diff --git a/llvm/include/llvm/Transforms/Scalar.h b/llvm/include/llvm/Transforms/Scalar.h --- a/llvm/include/llvm/Transforms/Scalar.h +++ b/llvm/include/llvm/Transforms/Scalar.h @@ -133,7 +133,8 @@ // Pass *createLICMPass(); Pass *createLICMPass(unsigned LicmMssaOptCap, - unsigned LicmMssaNoAccForPromotionCap); + unsigned LicmMssaNoAccForPromotionCap, + bool AllowSpeculation); //===----------------------------------------------------------------------===// // diff --git a/llvm/include/llvm/Transforms/Scalar/LICM.h b/llvm/include/llvm/Transforms/Scalar/LICM.h --- a/llvm/include/llvm/Transforms/Scalar/LICM.h +++ b/llvm/include/llvm/Transforms/Scalar/LICM.h @@ -46,14 +46,18 @@ class LICMPass : public PassInfoMixin { unsigned LicmMssaOptCap; unsigned LicmMssaNoAccForPromotionCap; + bool LicmAllowSpeculation; public: LICMPass() : LicmMssaOptCap(SetLicmMssaOptCap), - LicmMssaNoAccForPromotionCap(SetLicmMssaNoAccForPromotionCap) {} - LICMPass(unsigned LicmMssaOptCap, unsigned LicmMssaNoAccForPromotionCap) + LicmMssaNoAccForPromotionCap(SetLicmMssaNoAccForPromotionCap), + LicmAllowSpeculation(true) {} + LICMPass(unsigned LicmMssaOptCap, unsigned LicmMssaNoAccForPromotionCap, + bool LicmAllowSpeculation) : LicmMssaOptCap(LicmMssaOptCap), - LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap) {} + LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap), + LicmAllowSpeculation(LicmAllowSpeculation) {} PreservedAnalyses run(Loop &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U); }; @@ -62,14 +66,18 @@ class LNICMPass : public PassInfoMixin { unsigned LicmMssaOptCap; unsigned LicmMssaNoAccForPromotionCap; + bool LicmAllowSpeculation; public: LNICMPass() : LicmMssaOptCap(SetLicmMssaOptCap), - LicmMssaNoAccForPromotionCap(SetLicmMssaNoAccForPromotionCap) {} - LNICMPass(unsigned LicmMssaOptCap, unsigned LicmMssaNoAccForPromotionCap) + LicmMssaNoAccForPromotionCap(SetLicmMssaNoAccForPromotionCap), + LicmAllowSpeculation(true) {} + LNICMPass(unsigned LicmMssaOptCap, unsigned LicmMssaNoAccForPromotionCap, + bool LicmAllowSpeculation) : LicmMssaOptCap(LicmMssaOptCap), - LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap) {} + LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap), + LicmAllowSpeculation(LicmAllowSpeculation) {} PreservedAnalyses run(LoopNest &L, LoopAnalysisManager &AM, LoopStandardAnalysisResults &AR, LPMUpdater &U); }; diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -171,10 +171,13 @@ /// BlockFrequencyInfo, TargetLibraryInfo, Loop, AliasSet information for all /// instructions of the loop and loop safety information as arguments. /// Diagnostics is emitted via \p ORE. It returns changed status. +/// \p AllowSpeculation is whether values should be hoisted even if they are not +/// guaranteed to execute in the loop, but are safe to speculatively execute. bool hoistRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *, BlockFrequencyInfo *, TargetLibraryInfo *, Loop *, MemorySSAUpdater *, ScalarEvolution *, ICFLoopSafetyInfo *, - SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *, bool); + SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *, bool, + bool AllowSpeculation); /// This function deletes dead loops. The caller of this function needs to /// guarantee that the loop is infact dead. @@ -204,12 +207,14 @@ /// LoopInfo, DominatorTree, Loop, AliasSet information for all instructions /// of the loop and loop safety information as arguments. /// Diagnostics is emitted via \p ORE. It returns changed status. +/// \p AllowSpeculation is whether values should be hoisted even if they are not +/// guaranteed to execute in the loop, but are safe to speculatively execute. bool promoteLoopAccessesToScalars( const SmallSetVector &, SmallVectorImpl &, SmallVectorImpl &, SmallVectorImpl &, PredIteratorCache &, LoopInfo *, DominatorTree *, const TargetLibraryInfo *, Loop *, MemorySSAUpdater *, ICFLoopSafetyInfo *, - OptimizationRemarkEmitter *); + OptimizationRemarkEmitter *, bool AllowSpeculation); /// Does a BFS from a given node to all of its children inside a given loop. /// The returned vector of nodes includes the starting point. diff --git a/llvm/lib/Passes/PassBuilderPipelines.cpp b/llvm/lib/Passes/PassBuilderPipelines.cpp --- a/llvm/lib/Passes/PassBuilderPipelines.cpp +++ b/llvm/lib/Passes/PassBuilderPipelines.cpp @@ -293,14 +293,19 @@ LPM1.addPass(LoopSimplifyCFGPass()); // Try to remove as much code from the loop header as possible, - // to reduce amount of IR that will have to be duplicated. + // to reduce amount of IR that will have to be duplicated. However, + // do not perform speculative hoisting the first time as LICM + // will destroy metadata that may not need to be destroyed if run + // after loop rotation. // TODO: Investigate promotion cap for O1. - LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap)); + LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/false)); LPM1.addPass(LoopRotatePass(/* Disable header duplication */ true, isLTOPreLink(Phase))); // TODO: Investigate promotion cap for O1. - LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap)); + LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/true)); LPM1.addPass(SimpleLoopUnswitchPass()); if (EnableLoopFlatten) LPM1.addPass(LoopFlattenPass()); @@ -470,15 +475,20 @@ LPM1.addPass(LoopSimplifyCFGPass()); // Try to remove as much code from the loop header as possible, - // to reduce amount of IR that will have to be duplicated. + // to reduce amount of IR that will have to be duplicated. However, + // do not perform speculative hoisting the first time as LICM + // will destroy metadata that may not need to be destroyed if run + // after loop rotation. // TODO: Investigate promotion cap for O1. - LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap)); + LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/false)); // Disable header duplication in loop rotation at -Oz. LPM1.addPass( LoopRotatePass(Level != OptimizationLevel::Oz, isLTOPreLink(Phase))); // TODO: Investigate promotion cap for O1. - LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap)); + LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/true)); LPM1.addPass( SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3 && EnableO3NonTrivialUnswitching)); @@ -575,7 +585,8 @@ FPM.addPass(DSEPass()); FPM.addPass(createFunctionToLoopPassAdaptor( - LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap), + LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/true), /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); FPM.addPass(CoroElidePass()); @@ -1019,7 +1030,8 @@ ExtraPasses.addPass(CorrelatedValuePropagationPass()); ExtraPasses.addPass(InstCombinePass()); LoopPassManager LPM; - LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap)); + LPM.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/true)); LPM.addPass(SimpleLoopUnswitchPass(/* NonTrivial */ Level == OptimizationLevel::O3)); ExtraPasses.addPass( @@ -1087,7 +1099,8 @@ FPM.addPass( RequireAnalysisPass()); FPM.addPass(createFunctionToLoopPassAdaptor( - LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap), + LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/true), /*UseMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); } @@ -1627,7 +1640,8 @@ FunctionPassManager MainFPM; MainFPM.addPass(createFunctionToLoopPassAdaptor( - LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap), + LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/true), /*USeMemorySSA=*/true, /*UseBlockFrequencyInfo=*/true)); if (RunNewGVN) diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp --- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp +++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp @@ -458,13 +458,18 @@ MPM.add(createLoopSimplifyCFGPass()); } // Try to remove as much code from the loop header as possible, - // to reduce amount of IR that will have to be duplicated. + // to reduce amount of IR that will have to be duplicated. However, + // do not perform speculative hoisting the first time as LICM + // will destroy metadata that may not need to be destroyed if run + // after loop rotation. // TODO: Investigate promotion cap for O1. - MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/false)); // Rotate Loop - disable header duplication at -Oz MPM.add(createLoopRotatePass(SizeLevel == 2 ? 0 : -1, PrepareForLTO)); // TODO: Investigate promotion cap for O1. - MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/true)); if (EnableSimpleLoopUnswitch) MPM.add(createSimpleLoopUnswitchLegacyPass()); else @@ -529,7 +534,8 @@ // TODO: Investigate if this is too expensive at O1. if (OptLevel > 1) { MPM.add(createDeadStoreEliminationPass()); // Delete dead stores - MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/true)); } addExtensionsToPM(EP_ScalarOptimizerLate, MPM); @@ -588,7 +594,8 @@ PM.add(createEarlyCSEPass()); PM.add(createCorrelatedValuePropagationPass()); PM.add(createInstructionCombiningPass()); - PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/true)); PM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget)); PM.add(createCFGSimplificationPass( SimplifyCFGOptions().convertSwitchRangeToICmp(true))); @@ -651,7 +658,8 @@ // unrolled loop is a inner loop, then the prologue will be inside the // outer loop. LICM pass can help to promote the runtime check out if the // checked value is loop invariant. - PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/true)); } PM.add(createWarnMissedTransformationsPass()); @@ -898,7 +906,8 @@ // later might get benefit of no-alias assumption in clone loop. if (UseLoopVersioningLICM) { MPM.add(createLoopVersioningLICMPass()); // Do LoopVersioningLICM - MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/true)); } // We add a fresh GlobalsModRef run at this point. This is particularly @@ -1133,7 +1142,8 @@ // Run a few AA driven optimizations here and now, to cleanup the code. PM.add(createGlobalsAAWrapperPass()); // IP alias analysis. - PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap)); + PM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap, + /*AllowSpeculation=*/true)); PM.add(NewGVN ? createNewGVNPass() : createGVNPass(DisableGVNLoadPRE)); // Remove redundancies. PM.add(createMemCpyOptPass()); // Remove dead memcpys. diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -149,13 +149,11 @@ BlockFrequencyInfo *BFI, const Loop *CurLoop, ICFLoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU, OptimizationRemarkEmitter *ORE); -static bool isSafeToExecuteUnconditionally(Instruction &Inst, - const DominatorTree *DT, - const TargetLibraryInfo *TLI, - const Loop *CurLoop, - const LoopSafetyInfo *SafetyInfo, - OptimizationRemarkEmitter *ORE, - const Instruction *CtxI = nullptr); +static bool isSafeToExecuteUnconditionally( + Instruction &Inst, const DominatorTree *DT, const TargetLibraryInfo *TLI, + const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, + OptimizationRemarkEmitter *ORE, const Instruction *CtxI, + bool AllowSpeculation); static bool pointerInvalidatedByLoop(MemoryLocation MemLoc, AliasSetTracker *CurAST, Loop *CurLoop, AAResults *AA); @@ -188,21 +186,26 @@ OptimizationRemarkEmitter *ORE, bool LoopNestMode = false); LoopInvariantCodeMotion(unsigned LicmMssaOptCap, - unsigned LicmMssaNoAccForPromotionCap) + unsigned LicmMssaNoAccForPromotionCap, + bool LicmAllowSpeculation) : LicmMssaOptCap(LicmMssaOptCap), - LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap) {} + LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap), + LicmAllowSpeculation(LicmAllowSpeculation) {} private: unsigned LicmMssaOptCap; unsigned LicmMssaNoAccForPromotionCap; + bool LicmAllowSpeculation; }; struct LegacyLICMPass : public LoopPass { static char ID; // Pass identification, replacement for typeid LegacyLICMPass( unsigned LicmMssaOptCap = SetLicmMssaOptCap, - unsigned LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap) - : LoopPass(ID), LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap) { + unsigned LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap, + bool LicmAllowSpeculation = true) + : LoopPass(ID), LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap, + LicmAllowSpeculation) { initializeLegacyLICMPassPass(*PassRegistry::getPassRegistry()); } @@ -265,7 +268,8 @@ // but ORE cannot be preserved (see comment before the pass definition). OptimizationRemarkEmitter ORE(L.getHeader()->getParent()); - LoopInvariantCodeMotion LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap); + LoopInvariantCodeMotion LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap, + LicmAllowSpeculation); if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, AR.BFI, &AR.TLI, &AR.TTI, &AR.SE, AR.MSSA, &ORE)) return PreservedAnalyses::all(); @@ -290,7 +294,8 @@ // but ORE cannot be preserved (see comment before the pass definition). OptimizationRemarkEmitter ORE(LN.getParent()); - LoopInvariantCodeMotion LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap); + LoopInvariantCodeMotion LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap, + LicmAllowSpeculation); Loop &OutermostLoop = LN.getOutermostLoop(); bool Changed = LICM.runOnLoop(&OutermostLoop, &AR.AA, &AR.LI, &AR.DT, AR.BFI, @@ -321,8 +326,10 @@ Pass *llvm::createLICMPass() { return new LegacyLICMPass(); } Pass *llvm::createLICMPass(unsigned LicmMssaOptCap, - unsigned LicmMssaNoAccForPromotionCap) { - return new LegacyLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap); + unsigned LicmMssaNoAccForPromotionCap, + bool LicmAllowSpeculation) { + return new LegacyLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap, + LicmAllowSpeculation); } llvm::SinkAndHoistLICMFlags::SinkAndHoistLICMFlags(bool IsSink, Loop *L, @@ -418,7 +425,8 @@ Flags.setIsSink(false); if (Preheader) Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, BFI, TLI, L, - &MSSAU, SE, &SafetyInfo, Flags, ORE, LoopNestMode); + &MSSAU, SE, &SafetyInfo, Flags, ORE, LoopNestMode, + LicmAllowSpeculation); // Now that all loop invariants have been removed from the loop, promote any // memory references to scalars that we can. @@ -460,8 +468,8 @@ for (const SmallSetVector &PointerMustAliases : collectPromotionCandidates(MSSA, AA, L)) { LocalPromoted |= promoteLoopAccessesToScalars( - PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC, - LI, DT, TLI, L, &MSSAU, &SafetyInfo, ORE); + PointerMustAliases, ExitBlocks, InsertPts, MSSAInsertPts, PIC, LI, + DT, TLI, L, &MSSAU, &SafetyInfo, ORE, LicmAllowSpeculation); } Promoted |= LocalPromoted; } while (LocalPromoted); @@ -825,7 +833,8 @@ MemorySSAUpdater *MSSAU, ScalarEvolution *SE, ICFLoopSafetyInfo *SafetyInfo, SinkAndHoistLICMFlags &Flags, - OptimizationRemarkEmitter *ORE, bool LoopNestMode) { + OptimizationRemarkEmitter *ORE, bool LoopNestMode, + bool AllowSpeculation) { // Verify inputs. assert(N != nullptr && AA != nullptr && LI != nullptr && DT != nullptr && CurLoop != nullptr && MSSAU != nullptr && SafetyInfo != nullptr && @@ -877,7 +886,7 @@ true, &Flags, ORE) && isSafeToExecuteUnconditionally( I, DT, TLI, CurLoop, SafetyInfo, ORE, - CurLoop->getLoopPreheader()->getTerminator())) { + CurLoop->getLoopPreheader()->getTerminator(), AllowSpeculation)) { hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, MSSAU, SE, ORE); HoistedInstructions.push_back(&I); @@ -1774,14 +1783,12 @@ /// Only sink or hoist an instruction if it is not a trapping instruction, /// or if the instruction is known not to trap when moved to the preheader. /// or if it is a trapping instruction and is guaranteed to execute. -static bool isSafeToExecuteUnconditionally(Instruction &Inst, - const DominatorTree *DT, - const TargetLibraryInfo *TLI, - const Loop *CurLoop, - const LoopSafetyInfo *SafetyInfo, - OptimizationRemarkEmitter *ORE, - const Instruction *CtxI) { - if (isSafeToSpeculativelyExecute(&Inst, CtxI, DT, TLI)) +static bool isSafeToExecuteUnconditionally( + Instruction &Inst, const DominatorTree *DT, const TargetLibraryInfo *TLI, + const Loop *CurLoop, const LoopSafetyInfo *SafetyInfo, + OptimizationRemarkEmitter *ORE, const Instruction *CtxI, + bool AllowSpeculation) { + if (AllowSpeculation && isSafeToSpeculativelyExecute(&Inst, CtxI, DT, TLI)) return true; bool GuaranteedToExecute = @@ -1949,7 +1956,7 @@ SmallVectorImpl &MSSAInsertPts, PredIteratorCache &PIC, LoopInfo *LI, DominatorTree *DT, const TargetLibraryInfo *TLI, Loop *CurLoop, MemorySSAUpdater *MSSAU, ICFLoopSafetyInfo *SafetyInfo, - OptimizationRemarkEmitter *ORE) { + OptimizationRemarkEmitter *ORE, bool AllowSpeculation) { // Verify inputs. assert(LI != nullptr && DT != nullptr && CurLoop != nullptr && SafetyInfo != nullptr && @@ -2054,9 +2061,9 @@ // to execute does as well. Thus we can increase our guaranteed // alignment as well. if (!DereferenceableInPH || (InstAlignment > Alignment)) - if (isSafeToExecuteUnconditionally(*Load, DT, TLI, CurLoop, - SafetyInfo, ORE, - Preheader->getTerminator())) { + if (isSafeToExecuteUnconditionally( + *Load, DT, TLI, CurLoop, SafetyInfo, ORE, + Preheader->getTerminator(), AllowSpeculation)) { DereferenceableInPH = true; Alignment = std::max(Alignment, InstAlignment); } diff --git a/llvm/test/Transforms/LoopUnroll/AArch64/runtime-unroll-generic.ll b/llvm/test/Transforms/LoopUnroll/AArch64/runtime-unroll-generic.ll --- a/llvm/test/Transforms/LoopUnroll/AArch64/runtime-unroll-generic.ll +++ b/llvm/test/Transforms/LoopUnroll/AArch64/runtime-unroll-generic.ll @@ -6,100 +6,40 @@ define void @runtime_unroll_generic(i32 %arg_0, i32* %arg_1, i16* %arg_2, i16* %arg_3) { ; CHECK-A55-LABEL: @runtime_unroll_generic( ; CHECK-A55-NEXT: entry: +; CHECK-A55-NEXT: [[CMP52_NOT:%.*]] = icmp eq i32 [[ARG_0:%.*]], 0 +; CHECK-A55-NEXT: br i1 [[CMP52_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY6_LR_PH:%.*]] +; CHECK-A55: for.body6.lr.ph: ; CHECK-A55-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[ARG_2:%.*]], i64 undef ; CHECK-A55-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, i16* [[ARG_3:%.*]], i64 undef ; CHECK-A55-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[ARG_1:%.*]], i64 undef -; CHECK-A55-NEXT: [[CMP52_NOT:%.*]] = icmp eq i32 [[ARG_0:%.*]], 0 -; CHECK-A55-NEXT: br i1 [[CMP52_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY6_PREHEADER:%.*]] -; CHECK-A55: for.body6.preheader: -; CHECK-A55-NEXT: [[TMP0:%.*]] = add i32 [[ARG_0]], -1 -; CHECK-A55-NEXT: [[XTRAITER:%.*]] = and i32 [[ARG_0]], 3 -; CHECK-A55-NEXT: [[TMP1:%.*]] = icmp ult i32 [[TMP0]], 3 -; CHECK-A55-NEXT: br i1 [[TMP1]], label [[FOR_END_LOOPEXIT_UNR_LCSSA:%.*]], label [[FOR_BODY6_PREHEADER_NEW:%.*]] -; CHECK-A55: for.body6.preheader.new: -; CHECK-A55-NEXT: [[UNROLL_ITER:%.*]] = and i32 [[ARG_0]], -4 ; CHECK-A55-NEXT: br label [[FOR_BODY6:%.*]] ; CHECK-A55: for.body6: -; CHECK-A55-NEXT: [[NITER:%.*]] = phi i32 [ 0, [[FOR_BODY6_PREHEADER_NEW]] ], [ [[NITER_NEXT_3:%.*]], [[FOR_BODY6]] ] -; CHECK-A55-NEXT: [[TMP2:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 -; CHECK-A55-NEXT: [[CONV:%.*]] = sext i16 [[TMP2]] to i32 -; CHECK-A55-NEXT: [[TMP3:%.*]] = load i16, i16* [[ARRAYIDX14]], align 2 -; CHECK-A55-NEXT: [[CONV15:%.*]] = sext i16 [[TMP3]] to i32 +; CHECK-A55-NEXT: [[K_03:%.*]] = phi i32 [ 0, [[FOR_BODY6_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY6]] ] +; CHECK-A55-NEXT: [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 +; CHECK-A55-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32 +; CHECK-A55-NEXT: [[TMP1:%.*]] = load i16, i16* [[ARRAYIDX14]], align 2 +; CHECK-A55-NEXT: [[CONV15:%.*]] = sext i16 [[TMP1]] to i32 ; CHECK-A55-NEXT: [[MUL16:%.*]] = mul nsw i32 [[CONV15]], [[CONV]] -; CHECK-A55-NEXT: [[TMP4:%.*]] = load i32, i32* [[ARRAYIDX20]], align 4 -; CHECK-A55-NEXT: [[ADD21:%.*]] = add nsw i32 [[MUL16]], [[TMP4]] +; CHECK-A55-NEXT: [[TMP2:%.*]] = load i32, i32* [[ARRAYIDX20]], align 4 +; CHECK-A55-NEXT: [[ADD21:%.*]] = add nsw i32 [[MUL16]], [[TMP2]] ; CHECK-A55-NEXT: store i32 [[ADD21]], i32* [[ARRAYIDX20]], align 4 -; CHECK-A55-NEXT: [[TMP5:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 -; CHECK-A55-NEXT: [[CONV_1:%.*]] = sext i16 [[TMP5]] to i32 -; CHECK-A55-NEXT: [[TMP6:%.*]] = load i16, i16* [[ARRAYIDX14]], align 2 -; CHECK-A55-NEXT: [[CONV15_1:%.*]] = sext i16 [[TMP6]] to i32 -; CHECK-A55-NEXT: [[MUL16_1:%.*]] = mul nsw i32 [[CONV15_1]], [[CONV_1]] -; CHECK-A55-NEXT: [[ADD21_1:%.*]] = add nsw i32 [[MUL16_1]], [[ADD21]] -; CHECK-A55-NEXT: store i32 [[ADD21_1]], i32* [[ARRAYIDX20]], align 4 -; CHECK-A55-NEXT: [[TMP7:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 -; CHECK-A55-NEXT: [[CONV_2:%.*]] = sext i16 [[TMP7]] to i32 -; CHECK-A55-NEXT: [[TMP8:%.*]] = load i16, i16* [[ARRAYIDX14]], align 2 -; CHECK-A55-NEXT: [[CONV15_2:%.*]] = sext i16 [[TMP8]] to i32 -; CHECK-A55-NEXT: [[MUL16_2:%.*]] = mul nsw i32 [[CONV15_2]], [[CONV_2]] -; CHECK-A55-NEXT: [[ADD21_2:%.*]] = add nsw i32 [[MUL16_2]], [[ADD21_1]] -; CHECK-A55-NEXT: store i32 [[ADD21_2]], i32* [[ARRAYIDX20]], align 4 -; CHECK-A55-NEXT: [[TMP9:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 -; CHECK-A55-NEXT: [[CONV_3:%.*]] = sext i16 [[TMP9]] to i32 -; CHECK-A55-NEXT: [[TMP10:%.*]] = load i16, i16* [[ARRAYIDX14]], align 2 -; CHECK-A55-NEXT: [[CONV15_3:%.*]] = sext i16 [[TMP10]] to i32 -; CHECK-A55-NEXT: [[MUL16_3:%.*]] = mul nsw i32 [[CONV15_3]], [[CONV_3]] -; CHECK-A55-NEXT: [[ADD21_3:%.*]] = add nsw i32 [[MUL16_3]], [[ADD21_2]] -; CHECK-A55-NEXT: store i32 [[ADD21_3]], i32* [[ARRAYIDX20]], align 4 -; CHECK-A55-NEXT: [[NITER_NEXT_3]] = add i32 [[NITER]], 4 -; CHECK-A55-NEXT: [[NITER_NCMP_3_NOT:%.*]] = icmp eq i32 [[NITER_NEXT_3]], [[UNROLL_ITER]] -; CHECK-A55-NEXT: br i1 [[NITER_NCMP_3_NOT]], label [[FOR_END_LOOPEXIT_UNR_LCSSA]], label [[FOR_BODY6]], !llvm.loop [[LOOP0:![0-9]+]] -; CHECK-A55: for.end.loopexit.unr-lcssa: -; CHECK-A55-NEXT: [[LCMP_MOD_NOT:%.*]] = icmp eq i32 [[XTRAITER]], 0 -; CHECK-A55-NEXT: br i1 [[LCMP_MOD_NOT]], label [[FOR_END]], label [[FOR_BODY6_EPIL:%.*]] -; CHECK-A55: for.body6.epil: -; CHECK-A55-NEXT: [[TMP11:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 -; CHECK-A55-NEXT: [[CONV_EPIL:%.*]] = sext i16 [[TMP11]] to i32 -; CHECK-A55-NEXT: [[TMP12:%.*]] = load i16, i16* [[ARRAYIDX14]], align 2 -; CHECK-A55-NEXT: [[CONV15_EPIL:%.*]] = sext i16 [[TMP12]] to i32 -; CHECK-A55-NEXT: [[MUL16_EPIL:%.*]] = mul nsw i32 [[CONV15_EPIL]], [[CONV_EPIL]] -; CHECK-A55-NEXT: [[TMP13:%.*]] = load i32, i32* [[ARRAYIDX20]], align 4 -; CHECK-A55-NEXT: [[ADD21_EPIL:%.*]] = add nsw i32 [[MUL16_EPIL]], [[TMP13]] -; CHECK-A55-NEXT: store i32 [[ADD21_EPIL]], i32* [[ARRAYIDX20]], align 4 -; CHECK-A55-NEXT: [[EPIL_ITER_CMP_NOT:%.*]] = icmp eq i32 [[XTRAITER]], 1 -; CHECK-A55-NEXT: br i1 [[EPIL_ITER_CMP_NOT]], label [[FOR_END]], label [[FOR_BODY6_EPIL_1:%.*]] -; CHECK-A55: for.body6.epil.1: -; CHECK-A55-NEXT: [[TMP14:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 -; CHECK-A55-NEXT: [[CONV_EPIL_1:%.*]] = sext i16 [[TMP14]] to i32 -; CHECK-A55-NEXT: [[TMP15:%.*]] = load i16, i16* [[ARRAYIDX14]], align 2 -; CHECK-A55-NEXT: [[CONV15_EPIL_1:%.*]] = sext i16 [[TMP15]] to i32 -; CHECK-A55-NEXT: [[MUL16_EPIL_1:%.*]] = mul nsw i32 [[CONV15_EPIL_1]], [[CONV_EPIL_1]] -; CHECK-A55-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX20]], align 4 -; CHECK-A55-NEXT: [[ADD21_EPIL_1:%.*]] = add nsw i32 [[MUL16_EPIL_1]], [[TMP16]] -; CHECK-A55-NEXT: store i32 [[ADD21_EPIL_1]], i32* [[ARRAYIDX20]], align 4 -; CHECK-A55-NEXT: [[EPIL_ITER_CMP_1_NOT:%.*]] = icmp eq i32 [[XTRAITER]], 2 -; CHECK-A55-NEXT: br i1 [[EPIL_ITER_CMP_1_NOT]], label [[FOR_END]], label [[FOR_BODY6_EPIL_2:%.*]] -; CHECK-A55: for.body6.epil.2: -; CHECK-A55-NEXT: [[TMP17:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 -; CHECK-A55-NEXT: [[CONV_EPIL_2:%.*]] = sext i16 [[TMP17]] to i32 -; CHECK-A55-NEXT: [[TMP18:%.*]] = load i16, i16* [[ARRAYIDX14]], align 2 -; CHECK-A55-NEXT: [[CONV15_EPIL_2:%.*]] = sext i16 [[TMP18]] to i32 -; CHECK-A55-NEXT: [[MUL16_EPIL_2:%.*]] = mul nsw i32 [[CONV15_EPIL_2]], [[CONV_EPIL_2]] -; CHECK-A55-NEXT: [[TMP19:%.*]] = load i32, i32* [[ARRAYIDX20]], align 4 -; CHECK-A55-NEXT: [[ADD21_EPIL_2:%.*]] = add nsw i32 [[MUL16_EPIL_2]], [[TMP19]] -; CHECK-A55-NEXT: store i32 [[ADD21_EPIL_2]], i32* [[ARRAYIDX20]], align 4 -; CHECK-A55-NEXT: br label [[FOR_END]] +; CHECK-A55-NEXT: [[INC]] = add nuw i32 [[K_03]], 1 +; CHECK-A55-NEXT: [[CMP5:%.*]] = icmp ult i32 [[INC]], [[ARG_0]] +; CHECK-A55-NEXT: br i1 [[CMP5]], label [[FOR_BODY6]], label [[FOR_END]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK-A55: for.end: ; CHECK-A55-NEXT: ret void ; ; CHECK-GENERIC-LABEL: @runtime_unroll_generic( ; CHECK-GENERIC-NEXT: entry: +; CHECK-GENERIC-NEXT: [[CMP52_NOT:%.*]] = icmp eq i32 [[ARG_0:%.*]], 0 +; CHECK-GENERIC-NEXT: br i1 [[CMP52_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY6_LR_PH:%.*]] +; CHECK-GENERIC: for.body6.lr.ph: ; CHECK-GENERIC-NEXT: [[ARRAYIDX10:%.*]] = getelementptr inbounds i16, i16* [[ARG_2:%.*]], i64 undef ; CHECK-GENERIC-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i16, i16* [[ARG_3:%.*]], i64 undef ; CHECK-GENERIC-NEXT: [[ARRAYIDX20:%.*]] = getelementptr inbounds i32, i32* [[ARG_1:%.*]], i64 undef -; CHECK-GENERIC-NEXT: [[CMP52_NOT:%.*]] = icmp eq i32 [[ARG_0:%.*]], 0 -; CHECK-GENERIC-NEXT: br i1 [[CMP52_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY6:%.*]] +; CHECK-GENERIC-NEXT: br label [[FOR_BODY6:%.*]] ; CHECK-GENERIC: for.body6: -; CHECK-GENERIC-NEXT: [[K_03:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY6]] ], [ 0, [[ENTRY:%.*]] ] +; CHECK-GENERIC-NEXT: [[K_03:%.*]] = phi i32 [ 0, [[FOR_BODY6_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY6]] ] ; CHECK-GENERIC-NEXT: [[TMP0:%.*]] = load i16, i16* [[ARRAYIDX10]], align 2 ; CHECK-GENERIC-NEXT: [[CONV:%.*]] = sext i16 [[TMP0]] to i32 ; CHECK-GENERIC-NEXT: [[TMP1:%.*]] = load i16, i16* [[ARRAYIDX14]], align 2 diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll --- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll +++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll @@ -88,28 +88,28 @@ define void @matrix_extract_insert_loop(i32 %i, [225 x double]* nonnull align 8 dereferenceable(1800) %A, [225 x double]* nonnull align 8 dereferenceable(1800) %B) { ; CHECK-LABEL: @matrix_extract_insert_loop( ; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP212_NOT:%.*]] = icmp eq i32 [[I:%.*]], 0 ; CHECK-NEXT: [[TMP0:%.*]] = bitcast [225 x double]* [[A:%.*]] to <225 x double>* -; CHECK-NEXT: [[CONV6:%.*]] = zext i32 [[I:%.*]] to i64 +; CHECK-NEXT: [[CONV6:%.*]] = zext i32 [[I]] to i64 ; CHECK-NEXT: [[TMP1:%.*]] = bitcast [225 x double]* [[B:%.*]] to <225 x double>* -; CHECK-NEXT: [[CMP212_NOT:%.*]] = icmp eq i32 [[I]], 0 -; CHECK-NEXT: br i1 [[CMP212_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_US:%.*]] -; CHECK: for.cond1.preheader.us: +; CHECK-NEXT: br i1 [[CMP212_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]] +; CHECK: for.cond1.preheader.us.preheader: ; CHECK-NEXT: [[TMP2:%.*]] = icmp ult i32 [[I]], 225 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP2]]) -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[CONV6]] ; CHECK-NEXT: br label [[FOR_BODY4_US:%.*]] ; CHECK: for.body4.us: -; CHECK-NEXT: [[K_013_US:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US]] ], [ [[INC_US:%.*]], [[FOR_BODY4_US]] ] +; CHECK-NEXT: [[K_013_US:%.*]] = phi i32 [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ], [ [[INC_US:%.*]], [[FOR_BODY4_US]] ] ; CHECK-NEXT: [[CONV_US:%.*]] = zext i32 [[K_013_US]] to i64 -; CHECK-NEXT: [[TMP4:%.*]] = icmp ult i32 [[K_013_US]], 225 -; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP4]]) -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[CONV_US]] -; CHECK-NEXT: [[MATRIXEXT_US:%.*]] = load double, double* [[TMP5]], align 8 -; CHECK-NEXT: [[MATRIXEXT8_US:%.*]] = load double, double* [[TMP3]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = icmp ult i32 [[K_013_US]], 225 +; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP3]]) +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[CONV_US]] +; CHECK-NEXT: [[MATRIXEXT_US:%.*]] = load double, double* [[TMP4]], align 8 +; CHECK-NEXT: [[TMP5:%.*]] = load <225 x double>, <225 x double>* [[TMP1]], align 8 +; CHECK-NEXT: [[MATRIXEXT8_US:%.*]] = extractelement <225 x double> [[TMP5]], i64 [[CONV6]] ; CHECK-NEXT: [[MUL_US:%.*]] = fmul double [[MATRIXEXT_US]], [[MATRIXEXT8_US]] -; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[CONV_US]] -; CHECK-NEXT: [[MATRIXEXT11_US:%.*]] = load double, double* [[TMP6]], align 8 +; CHECK-NEXT: [[MATRIXEXT11_US:%.*]] = extractelement <225 x double> [[TMP5]], i64 [[CONV_US]] ; CHECK-NEXT: [[SUB_US:%.*]] = fsub double [[MATRIXEXT11_US]], [[MUL_US]] +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP1]], i64 0, i64 [[CONV_US]] ; CHECK-NEXT: store double [[SUB_US]], double* [[TMP6]], align 8 ; CHECK-NEXT: [[INC_US]] = add nuw nsw i32 [[K_013_US]], 1 ; CHECK-NEXT: [[CMP2_US:%.*]] = icmp ult i32 [[INC_US]], [[I]] @@ -145,8 +145,8 @@ ; CHECK-NEXT: br label [[FOR_BODY4_US_2:%.*]] ; CHECK: for.body4.us.2: ; CHECK-NEXT: [[K_013_US_2:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1]] ], [ [[INC_US_2:%.*]], [[FOR_BODY4_US_2]] ] -; CHECK-NEXT: [[NARROW16:%.*]] = add nuw nsw i32 [[K_013_US_2]], 30 -; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[NARROW16]] to i64 +; CHECK-NEXT: [[NARROW17:%.*]] = add nuw nsw i32 [[K_013_US_2]], 30 +; CHECK-NEXT: [[TMP17:%.*]] = zext i32 [[NARROW17]] to i64 ; CHECK-NEXT: [[TMP18:%.*]] = icmp ult i32 [[K_013_US_2]], 195 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP18]]) ; CHECK-NEXT: [[TMP19:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP17]] @@ -168,8 +168,8 @@ ; CHECK-NEXT: br label [[FOR_BODY4_US_3:%.*]] ; CHECK: for.body4.us.3: ; CHECK-NEXT: [[K_013_US_3:%.*]] = phi i32 [ 0, [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2]] ], [ [[INC_US_3:%.*]], [[FOR_BODY4_US_3]] ] -; CHECK-NEXT: [[NARROW17:%.*]] = add nuw nsw i32 [[K_013_US_3]], 45 -; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[NARROW17]] to i64 +; CHECK-NEXT: [[NARROW18:%.*]] = add nuw nsw i32 [[K_013_US_3]], 45 +; CHECK-NEXT: [[TMP24:%.*]] = zext i32 [[NARROW18]] to i64 ; CHECK-NEXT: [[TMP25:%.*]] = icmp ult i32 [[K_013_US_3]], 180 ; CHECK-NEXT: tail call void @llvm.assume(i1 [[TMP25]]) ; CHECK-NEXT: [[TMP26:%.*]] = getelementptr inbounds <225 x double>, <225 x double>* [[TMP0]], i64 0, i64 [[TMP24]] @@ -308,16 +308,10 @@ define <4 x float> @reverse_hadd_v4f32(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @reverse_hadd_v4f32( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> undef, <2 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> undef, <2 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> undef, <2 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <4 x float> [[B]], <4 x float> undef, <2 x i32> -; CHECK-NEXT: [[TMP7:%.*]] = fadd <2 x float> [[TMP5]], [[TMP6]] -; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <2 x float> [[TMP7]], <2 x float> poison, <4 x i32> -; CHECK-NEXT: [[TMP9:%.*]] = shufflevector <4 x float> [[TMP8]], <4 x float> [[TMP4]], <4 x i32> -; CHECK-NEXT: ret <4 x float> [[TMP9]] +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> [[A:%.*]], <4 x i32> +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> [[A]], <4 x i32> +; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]] +; CHECK-NEXT: ret <4 x float> [[TMP3]] ; %vecext = extractelement <4 x float> %a, i32 0 %vecext1 = extractelement <4 x float> %a, i32 1 diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll b/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/hoist-load-of-baseptr.ll @@ -44,15 +44,15 @@ ; OLDPM_O2-LABEL: define {{[^@]+}}@_Z7computeRSt6vectorIiSaIiEEy ; OLDPM_O2-SAME: (%"class.std::vector"* nocapture noundef nonnull readonly align 8 dereferenceable(24) [[DATA:%.*]], i64 noundef [[NUMELEMS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; OLDPM_O2-NEXT: entry: -; OLDPM_O2-NEXT: [[_M_START_I:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[DATA]], i64 0, i32 0, i32 0, i32 0, i32 0 ; OLDPM_O2-NEXT: [[CMP26_NOT:%.*]] = icmp eq i64 [[NUMELEMS]], 0 +; OLDPM_O2-NEXT: [[_M_START_I:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[DATA]], i64 0, i32 0, i32 0, i32 0, i32 0 +; OLDPM_O2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[_M_START_I]], align 8 ; OLDPM_O2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEMS]], 8 ; OLDPM_O2-NEXT: [[N_VEC:%.*]] = and i64 [[NUMELEMS]], -8 ; OLDPM_O2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[NUMELEMS]] ; OLDPM_O2-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] ; OLDPM_O2: for.cond1.preheader: ; OLDPM_O2-NEXT: [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC7:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] -; OLDPM_O2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[_M_START_I]], align 8 ; OLDPM_O2-NEXT: br i1 [[CMP26_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4_PREHEADER:%.*]] ; OLDPM_O2: for.body4.preheader: ; OLDPM_O2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY4_PREHEADER11:%.*]], label [[VECTOR_BODY:%.*]] @@ -97,8 +97,9 @@ ; OLDPM_O3-LABEL: define {{[^@]+}}@_Z7computeRSt6vectorIiSaIiEEy ; OLDPM_O3-SAME: (%"class.std::vector"* nocapture noundef nonnull readonly align 8 dereferenceable(24) [[DATA:%.*]], i64 noundef [[NUMELEMS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; OLDPM_O3-NEXT: entry: -; OLDPM_O3-NEXT: [[_M_START_I:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[DATA]], i64 0, i32 0, i32 0, i32 0, i32 0 ; OLDPM_O3-NEXT: [[CMP26_NOT:%.*]] = icmp eq i64 [[NUMELEMS]], 0 +; OLDPM_O3-NEXT: [[_M_START_I:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[DATA]], i64 0, i32 0, i32 0, i32 0, i32 0 +; OLDPM_O3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[_M_START_I]], align 8 ; OLDPM_O3-NEXT: br i1 [[CMP26_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]] ; OLDPM_O3: for.cond1.preheader.us.preheader: ; OLDPM_O3-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEMS]], 8 @@ -107,7 +108,6 @@ ; OLDPM_O3-NEXT: br label [[FOR_COND1_PREHEADER_US:%.*]] ; OLDPM_O3: for.cond1.preheader.us: ; OLDPM_O3-NEXT: [[I_08_US:%.*]] = phi i64 [ [[INC7_US:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ] -; OLDPM_O3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[_M_START_I]], align 8 ; OLDPM_O3-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY4_US_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]] ; OLDPM_O3: vector.body: ; OLDPM_O3-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[FOR_COND1_PREHEADER_US]] ] @@ -150,12 +150,12 @@ ; NEWPM_O1-LABEL: define {{[^@]+}}@_Z7computeRSt6vectorIiSaIiEEy ; NEWPM_O1-SAME: (%"class.std::vector"* nocapture noundef nonnull readonly align 8 dereferenceable(24) [[DATA:%.*]], i64 noundef [[NUMELEMS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; NEWPM_O1-NEXT: entry: -; NEWPM_O1-NEXT: [[_M_START_I:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[DATA]], i64 0, i32 0, i32 0, i32 0, i32 0 ; NEWPM_O1-NEXT: [[CMP26_NOT:%.*]] = icmp eq i64 [[NUMELEMS]], 0 +; NEWPM_O1-NEXT: [[_M_START_I:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[DATA]], i64 0, i32 0, i32 0, i32 0, i32 0 +; NEWPM_O1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[_M_START_I]], align 8 ; NEWPM_O1-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] ; NEWPM_O1: for.cond1.preheader: ; NEWPM_O1-NEXT: [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC7:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] -; NEWPM_O1-NEXT: [[TMP0:%.*]] = load i32*, i32** [[_M_START_I]], align 8 ; NEWPM_O1-NEXT: br i1 [[CMP26_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4:%.*]] ; NEWPM_O1: for.cond.cleanup: ; NEWPM_O1-NEXT: ret void @@ -176,15 +176,15 @@ ; NEWPM_O2-LABEL: define {{[^@]+}}@_Z7computeRSt6vectorIiSaIiEEy ; NEWPM_O2-SAME: (%"class.std::vector"* nocapture noundef nonnull readonly align 8 dereferenceable(24) [[DATA:%.*]], i64 noundef [[NUMELEMS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; NEWPM_O2-NEXT: entry: -; NEWPM_O2-NEXT: [[_M_START_I:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[DATA]], i64 0, i32 0, i32 0, i32 0, i32 0 ; NEWPM_O2-NEXT: [[CMP26_NOT:%.*]] = icmp eq i64 [[NUMELEMS]], 0 +; NEWPM_O2-NEXT: [[_M_START_I:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[DATA]], i64 0, i32 0, i32 0, i32 0, i32 0 +; NEWPM_O2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[_M_START_I]], align 8 ; NEWPM_O2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEMS]], 8 ; NEWPM_O2-NEXT: [[N_VEC:%.*]] = and i64 [[NUMELEMS]], -8 ; NEWPM_O2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[NUMELEMS]] ; NEWPM_O2-NEXT: br label [[FOR_COND1_PREHEADER:%.*]] ; NEWPM_O2: for.cond1.preheader: ; NEWPM_O2-NEXT: [[I_08:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC7:%.*]], [[FOR_COND_CLEANUP3:%.*]] ] -; NEWPM_O2-NEXT: [[TMP0:%.*]] = load i32*, i32** [[_M_START_I]], align 8 ; NEWPM_O2-NEXT: br i1 [[CMP26_NOT]], label [[FOR_COND_CLEANUP3]], label [[FOR_BODY4_PREHEADER:%.*]] ; NEWPM_O2: for.body4.preheader: ; NEWPM_O2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY4_PREHEADER11:%.*]], label [[VECTOR_BODY:%.*]] @@ -229,8 +229,9 @@ ; NEWPM_O3-LABEL: define {{[^@]+}}@_Z7computeRSt6vectorIiSaIiEEy ; NEWPM_O3-SAME: (%"class.std::vector"* nocapture noundef nonnull readonly align 8 dereferenceable(24) [[DATA:%.*]], i64 noundef [[NUMELEMS:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] { ; NEWPM_O3-NEXT: entry: -; NEWPM_O3-NEXT: [[_M_START_I:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[DATA]], i64 0, i32 0, i32 0, i32 0, i32 0 ; NEWPM_O3-NEXT: [[CMP26_NOT:%.*]] = icmp eq i64 [[NUMELEMS]], 0 +; NEWPM_O3-NEXT: [[_M_START_I:%.*]] = getelementptr inbounds %"class.std::vector", %"class.std::vector"* [[DATA]], i64 0, i32 0, i32 0, i32 0, i32 0 +; NEWPM_O3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[_M_START_I]], align 8 ; NEWPM_O3-NEXT: br i1 [[CMP26_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_COND1_PREHEADER_US_PREHEADER:%.*]] ; NEWPM_O3: for.cond1.preheader.us.preheader: ; NEWPM_O3-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEMS]], 8 @@ -239,7 +240,6 @@ ; NEWPM_O3-NEXT: br label [[FOR_COND1_PREHEADER_US:%.*]] ; NEWPM_O3: for.cond1.preheader.us: ; NEWPM_O3-NEXT: [[I_08_US:%.*]] = phi i64 [ [[INC7_US:%.*]], [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US:%.*]] ], [ 0, [[FOR_COND1_PREHEADER_US_PREHEADER]] ] -; NEWPM_O3-NEXT: [[TMP0:%.*]] = load i32*, i32** [[_M_START_I]], align 8 ; NEWPM_O3-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY4_US_PREHEADER:%.*]], label [[VECTOR_BODY:%.*]] ; NEWPM_O3: vector.body: ; NEWPM_O3-NEXT: [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ], [ 0, [[FOR_COND1_PREHEADER_US]] ] diff --git a/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll b/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/speculation-vs-tbaa.ll @@ -14,13 +14,15 @@ define void @licm(double** align 8 dereferenceable(8) %_M_start.i, i64 %numElem) { ; OLDPM_O1-LABEL: @licm( ; OLDPM_O1-NEXT: entry: -; OLDPM_O1-NEXT: [[TMP0:%.*]] = load double*, double** [[_M_START_I:%.*]], align 8 ; OLDPM_O1-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[NUMELEM:%.*]], 0 -; OLDPM_O1-NEXT: br i1 [[CMP1_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]] +; OLDPM_O1-NEXT: br i1 [[CMP1_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_LR_PH:%.*]] +; OLDPM_O1: for.body.lr.ph: +; OLDPM_O1-NEXT: [[TMP0:%.*]] = load double*, double** [[_M_START_I:%.*]], align 8, !tbaa [[TBAA3:![0-9]+]] +; OLDPM_O1-NEXT: br label [[FOR_BODY:%.*]] ; OLDPM_O1: for.body: -; OLDPM_O1-NEXT: [[K_02:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; OLDPM_O1-NEXT: [[K_02:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; OLDPM_O1-NEXT: [[ADD_PTR_I:%.*]] = getelementptr inbounds double, double* [[TMP0]], i64 [[K_02]] -; OLDPM_O1-NEXT: store double 2.000000e+00, double* [[ADD_PTR_I]], align 8, !tbaa [[TBAA3:![0-9]+]] +; OLDPM_O1-NEXT: store double 2.000000e+00, double* [[ADD_PTR_I]], align 8, !tbaa [[TBAA8:![0-9]+]] ; OLDPM_O1-NEXT: [[INC]] = add nuw i64 [[K_02]], 1 ; OLDPM_O1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[NUMELEM]] ; OLDPM_O1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] @@ -29,12 +31,12 @@ ; ; OLDPM_O23-LABEL: @licm( ; OLDPM_O23-NEXT: entry: -; OLDPM_O23-NEXT: [[TMP0:%.*]] = load double*, double** [[_M_START_I:%.*]], align 8 ; OLDPM_O23-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[NUMELEM:%.*]], 0 -; OLDPM_O23-NEXT: br i1 [[CMP1_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]] -; OLDPM_O23: for.body.preheader: +; OLDPM_O23-NEXT: br i1 [[CMP1_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_LR_PH:%.*]] +; OLDPM_O23: for.body.lr.ph: +; OLDPM_O23-NEXT: [[TMP0:%.*]] = load double*, double** [[_M_START_I:%.*]], align 8, !tbaa [[TBAA3:![0-9]+]] ; OLDPM_O23-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEM]], 4 -; OLDPM_O23-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER3:%.*]], label [[VECTOR_PH:%.*]] +; OLDPM_O23-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]] ; OLDPM_O23: vector.ph: ; OLDPM_O23-NEXT: [[N_VEC:%.*]] = and i64 [[NUMELEM]], -4 ; OLDPM_O23-NEXT: br label [[VECTOR_BODY:%.*]] @@ -42,38 +44,40 @@ ; OLDPM_O23-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; OLDPM_O23-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, double* [[TMP0]], i64 [[INDEX]] ; OLDPM_O23-NEXT: [[TMP2:%.*]] = bitcast double* [[TMP1]] to <2 x double>* -; OLDPM_O23-NEXT: store <2 x double> , <2 x double>* [[TMP2]], align 8, !tbaa [[TBAA3:![0-9]+]] +; OLDPM_O23-NEXT: store <2 x double> , <2 x double>* [[TMP2]], align 8, !tbaa [[TBAA8:![0-9]+]] ; OLDPM_O23-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[TMP1]], i64 2 ; OLDPM_O23-NEXT: [[TMP4:%.*]] = bitcast double* [[TMP3]] to <2 x double>* -; OLDPM_O23-NEXT: store <2 x double> , <2 x double>* [[TMP4]], align 8, !tbaa [[TBAA3]] +; OLDPM_O23-NEXT: store <2 x double> , <2 x double>* [[TMP4]], align 8, !tbaa [[TBAA8]] ; OLDPM_O23-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; OLDPM_O23-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; OLDPM_O23-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; OLDPM_O23-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; OLDPM_O23: middle.block: ; OLDPM_O23-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[NUMELEM]] -; OLDPM_O23-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER3]] -; OLDPM_O23: for.body.preheader3: -; OLDPM_O23-NEXT: [[K_02_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] +; OLDPM_O23-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER]] +; OLDPM_O23: for.body.preheader: +; OLDPM_O23-NEXT: [[K_02_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] ; OLDPM_O23-NEXT: br label [[FOR_BODY:%.*]] ; OLDPM_O23: for.body: -; OLDPM_O23-NEXT: [[K_02:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[K_02_PH]], [[FOR_BODY_PREHEADER3]] ] +; OLDPM_O23-NEXT: [[K_02:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[K_02_PH]], [[FOR_BODY_PREHEADER]] ] ; OLDPM_O23-NEXT: [[ADD_PTR_I:%.*]] = getelementptr inbounds double, double* [[TMP0]], i64 [[K_02]] -; OLDPM_O23-NEXT: store double 2.000000e+00, double* [[ADD_PTR_I]], align 8, !tbaa [[TBAA3]] +; OLDPM_O23-NEXT: store double 2.000000e+00, double* [[ADD_PTR_I]], align 8, !tbaa [[TBAA8]] ; OLDPM_O23-NEXT: [[INC]] = add nuw i64 [[K_02]], 1 ; OLDPM_O23-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[NUMELEM]] -; OLDPM_O23-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; OLDPM_O23-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; OLDPM_O23: for.cond.cleanup: ; OLDPM_O23-NEXT: ret void ; ; NEWPM_O1-LABEL: @licm( ; NEWPM_O1-NEXT: entry: -; NEWPM_O1-NEXT: [[TMP0:%.*]] = load double*, double** [[_M_START_I:%.*]], align 8 ; NEWPM_O1-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[NUMELEM:%.*]], 0 -; NEWPM_O1-NEXT: br i1 [[CMP1_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY:%.*]] +; NEWPM_O1-NEXT: br i1 [[CMP1_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_LR_PH:%.*]] +; NEWPM_O1: for.body.lr.ph: +; NEWPM_O1-NEXT: [[TMP0:%.*]] = load double*, double** [[_M_START_I:%.*]], align 8, !tbaa [[TBAA3:![0-9]+]] +; NEWPM_O1-NEXT: br label [[FOR_BODY:%.*]] ; NEWPM_O1: for.body: -; NEWPM_O1-NEXT: [[K_02:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] +; NEWPM_O1-NEXT: [[K_02:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY]] ] ; NEWPM_O1-NEXT: [[ADD_PTR_I:%.*]] = getelementptr inbounds double, double* [[TMP0]], i64 [[K_02]] -; NEWPM_O1-NEXT: store double 2.000000e+00, double* [[ADD_PTR_I]], align 8, !tbaa [[TBAA3:![0-9]+]] +; NEWPM_O1-NEXT: store double 2.000000e+00, double* [[ADD_PTR_I]], align 8, !tbaa [[TBAA8:![0-9]+]] ; NEWPM_O1-NEXT: [[INC]] = add nuw i64 [[K_02]], 1 ; NEWPM_O1-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[NUMELEM]] ; NEWPM_O1-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]] @@ -82,12 +86,12 @@ ; ; NEWPM_O23-LABEL: @licm( ; NEWPM_O23-NEXT: entry: -; NEWPM_O23-NEXT: [[TMP0:%.*]] = load double*, double** [[_M_START_I:%.*]], align 8 ; NEWPM_O23-NEXT: [[CMP1_NOT:%.*]] = icmp eq i64 [[NUMELEM:%.*]], 0 -; NEWPM_O23-NEXT: br i1 [[CMP1_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_PREHEADER:%.*]] -; NEWPM_O23: for.body.preheader: +; NEWPM_O23-NEXT: br i1 [[CMP1_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY_LR_PH:%.*]] +; NEWPM_O23: for.body.lr.ph: +; NEWPM_O23-NEXT: [[TMP0:%.*]] = load double*, double** [[_M_START_I:%.*]], align 8, !tbaa [[TBAA3:![0-9]+]] ; NEWPM_O23-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[NUMELEM]], 4 -; NEWPM_O23-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER3:%.*]], label [[VECTOR_PH:%.*]] +; NEWPM_O23-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY_PREHEADER:%.*]], label [[VECTOR_PH:%.*]] ; NEWPM_O23: vector.ph: ; NEWPM_O23-NEXT: [[N_VEC:%.*]] = and i64 [[NUMELEM]], -4 ; NEWPM_O23-NEXT: br label [[VECTOR_BODY:%.*]] @@ -95,26 +99,26 @@ ; NEWPM_O23-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] ; NEWPM_O23-NEXT: [[TMP1:%.*]] = getelementptr inbounds double, double* [[TMP0]], i64 [[INDEX]] ; NEWPM_O23-NEXT: [[TMP2:%.*]] = bitcast double* [[TMP1]] to <2 x double>* -; NEWPM_O23-NEXT: store <2 x double> , <2 x double>* [[TMP2]], align 8, !tbaa [[TBAA3:![0-9]+]] +; NEWPM_O23-NEXT: store <2 x double> , <2 x double>* [[TMP2]], align 8, !tbaa [[TBAA8:![0-9]+]] ; NEWPM_O23-NEXT: [[TMP3:%.*]] = getelementptr inbounds double, double* [[TMP1]], i64 2 ; NEWPM_O23-NEXT: [[TMP4:%.*]] = bitcast double* [[TMP3]] to <2 x double>* -; NEWPM_O23-NEXT: store <2 x double> , <2 x double>* [[TMP4]], align 8, !tbaa [[TBAA3]] +; NEWPM_O23-NEXT: store <2 x double> , <2 x double>* [[TMP4]], align 8, !tbaa [[TBAA8]] ; NEWPM_O23-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 ; NEWPM_O23-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; NEWPM_O23-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]] +; NEWPM_O23-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]] ; NEWPM_O23: middle.block: ; NEWPM_O23-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[NUMELEM]] -; NEWPM_O23-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER3]] -; NEWPM_O23: for.body.preheader3: -; NEWPM_O23-NEXT: [[K_02_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] +; NEWPM_O23-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY_PREHEADER]] +; NEWPM_O23: for.body.preheader: +; NEWPM_O23-NEXT: [[K_02_PH:%.*]] = phi i64 [ 0, [[FOR_BODY_LR_PH]] ], [ [[N_VEC]], [[MIDDLE_BLOCK]] ] ; NEWPM_O23-NEXT: br label [[FOR_BODY:%.*]] ; NEWPM_O23: for.body: -; NEWPM_O23-NEXT: [[K_02:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[K_02_PH]], [[FOR_BODY_PREHEADER3]] ] +; NEWPM_O23-NEXT: [[K_02:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_BODY]] ], [ [[K_02_PH]], [[FOR_BODY_PREHEADER]] ] ; NEWPM_O23-NEXT: [[ADD_PTR_I:%.*]] = getelementptr inbounds double, double* [[TMP0]], i64 [[K_02]] -; NEWPM_O23-NEXT: store double 2.000000e+00, double* [[ADD_PTR_I]], align 8, !tbaa [[TBAA3]] +; NEWPM_O23-NEXT: store double 2.000000e+00, double* [[ADD_PTR_I]], align 8, !tbaa [[TBAA8]] ; NEWPM_O23-NEXT: [[INC]] = add nuw i64 [[K_02]], 1 ; NEWPM_O23-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INC]], [[NUMELEM]] -; NEWPM_O23-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]] +; NEWPM_O23-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]] ; NEWPM_O23: for.cond.cleanup: ; NEWPM_O23-NEXT: ret void ; diff --git a/llvm/test/Transforms/PhaseOrdering/X86/spurious-peeling.ll b/llvm/test/Transforms/PhaseOrdering/X86/spurious-peeling.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/spurious-peeling.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/spurious-peeling.ll @@ -32,10 +32,10 @@ ; OLDPM_O23-NEXT: [[CMP510_NOT_I:%.*]] = icmp eq i32 [[TMP1]], 0 ; OLDPM_O23-NEXT: br i1 [[CMP510_NOT_I]], label [[_ZN12FLOATVECPAIR6VECINCEV_EXIT:%.*]], label [[FOR_BODY7_LR_PH_I:%.*]] ; OLDPM_O23: for.body7.lr.ph.i: -; OLDPM_O23-NEXT: [[BASE_I4_I:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR]], %class.FloatVecPair* [[FVP]], i64 0, i32 0, i32 0 ; OLDPM_O23-NEXT: [[BASE_I6_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], %class.HomemadeVector.0* [[TMP0]], i64 undef, i32 0 ; OLDPM_O23-NEXT: [[TMP2:%.*]] = load float*, float** [[BASE_I6_I]], align 8, !tbaa [[TBAA8:![0-9]+]] ; OLDPM_O23-NEXT: [[ARRAYIDX_I7_I:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 undef +; OLDPM_O23-NEXT: [[BASE_I4_I:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR]], %class.FloatVecPair* [[FVP]], i64 0, i32 0, i32 0 ; OLDPM_O23-NEXT: [[TMP3:%.*]] = load %class.HomemadeVector.0*, %class.HomemadeVector.0** [[BASE_I4_I]], align 8, !tbaa [[TBAA0]] ; OLDPM_O23-NEXT: [[BASE_I2_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], %class.HomemadeVector.0* [[TMP3]], i64 undef, i32 0 ; OLDPM_O23-NEXT: [[TMP4:%.*]] = load float*, float** [[BASE_I2_I]], align 8, !tbaa [[TBAA8]] @@ -64,10 +64,10 @@ ; NEWPM_O1-NEXT: [[CMP510_NOT_I:%.*]] = icmp eq i32 [[TMP1]], 0 ; NEWPM_O1-NEXT: br i1 [[CMP510_NOT_I]], label [[_ZN12FLOATVECPAIR6VECINCEV_EXIT:%.*]], label [[FOR_BODY7_LR_PH_I:%.*]] ; NEWPM_O1: for.body7.lr.ph.i: -; NEWPM_O1-NEXT: [[BASE_I6_I:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR]], %class.FloatVecPair* [[FVP]], i64 0, i32 0, i32 0 ; NEWPM_O1-NEXT: [[BASE_I4_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], %class.HomemadeVector.0* [[TMP0]], i64 undef, i32 0 ; NEWPM_O1-NEXT: [[TMP2:%.*]] = load float*, float** [[BASE_I4_I]], align 8, !tbaa [[TBAA8:![0-9]+]] ; NEWPM_O1-NEXT: [[ARRAYIDX_I5_I:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 undef +; NEWPM_O1-NEXT: [[BASE_I6_I:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR]], %class.FloatVecPair* [[FVP]], i64 0, i32 0, i32 0 ; NEWPM_O1-NEXT: [[TMP3:%.*]] = load %class.HomemadeVector.0*, %class.HomemadeVector.0** [[BASE_I6_I]], align 8, !tbaa [[TBAA0]] ; NEWPM_O1-NEXT: [[BASE_I8_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], %class.HomemadeVector.0* [[TMP3]], i64 undef, i32 0 ; NEWPM_O1-NEXT: [[TMP4:%.*]] = load float*, float** [[BASE_I8_I]], align 8, !tbaa [[TBAA8]] @@ -95,10 +95,10 @@ ; NEWPM_O23-NEXT: [[CMP510_NOT_I:%.*]] = icmp eq i32 [[TMP1]], 0 ; NEWPM_O23-NEXT: br i1 [[CMP510_NOT_I]], label [[_ZN12FLOATVECPAIR6VECINCEV_EXIT:%.*]], label [[FOR_BODY7_LR_PH_I:%.*]] ; NEWPM_O23: for.body7.lr.ph.i: -; NEWPM_O23-NEXT: [[BASE_I6_I:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR]], %class.FloatVecPair* [[FVP]], i64 0, i32 0, i32 0 ; NEWPM_O23-NEXT: [[BASE_I4_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], %class.HomemadeVector.0* [[TMP0]], i64 undef, i32 0 ; NEWPM_O23-NEXT: [[TMP2:%.*]] = load float*, float** [[BASE_I4_I]], align 8, !tbaa [[TBAA8:![0-9]+]] ; NEWPM_O23-NEXT: [[ARRAYIDX_I5_I:%.*]] = getelementptr inbounds float, float* [[TMP2]], i64 undef +; NEWPM_O23-NEXT: [[BASE_I6_I:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR]], %class.FloatVecPair* [[FVP]], i64 0, i32 0, i32 0 ; NEWPM_O23-NEXT: [[TMP3:%.*]] = load %class.HomemadeVector.0*, %class.HomemadeVector.0** [[BASE_I6_I]], align 8, !tbaa [[TBAA0]] ; NEWPM_O23-NEXT: [[BASE_I8_I:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0]], %class.HomemadeVector.0* [[TMP3]], i64 undef, i32 0 ; NEWPM_O23-NEXT: [[TMP4:%.*]] = load float*, float** [[BASE_I8_I]], align 8, !tbaa [[TBAA8]] @@ -130,16 +130,18 @@ ; OLDPM_O1-SAME: (%class.FloatVecPair* [[THIS:%.*]]) local_unnamed_addr comdat align 2 { ; OLDPM_O1-NEXT: entry: ; OLDPM_O1-NEXT: [[VSRC23:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR:%.*]], %class.FloatVecPair* [[THIS]], i64 0, i32 1 -; OLDPM_O1-NEXT: [[VSRCDST:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR]], %class.FloatVecPair* [[THIS]], i64 0, i32 0 ; OLDPM_O1-NEXT: [[CALL2:%.*]] = call %class.HomemadeVector.0* @_ZN14HomemadeVectorIS_IfLj8EELj8EEixEj(%class.HomemadeVector* nonnull [[VSRC23]]) ; OLDPM_O1-NEXT: [[SIZE43:%.*]] = getelementptr inbounds [[CLASS_HOMEMADEVECTOR_0:%.*]], %class.HomemadeVector.0* [[CALL2]], i64 0, i32 1 ; OLDPM_O1-NEXT: [[TMP0:%.*]] = load i32, i32* [[SIZE43]], align 8, !tbaa [[TBAA0:![0-9]+]] ; OLDPM_O1-NEXT: [[CMP54_NOT:%.*]] = icmp eq i32 [[TMP0]], 0 -; OLDPM_O1-NEXT: br i1 [[CMP54_NOT]], label [[FOR_COND_CLEANUP6:%.*]], label [[FOR_BODY7:%.*]] +; OLDPM_O1-NEXT: br i1 [[CMP54_NOT]], label [[FOR_COND_CLEANUP6:%.*]], label [[FOR_BODY7_LR_PH:%.*]] +; OLDPM_O1: for.body7.lr.ph: +; OLDPM_O1-NEXT: [[VSRCDST:%.*]] = getelementptr inbounds [[CLASS_FLOATVECPAIR]], %class.FloatVecPair* [[THIS]], i64 0, i32 0 +; OLDPM_O1-NEXT: br label [[FOR_BODY7:%.*]] ; OLDPM_O1: for.cond.cleanup6: ; OLDPM_O1-NEXT: ret void ; OLDPM_O1: for.body7: -; OLDPM_O1-NEXT: [[J_05:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY7]] ], [ 0, [[ENTRY:%.*]] ] +; OLDPM_O1-NEXT: [[J_05:%.*]] = phi i32 [ 0, [[FOR_BODY7_LR_PH]] ], [ [[INC:%.*]], [[FOR_BODY7]] ] ; OLDPM_O1-NEXT: [[CALL9:%.*]] = call %class.HomemadeVector.0* @_ZN14HomemadeVectorIS_IfLj8EELj8EEixEj(%class.HomemadeVector* nonnull [[VSRC23]]) ; OLDPM_O1-NEXT: [[CALL10:%.*]] = call float* @_ZN14HomemadeVectorIfLj8EEixEj(%class.HomemadeVector.0* [[CALL9]]) ; OLDPM_O1-NEXT: [[TMP1:%.*]] = load float, float* [[CALL10]], align 4, !tbaa [[TBAA6:![0-9]+]]