diff --git a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp --- a/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp +++ b/llvm/lib/Target/X86/X86LowerAMXIntrinsics.cpp @@ -50,10 +50,41 @@ return false; } -static BasicBlock *createLoop(BasicBlock *Preheader, BasicBlock *Exit, - Value *Bound, Value *Step, StringRef Name, - IRBuilderBase &B, DomTreeUpdater &DTU, Loop *L, - LoopInfo &LI) { +namespace { +class X86LowerAMXIntrinsics { + Function &Func; + +public: + X86LowerAMXIntrinsics(Function &F, DominatorTree *DomT, LoopInfo *LoopI) + : Func(F), DT(DomT), LI(LoopI) { + DTU.reset(new DomTreeUpdater(DT, DomTreeUpdater::UpdateStrategy::Lazy)); + } + bool visit(); + +private: + DominatorTree *DT; + LoopInfo *LI; + std::unique_ptr DTU; + BasicBlock *createLoop(BasicBlock *Preheader, BasicBlock *Exit, Value *Bound, + Value *Step, StringRef Name, IRBuilderBase &B, + Loop *L); + template + Value *createTileLoadStoreLoops(BasicBlock *Start, BasicBlock *End, + IRBuilderBase &B, Value *Row, Value *Col, + Value *Ptr, Value *Stride, Value *Tile); + Value *createTileDPBSSDLoops(BasicBlock *Start, BasicBlock *End, + IRBuilderBase &B, Value *Row, Value *Col, + Value *K, Value *Acc, Value *LHS, Value *RHS); + template + bool lowerTileLoadStore(Instruction *TileLoadStore); + bool lowerTileDPBSSD(Instruction *TileDPBSSD); + bool lowerTileZero(Instruction *TileZero); +}; + +BasicBlock *X86LowerAMXIntrinsics::createLoop(BasicBlock *Preheader, + BasicBlock *Exit, Value *Bound, + Value *Step, StringRef Name, + IRBuilderBase &B, Loop *L) { LLVMContext &Ctx = Preheader->getContext(); BasicBlock *Header = BasicBlock::Create(Ctx, Name + ".header", Preheader->getParent(), Exit); @@ -78,43 +109,47 @@ BranchInst *PreheaderBr = cast(Preheader->getTerminator()); BasicBlock *Tmp = PreheaderBr->getSuccessor(0); PreheaderBr->setSuccessor(0, Header); - DTU.applyUpdatesPermissive({ - {DominatorTree::Delete, Preheader, Tmp}, - {DominatorTree::Insert, Header, Body}, - {DominatorTree::Insert, Body, Latch}, - {DominatorTree::Insert, Latch, Header}, - {DominatorTree::Insert, Latch, Exit}, - {DominatorTree::Insert, Preheader, Header}, - }); - - L->addBasicBlockToLoop(Header, LI); - L->addBasicBlockToLoop(Body, LI); - L->addBasicBlockToLoop(Latch, LI); + if (DT) { + DTU->applyUpdatesPermissive({ + {DominatorTree::Delete, Preheader, Tmp}, + {DominatorTree::Insert, Header, Body}, + {DominatorTree::Insert, Body, Latch}, + {DominatorTree::Insert, Latch, Header}, + {DominatorTree::Insert, Latch, Exit}, + {DominatorTree::Insert, Preheader, Header}, + }); + } + if (LI) { + L->addBasicBlockToLoop(Header, *LI); + L->addBasicBlockToLoop(Body, *LI); + L->addBasicBlockToLoop(Latch, *LI); + } return Body; } template -static Value *createTileLoadStoreLoops(BasicBlock *Start, BasicBlock *End, - IRBuilderBase &B, DomTreeUpdater &DTU, - LoopInfo &LI, Value *Row, Value *Col, - Value *Ptr, Value *Stride, Value *Tile) { +Value *X86LowerAMXIntrinsics::createTileLoadStoreLoops( + BasicBlock *Start, BasicBlock *End, IRBuilderBase &B, Value *Row, + Value *Col, Value *Ptr, Value *Stride, Value *Tile) { std::string IntrinName = IsTileLoad ? "tileload" : "tilestore"; - Loop *RowLoop = LI.AllocateLoop(); - Loop *ColLoop = LI.AllocateLoop(); - RowLoop->addChildLoop(ColLoop); - if (Loop *ParentL = LI.getLoopFor(Start)) - ParentL->addChildLoop(RowLoop); - else - LI.addTopLevelLoop(RowLoop); + Loop *RowLoop = nullptr; + Loop *ColLoop = nullptr; + if (LI) { + RowLoop = LI->AllocateLoop(); + ColLoop = LI->AllocateLoop(); + RowLoop->addChildLoop(ColLoop); + if (Loop *ParentL = LI->getLoopFor(Start)) + ParentL->addChildLoop(RowLoop); + else + LI->addTopLevelLoop(RowLoop); + } - BasicBlock *RowBody = - createLoop(Start, End, Row, B.getInt16(1), IntrinName + ".scalarize.rows", - B, DTU, RowLoop, LI); + BasicBlock *RowBody = createLoop(Start, End, Row, B.getInt16(1), + IntrinName + ".scalarize.rows", B, RowLoop); BasicBlock *RowLatch = RowBody->getSingleSuccessor(); - BasicBlock *ColBody = - createLoop(RowBody, RowLatch, Col, B.getInt16(1), - IntrinName + ".scalarize.cols", B, DTU, ColLoop, LI); + BasicBlock *ColBody = createLoop(RowBody, RowLatch, Col, B.getInt16(1), + IntrinName + ".scalarize.cols", B, ColLoop); BasicBlock *ColLoopLatch = ColBody->getSingleSuccessor(); BasicBlock *ColLoopHeader = ColBody->getSinglePredecessor(); @@ -181,35 +216,36 @@ } } -static Value *createTileDPBSSDLoops(BasicBlock *Start, BasicBlock *End, - IRBuilderBase &B, DomTreeUpdater &DTU, - LoopInfo &LI, Value *Row, Value *Col, - Value *K, Value *Acc, Value *LHS, - Value *RHS) { - Loop *RowLoop = LI.AllocateLoop(); - Loop *ColLoop = LI.AllocateLoop(); - Loop *InnerLoop = LI.AllocateLoop(); - ColLoop->addChildLoop(InnerLoop); - RowLoop->addChildLoop(ColLoop); - if (Loop *ParentL = LI.getLoopFor(Start)) - ParentL->addChildLoop(RowLoop); - else - LI.addTopLevelLoop(RowLoop); +Value *X86LowerAMXIntrinsics::createTileDPBSSDLoops( + BasicBlock *Start, BasicBlock *End, IRBuilderBase &B, Value *Row, + Value *Col, Value *K, Value *Acc, Value *LHS, Value *RHS) { + Loop *RowLoop = nullptr; + Loop *ColLoop = nullptr; + Loop *InnerLoop = nullptr; + if (LI) { + RowLoop = LI->AllocateLoop(); + ColLoop = LI->AllocateLoop(); + InnerLoop = LI->AllocateLoop(); + ColLoop->addChildLoop(InnerLoop); + RowLoop->addChildLoop(ColLoop); + if (Loop *ParentL = LI->getLoopFor(Start)) + ParentL->addChildLoop(RowLoop); + else + LI->addTopLevelLoop(RowLoop); + } - BasicBlock *RowBody = - createLoop(Start, End, Row, B.getInt16(1), "tiledpbssd.scalarize.rows", B, - DTU, RowLoop, LI); + BasicBlock *RowBody = createLoop(Start, End, Row, B.getInt16(1), + "tiledpbssd.scalarize.rows", B, RowLoop); BasicBlock *RowLatch = RowBody->getSingleSuccessor(); - BasicBlock *ColBody = - createLoop(RowBody, RowLatch, Col, B.getInt16(1), - "tiledpbssd.scalarize.cols", B, DTU, ColLoop, LI); + BasicBlock *ColBody = createLoop(RowBody, RowLatch, Col, B.getInt16(1), + "tiledpbssd.scalarize.cols", B, ColLoop); BasicBlock *ColLoopLatch = ColBody->getSingleSuccessor(); B.SetInsertPoint(ColBody->getTerminator()); BasicBlock *InnerBody = createLoop(ColBody, ColLoopLatch, K, B.getInt16(1), - "tiledpbssd.scalarize.inner", B, DTU, InnerLoop, LI); + "tiledpbssd.scalarize.inner", B, InnerLoop); BasicBlock *ColLoopHeader = ColBody->getSinglePredecessor(); BasicBlock *RowLoopHeader = RowBody->getSinglePredecessor(); @@ -324,30 +360,11 @@ return NewVecD; } -namespace { -class X86LowerAMXIntrinsics { - Function &Func; - -public: - X86LowerAMXIntrinsics(Function &F, DominatorTree *DT, LoopInfo *LI) - : Func(F), DT(DT), LI(LI) {} - bool visit(); - -private: - DominatorTree *DT; - LoopInfo *LI; - template - bool lowerTileLoadStore(Instruction *TileLoadStore); - bool lowerTileDPBSSD(Instruction *TileDPBSSD); - bool lowerTileZero(Instruction *TileZero); -}; - bool X86LowerAMXIntrinsics::lowerTileDPBSSD(Instruction *TileDPBSSD) { Value *M, *N, *K, *C, *A, *B; match(TileDPBSSD, m_Intrinsic( m_Value(M), m_Value(N), m_Value(K), m_Value(C), m_Value(A), m_Value(B))); - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); Instruction *InsertI = TileDPBSSD; IRBuilder<> PreBuilder(TileDPBSSD); PreBuilder.SetInsertPoint(TileDPBSSD); @@ -360,8 +377,8 @@ BasicBlock *End = SplitBlock(InsertI->getParent(), InsertI, DT, LI, nullptr, "continue"); IRBuilder<> Builder(TileDPBSSD); - Value *ResVec = createTileDPBSSDLoops(Start, End, Builder, DTU, *LI, M, - NDWord, KDWord, C, A, B); + Value *ResVec = + createTileDPBSSDLoops(Start, End, Builder, M, NDWord, KDWord, C, A, B); // we cannot assume there always be bitcast after tiledpbssd. So we need to // insert one bitcast as required Builder.SetInsertPoint(End->getFirstNonPHI()); @@ -394,7 +411,6 @@ m_Value(M), m_Value(N), m_Value(Ptr), m_Value(Stride), m_Value(Tile))); - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); Instruction *InsertI = TileLoadStore; IRBuilder<> PreBuilder(TileLoadStore); PreBuilder.SetInsertPoint(TileLoadStore); @@ -405,7 +421,7 @@ SplitBlock(InsertI->getParent(), InsertI, DT, LI, nullptr, "continue"); IRBuilder<> Builder(TileLoadStore); Value *ResVec = createTileLoadStoreLoops( - Start, End, Builder, DTU, *LI, M, NDWord, Ptr, StrideDWord, + Start, End, Builder, M, NDWord, Ptr, StrideDWord, IsTileLoad ? nullptr : Tile); if (IsTileLoad) { // we cannot assume there always be bitcast after tileload. So we need to @@ -505,18 +521,18 @@ TM->getOptLevel() != CodeGenOpt::None) return false; - auto &DT = getAnalysis().getDomTree(); - auto &LI = getAnalysis().getLoopInfo(); + auto *DTWP = getAnalysisIfAvailable(); + auto *DT = DTWP ? &DTWP->getDomTree() : nullptr; + auto *LIWP = getAnalysisIfAvailable(); + auto *LI = LIWP ? &LIWP->getLoopInfo() : nullptr; - X86LowerAMXIntrinsics LAT(F, &DT, &LI); + X86LowerAMXIntrinsics LAT(F, DT, LI); return LAT.visit(); } StringRef getPassName() const override { return "Lower AMX intrinsics"; } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); AU.addPreserved(); - AU.addRequired(); AU.addPreserved(); AU.addRequired(); } @@ -528,8 +544,6 @@ char X86LowerAMXIntrinsicsLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(X86LowerAMXIntrinsicsLegacyPass, DEBUG_TYPE, PassName, false, false) -INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass) -INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) INITIALIZE_PASS_END(X86LowerAMXIntrinsicsLegacyPass, DEBUG_TYPE, PassName, false, false) diff --git a/llvm/test/CodeGen/X86/O0-pipeline.ll b/llvm/test/CodeGen/X86/O0-pipeline.ll --- a/llvm/test/CodeGen/X86/O0-pipeline.ll +++ b/llvm/test/CodeGen/X86/O0-pipeline.ll @@ -18,8 +18,6 @@ ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Expand Atomic instructions -; CHECK-NEXT: Dominator Tree Construction -; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Lower AMX intrinsics ; CHECK-NEXT: Lower AMX type for load/store ; CHECK-NEXT: Module Verifier diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll --- a/llvm/test/CodeGen/X86/opt-pipeline.ll +++ b/llvm/test/CodeGen/X86/opt-pipeline.ll @@ -24,12 +24,12 @@ ; CHECK-NEXT: Pre-ISel Intrinsic Lowering ; CHECK-NEXT: FunctionPass Manager ; CHECK-NEXT: Expand Atomic instructions -; CHECK-NEXT: Dominator Tree Construction -; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Lower AMX intrinsics ; CHECK-NEXT: Lower AMX type for load/store ; CHECK-NEXT: Module Verifier +; CHECK-NEXT: Dominator Tree Construction ; CHECK-NEXT: Basic Alias Analysis (stateless AA impl) +; CHECK-NEXT: Natural Loop Information ; CHECK-NEXT: Canonicalize natural loops ; CHECK-NEXT: Scalar Evolution Analysis ; CHECK-NEXT: Loop Pass Manager