diff --git a/llvm/include/llvm/CodeGen/LiveIntervals.h b/llvm/include/llvm/CodeGen/LiveIntervals.h
--- a/llvm/include/llvm/CodeGen/LiveIntervals.h
+++ b/llvm/include/llvm/CodeGen/LiveIntervals.h
@@ -377,6 +377,13 @@
     bool checkRegMaskInterference(LiveInterval &LI,
                                   BitVector &UsableRegs);
 
+    /// Get the interferenced slot index and its regmask for an live interval.
+    /// Return false if ther is no interference.
+    bool
+    getInterferenceRegMasks(LiveInterval &LI,
+                            SmallVectorImpl<SlotIndex> &RegSlots,
+                            SmallVectorImpl<const uint32_t *> &RegMaskBits);
+
     // Register unit functions.
     //
     // Fixed interference occurs when MachineInstrs use physregs directly
diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp
--- a/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -952,6 +952,56 @@
   }
 }
 
+bool LiveIntervals::getInterferenceRegMasks(
+    LiveInterval &LI, SmallVectorImpl<SlotIndex> &RegSlots,
+    SmallVectorImpl<const uint32_t *> &RegBits) {
+  if (LI.empty())
+    return false;
+  LiveInterval::iterator LiveI = LI.begin(), LiveE = LI.end();
+
+  // Use a smaller arrays for local live ranges.
+  ArrayRef<SlotIndex> Slots;
+  ArrayRef<const uint32_t *> Bits;
+  if (MachineBasicBlock *MBB = intervalIsInOneMBB(LI)) {
+    Slots = getRegMaskSlotsInBlock(MBB->getNumber());
+    Bits = getRegMaskBitsInBlock(MBB->getNumber());
+  } else {
+    Slots = getRegMaskSlots();
+    Bits = getRegMaskBits();
+  }
+
+  // We are going to enumerate all the register mask slots contained in LI.
+  // Start with a binary search of RegMaskSlots to find a starting point.
+  ArrayRef<SlotIndex>::iterator SlotI = llvm::lower_bound(Slots, LiveI->start);
+  ArrayRef<SlotIndex>::iterator SlotE = Slots.end();
+
+  // No slots in range, LI begins after the last call.
+  if (SlotI == SlotE)
+    return false;
+
+  bool Found = false;
+  while (true) {
+    assert(*SlotI >= LiveI->start);
+    // Loop over all slots overlapping this segment.
+    while (*SlotI < LiveI->end) {
+      // *SlotI overlaps LI. Collect mask bits.
+      Found = true;
+      RegSlots.push_back(*SlotI);
+      RegBits.push_back(Bits[SlotI - Slots.begin()]);
+      if (++SlotI == SlotE)
+        return Found;
+    }
+    // *SlotI is beyond the current LI segment.
+    LiveI = LI.advanceTo(LiveI, *SlotI);
+    if (LiveI == LiveE)
+      return Found;
+    // Advance SlotI until it overlaps.
+    while (*SlotI < LiveI->start)
+      if (++SlotI == SlotE)
+        return Found;
+  }
+}
+
 //===----------------------------------------------------------------------===//
 //                         IntervalUpdate class.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -461,25 +461,13 @@
   case TargetOpcode::ICALL_BRANCH_FUNNEL:
     ExpandICallBranchFunnel(&MBB, MBBI);
     return true;
-  case X86::PLDTILECFG: {
-    MI.RemoveOperand(0);
-    MI.setDesc(TII->get(X86::LDTILECFG));
-    return true;
-  }
-  case X86::PSTTILECFG: {
-    MI.RemoveOperand(MI.getNumOperands() - 1); // Remove $tmmcfg
-    MI.setDesc(TII->get(X86::STTILECFG));
-    return true;
-  }
   case X86::PTILELOADDV: {
-    MI.RemoveOperand(8); // Remove $tmmcfg
     for (unsigned i = 2; i > 0; --i)
       MI.RemoveOperand(i);
     MI.setDesc(TII->get(X86::TILELOADD));
     return true;
   }
   case X86::PTDPBSSDV: {
-    MI.RemoveOperand(7); // Remove $tmmcfg
     MI.untieRegOperand(4);
     for (unsigned i = 3; i > 0; --i)
       MI.RemoveOperand(i);
@@ -488,14 +476,13 @@
     return true;
   }
   case X86::PTILESTOREDV: {
-    MI.RemoveOperand(8); // Remove $tmmcfg
     for (int i = 1; i >= 0; --i)
       MI.RemoveOperand(i);
     MI.setDesc(TII->get(X86::TILESTORED));
     return true;
   }
   case X86::PTILEZEROV: {
-    for (int i = 3; i > 0; --i) // Remove row, col, $tmmcfg
+    for (int i = 2; i > 0; --i) // Remove row, col
       MI.RemoveOperand(i);
     MI.setDesc(TII->get(X86::TILEZERO));
     return true;
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -2094,8 +2094,14 @@
 
   // Emit tilerelease for AMX kernel.
   const MachineRegisterInfo &MRI = MF.getRegInfo();
-  if (!MRI.reg_nodbg_empty(X86::TMMCFG))
-    BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
+  const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID);
+  unsigned TileRegNum = RC->getNumRegs();
+  for (unsigned I = 0; I < TileRegNum; I++) {
+    if (!MRI.reg_nodbg_empty(X86::TMM0 + I)) {
+      BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE));
+      break;
+    }
+  }
 }
 
 StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF,
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -4606,7 +4606,6 @@
       SDValue Index = Node->getOperand(5);
       SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
       SDValue Segment = CurDAG->getRegister(0, MVT::i16);
-      SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
       SDValue Chain = Node->getOperand(0);
       MachineSDNode *CNode;
       SDValue Ops[] = {Node->getOperand(2),
@@ -4616,7 +4615,6 @@
                        Index,
                        Disp,
                        Segment,
-                       CFG,
                        Chain};
       CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
       ReplaceNode(Node, CNode);
@@ -4627,14 +4625,12 @@
         break;
       SDValue Chain = Node->getOperand(0);
       unsigned Opc = X86::PTDPBSSDV;
-      SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
       SDValue Ops[] = {Node->getOperand(2),
                        Node->getOperand(3),
                        Node->getOperand(4),
                        Node->getOperand(5),
                        Node->getOperand(6),
                        Node->getOperand(7),
-                       CFG,
                        Chain};
       MachineSDNode *CNode =
           CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
@@ -4646,8 +4642,7 @@
         break;
       unsigned Opc = X86::PTILEZEROV;
       SDValue Chain = Node->getOperand(0);
-      SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
-      SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), CFG, Chain};
+      SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), Chain};
       MachineSDNode *CNode =
           CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops);
       ReplaceNode(Node, CNode);
@@ -4718,7 +4713,6 @@
       SDValue Index = Node->getOperand(5);
       SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
       SDValue Segment = CurDAG->getRegister(0, MVT::i16);
-      SDValue CFG = CurDAG->getRegister(0, MVT::Untyped);
       SDValue Chain = Node->getOperand(0);
       MachineSDNode *CNode;
       SDValue Ops[] = {Node->getOperand(2),
@@ -4729,7 +4723,6 @@
                        Disp,
                        Segment,
                        Node->getOperand(6),
-                       CFG,
                        Chain};
       CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
       ReplaceNode(Node, CNode);
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
--- a/llvm/lib/Target/X86/X86InstrAMX.td
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -48,23 +48,14 @@
                      VEX, T8XD;
 
     // Pseduo instruction for RA.
-    let hasSideEffects = 1, mayLoad = 1,
-        Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
-    def PLDTILECFG : PseudoI <(outs TILECFG:$cfg), (ins opaquemem:$src), []>;
-
-    let hasSideEffects = 1, mayStore = 1 in
-    def PSTTILECFG : PseudoI<(outs), (ins opaquemem:$dst, TILECFG:$cfg), []>;
-
     def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
                                                       GR16:$src2,
-                                                      opaquemem:$src3,
-                                                      TILECFG:$cfg), []>;
+                                                      opaquemem:$src3), []>;
     def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1,
                                             GR16:$src2, opaquemem:$src3,
-                                            TILE:$src4, TILECFG:$cfg), []>;
+                                            TILE:$src4), []>;
     def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
-                                                     GR16:$src2,
-                                                     TILECFG:$cfg), []>;
+                                                     GR16:$src2), []>;
 
     let usesCustomInserter = 1 in {
       // Pseudo instructions, using immediates instead of tile registers.
@@ -104,7 +95,7 @@
     let Constraints = "$src4 = $dst" in
     def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1,
                             GR16:$src2, GR16:$src3, TILE:$src4,
-                            TILE:$src5, TILE:$src6, TILECFG:$cfg), []>;
+                            TILE:$src5, TILE:$src6), []>;
 
     let usesCustomInserter = 1 in {
       // Pseudo instructions, using immediates instead of tile registers.
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -3808,10 +3808,6 @@
     MachineOperand &MO = NewMI->getOperand(2);
     MO.setReg(VirtReg);
     MO.setIsKill(true);
-  } else if (RC->getID() == X86::TILECFGRegClassID) {
-    unsigned Opc = X86::PSTTILECFG;
-    addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
-        .addReg(SrcReg, getKillRegState(isKill));
   } else {
     unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
     bool isAligned =
@@ -3840,10 +3836,6 @@
     MachineOperand &MO = NewMI->getOperand(3);
     MO.setReg(VirtReg);
     MO.setIsKill(true);
-  } else if (RC->getID() == X86::TILECFGRegClassID) {
-    unsigned Opc = X86::PLDTILECFG;
-    addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg),
-                      FrameIdx);
   } else {
     const MachineFunction &MF = *MBB.getParent();
     unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp
--- a/llvm/lib/Target/X86/X86PreTileConfig.cpp
+++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp
@@ -38,6 +38,7 @@
 #include "X86InstrBuilder.h"
 #include "X86RegisterInfo.h"
 #include "X86Subtarget.h"
+#include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -62,8 +63,13 @@
   const TargetInstrInfo *TII;
   MachineDominatorTree *DomTree = nullptr;
   MachineRegisterInfo *MRI = nullptr;
+  LiveIntervals *LIS = nullptr;
+  SmallVector<Register, 16> VTileRegs;
+  MachineInstr *TileConfigMI = nullptr;
 
+  void buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx);
   MachineInstr *getTileConfigPoint();
+  void reloadTileConfig(int FI);
 
 public:
   X86PreTileConfig() : MachineFunctionPass(ID) {}
@@ -88,20 +94,21 @@
 
 INITIALIZE_PASS_BEGIN(X86PreTileConfig, "tilepreconfig",
                       "Tile Register Configure", false, false)
+INITIALIZE_PASS_DEPENDENCY(LiveIntervals)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
 INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig",
                     "Tile Register Configure", false, false)
 
 void X86PreTileConfig::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
+  AU.addRequired<LiveIntervals>();
+  AU.addPreserved<LiveIntervals>();
   AU.addRequired<MachineDominatorTree>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx,
-                              const TargetInstrInfo *TII,
-                              MachineRegisterInfo *MRI,
-                              const X86Subtarget *ST) {
+void X86PreTileConfig::buildConfigMI(MachineBasicBlock::iterator MI,
+                                     int FrameIdx) {
   auto *MBB = MI->getParent();
 
   // FIXME: AMX should assume AVX512 enabled.
@@ -111,18 +118,15 @@
     BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VPXORDZrr), Zmm)
         .addReg(Zmm, RegState::Undef)
         .addReg(Zmm, RegState::Undef);
-    addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::VMOVUPSZmr)),
-                      FrameIdx)
-        .addReg(Zmm);
+    TileConfigMI = &*addFrameReference(BuildMI(*MBB, MI, DebugLoc(),
+                                               TII->get(X86::VMOVUPSZmr)),
+                                       FrameIdx)
+                         .addReg(Zmm);
   }
 
   // build psuedo ldtilecfg
-  Register VReg = MRI->createVirtualRegister(&X86::TILECFGRegClass);
-
-  addFrameReference(
-      BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::PLDTILECFG), VReg), FrameIdx);
-
-  return VReg;
+  addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::LDTILECFG)),
+                    FrameIdx);
 }
 
 static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) {
@@ -151,6 +155,7 @@
     const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
     if (RC.getID() != X86::TILERegClassID)
       continue;
+    VTileRegs.push_back(VirtReg);
 
     // Find the common dominator for all MI that define tile register.
     for (const MachineOperand &MO : MRI->def_operands(VirtReg)) {
@@ -219,23 +224,138 @@
   return &*MII;
 }
 
-static void addTileCFGUse(MachineFunction &MF, Register CFG) {
-  for (MachineBasicBlock &MBB : MF) {
+void X86PreTileConfig::reloadTileConfig(int FI) {
+  SmallSet<MachineInstr *, 8> MIVisited;
+  const TargetRegisterClass *RC = TRI->getRegClass(X86::TILERegClassID);
+  auto TileRegNum = RC->getNumRegs();
+
+  for (Register VReg : VTileRegs) {
+    BitVector UsableRegs(TRI->getNumRegs());
+    for (unsigned I = 0; I < TileRegNum; I++)
+      UsableRegs.set(X86::TMM0 + I);
+    SmallVector<SlotIndex, 8> RegSlots;
+    SmallVector<const uint32_t *, 8> RegMasks;
+    LiveInterval &LI = LIS->getInterval(VReg);
+    if (!LIS->getInterferenceRegMasks(LI, RegSlots, RegMasks))
+      continue;
+    for (unsigned I = 0; I < RegSlots.size(); I++) {
+      SlotIndex &SI = RegSlots[I];
+      MachineInstr *MI = LIS->getInstructionFromIndex(SI);
+      // We have reload the tile config register before.
+      if (MIVisited.count(MI))
+        continue;
+      // For inline assembly, we don't reload tile config register.
+      // If there is any ldtilecfg instruction in inline assembly,
+      // it is user's reponsibility to restore everything.
+      if (!MI->isCall())
+        continue;
+      UsableRegs.clearBitsInMask(RegMasks[I]);
+      MIVisited.insert(MI);
+      // There is no interference in callee. This is benifited from
+      // IPRA.
+      if (UsableRegs.none())
+        continue;
+
+      // build psuedo ldtilecfg
+      auto *MBB = MI->getParent();
+      auto MII = MachineBasicBlock::iterator(MI);
+      MII++;
+      addFrameReference(
+          BuildMI(*MBB, *MII, DebugLoc(), TII->get(X86::LDTILECFG)), FI);
+    }
+  }
+  // We just check tile data register interference, we also need check tile
+  // config register interference. Since we don't model the config register
+  // we should check interference from the ldtilecfg to each tile data register
+  // def.
+  //              ldtilecfg
+  //              /       \
+  //             BB1      BB2
+  //             /         \
+  //            call       BB3
+  //            /           \
+  //        %1=tileload   %2=tilezero
+  // We can start from the instruction of each tile def, and backward to
+  // ldtilecfg. If there is any call instruction, and tile data register is
+  // not preserved, we should insert ldtilecfg after the call instruction.
+  SmallSet<MachineBasicBlock *, 8> MBBVisited;
+  for (Register VReg : VTileRegs) {
+    for (MachineOperand &MO : MRI->def_operands(VReg)) {
+      if (MO.isUndef())
+        continue;
+      MachineInstr *MI = MO.getParent();
+      // May be PHI instructiion.
+      // There must be several def tile before PHI instruction.
+      if (MI->isTransient())
+        continue;
+
+      bool Terminate = false;
+      MachineBasicBlock *MBB = MI->getParent();
+      // backward to see if there is any call instruction after ldtilecfg.
+      std::queue<MachineBasicBlock *> WorkList;
+      WorkList.push(MBB);
+      bool First = true;
+      while (!WorkList.empty()) {
+        MBB = WorkList.front();
+        WorkList.pop();
+        // If we have iterate the basic block before, don't iterate it and
+        // its predecessor again. This may be caused by loop, or it has a
+        // cross path from several successor, or it has been iterated when
+        // handle other tile register. In below example, BB1 hit the condition.
+        //               ldtilecfg
+        //                  |
+        //              ---BB1---
+        //              /        \
+        //            BB2        BB3
+        //            /           \
+        //        %1=tileload   %2=tilezero
+        if (MBBVisited.count(MBB))
+          continue;
+        // For the first MBB, we start from the amx instruction which def
+        // tile register.
+        auto I = (First) ? MI->getReverseIterator() : MBB->instr_rbegin();
+        for (auto E = MBB->instr_rend(); I != E; ++I) {
+          // If it is inserted point for ldtilecfg, then we've finished
+          // backward.
+          if (&*I == TileConfigMI) {
+            Terminate = true;
+            break;
+          }
+          if (MIVisited.count(&*I))
+            continue;
+          if (!I->isCall())
+            continue;
+          BitVector UsableRegs(TRI->getNumRegs());
+          for (unsigned I = 0; I < TileRegNum; I++)
+            UsableRegs.set(X86::TMM0 + I);
+          for (MachineOperand &CallMO : I->operands()) {
+            if (CallMO.isRegMask())
+              UsableRegs.clearBitsInMask(CallMO.getRegMask());
+          }
+          // Record the call to avoid double ldtilecfg insert.
+          MIVisited.insert(&*I);
+          if (UsableRegs.none())
+            continue;
+          // Insert ldtilecfg after call instruction.
+          --I;
+          addFrameReference(
+              BuildMI(*MBB, *I, DebugLoc(), TII->get(X86::LDTILECFG)), FI);
+        }
+        // We encounter visited MachineInst, so we don't need to do backward
+        // again.
+        if (Terminate)
+          break;
+        // Next we will iterate its predecessor.
+        for (MachineBasicBlock::pred_iterator S = MBB->pred_begin(),
+                                              E = MBB->pred_end();
+             S != E; S++)
+          WorkList.push(*S);
 
-    // Traverse the basic block.
-    for (MachineInstr &MI : MBB) {
-      unsigned Opcode = MI.getOpcode();
-      switch (Opcode) {
-      default:
-        break;
-      case X86::PTILELOADDV:
-      case X86::PTILESTOREDV:
-      case X86::PTDPBSSDV:
-      case X86::PTILEZEROV:
-        unsigned NumOperands = MI.getNumOperands();
-        MI.RemoveOperand(NumOperands - 1);
-        MI.addOperand(MF, MachineOperand::CreateReg(CFG, false));
-        break;
+        // The first the MBB may be visited for the second time when it is in
+        // a loop.
+        if (!First)
+          MBBVisited.insert(MBB);
+        First = false;
       }
     }
   }
@@ -248,15 +368,17 @@
   TRI = ST->getRegisterInfo();
   TII = mf.getSubtarget().getInstrInfo();
   DomTree = &getAnalysis<MachineDominatorTree>();
+  LIS = &getAnalysis<LiveIntervals>();
 
-  MachineInstr *MI = getTileConfigPoint();
-  if (!MI)
+  auto *TileConfigPoint = getTileConfigPoint();
+  if (!TileConfigPoint)
     return false;
   unsigned Size = ST->getTileConfigSize();
   Align Alignment = ST->getTileConfigAlignment();
   int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false);
-  Register CFG = buildConfigMI(MI, SS, TII, MRI, ST);
-  addTileCFGUse(mf, CFG);
+  buildConfigMI(TileConfigPoint, SS);
+  reloadTileConfig(SS);
+  VTileRegs.clear();
   return true;
 }
 
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -639,8 +639,3 @@
 let CopyCost = -1 in // Don't allow copying of tile registers
 def TILE : RegisterClass<"X86", [x86amx], 8192,
                          (sequence "TMM%u", 0, 7)> {let Size = 8192;}
-def TILECFG : RegisterClass<"X86", [untyped], 512, (add TMMCFG)> {
-  let CopyCost = -1;  // Don't allow copying of tile config registers.
-  let isAllocatable = 1;
-  let Size = 512;
-}
diff --git a/llvm/lib/Target/X86/X86TileConfig.cpp b/llvm/lib/Target/X86/X86TileConfig.cpp
--- a/llvm/lib/Target/X86/X86TileConfig.cpp
+++ b/llvm/lib/Target/X86/X86TileConfig.cpp
@@ -22,6 +22,7 @@
 #include "X86MachineFunctionInfo.h"
 #include "X86RegisterInfo.h"
 #include "X86Subtarget.h"
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -130,13 +131,14 @@
 }
 
 MachineInstr *X86TileConfig::getTileConfigPoint() {
-  for (MachineBasicBlock &MBB : *MF) {
-
-    // Traverse the basic block.
-    for (MachineInstr &MI : MBB)
+  MachineBasicBlock *Entry = &*MF->begin();
+  ReversePostOrderTraversal<MachineBasicBlock *> RPOT(Entry);
+  for (MachineBasicBlock *MBB : RPOT) {
+    for (MachineInstr &MI : *MBB)
       // Refer X86PreTileConfig.cpp.
-      // We only support one tile config for now.
-      if (MI.getOpcode() == X86::PLDTILECFG)
+      // We only support one tile config for now. The other ldtilecfg
+      // is for spill purpose and is dominated by the first ldtilecfg.
+      if (MI.getOpcode() == X86::LDTILECFG)
         return &MI;
   }
 
@@ -148,7 +150,7 @@
   if (!MI)
     return;
   MachineBasicBlock *MBB = MI->getParent();
-  int SS = MI->getOperand(1).getIndex();
+  int SS = MI->getOperand(0).getIndex();
   BitVector PhysRegs(TRI->getNumRegs());
 
   // Fill in the palette first.
diff --git a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
--- a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll
@@ -1,10 +1,35 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
-
-%struct.__tile_str = type <{ i16, i16, [60 x i8], <256 x i32> }>
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs -enable-ipra | FileCheck -check-prefix=IPRA %s
 
 @buf = dso_local global [3072 x i8] zeroinitializer, align 64
 
+define internal void @foo() #0 {
+; CHECK-LABEL: foo:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movq %rsp, %rbp
+; CHECK-NEXT:    .cfi_def_cfa_register %rbp
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    .cfi_def_cfa %rsp, 8
+; CHECK-NEXT:    retq
+;
+; IPRA-LABEL: foo:
+; IPRA:       # %bb.0: # %entry
+; IPRA-NEXT:    pushq %rbp
+; IPRA-NEXT:    .cfi_def_cfa_offset 16
+; IPRA-NEXT:    .cfi_offset %rbp, -16
+; IPRA-NEXT:    movq %rsp, %rbp
+; IPRA-NEXT:    .cfi_def_cfa_register %rbp
+; IPRA-NEXT:    popq %rbp
+; IPRA-NEXT:    .cfi_def_cfa %rsp, 8
+; IPRA-NEXT:    retq
+entry:
+  ret void
+}
+
 define dso_local void @test_api(i16 signext %0, i16 signext %1) nounwind {
 ; CHECK-LABEL: test_api:
 ; CHECK:       # %bb.0:
@@ -25,7 +50,6 @@
 ; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movw %bx, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
-; CHECK-NEXT:    sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill
 ; CHECK-NEXT:    movl $buf, %eax
 ; CHECK-NEXT:    movl $32, %r14d
 ; CHECK-NEXT:    movw $8, %r15w
@@ -36,11 +60,10 @@
 ; CHECK-NEXT:    tileloadd (%rax,%r14), %tmm2
 ; CHECK-NEXT:    movabsq $64, %rax
 ; CHECK-NEXT:    tilestored %tmm2, 1024(%rsp,%rax) # 1024-byte Folded Spill
-; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movl $buf+2048, %eax
-; CHECK-NEXT:    ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
 ; CHECK-NEXT:    tileloadd (%rax,%r14), %tmm0
 ; CHECK-NEXT:    movabsq $64, %rcx
 ; CHECK-NEXT:    tileloadd 2048(%rsp,%rcx), %tmm1 # 1024-byte Folded Reload
@@ -55,17 +78,48 @@
 ; CHECK-NEXT:    popq %rbp
 ; CHECK-NEXT:    tilerelease
 ; CHECK-NEXT:    retq
+;
+; IPRA-LABEL: test_api:
+; IPRA:       # %bb.0:
+; IPRA-NEXT:    pushq %rbp
+; IPRA-NEXT:    subq $64, %rsp
+; IPRA-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; IPRA-NEXT:    vmovdqu64 %zmm0, (%rsp)
+; IPRA-NEXT:    movb $1, (%rsp)
+; IPRA-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
+; IPRA-NEXT:    movw $8, {{[0-9]+}}(%rsp)
+; IPRA-NEXT:    movb $8, {{[0-9]+}}(%rsp)
+; IPRA-NEXT:    movw %si, {{[0-9]+}}(%rsp)
+; IPRA-NEXT:    movb %dil, {{[0-9]+}}(%rsp)
+; IPRA-NEXT:    movw %si, {{[0-9]+}}(%rsp)
+; IPRA-NEXT:    ldtilecfg (%rsp)
+; IPRA-NEXT:    movl $buf, %eax
+; IPRA-NEXT:    movl $32, %ecx
+; IPRA-NEXT:    movw $8, %dx
+; IPRA-NEXT:    tileloadd (%rax,%rcx), %tmm0
+; IPRA-NEXT:    movl $buf+1024, %eax
+; IPRA-NEXT:    tileloadd (%rax,%rcx), %tmm1
+; IPRA-NEXT:    callq foo
+; IPRA-NEXT:    movl $buf+2048, %eax
+; IPRA-NEXT:    tileloadd (%rax,%rcx), %tmm2
+; IPRA-NEXT:    tdpbssd %tmm1, %tmm0, %tmm2
+; IPRA-NEXT:    tilestored %tmm2, (%rax,%rcx)
+; IPRA-NEXT:    addq $64, %rsp
+; IPRA-NEXT:    popq %rbp
+; IPRA-NEXT:    tilerelease
+; IPRA-NEXT:    vzeroupper
+; IPRA-NEXT:    retq
   %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
   %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
-  tail call void (...) @foo()
+  call void @foo()
   %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
   %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4)
   tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6)
   ret void
 }
 
-declare dso_local void @foo(...)
-
 declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
 declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
 declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
+
+attributes #0 = { noinline nounwind optnone uwtable "disable-tail-calls"="false" "frame-pointer"="all" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll
--- a/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll
@@ -5,7 +5,6 @@
 ; CHECK-LABEL: test_amx:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    tdpbf16ps %tmm7, %tmm4, %tmm3
-; CHECK-NEXT:    retq
   call void @llvm.x86.tdpbf16ps(i8 3, i8 4, i8 7)
   ret void
 }
diff --git a/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/AMX/amx-ldtilecfg-insert.ll
@@ -0,0 +1,131 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+amx-int8 -mattr=+avx512f -verify-machineinstrs | FileCheck %s
+@buf = dso_local global [3072 x i8] zeroinitializer, align 16
+
+define dso_local void @test1(i16 signext %0, i16 signext %1) local_unnamed_addr {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $1, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb %dil, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw %si, -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    ldtilecfg -{{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movl $buf, %eax
+; CHECK-NEXT:    movl $32, %ecx
+; CHECK-NEXT:    movw $8, %dx
+; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm0
+; CHECK-NEXT:    movl $buf+1024, %eax
+; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm1
+; CHECK-NEXT:    movl $buf+2048, %eax
+; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm2
+; CHECK-NEXT:    tdpbssd %tmm1, %tmm0, %tmm2
+; CHECK-NEXT:    tilestored %tmm2, (%rax,%rcx)
+; CHECK-NEXT:    tilerelease
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    jmp foo # TAILCALL
+  %3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
+  %4 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
+  %5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
+  %6 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %5, x86_amx %3, x86_amx %4)
+  tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %6)
+  tail call void @foo()
+  ret void
+}
+
+define dso_local void @test2(i16 signext %0, i16 signext %1) local_unnamed_addr {
+; CHECK-LABEL: test2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    pushq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    subq $72, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 96
+; CHECK-NEXT:    .cfi_offset %rbx, -24
+; CHECK-NEXT:    .cfi_offset %rbp, -16
+; CHECK-NEXT:    movl %esi, %ebx
+; CHECK-NEXT:    movl %edi, %ebp
+; CHECK-NEXT:    vpxord %zmm0, %zmm0, %zmm0
+; CHECK-NEXT:    vmovdqu64 %zmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $1, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw %bx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb %bpl, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw %bx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb %bpl, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw %bx, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movb %bpl, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movw $8, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    vzeroupper
+; CHECK-NEXT:    callq foo
+; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %al, %al
+; CHECK-NEXT:    jne .LBB1_3
+; CHECK-NEXT:  # %bb.1: # %if.true
+; CHECK-NEXT:    movw $8, %ax
+; CHECK-NEXT:    tilezero %tmm0
+; CHECK-NEXT:    movl $32, %ecx
+; CHECK-NEXT:    movl $buf+1024, %edx
+; CHECK-NEXT:    tileloadd (%rdx,%rcx), %tmm1
+; CHECK-NEXT:    movl $buf+2048, %edx
+; CHECK-NEXT:    tileloadd (%rdx,%rcx), %tmm2
+; CHECK-NEXT:    tdpbssd %tmm2, %tmm1, %tmm0
+; CHECK-NEXT:    tilestored %tmm0, (%rdx,%rcx)
+; CHECK-NEXT:    jmp .LBB1_2
+; CHECK-NEXT:  .LBB1_3: # %if.false
+; CHECK-NEXT:    movl $buf, %eax
+; CHECK-NEXT:    movl $32, %ecx
+; CHECK-NEXT:    movw $8, %dx
+; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm3
+; CHECK-NEXT:    movl $buf+1024, %eax
+; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm4
+; CHECK-NEXT:    movl $buf+2048, %eax
+; CHECK-NEXT:    tileloadd (%rax,%rcx), %tmm2
+; CHECK-NEXT:    tdpbssd %tmm2, %tmm4, %tmm3
+; CHECK-NEXT:    tilestored %tmm3, (%rax,%rcx)
+; CHECK-NEXT:  .LBB1_2: # %if.true
+; CHECK-NEXT:    addq $72, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-NEXT:    popq %rbx
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    popq %rbp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    tilerelease
+; CHECK-NEXT:    retq
+  call void @foo()
+  br i1 undef, label %if.true, label %if.false
+
+if.true:
+  %t1 = tail call x86_amx @llvm.x86.tilezero.internal(i16 %0, i16 8)
+  %t2 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
+  %t3 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
+  %t4 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t1, x86_amx %t2, x86_amx %t3)
+  tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %t4)
+  br label %exit
+
+if.false:
+  %t5 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 8, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 0), i64 32)
+  %t6 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 8, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 1024), i64 32)
+  %t7 = tail call x86_amx @llvm.x86.tileloadd64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32)
+  %t8 = tail call x86_amx @llvm.x86.tdpbssd.internal(i16 %0, i16 %1, i16 8, x86_amx %t5, x86_amx %t6, x86_amx %t7)
+  tail call void @llvm.x86.tilestored64.internal(i16 %0, i16 %1, i8* getelementptr inbounds ([3072 x i8], [3072 x i8]* @buf, i64 0, i64 2048), i64 32, x86_amx %t8)
+  br label %exit
+
+exit:
+  ret void
+}
+
+declare dso_local void @foo() local_unnamed_addr
+declare x86_amx @llvm.x86.tilezero.internal(i16, i16)
+declare x86_amx @llvm.x86.tileloadd64.internal(i16, i16, i8*, i64)
+declare x86_amx @llvm.x86.tdpbssd.internal(i16, i16, i16, x86_amx, x86_amx, x86_amx)
+declare void @llvm.x86.tilestored64.internal(i16, i16, i8*, i64, x86_amx)
diff --git a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
--- a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll
@@ -36,11 +36,10 @@
 ; CHECK-NEXT:    tileloadd (%r15,%r14), %tmm5
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    testb %al, %al
-; CHECK-NEXT:    sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill
-; CHECK-NEXT:    movl $buf, %eax
-; CHECK-NEXT:    movw $8, %cx
 ; CHECK-NEXT:    jne .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %if.true
+; CHECK-NEXT:    movl $buf, %eax
+; CHECK-NEXT:    movw $8, %cx
 ; CHECK-NEXT:    tileloadd (%rax,%r14), %tmm0
 ; CHECK-NEXT:    movl $buf+1024, %eax
 ; CHECK-NEXT:    tileloadd (%rax,%r14), %tmm1
@@ -52,11 +51,13 @@
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq foo
-; CHECK-NEXT:    ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
+; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movabsq $64, %rax
 ; CHECK-NEXT:    tileloadd 1024(%rsp,%rax), %tmm6 # 1024-byte Folded Reload
 ; CHECK-NEXT:    jmp .LBB0_3
 ; CHECK-NEXT:  .LBB0_2: # %if.false
+; CHECK-NEXT:    movl $buf, %eax
+; CHECK-NEXT:    movw $8, %cx
 ; CHECK-NEXT:    tileloadd (%rax,%r14), %tmm2
 ; CHECK-NEXT:    movl $buf+1024, %eax
 ; CHECK-NEXT:    tileloadd (%rax,%r14), %tmm3
@@ -68,7 +69,7 @@
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq foo
-; CHECK-NEXT:    ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
+; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movabsq $64, %rax
 ; CHECK-NEXT:    tileloadd 1024(%rsp,%rax), %tmm6 # 1024-byte Folded Reload
 ; CHECK-NEXT:    tilestored %tmm6, (%r15,%r14)
@@ -139,7 +140,6 @@
 ; CHECK-NEXT:    movq %rdi, %rbx
 ; CHECK-NEXT:    movl $32, %r14d
 ; CHECK-NEXT:    xorl %ebp, %ebp
-; CHECK-NEXT:    sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill
 ; CHECK-NEXT:    .p2align 4, 0x90
 ; CHECK-NEXT:  .LBB1_2: # %loop.header
 ; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
@@ -149,7 +149,7 @@
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    callq foo
-; CHECK-NEXT:    ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload
+; CHECK-NEXT:    ldtilecfg {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    tilezero %tmm0
 ; CHECK-NEXT:    tileloadd (%rbx,%r14), %tmm1
 ; CHECK-NEXT:    tileloadd (%rbx,%r14), %tmm2
diff --git a/llvm/test/CodeGen/X86/opt-pipeline.ll b/llvm/test/CodeGen/X86/opt-pipeline.ll
--- a/llvm/test/CodeGen/X86/opt-pipeline.ll
+++ b/llvm/test/CodeGen/X86/opt-pipeline.ll
@@ -120,6 +120,8 @@
 ; CHECK-NEXT:       X86 EFLAGS copy lowering
 ; CHECK-NEXT:       X86 WinAlloca Expander
 ; CHECK-NEXT:       MachineDominator Tree Construction
+; CHECK-NEXT:       Slot index numbering
+; CHECK-NEXT:       Live Interval Analysis
 ; CHECK-NEXT:       Tile Register Pre-configure
 ; CHECK-NEXT:       Detect Dead Lanes
 ; CHECK-NEXT:       Process Implicit Definitions