diff --git a/llvm/include/llvm/CodeGen/LiveIntervals.h b/llvm/include/llvm/CodeGen/LiveIntervals.h --- a/llvm/include/llvm/CodeGen/LiveIntervals.h +++ b/llvm/include/llvm/CodeGen/LiveIntervals.h @@ -378,6 +378,12 @@ bool checkRegMaskInterference(LiveInterval &LI, BitVector &UsableRegs); + /// Get the interferenced slot index and its regmask for an live interval. + /// Return false if ther is no interference. + bool getInterferenceRegMasks(LiveInterval &LI, + SmallVector &RegSlots, + SmallVector &RegMaskBits); + // Register unit functions. // // Fixed interference occurs when MachineInstrs use physregs directly diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp --- a/llvm/lib/CodeGen/LiveIntervals.cpp +++ b/llvm/lib/CodeGen/LiveIntervals.cpp @@ -952,6 +952,56 @@ } } +bool LiveIntervals::getInterferenceRegMasks( + LiveInterval &LI, SmallVector &RegSlots, + SmallVector &RegBits) { + if (LI.empty()) + return false; + LiveInterval::iterator LiveI = LI.begin(), LiveE = LI.end(); + + // Use a smaller arrays for local live ranges. + ArrayRef Slots; + ArrayRef Bits; + if (MachineBasicBlock *MBB = intervalIsInOneMBB(LI)) { + Slots = getRegMaskSlotsInBlock(MBB->getNumber()); + Bits = getRegMaskBitsInBlock(MBB->getNumber()); + } else { + Slots = getRegMaskSlots(); + Bits = getRegMaskBits(); + } + + // We are going to enumerate all the register mask slots contained in LI. + // Start with a binary search of RegMaskSlots to find a starting point. + ArrayRef::iterator SlotI = llvm::lower_bound(Slots, LiveI->start); + ArrayRef::iterator SlotE = Slots.end(); + + // No slots in range, LI begins after the last call. + if (SlotI == SlotE) + return false; + + bool Found = false; + while (true) { + assert(*SlotI >= LiveI->start); + // Loop over all slots overlapping this segment. + while (*SlotI < LiveI->end) { + // *SlotI overlaps LI. Collect mask bits. + Found = true; + RegSlots.push_back(*SlotI); + RegBits.push_back(Bits[SlotI - Slots.begin()]); + if (++SlotI == SlotE) + return Found; + } + // *SlotI is beyond the current LI segment. + LiveI = LI.advanceTo(LiveI, *SlotI); + if (LiveI == LiveE) + return Found; + // Advance SlotI until it overlaps. + while (*SlotI < LiveI->start) + if (++SlotI == SlotE) + return Found; + } +} + //===----------------------------------------------------------------------===// // IntervalUpdate class. //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp --- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp +++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp @@ -461,25 +461,13 @@ case TargetOpcode::ICALL_BRANCH_FUNNEL: ExpandICallBranchFunnel(&MBB, MBBI); return true; - case X86::PLDTILECFG: { - MI.RemoveOperand(0); - MI.setDesc(TII->get(X86::LDTILECFG)); - return true; - } - case X86::PSTTILECFG: { - MI.RemoveOperand(MI.getNumOperands() - 1); // Remove $tmmcfg - MI.setDesc(TII->get(X86::STTILECFG)); - return true; - } case X86::PTILELOADDV: { - MI.RemoveOperand(8); // Remove $tmmcfg for (unsigned i = 2; i > 0; --i) MI.RemoveOperand(i); MI.setDesc(TII->get(X86::TILELOADD)); return true; } case X86::PTDPBSSDV: { - MI.RemoveOperand(7); // Remove $tmmcfg MI.untieRegOperand(4); for (unsigned i = 3; i > 0; --i) MI.RemoveOperand(i); @@ -488,14 +476,13 @@ return true; } case X86::PTILESTOREDV: { - MI.RemoveOperand(8); // Remove $tmmcfg for (int i = 1; i >= 0; --i) MI.RemoveOperand(i); MI.setDesc(TII->get(X86::TILESTORED)); return true; } case X86::PTILEZEROV: { - for (int i = 3; i > 0; --i) // Remove row, col, $tmmcfg + for (int i = 2; i > 0; --i) // Remove row, col MI.RemoveOperand(i); MI.setDesc(TII->get(X86::TILEZERO)); return true; diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp --- a/llvm/lib/Target/X86/X86FrameLowering.cpp +++ b/llvm/lib/Target/X86/X86FrameLowering.cpp @@ -2094,8 +2094,12 @@ // Emit tilerelease for AMX kernel. const MachineRegisterInfo &MRI = MF.getRegInfo(); - if (!MRI.reg_nodbg_empty(X86::TMMCFG)) - BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE)); + for (unsigned I = 0; I < 8; I++) { + if (!MRI.reg_nodbg_empty(X86::TMM0 + I)) { + BuildMI(MBB, Terminator, DL, TII.get(X86::TILERELEASE)); + break; + } + } } StackOffset X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -4606,7 +4606,6 @@ SDValue Index = Node->getOperand(5); SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); SDValue Segment = CurDAG->getRegister(0, MVT::i16); - SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); SDValue Chain = Node->getOperand(0); MachineSDNode *CNode; SDValue Ops[] = {Node->getOperand(2), @@ -4616,7 +4615,6 @@ Index, Disp, Segment, - CFG, Chain}; CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); ReplaceNode(Node, CNode); @@ -4627,14 +4625,12 @@ break; SDValue Chain = Node->getOperand(0); unsigned Opc = X86::PTDPBSSDV; - SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), Node->getOperand(4), Node->getOperand(5), Node->getOperand(6), Node->getOperand(7), - CFG, Chain}; MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); @@ -4646,8 +4642,7 @@ break; unsigned Opc = X86::PTILEZEROV; SDValue Chain = Node->getOperand(0); - SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); - SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), CFG, Chain}; + SDValue Ops[] = {Node->getOperand(2), Node->getOperand(3), Chain}; MachineSDNode *CNode = CurDAG->getMachineNode(Opc, dl, {MVT::x86amx, MVT::Other}, Ops); ReplaceNode(Node, CNode); @@ -4718,7 +4713,6 @@ SDValue Index = Node->getOperand(5); SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32); SDValue Segment = CurDAG->getRegister(0, MVT::i16); - SDValue CFG = CurDAG->getRegister(0, MVT::Untyped); SDValue Chain = Node->getOperand(0); MachineSDNode *CNode; SDValue Ops[] = {Node->getOperand(2), @@ -4729,7 +4723,6 @@ Disp, Segment, Node->getOperand(6), - CFG, Chain}; CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops); ReplaceNode(Node, CNode); diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td --- a/llvm/lib/Target/X86/X86InstrAMX.td +++ b/llvm/lib/Target/X86/X86InstrAMX.td @@ -48,23 +48,14 @@ VEX, T8XD; // Pseduo instruction for RA. - let hasSideEffects = 1, mayLoad = 1, - Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in - def PLDTILECFG : PseudoI <(outs TILECFG:$cfg), (ins opaquemem:$src), []>; - - let hasSideEffects = 1, mayStore = 1 in - def PSTTILECFG : PseudoI<(outs), (ins opaquemem:$dst, TILECFG:$cfg), []>; - def PTILELOADDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, GR16:$src2, - opaquemem:$src3, - TILECFG:$cfg), []>; + opaquemem:$src3), []>; def PTILESTOREDV : PseudoI<(outs), (ins GR16:$src1, GR16:$src2, opaquemem:$src3, - TILE:$src4, TILECFG:$cfg), []>; + TILE:$src4), []>; def PTILEZEROV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, - GR16:$src2, - TILECFG:$cfg), []>; + GR16:$src2), []>; let usesCustomInserter = 1 in { // Pseudo instructions, using immediates instead of tile registers. @@ -104,7 +95,7 @@ let Constraints = "$src4 = $dst" in def PTDPBSSDV : PseudoI<(outs TILE: $dst), (ins GR16:$src1, GR16:$src2, GR16:$src3, TILE:$src4, - TILE:$src5, TILE:$src6, TILECFG:$cfg), []>; + TILE:$src5, TILE:$src6), []>; let usesCustomInserter = 1 in { // Pseudo instructions, using immediates instead of tile registers. diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp --- a/llvm/lib/Target/X86/X86InstrInfo.cpp +++ b/llvm/lib/Target/X86/X86InstrInfo.cpp @@ -3808,10 +3808,6 @@ MachineOperand &MO = NewMI->getOperand(2); MO.setReg(VirtReg); MO.setIsKill(true); - } else if (RC->getID() == X86::TILECFGRegClassID) { - unsigned Opc = X86::PSTTILECFG; - addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx) - .addReg(SrcReg, getKillRegState(isKill)); } else { unsigned Alignment = std::max(TRI->getSpillSize(*RC), 16); bool isAligned = @@ -3840,10 +3836,6 @@ MachineOperand &MO = NewMI->getOperand(3); MO.setReg(VirtReg); MO.setIsKill(true); - } else if (RC->getID() == X86::TILECFGRegClassID) { - unsigned Opc = X86::PLDTILECFG; - addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), - FrameIdx); } else { const MachineFunction &MF = *MBB.getParent(); unsigned Alignment = std::max(TRI->getSpillSize(*RC), 16); diff --git a/llvm/lib/Target/X86/X86PreTileConfig.cpp b/llvm/lib/Target/X86/X86PreTileConfig.cpp --- a/llvm/lib/Target/X86/X86PreTileConfig.cpp +++ b/llvm/lib/Target/X86/X86PreTileConfig.cpp @@ -38,6 +38,7 @@ #include "X86InstrBuilder.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" +#include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineInstr.h" @@ -62,8 +63,11 @@ const TargetInstrInfo *TII; MachineDominatorTree *DomTree = nullptr; MachineRegisterInfo *MRI = nullptr; + LiveIntervals *LIS = nullptr; + SmallVector VTileRegs; MachineInstr *getTileConfigPoint(); + void reloadTileConfig(int FI); public: X86PreTileConfig() : MachineFunctionPass(ID) {} @@ -88,20 +92,22 @@ INITIALIZE_PASS_BEGIN(X86PreTileConfig, "tilepreconfig", "Tile Register Configure", false, false) +INITIALIZE_PASS_DEPENDENCY(LiveIntervals) INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree) INITIALIZE_PASS_END(X86PreTileConfig, "tilepreconfig", "Tile Register Configure", false, false) void X86PreTileConfig::getAnalysisUsage(AnalysisUsage &AU) const { AU.setPreservesAll(); + AU.addRequired(); + AU.addPreserved(); AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); } -static Register buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx, - const TargetInstrInfo *TII, - MachineRegisterInfo *MRI, - const X86Subtarget *ST) { +static void buildConfigMI(MachineBasicBlock::iterator MI, int FrameIdx, + const TargetInstrInfo *TII, MachineRegisterInfo *MRI, + const X86Subtarget *ST) { auto *MBB = MI->getParent(); // FIXME: AMX should assume AVX512 enabled. @@ -117,12 +123,10 @@ } // build psuedo ldtilecfg - Register VReg = MRI->createVirtualRegister(&X86::TILECFGRegClass); + // Register VReg = MRI->createVirtualRegister(&X86::TILECFGRegClass); - addFrameReference( - BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::PLDTILECFG), VReg), FrameIdx); - - return VReg; + addFrameReference(BuildMI(*MBB, MI, DebugLoc(), TII->get(X86::LDTILECFG)), + FrameIdx); } static ShapeT getShape(const MachineInstr &MI, MachineRegisterInfo *MRI) { @@ -151,6 +155,7 @@ const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg); if (RC.getID() != X86::TILERegClassID) continue; + VTileRegs.push_back(VirtReg); // Find the common dominator for all MI that define tile register. for (const MachineOperand &MO : MRI->def_operands(VirtReg)) { @@ -219,6 +224,7 @@ return &*MII; } +#if 0 static void addTileCFGUse(MachineFunction &MF, Register CFG) { for (MachineBasicBlock &MBB : MF) { @@ -240,6 +246,48 @@ } } } +#endif + +void X86PreTileConfig::reloadTileConfig(int FI) { + SmallSet DoneSet; + + for (Register VReg : VTileRegs) { + BitVector UsableRegs(TRI->getNumRegs()); + for (unsigned I = 0; I < 8; I++) + UsableRegs.set(X86::TMM0 + I); + SmallVector RegSlots; + SmallVector RegMasks; + LiveInterval &LI = LIS->getInterval(VReg); + if (!LIS->getInterferenceRegMasks(LI, RegSlots, RegMasks)) + continue; + for (unsigned I = 0; I < RegSlots.size(); I++) { + SlotIndex &SI = RegSlots[I]; + MachineInstr *MI = LIS->getInstructionFromIndex(SI); + // We have reload the tile config register before. + if (DoneSet.count(SI)) + continue; + // For inline assembly, we don't reload tile config register. + // If there is any ldtilecfg instruction in inline assembly, + // it is user's reponsibility to restore everything. + if (!MI->isCall()) + continue; + UsableRegs.clearBitsNotInMask(RegMasks[I]); + // There is no interference in callee. This is benifited from + // IPRA. + if (UsableRegs.empty()) + continue; + + // build psuedo ldtilecfg + auto *MBB = MI->getParent(); + auto MII = MachineBasicBlock::iterator(MI); + MII++; + addFrameReference( + BuildMI(*MBB, *MII, DebugLoc(), TII->get(X86::LDTILECFG)), FI); + + DoneSet.insert(SI); + } + } +} bool X86PreTileConfig::runOnMachineFunction(MachineFunction &mf) { MF = &mf; @@ -248,6 +296,7 @@ TRI = ST->getRegisterInfo(); TII = mf.getSubtarget().getInstrInfo(); DomTree = &getAnalysis(); + LIS = &getAnalysis(); MachineInstr *MI = getTileConfigPoint(); if (!MI) @@ -255,8 +304,10 @@ unsigned Size = ST->getTileConfigSize(); Align Alignment = ST->getTileConfigAlignment(); int SS = mf.getFrameInfo().CreateStackObject(Size, Alignment, false); - Register CFG = buildConfigMI(MI, SS, TII, MRI, ST); - addTileCFGUse(mf, CFG); + buildConfigMI(MI, SS, TII, MRI, ST); + // addTileCFGUse(mf, CFG); + reloadTileConfig(SS); + VTileRegs.clear(); return true; } diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td --- a/llvm/lib/Target/X86/X86RegisterInfo.td +++ b/llvm/lib/Target/X86/X86RegisterInfo.td @@ -639,8 +639,3 @@ let CopyCost = -1 in // Don't allow copying of tile registers def TILE : RegisterClass<"X86", [x86amx], 8192, (sequence "TMM%u", 0, 7)> {let Size = 8192;} -def TILECFG : RegisterClass<"X86", [untyped], 512, (add TMMCFG)> { - let CopyCost = -1; // Don't allow copying of tile config registers. - let isAllocatable = 1; - let Size = 512; -} diff --git a/llvm/lib/Target/X86/X86TileConfig.cpp b/llvm/lib/Target/X86/X86TileConfig.cpp --- a/llvm/lib/Target/X86/X86TileConfig.cpp +++ b/llvm/lib/Target/X86/X86TileConfig.cpp @@ -22,6 +22,7 @@ #include "X86MachineFunctionInfo.h" #include "X86RegisterInfo.h" #include "X86Subtarget.h" +#include "llvm/ADT/PostOrderIterator.h" #include "llvm/CodeGen/LiveIntervals.h" #include "llvm/CodeGen/MachineDominators.h" #include "llvm/CodeGen/MachineFrameInfo.h" @@ -130,15 +131,28 @@ } MachineInstr *X86TileConfig::getTileConfigPoint() { + MachineBasicBlock *Entry = &*MF->begin(); + ReversePostOrderTraversal RPOT(Entry); + for (MachineBasicBlock *MBB : RPOT) { + for (MachineInstr &MI : *MBB) + // Refer X86PreTileConfig.cpp. + // We only support one tile config for now. The other ldtilecfg + // is for spill purpose and is dominated by the first ldtilecfg. + if (MI.getOpcode() == X86::LDTILECFG) + return &MI; + } + +#if 0 for (MachineBasicBlock &MBB : *MF) { // Traverse the basic block. for (MachineInstr &MI : MBB) // Refer X86PreTileConfig.cpp. // We only support one tile config for now. - if (MI.getOpcode() == X86::PLDTILECFG) + if (MI.getOpcode() == X86::LDTILECFG) return &MI; } +#endif return nullptr; } @@ -148,7 +162,7 @@ if (!MI) return; MachineBasicBlock *MBB = MI->getParent(); - int SS = MI->getOperand(1).getIndex(); + int SS = MI->getOperand(0).getIndex(); BitVector PhysRegs(TRI->getNumRegs()); // Fill in the palette first. diff --git a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll --- a/llvm/test/CodeGen/X86/AMX/amx-across-func.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-across-func.ll @@ -34,7 +34,6 @@ ; CHECK-NEXT: movb $8, {{[0-9]+}}(%rsp) ; CHECK-NEXT: movw %bx, {{[0-9]+}}(%rsp) ; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) -; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill ; CHECK-NEXT: movl $buf, %eax ; CHECK-NEXT: movl $32, %r14d ; CHECK-NEXT: movw $8, %r15w @@ -48,8 +47,8 @@ ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: movl $buf+2048, %eax -; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0 ; CHECK-NEXT: movabsq $64, %rcx ; CHECK-NEXT: tileloadd 2048(%rsp,%rcx), %tmm1 # 1024-byte Folded Reload diff --git a/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll b/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll --- a/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-bf16-intrinsics.ll @@ -5,7 +5,6 @@ ; CHECK-LABEL: test_amx: ; CHECK: # %bb.0: ; CHECK-NEXT: tdpbf16ps %tmm7, %tmm4, %tmm3 -; CHECK-NEXT: retq call void @llvm.x86.tdpbf16ps(i8 3, i8 4, i8 7) ret void } diff --git a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll --- a/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-spill-merge.ll @@ -48,11 +48,10 @@ ; CHECK-NEXT: tileloadd (%r15,%r14), %tmm5 ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: sttilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Spill -; CHECK-NEXT: movl $buf, %eax -; CHECK-NEXT: movw $8, %cx ; CHECK-NEXT: jne .LBB0_2 ; CHECK-NEXT: # %bb.1: # %if.true +; CHECK-NEXT: movl $buf, %eax +; CHECK-NEXT: movw $8, %cx ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm0 ; CHECK-NEXT: movl $buf+1024, %eax ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm1 @@ -64,11 +63,13 @@ ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm6 # 1024-byte Folded Reload ; CHECK-NEXT: jmp .LBB0_3 ; CHECK-NEXT: .LBB0_2: # %if.false +; CHECK-NEXT: movl $buf, %eax +; CHECK-NEXT: movw $8, %cx ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm2 ; CHECK-NEXT: movl $buf+1024, %eax ; CHECK-NEXT: tileloadd (%rax,%r14), %tmm3 @@ -80,7 +81,7 @@ ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: vzeroupper ; CHECK-NEXT: callq foo -; CHECK-NEXT: ldtilecfg {{[-0-9]+}}(%r{{[sb]}}p) # 64-byte Folded Reload +; CHECK-NEXT: ldtilecfg {{[0-9]+}}(%rsp) ; CHECK-NEXT: movabsq $64, %rax ; CHECK-NEXT: tileloadd 1024(%rsp,%rax), %tmm6 # 1024-byte Folded Reload ; CHECK-NEXT: tilestored %tmm6, (%r15,%r14)