diff --git a/llvm/lib/CodeGen/Spill2Reg.cpp b/llvm/lib/CodeGen/Spill2Reg.cpp --- a/llvm/lib/CodeGen/Spill2Reg.cpp +++ b/llvm/lib/CodeGen/Spill2Reg.cpp @@ -102,6 +102,9 @@ /// Helper for generateCode(). It eplaces stack spills or reloads with movs /// to \p LI.reg(). void replaceStackWithReg(StackSlotDataEntry &Entry, Register VectorReg); + /// Updates the live-ins of MBBs after we emit the new spill2reg instructions + /// and the vector registers become live from register spills to reloads. + void updateLiveIns(StackSlotDataEntry &Entry, MCRegister VectorReg); /// Updates \p LRU with the liveness of physical registers around the spills /// and reloads in \p Entry. void calculateLiveRegs(StackSlotDataEntry &Entry, LiveRegUnits &LRU); @@ -114,6 +117,9 @@ /// Map from a stack slot to the corresponding spills and reloads. DenseMap StackSlotData; + /// The registers used by each block (from LiveRegUnits). This is needed for + /// finding free physical registers in the generateCode(). + DenseMap LRUs; MachineFunction *MF = nullptr; MachineRegisterInfo *MRI = nullptr; @@ -177,7 +183,16 @@ // If any spill/reload for a stack slot is found not to be eligible for // spill-to-reg, then that stack slot is disabled. for (MachineBasicBlock &MBB : *MF) { - for (MachineInstr &MI : MBB) { + // Initialize AccumMBBLRU for keeping track of physical registers used + // across the whole MBB. + LiveRegUnits AccumMBBLRU(*TRI); + AccumMBBLRU.addLiveOuts(MBB); + + // Collect spills/reloads + for (MachineInstr &MI : llvm::reverse(MBB)) { + // Update the LRU state as we move upwards. + AccumMBBLRU.accumulate(MI); + int StackSlot; if (const MachineOperand *MO = TII->isStoreToStackSlotMO(MI, StackSlot)) { MachineInstr *Spill = &MI; @@ -211,6 +226,8 @@ } } } + + LRUs.insert(std::make_pair(&MBB, AccumMBBLRU)); } } @@ -236,6 +253,25 @@ return None; } +/// Perform a bottom-up depth-first traversal from \p MBB at \p MI towards its +/// predecessors blocks. Visited marks the visited blocks. \p Fn is the +/// callback function called in pre-order. If \p Fn returns true we stop the +/// traversal. +static void DFS(MachineBasicBlock *MBB, DenseSet &Visited, + std::function Fn) { + // Skip visited to avoid infinite loops. + if (Visited.count(MBB)) + return; + Visited.insert(MBB); + + // Preorder. + if (Fn(MBB)) + return; + + // Depth-first across predecessors. + for (MachineBasicBlock *PredMBB : MBB->predecessors()) + DFS(PredMBB, Visited, Fn); +} // Replace stack-based spills/reloads with register-based ones. void Spill2Reg::replaceStackWithReg(StackSlotDataEntry &Entry, Register VectorReg) { @@ -248,6 +284,9 @@ VectorReg, OldReg, SpillData.MemBits, StackSpill->getParent(), /*InsertBeforeIt=*/StackSpill->getIterator(), TRI); + // Mark VectorReg as live in the instr's BB. + LRUs[StackSpill->getParent()].addReg(VectorReg); + // Spill to stack is no longer needed. StackSpill->eraseFromParent(); assert(OldReg.isPhysical() && "Otherwise we need to removeInterval()"); @@ -262,6 +301,9 @@ OldReg, VectorReg, ReloadData.MemBits, StackReload->getParent(), /*InsertBeforeIt=*/StackReload->getIterator(), TRI); + // Mark VectorReg as live in the instr's BB. + LRUs[StackReload->getParent()].addReg(VectorReg); + // Reload from stack is no longer needed. StackReload->eraseFromParent(); assert(OldReg.isPhysical() && "Otherwise we need to removeInterval()"); @@ -270,7 +312,86 @@ void Spill2Reg::calculateLiveRegs(StackSlotDataEntry &Entry, LiveRegUnits &LRU) { - // TODO: Unimplemented + // Collect the parent MBBs of Spills for fast lookup. + DenseSet SpillMBBs(Entry.Spills.size()); + DenseSet Spills(Entry.Spills.size()); + for (const auto &Data : Entry.Spills) { + SpillMBBs.insert(Data.MI->getParent()); + Spills.insert(Data.MI); + } + + /// Wlaks up the instructions in \p Reload's block, stopping at a spill if + /// found. \Returns true if a spill was found, false otherwise. + auto AccumulateLRUUntilSpillFn = [&Spills, &SpillMBBs](MachineInstr *Reload, + LiveRegUnits &LRU) { + MachineBasicBlock *MBB = Reload->getParent(); + bool IsSpillBlock = SpillMBBs.count(MBB); + // Add all MBB's live-outs. + LRU.addLiveOuts(*MBB); + // Else walk up the BB, starting from MI, looking for any spill. + for (MachineInstr *CurrMI = Reload; CurrMI != nullptr; + CurrMI = CurrMI->getPrevNode()) { + LRU.accumulate(*CurrMI); + // If a spill is found then return true to end the recursion. + if (IsSpillBlock && Spills.count(CurrMI)) + return true; + } + return false; + }; + + // Helper for the traversal. It accumulates all register units used in \p + // MBB from \p MI upwards. It returns true once a spill is found. + auto AccumulateLRUFn = [&SpillMBBs, &LRU, AccumulateLRUUntilSpillFn, + this](MachineBasicBlock *MBB) { + if (SpillMBBs.count(MBB)) { + // If this is a spill block, then walk bottom-up until the spill. + assert(!MBB->empty() && "How can it be a spill block and empty?"); + bool FoundSpill = AccumulateLRUUntilSpillFn(&*MBB->rbegin(), LRU); + assert(FoundSpill && "Spill block but we couldn't find spill!"); + // We return true to stop the recursion. + return true; + } else { + // Else this is an intermediate block between the spills and reloads and + // there is no spill in it, then use the pre-computed LRU to avoid walking + // it again. This improves compilation time. + LRU.addUnits(LRUs[MBB].getBitVector()); + // We return false to continue the recursion. + return false; + } + }; + + /// \Returns the LiveRegUnits at `Reload` by stepping back the BB. + auto GetReloadLRU = [this](MachineInstr *Reload) { + LiveRegUnits ReloadLRU(*TRI); + MachineBasicBlock *MBB = Reload->getParent(); + ReloadLRU.addLiveOuts(*MBB); + // Start at the bottom of the BB and walk up until we find `Reload`. + for (MachineInstr &MI : llvm::reverse(*MBB)) { + if (&MI == Reload) + break; + ReloadLRU.stepBackward(MI); + } + return ReloadLRU; + }; + + // Start from each Reload and walk up the CFG with a depth-first traversal, + // looking for spills. Upon finding a spill we don't go beyond that point. In + // the meantime we accumulate the registers used. This is then used to find + // free physical registes. + DenseSet Visited; + for (const auto &ReloadData : Entry.Reloads) { + MachineInstr *Reload = ReloadData.MI; + // Add the Reload's LRU to the total LRU for the whole Spill-Reload range. + LiveRegUnits ReloadLRU = GetReloadLRU(Reload); + bool FoundSpill = AccumulateLRUUntilSpillFn(Reload, ReloadLRU); + LRU.addUnits(ReloadLRU.getBitVector()); + + // Traverse the CFG bottom-up accumulating LRUs until we reach the Spills. + if (!FoundSpill) { + for (MachineBasicBlock *PredMBB : Reload->getParent()->predecessors()) + DFS(PredMBB, Visited, AccumulateLRUFn); + } + } } void Spill2Reg::generateCode() { @@ -301,7 +422,10 @@ } } -void Spill2Reg::cleanup() { StackSlotData.clear(); } +void Spill2Reg::cleanup() { + StackSlotData.clear(); + LRUs.clear(); +} bool Spill2Reg::run() { // Walk over each instruction in the code keeping track of the processor's diff --git a/llvm/test/CodeGen/X86/spill2reg_end_to_end_16bit.ll b/llvm/test/CodeGen/X86/spill2reg_end_to_end_16bit.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/spill2reg_end_to_end_16bit.ll @@ -0,0 +1,177 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 | FileCheck %s + +; End-to-end check that Spill2Reg works with 16-bit registers. + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@D0 = dso_local local_unnamed_addr global i16 0, align 4 +@D1 = dso_local local_unnamed_addr global i16 0, align 4 +@D2 = dso_local local_unnamed_addr global i16 0, align 4 +@D3 = dso_local local_unnamed_addr global i16 0, align 4 +@D4 = dso_local local_unnamed_addr global i16 0, align 4 +@D5 = dso_local local_unnamed_addr global i16 0, align 4 +@D6 = dso_local local_unnamed_addr global i16 0, align 4 +@D7 = dso_local local_unnamed_addr global i16 0, align 4 +@D8 = dso_local local_unnamed_addr global i16 0, align 4 +@D9 = dso_local local_unnamed_addr global i16 0, align 4 +@D10 = dso_local local_unnamed_addr global i16 0, align 4 +@D11 = dso_local local_unnamed_addr global i16 0, align 4 +@D12 = dso_local local_unnamed_addr global i16 0, align 4 +@D13 = dso_local local_unnamed_addr global i16 0, align 4 +@D14 = dso_local local_unnamed_addr global i16 0, align 4 +@D15 = dso_local local_unnamed_addr global i16 0, align 4 +@D16 = dso_local local_unnamed_addr global i16 0, align 4 +@D17 = dso_local local_unnamed_addr global i16 0, align 4 +@D18 = dso_local local_unnamed_addr global i16 0, align 4 +@U0 = dso_local local_unnamed_addr global i16 0, align 4 +@U1 = dso_local local_unnamed_addr global i16 0, align 4 +@U2 = dso_local local_unnamed_addr global i16 0, align 4 +@U3 = dso_local local_unnamed_addr global i16 0, align 4 +@U4 = dso_local local_unnamed_addr global i16 0, align 4 +@U5 = dso_local local_unnamed_addr global i16 0, align 4 +@U6 = dso_local local_unnamed_addr global i16 0, align 4 +@U7 = dso_local local_unnamed_addr global i16 0, align 4 +@U8 = dso_local local_unnamed_addr global i16 0, align 4 +@U9 = dso_local local_unnamed_addr global i16 0, align 4 +@U10 = dso_local local_unnamed_addr global i16 0, align 4 +@U11 = dso_local local_unnamed_addr global i16 0, align 4 +@U12 = dso_local local_unnamed_addr global i16 0, align 4 +@U13 = dso_local local_unnamed_addr global i16 0, align 4 +@U14 = dso_local local_unnamed_addr global i16 0, align 4 +@U15 = dso_local local_unnamed_addr global i16 0, align 4 +@U16 = dso_local local_unnamed_addr global i16 0, align 4 +@U17 = dso_local local_unnamed_addr global i16 0, align 4 +@U18 = dso_local local_unnamed_addr global i16 0, align 4 + +; Function Attrs: mustprogress noinline nounwind uwtable +define dso_local void @_Z5spillv() local_unnamed_addr #0 { +; CHECK-LABEL: _Z5spillv: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 56 +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movzwl D0(%rip), %eax +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movzwl D1(%rip), %ecx +; CHECK-NEXT: movzwl D2(%rip), %edx +; CHECK-NEXT: movzwl D3(%rip), %esi +; CHECK-NEXT: movzwl D4(%rip), %edi +; CHECK-NEXT: movzwl D5(%rip), %r8d +; CHECK-NEXT: movzwl D6(%rip), %r9d +; CHECK-NEXT: movzwl D7(%rip), %r10d +; CHECK-NEXT: movzwl D8(%rip), %r11d +; CHECK-NEXT: movzwl D9(%rip), %ebx +; CHECK-NEXT: movzwl D10(%rip), %ebp +; CHECK-NEXT: movzwl D11(%rip), %r14d +; CHECK-NEXT: movzwl D12(%rip), %r15d +; CHECK-NEXT: movzwl D13(%rip), %r12d +; CHECK-NEXT: movzwl D14(%rip), %r13d +; CHECK-NEXT: movzwl D15(%rip), %eax +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movzwl D16(%rip), %eax +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movzwl D17(%rip), %eax +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: movzwl D18(%rip), %eax +; CHECK-NEXT: movw %ax, {{[-0-9]+}}(%r{{[sb]}}p) # 2-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, U0(%rip) +; CHECK-NEXT: movw %cx, U1(%rip) +; CHECK-NEXT: movw %dx, U2(%rip) +; CHECK-NEXT: movw %si, U3(%rip) +; CHECK-NEXT: movw %di, U4(%rip) +; CHECK-NEXT: movw %r8w, U5(%rip) +; CHECK-NEXT: movw %r9w, U6(%rip) +; CHECK-NEXT: movw %r10w, U7(%rip) +; CHECK-NEXT: movw %r11w, U8(%rip) +; CHECK-NEXT: movw %bx, U9(%rip) +; CHECK-NEXT: movw %bp, U10(%rip) +; CHECK-NEXT: movw %r14w, U11(%rip) +; CHECK-NEXT: movw %r15w, U12(%rip) +; CHECK-NEXT: movw %r12w, U13(%rip) +; CHECK-NEXT: movw %r13w, U14(%rip) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, U15(%rip) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, U16(%rip) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, U17(%rip) +; CHECK-NEXT: movzwl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 2-byte Folded Reload +; CHECK-NEXT: movw %ax, U18(%rip) +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %0 = load i16, i16* @D0 + %1 = load i16, i16* @D1 + %2 = load i16, i16* @D2 + %3 = load i16, i16* @D3 + %4 = load i16, i16* @D4 + %5 = load i16, i16* @D5 + %6 = load i16, i16* @D6 + %7 = load i16, i16* @D7 + %8 = load i16, i16* @D8 + %9 = load i16, i16* @D9 + %10 = load i16, i16* @D10 + %11 = load i16, i16* @D11 + %12 = load i16, i16* @D12 + %13 = load i16, i16* @D13 + %14 = load i16, i16* @D14 + %15 = load i16, i16* @D15 + %16 = load i16, i16* @D16 + %17 = load i16, i16* @D17 + %18 = load i16, i16* @D18 + call void asm sideeffect "", "~{memory}"() #1 + store i16 %0, i16* @U0 + store i16 %1, i16* @U1 + store i16 %2, i16* @U2 + store i16 %3, i16* @U3 + store i16 %4, i16* @U4 + store i16 %5, i16* @U5 + store i16 %6, i16* @U6 + store i16 %7, i16* @U7 + store i16 %8, i16* @U8 + store i16 %9, i16* @U9 + store i16 %10, i16* @U10 + store i16 %11, i16* @U11 + store i16 %12, i16* @U12 + store i16 %13, i16* @U13 + store i16 %14, i16* @U14 + store i16 %15, i16* @U15 + store i16 %16, i16* @U16 + store i16 %17, i16* @U17 + store i16 %18, i16* @U18 + ret void +} + +attributes #0 = { mustprogress noinline nounwind uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/X86/spill2reg_end_to_end_32bit.ll b/llvm/test/CodeGen/X86/spill2reg_end_to_end_32bit.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/spill2reg_end_to_end_32bit.ll @@ -0,0 +1,178 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 | FileCheck %s + +; End-to-end check that Spill2Reg works with 32-bit registers. + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@D0 = dso_local local_unnamed_addr global i32 0, align 4 +@D1 = dso_local local_unnamed_addr global i32 0, align 4 +@D2 = dso_local local_unnamed_addr global i32 0, align 4 +@D3 = dso_local local_unnamed_addr global i32 0, align 4 +@D4 = dso_local local_unnamed_addr global i32 0, align 4 +@D5 = dso_local local_unnamed_addr global i32 0, align 4 +@D6 = dso_local local_unnamed_addr global i32 0, align 4 +@D7 = dso_local local_unnamed_addr global i32 0, align 4 +@D8 = dso_local local_unnamed_addr global i32 0, align 4 +@D9 = dso_local local_unnamed_addr global i32 0, align 4 +@D10 = dso_local local_unnamed_addr global i32 0, align 4 +@D11 = dso_local local_unnamed_addr global i32 0, align 4 +@D12 = dso_local local_unnamed_addr global i32 0, align 4 +@D13 = dso_local local_unnamed_addr global i32 0, align 4 +@D14 = dso_local local_unnamed_addr global i32 0, align 4 +@D15 = dso_local local_unnamed_addr global i32 0, align 4 +@D16 = dso_local local_unnamed_addr global i32 0, align 4 +@D17 = dso_local local_unnamed_addr global i32 0, align 4 +@D18 = dso_local local_unnamed_addr global i32 0, align 4 +@U0 = dso_local local_unnamed_addr global i32 0, align 4 +@U1 = dso_local local_unnamed_addr global i32 0, align 4 +@U2 = dso_local local_unnamed_addr global i32 0, align 4 +@U3 = dso_local local_unnamed_addr global i32 0, align 4 +@U4 = dso_local local_unnamed_addr global i32 0, align 4 +@U5 = dso_local local_unnamed_addr global i32 0, align 4 +@U6 = dso_local local_unnamed_addr global i32 0, align 4 +@U7 = dso_local local_unnamed_addr global i32 0, align 4 +@U8 = dso_local local_unnamed_addr global i32 0, align 4 +@U9 = dso_local local_unnamed_addr global i32 0, align 4 +@U10 = dso_local local_unnamed_addr global i32 0, align 4 +@U11 = dso_local local_unnamed_addr global i32 0, align 4 +@U12 = dso_local local_unnamed_addr global i32 0, align 4 +@U13 = dso_local local_unnamed_addr global i32 0, align 4 +@U14 = dso_local local_unnamed_addr global i32 0, align 4 +@U15 = dso_local local_unnamed_addr global i32 0, align 4 +@U16 = dso_local local_unnamed_addr global i32 0, align 4 +@U17 = dso_local local_unnamed_addr global i32 0, align 4 +@U18 = dso_local local_unnamed_addr global i32 0, align 4 + +; Function Attrs: mustprogress noinline nounwind uwtable +define dso_local void @_Z5spillv() local_unnamed_addr #0 { +; CHECK-LABEL: _Z5spillv: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 56 +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movl D0(%rip), %eax +; CHECK-NEXT: movd %eax, %xmm3 +; CHECK-NEXT: movl D1(%rip), %ecx +; CHECK-NEXT: movl D2(%rip), %edx +; CHECK-NEXT: movl D3(%rip), %esi +; CHECK-NEXT: movl D4(%rip), %edi +; CHECK-NEXT: movl D5(%rip), %r8d +; CHECK-NEXT: movl D6(%rip), %r9d +; CHECK-NEXT: movl D7(%rip), %r10d +; CHECK-NEXT: movl D8(%rip), %r11d +; CHECK-NEXT: movl D9(%rip), %ebx +; CHECK-NEXT: movl D10(%rip), %ebp +; CHECK-NEXT: movl D11(%rip), %r14d +; CHECK-NEXT: movl D12(%rip), %r15d +; CHECK-NEXT: movl D13(%rip), %r12d +; CHECK-NEXT: movl D14(%rip), %r13d +; CHECK-NEXT: movl D15(%rip), %eax +; CHECK-NEXT: movd %eax, %xmm0 +; CHECK-NEXT: movl D16(%rip), %eax +; CHECK-NEXT: movd %eax, %xmm1 +; CHECK-NEXT: movl D17(%rip), %eax +; CHECK-NEXT: movd %eax, %xmm4 +; CHECK-NEXT: movl D18(%rip), %eax +; CHECK-NEXT: movd %eax, %xmm2 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movd %xmm3, %eax +; CHECK-NEXT: movl %eax, U0(%rip) +; CHECK-NEXT: movl %ecx, U1(%rip) +; CHECK-NEXT: movl %edx, U2(%rip) +; CHECK-NEXT: movl %esi, U3(%rip) +; CHECK-NEXT: movl %edi, U4(%rip) +; CHECK-NEXT: movl %r8d, U5(%rip) +; CHECK-NEXT: movl %r9d, U6(%rip) +; CHECK-NEXT: movl %r10d, U7(%rip) +; CHECK-NEXT: movl %r11d, U8(%rip) +; CHECK-NEXT: movl %ebx, U9(%rip) +; CHECK-NEXT: movl %ebp, U10(%rip) +; CHECK-NEXT: movl %r14d, U11(%rip) +; CHECK-NEXT: movl %r15d, U12(%rip) +; CHECK-NEXT: movl %r12d, U13(%rip) +; CHECK-NEXT: movl %r13d, U14(%rip) +; CHECK-NEXT: movd %xmm0, %eax +; CHECK-NEXT: movl %eax, U15(%rip) +; CHECK-NEXT: movd %xmm1, %eax +; CHECK-NEXT: movl %eax, U16(%rip) +; CHECK-NEXT: movd %xmm4, %eax +; CHECK-NEXT: movl %eax, U17(%rip) +; CHECK-NEXT: movd %xmm2, %eax +; CHECK-NEXT: movl %eax, U18(%rip) +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %0 = load i32, i32* @D0 + %1 = load i32, i32* @D1 + %2 = load i32, i32* @D2 + %3 = load i32, i32* @D3 + %4 = load i32, i32* @D4 + %5 = load i32, i32* @D5 + %6 = load i32, i32* @D6 + %7 = load i32, i32* @D7 + %8 = load i32, i32* @D8 + %9 = load i32, i32* @D9 + %10 = load i32, i32* @D10 + %11 = load i32, i32* @D11 + %12 = load i32, i32* @D12 + %13 = load i32, i32* @D13 + %14 = load i32, i32* @D14 + %15 = load i32, i32* @D15 + %16 = load i32, i32* @D16 + %17 = load i32, i32* @D17 + %18 = load i32, i32* @D18 + call void asm sideeffect "", "~{memory}"() #1 + store i32 %0, i32* @U0 + store i32 %1, i32* @U1 + store i32 %2, i32* @U2 + store i32 %3, i32* @U3 + store i32 %4, i32* @U4 + store i32 %5, i32* @U5 + store i32 %6, i32* @U6 + store i32 %7, i32* @U7 + store i32 %8, i32* @U8 + store i32 %9, i32* @U9 + store i32 %10, i32* @U10 + store i32 %11, i32* @U11 + store i32 %12, i32* @U12 + store i32 %13, i32* @U13 + store i32 %14, i32* @U14 + store i32 %15, i32* @U15 + store i32 %16, i32* @U16 + store i32 %17, i32* @U17 + store i32 %18, i32* @U18 + ret void +} + +attributes #0 = { mustprogress noinline nounwind uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { nounwind } + diff --git a/llvm/test/CodeGen/X86/spill2reg_end_to_end_64bit.ll b/llvm/test/CodeGen/X86/spill2reg_end_to_end_64bit.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/spill2reg_end_to_end_64bit.ll @@ -0,0 +1,177 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 | FileCheck %s + +; End-to-end check that Spill2Reg works with 64-bit registers. + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@D0 = dso_local local_unnamed_addr global i64 0, align 4 +@D1 = dso_local local_unnamed_addr global i64 0, align 4 +@D2 = dso_local local_unnamed_addr global i64 0, align 4 +@D3 = dso_local local_unnamed_addr global i64 0, align 4 +@D4 = dso_local local_unnamed_addr global i64 0, align 4 +@D5 = dso_local local_unnamed_addr global i64 0, align 4 +@D6 = dso_local local_unnamed_addr global i64 0, align 4 +@D7 = dso_local local_unnamed_addr global i64 0, align 4 +@D8 = dso_local local_unnamed_addr global i64 0, align 4 +@D9 = dso_local local_unnamed_addr global i64 0, align 4 +@D10 = dso_local local_unnamed_addr global i64 0, align 4 +@D11 = dso_local local_unnamed_addr global i64 0, align 4 +@D12 = dso_local local_unnamed_addr global i64 0, align 4 +@D13 = dso_local local_unnamed_addr global i64 0, align 4 +@D14 = dso_local local_unnamed_addr global i64 0, align 4 +@D15 = dso_local local_unnamed_addr global i64 0, align 4 +@D16 = dso_local local_unnamed_addr global i64 0, align 4 +@D17 = dso_local local_unnamed_addr global i64 0, align 4 +@D18 = dso_local local_unnamed_addr global i64 0, align 4 +@U0 = dso_local local_unnamed_addr global i64 0, align 4 +@U1 = dso_local local_unnamed_addr global i64 0, align 4 +@U2 = dso_local local_unnamed_addr global i64 0, align 4 +@U3 = dso_local local_unnamed_addr global i64 0, align 4 +@U4 = dso_local local_unnamed_addr global i64 0, align 4 +@U5 = dso_local local_unnamed_addr global i64 0, align 4 +@U6 = dso_local local_unnamed_addr global i64 0, align 4 +@U7 = dso_local local_unnamed_addr global i64 0, align 4 +@U8 = dso_local local_unnamed_addr global i64 0, align 4 +@U9 = dso_local local_unnamed_addr global i64 0, align 4 +@U10 = dso_local local_unnamed_addr global i64 0, align 4 +@U11 = dso_local local_unnamed_addr global i64 0, align 4 +@U12 = dso_local local_unnamed_addr global i64 0, align 4 +@U13 = dso_local local_unnamed_addr global i64 0, align 4 +@U14 = dso_local local_unnamed_addr global i64 0, align 4 +@U15 = dso_local local_unnamed_addr global i64 0, align 4 +@U16 = dso_local local_unnamed_addr global i64 0, align 4 +@U17 = dso_local local_unnamed_addr global i64 0, align 4 +@U18 = dso_local local_unnamed_addr global i64 0, align 4 + +; Function Attrs: mustprogress noinline nounwind uwtable +define dso_local void @_Z5spillv() local_unnamed_addr #0 { +; CHECK-LABEL: _Z5spillv: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 56 +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movq D0(%rip), %rax +; CHECK-NEXT: movq %rax, %xmm3 +; CHECK-NEXT: movq D1(%rip), %rcx +; CHECK-NEXT: movq D2(%rip), %rdx +; CHECK-NEXT: movq D3(%rip), %rsi +; CHECK-NEXT: movq D4(%rip), %rdi +; CHECK-NEXT: movq D5(%rip), %r8 +; CHECK-NEXT: movq D6(%rip), %r9 +; CHECK-NEXT: movq D7(%rip), %r10 +; CHECK-NEXT: movq D8(%rip), %r11 +; CHECK-NEXT: movq D9(%rip), %rbx +; CHECK-NEXT: movq D10(%rip), %r14 +; CHECK-NEXT: movq D11(%rip), %r15 +; CHECK-NEXT: movq D12(%rip), %r12 +; CHECK-NEXT: movq D13(%rip), %r13 +; CHECK-NEXT: movq D14(%rip), %rbp +; CHECK-NEXT: movq D15(%rip), %rax +; CHECK-NEXT: movq %rax, %xmm0 +; CHECK-NEXT: movq D16(%rip), %rax +; CHECK-NEXT: movq %rax, %xmm1 +; CHECK-NEXT: movq D17(%rip), %rax +; CHECK-NEXT: movq %rax, %xmm4 +; CHECK-NEXT: movq D18(%rip), %rax +; CHECK-NEXT: movq %rax, %xmm2 +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movq %xmm3, %rax +; CHECK-NEXT: movq %rax, U0(%rip) +; CHECK-NEXT: movq %rcx, U1(%rip) +; CHECK-NEXT: movq %rdx, U2(%rip) +; CHECK-NEXT: movq %rsi, U3(%rip) +; CHECK-NEXT: movq %rdi, U4(%rip) +; CHECK-NEXT: movq %r8, U5(%rip) +; CHECK-NEXT: movq %r9, U6(%rip) +; CHECK-NEXT: movq %r10, U7(%rip) +; CHECK-NEXT: movq %r11, U8(%rip) +; CHECK-NEXT: movq %rbx, U9(%rip) +; CHECK-NEXT: movq %r14, U10(%rip) +; CHECK-NEXT: movq %r15, U11(%rip) +; CHECK-NEXT: movq %r12, U12(%rip) +; CHECK-NEXT: movq %r13, U13(%rip) +; CHECK-NEXT: movq %rbp, U14(%rip) +; CHECK-NEXT: movq %xmm0, %rax +; CHECK-NEXT: movq %rax, U15(%rip) +; CHECK-NEXT: movq %xmm1, %rax +; CHECK-NEXT: movq %rax, U16(%rip) +; CHECK-NEXT: movq %xmm4, %rax +; CHECK-NEXT: movq %rax, U17(%rip) +; CHECK-NEXT: movq %xmm2, %rax +; CHECK-NEXT: movq %rax, U18(%rip) +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %0 = load i64, i64* @D0 + %1 = load i64, i64* @D1 + %2 = load i64, i64* @D2 + %3 = load i64, i64* @D3 + %4 = load i64, i64* @D4 + %5 = load i64, i64* @D5 + %6 = load i64, i64* @D6 + %7 = load i64, i64* @D7 + %8 = load i64, i64* @D8 + %9 = load i64, i64* @D9 + %10 = load i64, i64* @D10 + %11 = load i64, i64* @D11 + %12 = load i64, i64* @D12 + %13 = load i64, i64* @D13 + %14 = load i64, i64* @D14 + %15 = load i64, i64* @D15 + %16 = load i64, i64* @D16 + %17 = load i64, i64* @D17 + %18 = load i64, i64* @D18 + call void asm sideeffect "", "~{memory}"() #1 + store i64 %0, i64* @U0 + store i64 %1, i64* @U1 + store i64 %2, i64* @U2 + store i64 %3, i64* @U3 + store i64 %4, i64* @U4 + store i64 %5, i64* @U5 + store i64 %6, i64* @U6 + store i64 %7, i64* @U7 + store i64 %8, i64* @U8 + store i64 %9, i64* @U9 + store i64 %10, i64* @U10 + store i64 %11, i64* @U11 + store i64 %12, i64* @U12 + store i64 %13, i64* @U13 + store i64 %14, i64* @U14 + store i64 %15, i64* @U15 + store i64 %16, i64* @U16 + store i64 %17, i64* @U17 + store i64 %18, i64* @U18 + ret void +} + +attributes #0 = { mustprogress noinline nounwind uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/X86/spill2reg_end_to_end_8bit.ll b/llvm/test/CodeGen/X86/spill2reg_end_to_end_8bit.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/spill2reg_end_to_end_8bit.ll @@ -0,0 +1,177 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 | FileCheck %s + +; End-to-end check that Spill2Reg works with 8-bit registers. + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@D0 = dso_local local_unnamed_addr global i8 0, align 4 +@D1 = dso_local local_unnamed_addr global i8 0, align 4 +@D2 = dso_local local_unnamed_addr global i8 0, align 4 +@D3 = dso_local local_unnamed_addr global i8 0, align 4 +@D4 = dso_local local_unnamed_addr global i8 0, align 4 +@D5 = dso_local local_unnamed_addr global i8 0, align 4 +@D6 = dso_local local_unnamed_addr global i8 0, align 4 +@D7 = dso_local local_unnamed_addr global i8 0, align 4 +@D8 = dso_local local_unnamed_addr global i8 0, align 4 +@D9 = dso_local local_unnamed_addr global i8 0, align 4 +@D10 = dso_local local_unnamed_addr global i8 0, align 4 +@D11 = dso_local local_unnamed_addr global i8 0, align 4 +@D12 = dso_local local_unnamed_addr global i8 0, align 4 +@D13 = dso_local local_unnamed_addr global i8 0, align 4 +@D14 = dso_local local_unnamed_addr global i8 0, align 4 +@D15 = dso_local local_unnamed_addr global i8 0, align 4 +@D16 = dso_local local_unnamed_addr global i8 0, align 4 +@D17 = dso_local local_unnamed_addr global i8 0, align 4 +@D18 = dso_local local_unnamed_addr global i8 0, align 4 +@U0 = dso_local local_unnamed_addr global i8 0, align 4 +@U1 = dso_local local_unnamed_addr global i8 0, align 4 +@U2 = dso_local local_unnamed_addr global i8 0, align 4 +@U3 = dso_local local_unnamed_addr global i8 0, align 4 +@U4 = dso_local local_unnamed_addr global i8 0, align 4 +@U5 = dso_local local_unnamed_addr global i8 0, align 4 +@U6 = dso_local local_unnamed_addr global i8 0, align 4 +@U7 = dso_local local_unnamed_addr global i8 0, align 4 +@U8 = dso_local local_unnamed_addr global i8 0, align 4 +@U9 = dso_local local_unnamed_addr global i8 0, align 4 +@U10 = dso_local local_unnamed_addr global i8 0, align 4 +@U11 = dso_local local_unnamed_addr global i8 0, align 4 +@U12 = dso_local local_unnamed_addr global i8 0, align 4 +@U13 = dso_local local_unnamed_addr global i8 0, align 4 +@U14 = dso_local local_unnamed_addr global i8 0, align 4 +@U15 = dso_local local_unnamed_addr global i8 0, align 4 +@U16 = dso_local local_unnamed_addr global i8 0, align 4 +@U17 = dso_local local_unnamed_addr global i8 0, align 4 +@U18 = dso_local local_unnamed_addr global i8 0, align 4 + +; Function Attrs: mustprogress noinline nounwind uwtable +define dso_local void @_Z5spillv() local_unnamed_addr #0 { +; CHECK-LABEL: _Z5spillv: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: pushq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: pushq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: pushq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: pushq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: pushq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: pushq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 56 +; CHECK-NEXT: .cfi_offset %rbx, -56 +; CHECK-NEXT: .cfi_offset %r12, -48 +; CHECK-NEXT: .cfi_offset %r13, -40 +; CHECK-NEXT: .cfi_offset %r14, -32 +; CHECK-NEXT: .cfi_offset %r15, -24 +; CHECK-NEXT: .cfi_offset %rbp, -16 +; CHECK-NEXT: movb D0(%rip), %al +; CHECK-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-NEXT: movb D1(%rip), %cl +; CHECK-NEXT: movb D2(%rip), %dl +; CHECK-NEXT: movb D3(%rip), %sil +; CHECK-NEXT: movb D4(%rip), %dil +; CHECK-NEXT: movb D5(%rip), %r8b +; CHECK-NEXT: movb D6(%rip), %r9b +; CHECK-NEXT: movb D7(%rip), %r10b +; CHECK-NEXT: movb D8(%rip), %r11b +; CHECK-NEXT: movb D9(%rip), %bl +; CHECK-NEXT: movb D10(%rip), %bpl +; CHECK-NEXT: movb D11(%rip), %r14b +; CHECK-NEXT: movb D12(%rip), %r15b +; CHECK-NEXT: movb D13(%rip), %r12b +; CHECK-NEXT: movb D14(%rip), %r13b +; CHECK-NEXT: movb D15(%rip), %al +; CHECK-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-NEXT: movb D16(%rip), %al +; CHECK-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-NEXT: movb D17(%rip), %al +; CHECK-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-NEXT: movb D18(%rip), %al +; CHECK-NEXT: movb %al, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill +; CHECK-NEXT: #APP +; CHECK-NEXT: #NO_APP +; CHECK-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-NEXT: movb %al, U0(%rip) +; CHECK-NEXT: movb %cl, U1(%rip) +; CHECK-NEXT: movb %dl, U2(%rip) +; CHECK-NEXT: movb %sil, U3(%rip) +; CHECK-NEXT: movb %dil, U4(%rip) +; CHECK-NEXT: movb %r8b, U5(%rip) +; CHECK-NEXT: movb %r9b, U6(%rip) +; CHECK-NEXT: movb %r10b, U7(%rip) +; CHECK-NEXT: movb %r11b, U8(%rip) +; CHECK-NEXT: movb %bl, U9(%rip) +; CHECK-NEXT: movb %bpl, U10(%rip) +; CHECK-NEXT: movb %r14b, U11(%rip) +; CHECK-NEXT: movb %r15b, U12(%rip) +; CHECK-NEXT: movb %r12b, U13(%rip) +; CHECK-NEXT: movb %r13b, U14(%rip) +; CHECK-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-NEXT: movb %al, U15(%rip) +; CHECK-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-NEXT: movb %al, U16(%rip) +; CHECK-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-NEXT: movb %al, U17(%rip) +; CHECK-NEXT: movb {{[-0-9]+}}(%r{{[sb]}}p), %al # 1-byte Reload +; CHECK-NEXT: movb %al, U18(%rip) +; CHECK-NEXT: popq %rbx +; CHECK-NEXT: .cfi_def_cfa_offset 48 +; CHECK-NEXT: popq %r12 +; CHECK-NEXT: .cfi_def_cfa_offset 40 +; CHECK-NEXT: popq %r13 +; CHECK-NEXT: .cfi_def_cfa_offset 32 +; CHECK-NEXT: popq %r14 +; CHECK-NEXT: .cfi_def_cfa_offset 24 +; CHECK-NEXT: popq %r15 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: popq %rbp +; CHECK-NEXT: .cfi_def_cfa_offset 8 +; CHECK-NEXT: retq +entry: + %0 = load i8, i8* @D0 + %1 = load i8, i8* @D1 + %2 = load i8, i8* @D2 + %3 = load i8, i8* @D3 + %4 = load i8, i8* @D4 + %5 = load i8, i8* @D5 + %6 = load i8, i8* @D6 + %7 = load i8, i8* @D7 + %8 = load i8, i8* @D8 + %9 = load i8, i8* @D9 + %10 = load i8, i8* @D10 + %11 = load i8, i8* @D11 + %12 = load i8, i8* @D12 + %13 = load i8, i8* @D13 + %14 = load i8, i8* @D14 + %15 = load i8, i8* @D15 + %16 = load i8, i8* @D16 + %17 = load i8, i8* @D17 + %18 = load i8, i8* @D18 + call void asm sideeffect "", "~{memory}"() #1 + store i8 %0, i8* @U0 + store i8 %1, i8* @U1 + store i8 %2, i8* @U2 + store i8 %3, i8* @U3 + store i8 %4, i8* @U4 + store i8 %5, i8* @U5 + store i8 %6, i8* @U6 + store i8 %7, i8* @U7 + store i8 %8, i8* @U8 + store i8 %9, i8* @U9 + store i8 %10, i8* @U10 + store i8 %11, i8* @U11 + store i8 %12, i8* @U12 + store i8 %13, i8* @U13 + store i8 %14, i8* @U14 + store i8 %15, i8* @U15 + store i8 %16, i8* @U16 + store i8 %17, i8* @U17 + store i8 %18, i8* @U18 + ret void +} + +attributes #0 = { mustprogress noinline nounwind uwtable "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="x86-64" "target-features"="+cx8,+fxsr,+mmx,+sse,+sse2,+x87" "tune-cpu"="generic" } +attributes #1 = { nounwind } diff --git a/llvm/test/CodeGen/X86/spill2reg_simple_2.mir b/llvm/test/CodeGen/X86/spill2reg_liveregs_all_live.mir copy from llvm/test/CodeGen/X86/spill2reg_simple_2.mir copy to llvm/test/CodeGen/X86/spill2reg_liveregs_all_live.mir --- a/llvm/test/CodeGen/X86/spill2reg_simple_2.mir +++ b/llvm/test/CodeGen/X86/spill2reg_liveregs_all_live.mir @@ -1,11 +1,7 @@ # NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py # RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s -# Simple test with two overlapping spill-reload pairs. -# spill stack.0 -# spill stack.1 -# reload stack.0 -# reload stack.1 +# Check that spill2reg is not applied when all xmm registers are live. --- | @D0 = dso_local local_unnamed_addr global i32 0, align 4 @@ -26,18 +22,21 @@ - { id: 1, type: spill-slot, size: 4, alignment: 4 } machineFunctionInfo: {} body: | - ; CHECK-LABEL: bb.0: - ; CHECK-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0) - ; CHECK-NEXT: $xmm0 = MOVDI2PDIrr $eax - ; CHECK-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1) - ; CHECK-NEXT: $xmm0 = MOVDI2PDIrr $eax - ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm0 - ; CHECK-NEXT: MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0) - ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm0 - ; CHECK-NEXT: MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1) - ; CHECK-NEXT: RET 0 - + ; CHECK-LABEL: bb.0: + ; CHECK: liveins: $xmm0, $xmm1, $xmm2, $xmm3, $xmm4, $xmm5, $xmm6, $xmm7, $xmm8, $xmm9, $xmm10, $xmm11, $xmm12, $xmm13, $xmm14, $xmm15 + ; CHECK: $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0) + ; CHECK-NEXT: MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0) + ; CHECK-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1) + ; CHECK-NEXT: MOV32mr %stack.1, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.1) + ; CHECK-NEXT: $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0) + ; CHECK-NEXT: MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0) + ; CHECK-NEXT: $eax = MOV32rm %stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %stack.1) + ; CHECK-NEXT: MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1) + ; CHECK-NEXT: JMP_1 %bb.1 bb.0: + successors: %bb.1 + liveins: $xmm0, $xmm1, $xmm2, $xmm3, $xmm4, $xmm5, $xmm6, $xmm7, $xmm8, $xmm9, $xmm10, $xmm11, $xmm12, $xmm13, $xmm14, $xmm15 + $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0) MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0) $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1) @@ -47,6 +46,10 @@ MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0) $eax = MOV32rm %stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %stack.1) MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1) + JMP_1 %bb.1 + + bb.1: + liveins: $xmm0, $xmm1, $xmm2, $xmm3, $xmm4, $xmm5, $xmm6, $xmm7, $xmm8, $xmm9, $xmm10, $xmm11, $xmm12, $xmm13, $xmm14, $xmm15 RET 0 ... diff --git a/llvm/test/CodeGen/X86/spill2reg_simple_2.mir b/llvm/test/CodeGen/X86/spill2reg_liveregs_call.mir copy from llvm/test/CodeGen/X86/spill2reg_simple_2.mir copy to llvm/test/CodeGen/X86/spill2reg_liveregs_call.mir --- a/llvm/test/CodeGen/X86/spill2reg_simple_2.mir +++ b/llvm/test/CodeGen/X86/spill2reg_liveregs_call.mir @@ -1,17 +1,15 @@ # NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py # RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s -# Simple test with two overlapping spill-reload pairs. -# spill stack.0 -# spill stack.1 -# reload stack.0 -# reload stack.1 +# Check that reg liveness works correctly through calls --- | @D0 = dso_local local_unnamed_addr global i32 0, align 4 @D1 = dso_local local_unnamed_addr global i32 0, align 4 @U0 = dso_local local_unnamed_addr global i32 0, align 4 @U1 = dso_local local_unnamed_addr global i32 0, align 4 + @Cond = dso_local local_unnamed_addr global i32 0, align 4 + declare void @foo() define void @func() { ret void } ... --- @@ -28,25 +26,21 @@ body: | ; CHECK-LABEL: bb.0: ; CHECK-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0) - ; CHECK-NEXT: $xmm0 = MOVDI2PDIrr $eax - ; CHECK-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1) - ; CHECK-NEXT: $xmm0 = MOVDI2PDIrr $eax - ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm0 + ; CHECK-NEXT: MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0) + ; CHECK-NEXT: CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit-def $rsp + ; CHECK-NEXT: $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0) ; CHECK-NEXT: MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0) - ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm0 - ; CHECK-NEXT: MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1) ; CHECK-NEXT: RET 0 bb.0: $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0) MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0) - $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1) - MOV32mr %stack.1, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.1) + + ; The call may touch all xmm regs, so disable spill2reg across it + CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit-def $rsp $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0) MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0) - $eax = MOV32rm %stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %stack.1) - MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1) RET 0 ... diff --git a/llvm/test/CodeGen/X86/spill2reg_simple_2.mir b/llvm/test/CodeGen/X86/spill2reg_liveregs_defined_in_bb.mir copy from llvm/test/CodeGen/X86/spill2reg_simple_2.mir copy to llvm/test/CodeGen/X86/spill2reg_liveregs_defined_in_bb.mir --- a/llvm/test/CodeGen/X86/spill2reg_simple_2.mir +++ b/llvm/test/CodeGen/X86/spill2reg_liveregs_defined_in_bb.mir @@ -1,11 +1,9 @@ # NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py # RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s -# Simple test with two overlapping spill-reload pairs. -# spill stack.0 -# spill stack.1 -# reload stack.0 -# reload stack.1 +# Check that spill2reg won't use $xmm0 if it is defined by some other instr +# in the BB and is live. Instead it should use $xmm1 and $xmm2. + --- | @D0 = dso_local local_unnamed_addr global i32 0, align 4 @@ -26,18 +24,23 @@ - { id: 1, type: spill-slot, size: 4, alignment: 4 } machineFunctionInfo: {} body: | - ; CHECK-LABEL: bb.0: - ; CHECK-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0) - ; CHECK-NEXT: $xmm0 = MOVDI2PDIrr $eax - ; CHECK-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1) - ; CHECK-NEXT: $xmm0 = MOVDI2PDIrr $eax - ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm0 - ; CHECK-NEXT: MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0) - ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm0 - ; CHECK-NEXT: MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1) - ; CHECK-NEXT: RET 0 - + ; CHECK-LABEL: bb.0: + ; CHECK: liveins: $eax + ; CHECK: $xmm0 = MOVDI2PDIrr $eax + ; CHECK-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0) + ; CHECK-NEXT: $xmm1 = MOVDI2PDIrr $eax + ; CHECK-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1) + ; CHECK-NEXT: $xmm2 = MOVDI2PDIrr $eax + ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm1 + ; CHECK-NEXT: MOV32mr $rip, 1, $noreg, @U0, $noreg, renamable $eax :: (store (s32) into @U0) + ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm2 + ; CHECK-NEXT: MOV32mr $rip, 1, $noreg, @U1, $noreg, renamable $eax :: (store (s32) into @U1) + ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm0 + ; CHECK-NEXT: RET $eax bb.0: + liveins: $eax + $xmm0 = MOVDI2PDIrr $eax + $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0) MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0) $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1) @@ -47,6 +50,7 @@ MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0) $eax = MOV32rm %stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %stack.1) MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1) - RET 0 + $eax = MOVPDI2DIrr $xmm0 + RET $eax ... diff --git a/llvm/test/CodeGen/X86/spill2reg_simple_2.mir b/llvm/test/CodeGen/X86/spill2reg_liveregs_livein.mir copy from llvm/test/CodeGen/X86/spill2reg_simple_2.mir copy to llvm/test/CodeGen/X86/spill2reg_liveregs_livein.mir --- a/llvm/test/CodeGen/X86/spill2reg_simple_2.mir +++ b/llvm/test/CodeGen/X86/spill2reg_liveregs_livein.mir @@ -1,11 +1,8 @@ # NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py # RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s -# Simple test with two overlapping spill-reload pairs. -# spill stack.0 -# spill stack.1 -# reload stack.0 -# reload stack.1 +# Check that spill2reg doesn't use $xmm0 and $xmm2 if they are livein. +# Instead it should use $xmm1 and $xmm3. --- | @D0 = dso_local local_unnamed_addr global i32 0, align 4 @@ -26,18 +23,22 @@ - { id: 1, type: spill-slot, size: 4, alignment: 4 } machineFunctionInfo: {} body: | - ; CHECK-LABEL: bb.0: - ; CHECK-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0) - ; CHECK-NEXT: $xmm0 = MOVDI2PDIrr $eax - ; CHECK-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1) - ; CHECK-NEXT: $xmm0 = MOVDI2PDIrr $eax - ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm0 - ; CHECK-NEXT: MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0) - ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm0 - ; CHECK-NEXT: MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1) - ; CHECK-NEXT: RET 0 - + ; CHECK-LABEL: name: func + ; CHECK: bb.0: + ; CHECK: liveins: $xmm0, $xmm2 + ; CHECK: $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0) + ; CHECK-NEXT: $xmm1 = MOVDI2PDIrr $eax + ; CHECK-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1) + ; CHECK-NEXT: $xmm3 = MOVDI2PDIrr $eax + ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm1 + ; CHECK-NEXT: MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0) + ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm3 + ; CHECK-NEXT: MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1) + ; CHECK-NEXT: JMP_1 %bb.1 bb.0: + successors: %bb.1 + liveins: $xmm0, $xmm2 + $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0) MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0) $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1) @@ -47,6 +48,10 @@ MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0) $eax = MOV32rm %stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %stack.1) MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1) + JMP_1 %bb.1 + + bb.1: + liveins: $xmm0, $xmm2 RET 0 ... diff --git a/llvm/test/CodeGen/X86/spill2reg_liveregs_reload_mbb_and_intermediate.mir b/llvm/test/CodeGen/X86/spill2reg_liveregs_reload_mbb_and_intermediate.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/X86/spill2reg_liveregs_reload_mbb_and_intermediate.mir @@ -0,0 +1,71 @@ +# RUN: llc %s -o - -mtriple=x86_64-unknown-linux -enable-spill2reg -mattr=+sse4.1 --run-pass=spill2reg -simplify-mir -spill2reg-mem-instrs=0 -spill2reg-vec-instrs=99999 | FileCheck %s + +# Check that the traversal works correctly when an MBB (like BB0) contains +# a reload (reload1), but it is also an intermediate block in the path from +# another reload (reload2) to a spill (spill1). +# In the following example we need to make sure that we don't skip the +# instructions of BB1 under reload1 during the bottom-up traversal from +# reload2 to the spill. + +# BB0: +# [stack.0] = ... ; spill +# BB1: +# ... = [stack.0] ; reload1 +# call ; clobbers xmm regs +# BB2: +# ... = [stack.0] ; reload2 + +--- | + @D0 = dso_local local_unnamed_addr global i32 0, align 4 + @U0 = dso_local local_unnamed_addr global i32 0, align 4 + declare void @foo() + define void @func() { ret void } +... +--- +name: func +alignment: 16 +tracksRegLiveness: true +tracksDebugUserValues: true +frameInfo: + maxAlignment: 4 +stack: + - { id: 0, type: spill-slot, size: 4, alignment: 4 } +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: bb.0: + ; CHECK-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0) + ; CHECK-NEXT: MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0) + + ; CHECK-LABEL: bb.1: + ; CHECK-NEXT: $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0) + ; CHECK-NEXT: MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0) + ; CHECK-NEXT: CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit-def $rsp + ; CHECK-NEXT: JMP_1 %bb.2 + + ; CHECK-LABEL: bb.2: + ; CHECK-NEXT: $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0) + ; CHECK-NEXT: MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0) + ; CHECK-NEXT: RET 0 + + bb.0: + successors: %bb.1 + $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0) + ; spill + MOV32mr %stack.0, 1, $noreg, 0, $noreg, killed renamable $eax :: (store (s32) into %stack.0) + + bb.1: + successors: %bb.2 + ; reload1 + $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0) + MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0) + + ; The call clobbers all xmm regs + CALL64pcrel32 @foo, csr_64, implicit $rsp, implicit-def $rsp + JMP_1 %bb.2 + + bb.2: + ; reload2 + $eax = MOV32rm %stack.0, 1, $noreg, 0, $noreg :: (load (s32) from %stack.0) + MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0) + RET 0 +... diff --git a/llvm/test/CodeGen/X86/spill2reg_simple_2.mir b/llvm/test/CodeGen/X86/spill2reg_simple_2.mir --- a/llvm/test/CodeGen/X86/spill2reg_simple_2.mir +++ b/llvm/test/CodeGen/X86/spill2reg_simple_2.mir @@ -30,10 +30,10 @@ ; CHECK-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D0, $noreg :: (dereferenceable load (s32) from @D0) ; CHECK-NEXT: $xmm0 = MOVDI2PDIrr $eax ; CHECK-NEXT: $eax = MOV32rm $rip, 1, $noreg, @D1, $noreg :: (dereferenceable load (s32) from @D1) - ; CHECK-NEXT: $xmm0 = MOVDI2PDIrr $eax + ; CHECK-NEXT: $xmm1 = MOVDI2PDIrr $eax ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm0 ; CHECK-NEXT: MOV32mr $rip, 1, $noreg, @U0, $noreg, killed renamable $eax :: (store (s32) into @U0) - ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm0 + ; CHECK-NEXT: $eax = MOVPDI2DIrr $xmm1 ; CHECK-NEXT: MOV32mr $rip, 1, $noreg, @U1, $noreg, killed renamable $eax :: (store (s32) into @U1) ; CHECK-NEXT: RET 0