Index: include/llvm/CodeGen/CommandFlags.h =================================================================== --- include/llvm/CodeGen/CommandFlags.h +++ include/llvm/CodeGen/CommandFlags.h @@ -177,6 +177,11 @@ cl::desc("Never emit tail calls"), cl::init(false)); +cl::opt +StackSymbolOrdering("stack-symbol-ordering", + cl::desc("Order local stack symbols."), + cl::init(true)); + cl::opt OverrideStackAlignment("stack-alignment", cl::desc("Override default stack alignment"), @@ -273,6 +278,7 @@ Options.NoZerosInBSS = DontPlaceZerosInBSS; Options.GuaranteedTailCallOpt = EnableGuaranteedTailCallOpt; Options.StackAlignmentOverride = OverrideStackAlignment; + Options.StackSymbolOrdering = StackSymbolOrdering; Options.PositionIndependentExecutable = EnablePIE; Options.UseInitArray = !UseCtors; Options.DataSections = DataSections; Index: include/llvm/Target/TargetFrameLowering.h =================================================================== --- include/llvm/Target/TargetFrameLowering.h +++ include/llvm/Target/TargetFrameLowering.h @@ -311,6 +311,17 @@ virtual bool canUseAsEpilogue(const MachineBasicBlock &MBB) const { return true; } + + /// Order the symbols in the local stack frame. + /// The list of objects that we want to order is in \p objectsToAllocate as + /// indices into the MachineFrameInfo. The array can be reordered in any way + /// upon return. The contents of the array, however, may not be modified (i.e. + /// only their order may be changed). + /// By default, just maintain the original order. + virtual void orderFrameObjects(const MachineFunction &MF, + SmallVectorImpl &objectsToAllocate) const { + return; + } }; } // End llvm namespace Index: include/llvm/Target/TargetOptions.h =================================================================== --- include/llvm/Target/TargetOptions.h +++ include/llvm/Target/TargetOptions.h @@ -73,6 +73,7 @@ UnsafeFPMath(false), NoInfsFPMath(false), NoNaNsFPMath(false), HonorSignDependentRoundingFPMathOption(false), NoZerosInBSS(false), GuaranteedTailCallOpt(false), StackAlignmentOverride(0), + StackSymbolOrdering(true), EnableFastISel(false), PositionIndependentExecutable(false), UseInitArray(false), DisableIntegratedAS(false), CompressDebugSections(false), FunctionSections(false), @@ -142,6 +143,12 @@ /// as their parent function, etc.), using an alternate ABI if necessary. unsigned GuaranteedTailCallOpt : 1; + /// StackSymbolOrdering - When true, this will allow CodeGen to order + /// the local stack symbols (for code size, code locality, or any other + /// heuristics). When false, the local symbols are left in whatever order + /// they were generated. Default is true. + unsigned StackSymbolOrdering : 1; + /// StackAlignmentOverride - Override default stack alignment for target. unsigned StackAlignmentOverride; Index: lib/CodeGen/PrologEpilogInserter.cpp =================================================================== --- lib/CodeGen/PrologEpilogInserter.cpp +++ lib/CodeGen/PrologEpilogInserter.cpp @@ -705,8 +705,10 @@ Offset, MaxAlign, Skew); } - // Then assign frame offsets to stack objects that are not used to spill - // callee saved registers. + SmallVector ObjectsToAllocate; + + // Then prepare to assign frame offsets to stack objects that are not used to + // spill callee saved registers. for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) { if (MFI->isObjectPreAllocated(i) && MFI->getUseLocalStackAllocationBlock()) @@ -722,8 +724,17 @@ if (ProtectedObjs.count(i)) continue; - AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign, Skew); + // Add the objects that we need to allocate to our working set. + ObjectsToAllocate.push_back(i); } + // Give the targets a chance to order the objects the way they like it. + if (Fn.getTarget().getOptLevel() != CodeGenOpt::None && + Fn.getTarget().Options.StackSymbolOrdering) + TFI.orderFrameObjects(Fn, ObjectsToAllocate); + + // Now walk the objects and actually assign base offsets to them. + for (auto &Object : ObjectsToAllocate) + AdjustStackOffset(MFI, Object, StackGrowsDown, Offset, MaxAlign, Skew); // Make sure the special register scavenging spill slot is closest to the // stack pointer. Index: lib/Target/X86/X86FrameLowering.h =================================================================== --- lib/Target/X86/X86FrameLowering.h +++ lib/Target/X86/X86FrameLowering.h @@ -134,6 +134,13 @@ /// \p MBB will be correctly handled by the target. bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override; + /// Order the symbols in the local stack. + /// We want to place the local stack objects in some sort of sensible order. + /// The heuristic we use is to try and pack them according to static number + /// of uses and size in order to minimize code size. + void orderFrameObjects(const MachineFunction &MF, + SmallVectorImpl &objectsToAllocate) const override; + /// convertArgMovsToPushes - This method tries to convert a call sequence /// that uses sub and mov instructions to put the argument onto the stack /// into a series of pushes. Index: lib/Target/X86/X86FrameLowering.cpp =================================================================== --- lib/Target/X86/X86FrameLowering.cpp +++ lib/Target/X86/X86FrameLowering.cpp @@ -2657,6 +2657,148 @@ return MBBI; } +namespace { + // Struct used by orderFrameObjects to help sort the stack objects. + struct X86FrameSortingObject { + bool IsValid{false}; // true if we care about this Object. + unsigned ObjectIndex{0}; // Index of Object into MFI list. + unsigned ObjectSize{0}; // Size of Object in bytes. + unsigned ObjectAlignment{1}; // Alignment of Object in bytes. + unsigned ObjectNumUses{0}; // Object static number of uses. + }; + + // The comparison function we use for std::sort to order our local + // stack symbols. The current algorithm is to use an estimated + // "density". This takes into consideration the size and number of + // uses each object has in order to roughly minimize code size. + // So, for example, an object of size 16B that is referenced 5 times + // will get higher priority than 4 4B objects referenced 1 time each. + // It's not perfect and we may be able to squeeze a few more bytes out of + // it (for example : 0(esp) requires fewer bytes, symbols allocated at the + // fringe end can have special consideration, given their size is less + // important, etc.), but the algorithmic complexity grows too much to be + // worth the extra gains we get. This gets us pretty close. + // The final order leaves us with objects with highest priority going + // at the end of our list. + struct X86FrameSortingComparator { + inline bool operator() (const X86FrameSortingObject& a, + const X86FrameSortingObject& b) + { + int64_t DensityAScaled, DensityBScaled; + + // For consistency in our comparison, all invalid objects are placed + // at the end. This also allows us to stop walking when we hit the + // first invalid item after it's all sorted. + if (!a.IsValid) + return false; + if (!b.IsValid) + return true; + + // The density is calculated by doing : + // (double)DensityA = a.ObjectNumUses / a.ObjectSize + // (double)DensityB = b.ObjectNumUses / b.ObjectSize + // Since this approach may cause inconsistencies in + // the floating point <, >, == comparisons, depending on the floating + // point model with which the compiler was built, we're going + // to scale both sides by multiplying with + // a.ObjectSize * b.ObjectSize. This ends up factoring away + // the division and, with it, the need for any floating point + // arithmetic. + DensityAScaled = static_cast(a.ObjectNumUses) * + static_cast(b.ObjectSize); + DensityBScaled = static_cast(b.ObjectNumUses) * + static_cast(a.ObjectSize); + + // If the two densities are equal, prioritize highest alignment + // objects. This allows for similar alignment objects + // to be packed together (given the same density). + // There's room for improvement here, also, since we can pack + // similar alignment (different density) objects next to each + // other to save padding. This will also require further + // complexity/iterations, and the overall gain isn't worth it, + // in general. Something to keep in mind, though. + if (DensityAScaled == DensityBScaled) + return a.ObjectAlignment < b.ObjectAlignment; + + return DensityAScaled < DensityBScaled; + } + }; +} + +// Order the symbols in the local stack. +// We want to place the local stack objects in some sort of sensible order. +// The heuristic we use is to try and pack them according to static number +// of uses and size of object in order to minimize code size. +void X86FrameLowering::orderFrameObjects( + const MachineFunction &MF, SmallVectorImpl &ObjectsToAllocate) const { + const MachineFrameInfo *MFI = MF.getFrameInfo(); + + // Don't waste time if there's nothing to do. + if (ObjectsToAllocate.empty()) + return; + + // Create an array of all MFI objects. We won't need all of these + // objects, but we're going to create a full array of them to make + // it easier to index into when we're counting "uses" down below. + // We want to be able to easily/cheaply access an object by simply + // indexing into it, instead of having to search for it every time. + std::vector SortingObjects(MFI->getObjectIndexEnd()); + + // Walk the objects we care about and mark them as such in our working + // struct. + for (auto &Obj : ObjectsToAllocate) { + SortingObjects[Obj].IsValid = true; + SortingObjects[Obj].ObjectIndex = Obj; + SortingObjects[Obj].ObjectAlignment = MFI->getObjectAlignment(Obj); + // Set the size. + int ObjectSize = MFI->getObjectSize(Obj); + if (ObjectSize == 0) + // Variable size. Just use 4. + SortingObjects[Obj].ObjectSize = 4; + else + SortingObjects[Obj].ObjectSize = ObjectSize; + } + + // Count the number of uses for each object. + for (auto &MBB : MF) { + for (auto &MI : MBB) { + for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) { + // Check to see if it's a local stack symbol. + if (!MI.getOperand(i).isFI()) + continue; + int Index = MI.getOperand(i).getIndex(); + // Check to see if it falls within our range, and is tagged + // to require ordering. + if (Index >= 0 && Index < MFI->getObjectIndexEnd() && + SortingObjects[Index].IsValid) + SortingObjects[Index].ObjectNumUses++; + } + } + } + + // Sort the objects using X86FrameSortingAlgorithm (see its comment for + // info). + std::stable_sort(SortingObjects.begin(), SortingObjects.end(), + X86FrameSortingComparator()); + + // Now modify the original list to represent the final order that + // we want. The order will depend on whether we're going to access them + // from the stack pointer or the frame pointer. For SP, the list should + // end up with the END containing objects that we want with smaller offsets. + // For FP, it should be flipped. + int i = 0; + for (auto &Obj : SortingObjects) { + // All invalid items are sorted at the end, so it's safe to stop. + if (!Obj.IsValid) + break; + ObjectsToAllocate[i++] = Obj.ObjectIndex; + } + + // Flip it if we're accessing off of the FP. + if (!TRI->needsStackRealignment(MF) && hasFP(MF)) + std::reverse(ObjectsToAllocate.begin(), ObjectsToAllocate.end()); +} + unsigned X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const { // RDX, the parent frame pointer, is homed into 16(%rsp) in the prologue. unsigned Offset = 16; Index: test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll =================================================================== --- test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll +++ test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll @@ -1,5 +1,5 @@ ; RUN: llc < %s -; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -stack-symbol-ordering=0 -march=x86-64 -verify-machineinstrs | FileCheck %s ; PR3538 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128" target triple = "i386-apple-darwin9" Index: test/CodeGen/X86/aligned-variadic.ll =================================================================== --- test/CodeGen/X86/aligned-variadic.ll +++ test/CodeGen/X86/aligned-variadic.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s -check-prefix=X64 -; RUN: llc < %s -mtriple=i686-apple-darwin | FileCheck %s -check-prefix=X32 +; RUN: llc < %s -mtriple=x86_64-apple-darwin -stack-symbol-ordering=0 | FileCheck %s -check-prefix=X64 +; RUN: llc < %s -mtriple=i686-apple-darwin -stack-symbol-ordering=0 | FileCheck %s -check-prefix=X32 %struct.Baz = type { [17 x i8] } %struct.__va_list_tag = type { i32, i32, i8*, i8* } Index: test/CodeGen/X86/cleanuppad-realign.ll =================================================================== --- test/CodeGen/X86/cleanuppad-realign.ll +++ test/CodeGen/X86/cleanuppad-realign.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=i686-pc-windows-msvc < %s | FileCheck --check-prefix=X86 %s -; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck --check-prefix=X64 %s +; RUN: llc -mtriple=i686-pc-windows-msvc -stack-symbol-ordering=0 < %s | FileCheck --check-prefix=X86 %s +; RUN: llc -mtriple=x86_64-pc-windows-msvc -stack-symbol-ordering=0 < %s | FileCheck --check-prefix=X64 %s declare i32 @__CxxFrameHandler3(...) declare void @Dtor(i64* %o) Index: test/CodeGen/X86/dynamic-allocas-VLAs.ll =================================================================== --- test/CodeGen/X86/dynamic-allocas-VLAs.ll +++ test/CodeGen/X86/dynamic-allocas-VLAs.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mcpu=generic -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s -; RUN: llc < %s -mcpu=generic -stackrealign -stack-alignment=32 -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s -check-prefix=FORCE-ALIGN +; RUN: llc < %s -stack-symbol-ordering=0 -mcpu=generic -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s +; RUN: llc < %s -stack-symbol-ordering=0 -mcpu=generic -stackrealign -stack-alignment=32 -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s -check-prefix=FORCE-ALIGN ; rdar://11496434 ; no VLAs or dynamic alignment Index: test/CodeGen/X86/hipe-cc.ll =================================================================== --- test/CodeGen/X86/hipe-cc.ll +++ test/CodeGen/X86/hipe-cc.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -tailcallopt -code-model=medium -stack-alignment=4 -mtriple=i686-linux-gnu -mcpu=pentium | FileCheck %s +; RUN: llc < %s -stack-symbol-ordering=0 -tailcallopt -code-model=medium -stack-alignment=4 -mtriple=i686-linux-gnu -mcpu=pentium | FileCheck %s ; Check the HiPE calling convention works (x86-32) Index: test/CodeGen/X86/hipe-cc64.ll =================================================================== --- test/CodeGen/X86/hipe-cc64.ll +++ test/CodeGen/X86/hipe-cc64.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -tailcallopt -code-model=medium -stack-alignment=8 -mtriple=x86_64-linux-gnu -mcpu=opteron | FileCheck %s +; RUN: llc < %s -stack-symbol-ordering=0 -tailcallopt -code-model=medium -stack-alignment=8 -mtriple=x86_64-linux-gnu -mcpu=opteron | FileCheck %s ; Check the HiPE calling convention works (x86-64) Index: test/CodeGen/X86/local-stack-symbol-ordering.ll =================================================================== --- test/CodeGen/X86/local-stack-symbol-ordering.ll +++ test/CodeGen/X86/local-stack-symbol-ordering.ll @@ -0,0 +1,184 @@ +; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -check-prefix=X64 +; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s -check-prefix=X32 + +; CHECK-LABEL: foo + +; Check the functionality of the local stack symbol table ordering +; heuristics. +; The test has a bunch of locals of various sizes that are referenced a +; different number of times. +; +; a : 120B, 9 uses, density = 0.075 +; aa : 4000B, 1 use, density = 0.00025 +; b : 4B, 1 use, density = 0.25 +; cc : 4000B, 2 uses density = 0.0005 +; d : 4B, 2 uses density = 0.5 +; e : 4B, 3 uses density = 0.75 +; f : 4B, 4 uses density = 1 +; +; Given the size, number of uses and calculated density (uses / size), we're +; going to hope that f gets allocated closest to the stack pointer, +; followed by e, d, b, then a (to check for just a few). +; We use gnu-inline asm between calls to prevent registerization of addresses +; so that we get exact counts. +; +; The test is taken from something like this: +; void foo() +; { +; int f; // 4 uses. 4 / 4 = 1 +; int a[30]; // 9 uses. 8 / 120 = 0.06 +; int aa[1000]; // 1 use. 1 / 4000 = +; int e; // 3 uses. 3 / 4 = 0.75 +; int cc[1000]; // 2 uses. 2 / 4000 = +; int b; // 1 use. 1 / 4 = 0.25 +; int d; // 2 uses. 2 / 4 = 0.5 +; int aaa[1000]; // 2 uses. 2 / 4000 +; +; +; check_a(&a); +; bar1(&aaa); +; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp"); +; bar1(&a); +; check_f(&f); +; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp"); +; bar1(&a); +; bar3(&aa, &aaa, &cc); +; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp"); +; bar2(&a,&cc); +; check_b(&b); +; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp"); +; bar1(&a); +; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp"); +; bar2(&a, &f); +; check_e(&e); +; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp"); +; bar1(&a); +; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp"); +; bar2(&e, &f); +; check_d(&d); +; bar1(&a); +; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp"); +; bar3(&d, &e, &f); +; asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp"); +; bar1(&a); +; } +; +; X64: leaq 16(%rsp), %rdi +; X64: callq check_a +; X64: callq bar1 +; X64: callq bar1 +; X64: leaq (%rsp), %rdi +; X64: callq check_f +; X64: callq bar1 +; X64: callq bar3 +; X64: callq bar2 +; X64: leaq 12(%rsp), %rdi +; X64: callq check_b +; X64: callq bar1 +; X64: callq bar2 +; X64: leaq 4(%rsp), %rdi +; X64: callq check_e +; X64: callq bar1 +; X64: callq bar2 +; X64: leaq 8(%rsp), %rdi +; X64: callq check_d + +; X32: leal 32(%esp) +; X32: calll check_a +; X32: calll bar1 +; X32: calll bar1 +; X32: leal 16(%esp) +; X32: calll check_f +; X32: calll bar1 +; X32: calll bar3 +; X32: calll bar2 +; X32: leal 28(%esp) +; X32: calll check_b +; X32: calll bar1 +; X32: calll bar2 +; X32: leal 20(%esp) +; X32: calll check_e +; X32: calll bar1 +; X32: calll bar2 +; X32: leal 24(%esp) +; X32: calll check_d + + +define void @foo() nounwind uwtable { +entry: + %f = alloca i32, align 4 + %a = alloca [30 x i32], align 16 + %aa = alloca [1000 x i32], align 16 + %e = alloca i32, align 4 + %cc = alloca [1000 x i32], align 16 + %b = alloca i32, align 4 + %d = alloca i32, align 4 + %aaa = alloca [1000 x i32], align 16 + %0 = bitcast i32* %f to i8* + call void @llvm.lifetime.start(i64 4, i8* %0) #1 + %1 = bitcast [30 x i32]* %a to i8* + call void @llvm.lifetime.start(i64 120, i8* %1) #1 + %2 = bitcast [1000 x i32]* %aa to i8* + call void @llvm.lifetime.start(i64 4000, i8* %2) #1 + %3 = bitcast i32* %e to i8* + call void @llvm.lifetime.start(i64 4, i8* %3) #1 + %4 = bitcast [1000 x i32]* %cc to i8* + call void @llvm.lifetime.start(i64 4000, i8* %4) #1 + %5 = bitcast i32* %b to i8* + call void @llvm.lifetime.start(i64 4, i8* %5) #1 + %6 = bitcast i32* %d to i8* + call void @llvm.lifetime.start(i64 4, i8* %6) #1 + %7 = bitcast [1000 x i32]* %aaa to i8* + call void @llvm.lifetime.start(i64 4000, i8* %7) #1 + %call = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @check_a to i32 ([30 x i32]*, ...)*)([30 x i32]* %a) + %call1 = call i32 ([1000 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([1000 x i32]*, ...)*)([1000 x i32]* %aaa) + call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1 + %call2 = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([30 x i32]*, ...)*)([30 x i32]* %a) + %call3 = call i32 (i32*, ...) bitcast (i32 (...)* @check_f to i32 (i32*, ...)*)(i32* %f) + call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1 + %call4 = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([30 x i32]*, ...)*)([30 x i32]* %a) + %call5 = call i32 ([1000 x i32]*, [1000 x i32]*, [1000 x i32]*, ...) bitcast (i32 (...)* @bar3 to i32 ([1000 x i32]*, [1000 x i32]*, [1000 x i32]*, ...)*)([1000 x i32]* %aa, [1000 x i32]* %aaa, [1000 x i32]* %cc) + call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1 + %call6 = call i32 ([30 x i32]*, [1000 x i32]*, ...) bitcast (i32 (...)* @bar2 to i32 ([30 x i32]*, [1000 x i32]*, ...)*)([30 x i32]* %a, [1000 x i32]* %cc) + %call7 = call i32 (i32*, ...) bitcast (i32 (...)* @check_b to i32 (i32*, ...)*)(i32* %b) + call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1 + %call8 = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([30 x i32]*, ...)*)([30 x i32]* %a) + call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1 + %call9 = call i32 ([30 x i32]*, i32*, ...) bitcast (i32 (...)* @bar2 to i32 ([30 x i32]*, i32*, ...)*)([30 x i32]* %a, i32* %f) + %call10 = call i32 (i32*, ...) bitcast (i32 (...)* @check_e to i32 (i32*, ...)*)(i32* %e) + call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1 + %call11 = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([30 x i32]*, ...)*)([30 x i32]* %a) + call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1 + %call12 = call i32 (i32*, i32*, ...) bitcast (i32 (...)* @bar2 to i32 (i32*, i32*, ...)*)(i32* %e, i32* %f) + %call13 = call i32 (i32*, ...) bitcast (i32 (...)* @check_d to i32 (i32*, ...)*)(i32* %d) + %call14 = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([30 x i32]*, ...)*)([30 x i32]* %a) + call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1 + %call15 = call i32 (i32*, i32*, i32*, ...) bitcast (i32 (...)* @bar3 to i32 (i32*, i32*, i32*, ...)*)(i32* %d, i32* %e, i32* %f) + call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1 + %call16 = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([30 x i32]*, ...)*)([30 x i32]* %a) + call void @llvm.lifetime.end(i64 4000, i8* %7) #1 + call void @llvm.lifetime.end(i64 4, i8* %6) #1 + call void @llvm.lifetime.end(i64 4, i8* %5) #1 + call void @llvm.lifetime.end(i64 4000, i8* %4) #1 + call void @llvm.lifetime.end(i64 4, i8* %3) #1 + call void @llvm.lifetime.end(i64 4000, i8* %2) #1 + call void @llvm.lifetime.end(i64 120, i8* %1) #1 + call void @llvm.lifetime.end(i64 4, i8* %0) #1 + ret void +} + +; Function Attrs: nounwind +declare void @llvm.lifetime.start(i64, i8* nocapture) #1 + +declare i32 @check_a(...) #2 +declare i32 @bar1(...) #2 +declare i32 @check_f(...) #2 +declare i32 @bar3(...) #2 +declare i32 @bar2(...) #2 +declare i32 @check_b(...) #2 +declare i32 @check_e(...) #2 +declare i32 @check_d(...) #2 + +; Function Attrs: nounwind +declare void @llvm.lifetime.end(i64, i8* nocapture) #1 + Index: test/CodeGen/X86/phys-reg-local-regalloc.ll =================================================================== --- test/CodeGen/X86/phys-reg-local-regalloc.ll +++ test/CodeGen/X86/phys-reg-local-regalloc.ll @@ -1,6 +1,6 @@ -; RUN: llc < %s -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast -optimize-regalloc=0 | FileCheck %s -; RUN: llc -O0 < %s -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast | FileCheck %s -; RUN: llc < %s -march=x86 -mtriple=i386-apple-darwin9 -mcpu=atom -regalloc=fast -optimize-regalloc=0 | FileCheck -check-prefix=ATOM %s +; RUN: llc < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast -optimize-regalloc=0 | FileCheck %s +; RUN: llc -O0 < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast | FileCheck %s +; RUN: llc < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=atom -regalloc=fast -optimize-regalloc=0 | FileCheck -check-prefix=ATOM %s ; CHECKed instructions should be the same with or without -O0 except on Intel Atom due to instruction scheduling. @.str = private constant [12 x i8] c"x + y = %i\0A\00", align 1 ; <[12 x i8]*> [#uses=1] Index: test/CodeGen/X86/seh-catch-all-win32.ll =================================================================== --- test/CodeGen/X86/seh-catch-all-win32.ll +++ test/CodeGen/X86/seh-catch-all-win32.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s +; RUN: llc -stack-symbol-ordering=0 -mtriple=i686-windows-msvc < %s | FileCheck %s ; 32-bit catch-all has to use a filter function because that's how it saves the ; exception code. Index: test/CodeGen/X86/seh-stack-realign.ll =================================================================== --- test/CodeGen/X86/seh-stack-realign.ll +++ test/CodeGen/X86/seh-stack-realign.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s +; RUN: llc -stack-symbol-ordering=0 -mtriple=i686-windows-msvc < %s | FileCheck %s ; 32-bit catch-all has to use a filter function because that's how it saves the ; exception code. Index: test/CodeGen/X86/ssp-data-layout.ll =================================================================== --- test/CodeGen/X86/ssp-data-layout.ll +++ test/CodeGen/X86/ssp-data-layout.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -disable-fp-elim -mtriple=x86_64-pc-linux-gnu -mcpu=corei7 -o - | FileCheck %s +; RUN: llc < %s -stack-symbol-ordering=0 -disable-fp-elim -mtriple=x86_64-pc-linux-gnu -mcpu=corei7 -o - | FileCheck %s ; This test is fairly fragile. The goal is to ensure that "large" stack ; objects are allocated closest to the stack protector (i.e., farthest away ; from the Stack Pointer.) In standard SSP mode this means that large (>= Index: test/CodeGen/X86/statepoint-stack-usage.ll =================================================================== --- test/CodeGen/X86/statepoint-stack-usage.ll +++ test/CodeGen/X86/statepoint-stack-usage.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s | FileCheck %s +; RUN: llc -stack-symbol-ordering=0 < %s | FileCheck %s target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128" target triple = "x86_64-pc-linux-gnu" Index: test/CodeGen/X86/statepoint-stackmap-format.ll =================================================================== --- test/CodeGen/X86/statepoint-stackmap-format.ll +++ test/CodeGen/X86/statepoint-stackmap-format.ll @@ -1,5 +1,5 @@ -; RUN: llc < %s -mtriple="x86_64-pc-linux-gnu" | FileCheck %s -; RUN: llc < %s -mtriple="x86_64-pc-win64-coff" | FileCheck %s +; RUN: llc < %s -stack-symbol-ordering=0 -mtriple="x86_64-pc-linux-gnu" | FileCheck %s +; RUN: llc < %s -stack-symbol-ordering=0 -mtriple="x86_64-pc-win64-coff" | FileCheck %s ; This test is a sanity check to ensure statepoints are generating StackMap ; sections correctly. This is not intended to be a rigorous test of the Index: test/CodeGen/X86/stdarg.ll =================================================================== --- test/CodeGen/X86/stdarg.ll +++ test/CodeGen/X86/stdarg.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s +; RUN: llc -stack-symbol-ordering=0 < %s -mtriple=x86_64-linux | FileCheck %s %struct.__va_list_tag = type { i32, i32, i8*, i8* } Index: test/CodeGen/X86/widen_load-1.ll =================================================================== --- test/CodeGen/X86/widen_load-1.ll +++ test/CodeGen/X86/widen_load-1.ll @@ -1,5 +1,5 @@ -; RUN: llc %s -o - -march=x86-64 -mattr=-avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=SSE -; RUN: llc %s -o - -march=x86-64 -mattr=+avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=AVX +; RUN: llc -stack-symbol-ordering=0 %s -o - -march=x86-64 -mattr=-avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=SSE +; RUN: llc -stack-symbol-ordering=0 %s -o - -march=x86-64 -mattr=+avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=AVX ; PR4891 ; PR5626 Index: test/CodeGen/X86/win-catchpad-varargs.ll =================================================================== --- test/CodeGen/X86/win-catchpad-varargs.ll +++ test/CodeGen/X86/win-catchpad-varargs.ll @@ -1,5 +1,5 @@ -; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s --check-prefix=X64 -; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s --check-prefix=X86 +; RUN: llc -stack-symbol-ordering=0 -mtriple=x86_64-windows-msvc < %s | FileCheck %s --check-prefix=X64 +; RUN: llc -stack-symbol-ordering=0 -mtriple=i686-windows-msvc < %s | FileCheck %s --check-prefix=X86 declare void @llvm.va_start(i8*) declare void @llvm.va_end(i8*) Index: test/CodeGen/X86/win-catchpad.ll =================================================================== --- test/CodeGen/X86/win-catchpad.ll +++ test/CodeGen/X86/win-catchpad.ll @@ -1,5 +1,5 @@ -; RUN: llc -verify-machineinstrs -mtriple=i686-pc-windows-msvc < %s | FileCheck --check-prefix=X86 %s -; RUN: llc -verify-machineinstrs -mtriple=x86_64-pc-windows-msvc < %s | FileCheck --check-prefix=X64 %s +; RUN: llc -stack-symbol-ordering=0 -verify-machineinstrs -mtriple=i686-pc-windows-msvc < %s | FileCheck --check-prefix=X86 %s +; RUN: llc -stack-symbol-ordering=0 -verify-machineinstrs -mtriple=x86_64-pc-windows-msvc < %s | FileCheck --check-prefix=X64 %s ; Loosely based on IR for this C++ source code: ; void f(int p); Index: test/CodeGen/X86/win32-seh-catchpad-realign.ll =================================================================== --- test/CodeGen/X86/win32-seh-catchpad-realign.ll +++ test/CodeGen/X86/win32-seh-catchpad-realign.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s | FileCheck %s +; RUN: llc -stack-symbol-ordering=0 < %s | FileCheck %s ; The aligned alloca means that we have to realign the stack, which forces the ; use of ESI to address local variables.