Index: llvm/trunk/include/llvm/CodeGen/CommandFlags.h
===================================================================
--- llvm/trunk/include/llvm/CodeGen/CommandFlags.h
+++ llvm/trunk/include/llvm/CodeGen/CommandFlags.h
@@ -177,6 +177,11 @@
                  cl::desc("Never emit tail calls"),
                  cl::init(false));
 
+cl::opt<bool>
+StackSymbolOrdering("stack-symbol-ordering",
+                    cl::desc("Order local stack symbols."),
+                    cl::init(true));
+
 cl::opt<unsigned>
 OverrideStackAlignment("stack-alignment",
                        cl::desc("Override default stack alignment"),
@@ -284,6 +289,7 @@
   Options.NoZerosInBSS = DontPlaceZerosInBSS;
   Options.GuaranteedTailCallOpt = EnableGuaranteedTailCallOpt;
   Options.StackAlignmentOverride = OverrideStackAlignment;
+  Options.StackSymbolOrdering = StackSymbolOrdering;
   Options.PositionIndependentExecutable = EnablePIE;
   Options.UseInitArray = !UseCtors;
   Options.DataSections = DataSections;
Index: llvm/trunk/include/llvm/Target/TargetFrameLowering.h
===================================================================
--- llvm/trunk/include/llvm/Target/TargetFrameLowering.h
+++ llvm/trunk/include/llvm/Target/TargetFrameLowering.h
@@ -288,6 +288,18 @@
                      "target!");
   }
 
+
+  /// Order the symbols in the local stack frame.
+  /// The list of objects that we want to order is in \p objectsToAllocate as
+  /// indices into the MachineFrameInfo. The array can be reordered in any way
+  /// upon return. The contents of the array, however, may not be modified (i.e.
+  /// only their order may be changed).
+  /// By default, just maintain the original order.
+  virtual void
+  orderFrameObjects(const MachineFunction &MF,
+                    SmallVectorImpl<int> &objectsToAllocate) const {
+  }
+
   /// Check whether or not the given \p MBB can be used as a prologue
   /// for the target.
   /// The prologue will be inserted first in this basic block.
Index: llvm/trunk/include/llvm/Target/TargetOptions.h
===================================================================
--- llvm/trunk/include/llvm/Target/TargetOptions.h
+++ llvm/trunk/include/llvm/Target/TargetOptions.h
@@ -97,6 +97,7 @@
           UnsafeFPMath(false), NoInfsFPMath(false), NoNaNsFPMath(false),
           HonorSignDependentRoundingFPMathOption(false), NoZerosInBSS(false),
           GuaranteedTailCallOpt(false), StackAlignmentOverride(0),
+          StackSymbolOrdering(true),
           EnableFastISel(false), PositionIndependentExecutable(false),
           UseInitArray(false), DisableIntegratedAS(false),
           CompressDebugSections(false), FunctionSections(false),
@@ -169,6 +170,12 @@
     /// StackAlignmentOverride - Override default stack alignment for target.
     unsigned StackAlignmentOverride;
 
+    /// StackSymbolOrdering - When true, this will allow CodeGen to order
+    /// the local stack symbols (for code size, code locality, or any other
+    /// heuristics). When false, the local symbols are left in whatever order
+    /// they were generated. Default is true.
+    unsigned StackSymbolOrdering : 1;
+
     /// EnableFastISel - This flag enables fast-path instruction selection
     /// which trades away generated code quality in favor of reducing
     /// compile time.
Index: llvm/trunk/lib/CodeGen/PrologEpilogInserter.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/PrologEpilogInserter.cpp
+++ llvm/trunk/lib/CodeGen/PrologEpilogInserter.cpp
@@ -707,8 +707,10 @@
                           Offset, MaxAlign, Skew);
   }
 
-  // Then assign frame offsets to stack objects that are not used to spill
-  // callee saved registers.
+  SmallVector<int, 8> ObjectsToAllocate;
+
+  // Then prepare to assign frame offsets to stack objects that are not used to
+  // spill callee saved registers.
   for (unsigned i = 0, e = MFI->getObjectIndexEnd(); i != e; ++i) {
     if (MFI->isObjectPreAllocated(i) &&
         MFI->getUseLocalStackAllocationBlock())
@@ -724,8 +726,17 @@
     if (ProtectedObjs.count(i))
       continue;
 
-    AdjustStackOffset(MFI, i, StackGrowsDown, Offset, MaxAlign, Skew);
+    // Add the objects that we need to allocate to our working set.
+    ObjectsToAllocate.push_back(i);
   }
+  // Give the targets a chance to order the objects the way they like it.
+  if (Fn.getTarget().getOptLevel() != CodeGenOpt::None &&
+      Fn.getTarget().Options.StackSymbolOrdering)
+    TFI.orderFrameObjects(Fn, ObjectsToAllocate);
+  
+  // Now walk the objects and actually assign base offsets to them.
+  for (auto &Object : ObjectsToAllocate)
+    AdjustStackOffset(MFI, Object, StackGrowsDown, Offset, MaxAlign, Skew);
 
   // Make sure the special register scavenging spill slot is closest to the
   // stack pointer.
Index: llvm/trunk/lib/Target/X86/X86FrameLowering.h
===================================================================
--- llvm/trunk/lib/Target/X86/X86FrameLowering.h
+++ llvm/trunk/lib/Target/X86/X86FrameLowering.h
@@ -137,6 +137,13 @@
   /// Returns true if the target will correctly handle shrink wrapping.
   bool enableShrinkWrapping(const MachineFunction &MF) const override;
 
+  /// Order the symbols in the local stack.
+  /// We want to place the local stack objects in some sort of sensible order.
+  /// The heuristic we use is to try and pack them according to static number
+  /// of uses and size in order to minimize code size.
+  void orderFrameObjects(const MachineFunction &MF,
+                         SmallVectorImpl<int> &ObjectsToAllocate) const override;
+
   /// convertArgMovsToPushes - This method tries to convert a call sequence
   /// that uses sub and mov instructions to put the argument onto the stack
   /// into a series of pushes.
Index: llvm/trunk/lib/Target/X86/X86FrameLowering.cpp
===================================================================
--- llvm/trunk/lib/Target/X86/X86FrameLowering.cpp
+++ llvm/trunk/lib/Target/X86/X86FrameLowering.cpp
@@ -2669,6 +2669,148 @@
   return MBBI;
 }
 
+namespace {
+// Struct used by orderFrameObjects to help sort the stack objects.
+struct X86FrameSortingObject {
+  bool IsValid = false;         // true if we care about this Object.
+  unsigned ObjectIndex = 0;     // Index of Object into MFI list.
+  unsigned ObjectSize = 0;      // Size of Object in bytes.
+  unsigned ObjectAlignment = 1; // Alignment of Object in bytes.
+  unsigned ObjectNumUses = 0;   // Object static number of uses.
+};
+
+// The comparison function we use for std::sort to order our local
+// stack symbols. The current algorithm is to use an estimated
+// "density". This takes into consideration the size and number of
+// uses each object has in order to roughly minimize code size.
+// So, for example, an object of size 16B that is referenced 5 times
+// will get higher priority than 4 4B objects referenced 1 time each.
+// It's not perfect and we may be able to squeeze a few more bytes out of
+// it (for example : 0(esp) requires fewer bytes, symbols allocated at the
+// fringe end can have special consideration, given their size is less
+// important, etc.), but the algorithmic complexity grows too much to be
+// worth the extra gains we get. This gets us pretty close.
+// The final order leaves us with objects with highest priority going
+// at the end of our list.
+struct X86FrameSortingComparator {
+  inline bool operator()(const X86FrameSortingObject &A,
+                         const X86FrameSortingObject &B) {
+    uint64_t DensityAScaled, DensityBScaled;
+
+    // For consistency in our comparison, all invalid objects are placed
+    // at the end. This also allows us to stop walking when we hit the
+    // first invalid item after it's all sorted.
+    if (!A.IsValid)
+      return false;
+    if (!B.IsValid)
+      return true;
+
+    // The density is calculated by doing :
+    //     (double)DensityA = A.ObjectNumUses / A.ObjectSize
+    //     (double)DensityB = B.ObjectNumUses / B.ObjectSize
+    // Since this approach may cause inconsistencies in
+    // the floating point <, >, == comparisons, depending on the floating
+    // point model with which the compiler was built, we're going
+    // to scale both sides by multiplying with
+    // A.ObjectSize * B.ObjectSize. This ends up factoring away
+    // the division and, with it, the need for any floating point
+    // arithmetic.
+    DensityAScaled = static_cast<uint64_t>(A.ObjectNumUses) *
+      static_cast<uint64_t>(B.ObjectSize);
+    DensityBScaled = static_cast<uint64_t>(B.ObjectNumUses) *
+      static_cast<uint64_t>(A.ObjectSize);
+
+    // If the two densities are equal, prioritize highest alignment
+    // objects. This allows for similar alignment objects
+    // to be packed together (given the same density).
+    // There's room for improvement here, also, since we can pack
+    // similar alignment (different density) objects next to each
+    // other to save padding. This will also require further
+    // complexity/iterations, and the overall gain isn't worth it,
+    // in general. Something to keep in mind, though.
+    if (DensityAScaled == DensityBScaled)
+      return A.ObjectAlignment < B.ObjectAlignment;
+    
+    return DensityAScaled < DensityBScaled;
+  }
+};
+} // namespace
+
+// Order the symbols in the local stack.
+// We want to place the local stack objects in some sort of sensible order.
+// The heuristic we use is to try and pack them according to static number
+// of uses and size of object in order to minimize code size.
+void X86FrameLowering::orderFrameObjects(
+    const MachineFunction &MF, SmallVectorImpl<int> &ObjectsToAllocate) const {
+  const MachineFrameInfo *MFI = MF.getFrameInfo();
+
+  // Don't waste time if there's nothing to do.
+  if (ObjectsToAllocate.empty())
+    return;
+
+  // Create an array of all MFI objects. We won't need all of these
+  // objects, but we're going to create a full array of them to make
+  // it easier to index into when we're counting "uses" down below.
+  // We want to be able to easily/cheaply access an object by simply
+  // indexing into it, instead of having to search for it every time.
+  std::vector<X86FrameSortingObject> SortingObjects(MFI->getObjectIndexEnd());
+
+  // Walk the objects we care about and mark them as such in our working
+  // struct.
+  for (auto &Obj : ObjectsToAllocate) {
+    SortingObjects[Obj].IsValid = true;
+    SortingObjects[Obj].ObjectIndex = Obj;
+    SortingObjects[Obj].ObjectAlignment = MFI->getObjectAlignment(Obj);
+    // Set the size.
+    int ObjectSize = MFI->getObjectSize(Obj);
+    if (ObjectSize == 0)
+      // Variable size. Just use 4.
+      SortingObjects[Obj].ObjectSize = 4;
+    else      
+      SortingObjects[Obj].ObjectSize = ObjectSize;
+  }
+
+  // Count the number of uses for each object.
+  for (auto &MBB : MF) {
+    for (auto &MI : MBB) {
+      for (const MachineOperand &MO : MI.operands()) {
+        // Check to see if it's a local stack symbol.
+        if (!MO.isFI())
+          continue;
+        int Index = MO.getIndex();
+        // Check to see if it falls within our range, and is tagged
+        // to require ordering.
+        if (Index >= 0 && Index < MFI->getObjectIndexEnd() &&
+            SortingObjects[Index].IsValid)
+          SortingObjects[Index].ObjectNumUses++;
+      }
+    }
+  }
+
+  // Sort the objects using X86FrameSortingAlgorithm (see its comment for
+  // info).
+  std::stable_sort(SortingObjects.begin(), SortingObjects.end(),
+                   X86FrameSortingComparator());
+
+  // Now modify the original list to represent the final order that
+  // we want. The order will depend on whether we're going to access them
+  // from the stack pointer or the frame pointer. For SP, the list should
+  // end up with the END containing objects that we want with smaller offsets.
+  // For FP, it should be flipped.
+  int i = 0;
+  for (auto &Obj : SortingObjects) {
+    // All invalid items are sorted at the end, so it's safe to stop.
+    if (!Obj.IsValid)
+      break;
+    ObjectsToAllocate[i++] = Obj.ObjectIndex;
+  }
+
+  // Flip it if we're accessing off of the FP.
+  if (!TRI->needsStackRealignment(MF) && hasFP(MF))
+    std::reverse(ObjectsToAllocate.begin(), ObjectsToAllocate.end());
+}
+
+
 unsigned X86FrameLowering::getWinEHParentFrameOffset(const MachineFunction &MF) const {
   // RDX, the parent frame pointer, is homed into 16(%rsp) in the prologue.
   unsigned Offset = 16;
Index: llvm/trunk/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
+++ llvm/trunk/test/CodeGen/X86/2009-02-12-DebugInfoVLA.ll
@@ -1,5 +1,5 @@
 ; RUN: llc < %s
-; RUN: llc < %s -march=x86-64 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -march=x86-64 -verify-machineinstrs | FileCheck %s
 ; PR3538
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin9"
Index: llvm/trunk/test/CodeGen/X86/aligned-variadic.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/aligned-variadic.ll
+++ llvm/trunk/test/CodeGen/X86/aligned-variadic.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -mtriple=i686-apple-darwin   | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -stack-symbol-ordering=0 | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-apple-darwin -stack-symbol-ordering=0 | FileCheck %s -check-prefix=X32
 
 %struct.Baz = type { [17 x i8] }
 %struct.__va_list_tag = type { i32, i32, i8*, i8* }
Index: llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
+++ llvm/trunk/test/CodeGen/X86/avx512-mask-op.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
+; RUN: llc < %s -stack-symbol-ordering=0 -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s --check-prefix=CHECK --check-prefix=KNL
+; RUN: llc < %s -stack-symbol-ordering=0 -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=skx | FileCheck %s --check-prefix=CHECK --check-prefix=SKX
 
 define i16 @mask16(i16 %x) {
 ; CHECK-LABEL: mask16:
Index: llvm/trunk/test/CodeGen/X86/cleanuppad-realign.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/cleanuppad-realign.ll
+++ llvm/trunk/test/CodeGen/X86/cleanuppad-realign.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=i686-pc-windows-msvc < %s | FileCheck --check-prefix=X86 %s
-; RUN: llc -mtriple=x86_64-pc-windows-msvc < %s | FileCheck --check-prefix=X64 %s
+; RUN: llc -mtriple=i686-pc-windows-msvc -stack-symbol-ordering=0 < %s | FileCheck --check-prefix=X86 %s
+; RUN: llc -mtriple=x86_64-pc-windows-msvc -stack-symbol-ordering=0 < %s | FileCheck --check-prefix=X64 %s
 
 declare i32 @__CxxFrameHandler3(...)
 declare void @Dtor(i64* %o)
Index: llvm/trunk/test/CodeGen/X86/dynamic-allocas-VLAs.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/dynamic-allocas-VLAs.ll
+++ llvm/trunk/test/CodeGen/X86/dynamic-allocas-VLAs.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mcpu=generic -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s
-; RUN: llc < %s -mcpu=generic -stackrealign -stack-alignment=32 -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s -check-prefix=FORCE-ALIGN
+; RUN: llc < %s -stack-symbol-ordering=0 -mcpu=generic -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -mcpu=generic -stackrealign -stack-alignment=32 -march=x86-64 -mattr=+avx -mtriple=i686-apple-darwin10 | FileCheck %s -check-prefix=FORCE-ALIGN
 ; rdar://11496434
 
 ; no VLAs or dynamic alignment
Index: llvm/trunk/test/CodeGen/X86/hipe-cc.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/hipe-cc.ll
+++ llvm/trunk/test/CodeGen/X86/hipe-cc.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -tailcallopt -code-model=medium -stack-alignment=4 -mtriple=i686-linux-gnu -mcpu=pentium | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -tailcallopt -code-model=medium -stack-alignment=4 -mtriple=i686-linux-gnu -mcpu=pentium | FileCheck %s
 
 ; Check the HiPE calling convention works (x86-32)
 
Index: llvm/trunk/test/CodeGen/X86/hipe-cc64.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/hipe-cc64.ll
+++ llvm/trunk/test/CodeGen/X86/hipe-cc64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -tailcallopt -code-model=medium -stack-alignment=8 -mtriple=x86_64-linux-gnu -mcpu=opteron | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -tailcallopt -code-model=medium -stack-alignment=8 -mtriple=x86_64-linux-gnu -mcpu=opteron | FileCheck %s
 
 ; Check the HiPE calling convention works (x86-64)
 
Index: llvm/trunk/test/CodeGen/X86/local_stack_symbol_ordering.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/local_stack_symbol_ordering.ll
+++ llvm/trunk/test/CodeGen/X86/local_stack_symbol_ordering.ll
@@ -0,0 +1,184 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s -check-prefix=X32
+
+; CHECK-LABEL: foo
+
+; Check the functionality of the local stack symbol table ordering
+; heuristics.
+; The test has a bunch of locals of various sizes that are referenced a
+; different number of times.
+;
+; a   : 120B, 9 uses,   density = 0.075
+; aa  : 4000B, 1 use,   density = 0.00025
+; b   : 4B, 1 use,      density = 0.25
+; cc  : 4000B, 2 uses   density = 0.0005
+; d   : 4B, 2 uses      density = 0.5
+; e   : 4B, 3 uses      density = 0.75
+; f   : 4B, 4 uses      density = 1
+;
+; Given the size, number of uses and calculated density (uses / size), we're
+; going to hope that f gets allocated closest to the stack pointer,
+; followed by e, d, b, then a (to check for just a few).
+; We use gnu-inline asm between calls to prevent registerization of addresses
+; so that we get exact counts.
+;
+; The test is taken from something like this:
+; void foo()
+; {
+;   int f; // 4 uses.          4 / 4 = 1
+;   int a[30]; // 9 uses.      8 / 120 = 0.06
+;   int aa[1000]; // 1 use.    1 / 4000 =
+;   int e; // 3 uses.          3 / 4 = 0.75
+;   int cc[1000]; // 2 uses.   2 / 4000 = 
+;   int b; // 1 use.           1 / 4 = 0.25
+;   int d; // 2 uses.          2 / 4 = 0.5
+;   int aaa[1000]; // 2 uses.  2 / 4000
+;
+; 
+;   check_a(&a);
+;   bar1(&aaa);
+;   asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+;   bar1(&a);
+;   check_f(&f);
+;   asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+;   bar1(&a);
+;   bar3(&aa, &aaa, &cc);
+;   asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+;   bar2(&a,&cc);
+;   check_b(&b);
+;   asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+;   bar1(&a);
+;   asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+;   bar2(&a, &f);
+;   check_e(&e);
+;   asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+;   bar1(&a);
+;   asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+;   bar2(&e, &f);
+;   check_d(&d);
+;   bar1(&a);
+;   asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+;   bar3(&d, &e, &f);
+;   asm ("":::"esi","edi","ebp","ebx","rbx","r12","r13","r14","r15","rbp");
+;   bar1(&a);
+; }
+;
+; X64: leaq 16(%rsp), %rdi
+; X64: callq check_a
+; X64: callq bar1
+; X64: callq bar1
+; X64: leaq (%rsp), %rdi
+; X64: callq check_f
+; X64: callq bar1
+; X64: callq bar3
+; X64: callq bar2
+; X64: leaq 12(%rsp), %rdi
+; X64: callq check_b
+; X64: callq bar1
+; X64: callq bar2
+; X64: leaq 4(%rsp), %rdi
+; X64: callq check_e
+; X64: callq bar1
+; X64: callq bar2
+; X64: leaq 8(%rsp), %rdi
+; X64: callq check_d
+
+; X32: leal 32(%esp)
+; X32: calll check_a
+; X32: calll bar1
+; X32: calll bar1
+; X32: leal 16(%esp)
+; X32: calll check_f
+; X32: calll bar1
+; X32: calll bar3
+; X32: calll bar2
+; X32: leal 28(%esp)
+; X32: calll check_b
+; X32: calll bar1
+; X32: calll bar2
+; X32: leal 20(%esp)
+; X32: calll check_e
+; X32: calll bar1
+; X32: calll bar2
+; X32: leal 24(%esp)
+; X32: calll check_d
+
+
+define void @foo() nounwind uwtable {
+entry:
+  %f = alloca i32, align 4
+  %a = alloca [30 x i32], align 16
+  %aa = alloca [1000 x i32], align 16
+  %e = alloca i32, align 4
+  %cc = alloca [1000 x i32], align 16
+  %b = alloca i32, align 4
+  %d = alloca i32, align 4
+  %aaa = alloca [1000 x i32], align 16
+  %0 = bitcast i32* %f to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %0) #1
+  %1 = bitcast [30 x i32]* %a to i8*
+  call void @llvm.lifetime.start(i64 120, i8* %1) #1
+  %2 = bitcast [1000 x i32]* %aa to i8*
+  call void @llvm.lifetime.start(i64 4000, i8* %2) #1
+  %3 = bitcast i32* %e to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %3) #1
+  %4 = bitcast [1000 x i32]* %cc to i8*
+  call void @llvm.lifetime.start(i64 4000, i8* %4) #1
+  %5 = bitcast i32* %b to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %5) #1
+  %6 = bitcast i32* %d to i8*
+  call void @llvm.lifetime.start(i64 4, i8* %6) #1
+  %7 = bitcast [1000 x i32]* %aaa to i8*
+  call void @llvm.lifetime.start(i64 4000, i8* %7) #1
+  %call = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @check_a to i32 ([30 x i32]*, ...)*)([30 x i32]* %a)
+  %call1 = call i32 ([1000 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([1000 x i32]*, ...)*)([1000 x i32]* %aaa)
+  call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+  %call2 = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([30 x i32]*, ...)*)([30 x i32]* %a)
+  %call3 = call i32 (i32*, ...) bitcast (i32 (...)* @check_f to i32 (i32*, ...)*)(i32* %f)
+  call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+  %call4 = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([30 x i32]*, ...)*)([30 x i32]* %a)
+  %call5 = call i32 ([1000 x i32]*, [1000 x i32]*, [1000 x i32]*, ...) bitcast (i32 (...)* @bar3 to i32 ([1000 x i32]*, [1000 x i32]*, [1000 x i32]*, ...)*)([1000 x i32]* %aa, [1000 x i32]* %aaa, [1000 x i32]* %cc)
+  call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+  %call6 = call i32 ([30 x i32]*, [1000 x i32]*, ...) bitcast (i32 (...)* @bar2 to i32 ([30 x i32]*, [1000 x i32]*, ...)*)([30 x i32]* %a, [1000 x i32]* %cc)
+  %call7 = call i32 (i32*, ...) bitcast (i32 (...)* @check_b to i32 (i32*, ...)*)(i32* %b)
+  call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+  %call8 = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([30 x i32]*, ...)*)([30 x i32]* %a)
+  call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+  %call9 = call i32 ([30 x i32]*, i32*, ...) bitcast (i32 (...)* @bar2 to i32 ([30 x i32]*, i32*, ...)*)([30 x i32]* %a, i32* %f)
+  %call10 = call i32 (i32*, ...) bitcast (i32 (...)* @check_e to i32 (i32*, ...)*)(i32* %e)
+  call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+  %call11 = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([30 x i32]*, ...)*)([30 x i32]* %a)
+  call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+  %call12 = call i32 (i32*, i32*, ...) bitcast (i32 (...)* @bar2 to i32 (i32*, i32*, ...)*)(i32* %e, i32* %f)
+  %call13 = call i32 (i32*, ...) bitcast (i32 (...)* @check_d to i32 (i32*, ...)*)(i32* %d)
+  %call14 = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([30 x i32]*, ...)*)([30 x i32]* %a)
+  call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+  %call15 = call i32 (i32*, i32*, i32*, ...) bitcast (i32 (...)* @bar3 to i32 (i32*, i32*, i32*, ...)*)(i32* %d, i32* %e, i32* %f)
+  call void asm sideeffect "", "~{esi},~{edi},~{ebp},~{ebx},~{rbx},~{r12},~{r13},~{r14},~{r15},~{rbp},~{dirflag},~{fpsr},~{flags}"() #1
+  %call16 = call i32 ([30 x i32]*, ...) bitcast (i32 (...)* @bar1 to i32 ([30 x i32]*, ...)*)([30 x i32]* %a)
+  call void @llvm.lifetime.end(i64 4000, i8* %7) #1
+  call void @llvm.lifetime.end(i64 4, i8* %6) #1
+  call void @llvm.lifetime.end(i64 4, i8* %5) #1
+  call void @llvm.lifetime.end(i64 4000, i8* %4) #1
+  call void @llvm.lifetime.end(i64 4, i8* %3) #1
+  call void @llvm.lifetime.end(i64 4000, i8* %2) #1
+  call void @llvm.lifetime.end(i64 120, i8* %1) #1
+  call void @llvm.lifetime.end(i64 4, i8* %0) #1
+  ret void
+}
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+
+declare i32 @check_a(...) #2
+declare i32 @bar1(...) #2
+declare i32 @check_f(...) #2
+declare i32 @bar3(...) #2
+declare i32 @bar2(...) #2
+declare i32 @check_b(...) #2
+declare i32 @check_e(...) #2
+declare i32 @check_d(...) #2
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+
Index: llvm/trunk/test/CodeGen/X86/phys-reg-local-regalloc.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/phys-reg-local-regalloc.ll
+++ llvm/trunk/test/CodeGen/X86/phys-reg-local-regalloc.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast -optimize-regalloc=0 | FileCheck %s
-; RUN: llc -O0 < %s -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast | FileCheck %s
-; RUN: llc < %s -march=x86 -mtriple=i386-apple-darwin9 -mcpu=atom -regalloc=fast -optimize-regalloc=0 | FileCheck -check-prefix=ATOM %s
+; RUN: llc < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast -optimize-regalloc=0 | FileCheck %s
+; RUN: llc -O0 < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=generic -regalloc=fast | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -march=x86 -mtriple=i386-apple-darwin9 -mcpu=atom -regalloc=fast -optimize-regalloc=0 | FileCheck -check-prefix=ATOM %s
 ; CHECKed instructions should be the same with or without -O0 except on Intel Atom due to instruction scheduling.
 
 @.str = private constant [12 x i8] c"x + y = %i\0A\00", align 1 ; <[12 x i8]*> [#uses=1]
Index: llvm/trunk/test/CodeGen/X86/seh-catch-all-win32.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/seh-catch-all-win32.ll
+++ llvm/trunk/test/CodeGen/X86/seh-catch-all-win32.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s
+; RUN: llc -stack-symbol-ordering=0 -mtriple=i686-windows-msvc < %s | FileCheck %s
 
 ; 32-bit catch-all has to use a filter function because that's how it saves the
 ; exception code.
Index: llvm/trunk/test/CodeGen/X86/seh-stack-realign.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/seh-stack-realign.ll
+++ llvm/trunk/test/CodeGen/X86/seh-stack-realign.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s
+; RUN: llc -stack-symbol-ordering=0 -mtriple=i686-windows-msvc < %s | FileCheck %s
 
 ; 32-bit catch-all has to use a filter function because that's how it saves the
 ; exception code.
Index: llvm/trunk/test/CodeGen/X86/ssp-data-layout.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/ssp-data-layout.ll
+++ llvm/trunk/test/CodeGen/X86/ssp-data-layout.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -disable-fp-elim -mtriple=x86_64-pc-linux-gnu -mcpu=corei7 -o - | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -disable-fp-elim -mtriple=x86_64-pc-linux-gnu -mcpu=corei7 -o - | FileCheck %s
 ;  This test is fairly fragile.  The goal is to ensure that "large" stack
 ;  objects are allocated closest to the stack protector (i.e., farthest away 
 ;  from the Stack Pointer.)  In standard SSP mode this means that large (>=
Index: llvm/trunk/test/CodeGen/X86/statepoint-stack-usage.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/statepoint-stack-usage.ll
+++ llvm/trunk/test/CodeGen/X86/statepoint-stack-usage.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -stack-symbol-ordering=0 < %s | FileCheck %s
 
 target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-pc-linux-gnu"
Index: llvm/trunk/test/CodeGen/X86/statepoint-stackmap-format.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/statepoint-stackmap-format.ll
+++ llvm/trunk/test/CodeGen/X86/statepoint-stackmap-format.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple="x86_64-pc-linux-gnu" | FileCheck %s
-; RUN: llc < %s -mtriple="x86_64-pc-unknown-elf" | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -mtriple="x86_64-pc-linux-gnu" | FileCheck %s
+; RUN: llc < %s -stack-symbol-ordering=0 -mtriple="x86_64-pc-unknown-elf" | FileCheck %s
 
 ; This test is a sanity check to ensure statepoints are generating StackMap
 ; sections correctly.  This is not intended to be a rigorous test of the 
Index: llvm/trunk/test/CodeGen/X86/statepoint-vector.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/statepoint-vector.ll
+++ llvm/trunk/test/CodeGen/X86/statepoint-vector.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mcpu=core-avx -debug-only=stackmaps < %s | FileCheck %s
+; RUN: llc -stack-symbol-ordering=0 -mcpu=core-avx -debug-only=stackmaps < %s | FileCheck %s
 ; REQUIRES: asserts
 
 target triple = "x86_64-pc-linux-gnu"
Index: llvm/trunk/test/CodeGen/X86/stdarg.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/stdarg.ll
+++ llvm/trunk/test/CodeGen/X86/stdarg.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
+; RUN: llc -stack-symbol-ordering=0 < %s -mtriple=x86_64-linux | FileCheck %s
 
 %struct.__va_list_tag = type { i32, i32, i8*, i8* }
 
Index: llvm/trunk/test/CodeGen/X86/widen_load-1.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/widen_load-1.ll
+++ llvm/trunk/test/CodeGen/X86/widen_load-1.ll
@@ -1,5 +1,5 @@
-; RUN: llc %s -o - -march=x86-64 -mattr=-avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=SSE
-; RUN: llc %s -o - -march=x86-64 -mattr=+avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=AVX
+; RUN: llc -stack-symbol-ordering=0 %s -o - -march=x86-64 -mattr=-avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=SSE
+; RUN: llc -stack-symbol-ordering=0 %s -o - -march=x86-64 -mattr=+avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=AVX
 ; PR4891
 ; PR5626
 
Index: llvm/trunk/test/CodeGen/X86/win-catchpad-varargs.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/win-catchpad-varargs.ll
+++ llvm/trunk/test/CodeGen/X86/win-catchpad-varargs.ll
@@ -1,5 +1,5 @@
-; RUN: llc -mtriple=x86_64-windows-msvc < %s | FileCheck %s --check-prefix=X64
-; RUN: llc -mtriple=i686-windows-msvc < %s | FileCheck %s --check-prefix=X86
+; RUN: llc -stack-symbol-ordering=0 -mtriple=x86_64-windows-msvc < %s | FileCheck %s --check-prefix=X64
+; RUN: llc -stack-symbol-ordering=0 -mtriple=i686-windows-msvc < %s | FileCheck %s --check-prefix=X86
 
 declare void @llvm.va_start(i8*)
 declare void @llvm.va_end(i8*)
Index: llvm/trunk/test/CodeGen/X86/win-catchpad.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/win-catchpad.ll
+++ llvm/trunk/test/CodeGen/X86/win-catchpad.ll
@@ -1,5 +1,5 @@
-; RUN: llc -verify-machineinstrs -mtriple=i686-pc-windows-msvc < %s | FileCheck --check-prefix=X86 %s
-; RUN: llc -verify-machineinstrs -mtriple=x86_64-pc-windows-msvc < %s | FileCheck --check-prefix=X64 %s
+; RUN: llc -stack-symbol-ordering=0 -verify-machineinstrs -mtriple=i686-pc-windows-msvc < %s | FileCheck --check-prefix=X86 %s
+; RUN: llc -stack-symbol-ordering=0 -verify-machineinstrs -mtriple=x86_64-pc-windows-msvc < %s | FileCheck --check-prefix=X64 %s
 
 ; Loosely based on IR for this C++ source code:
 ;   void f(int p);
Index: llvm/trunk/test/CodeGen/X86/win32-seh-catchpad-realign.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/win32-seh-catchpad-realign.ll
+++ llvm/trunk/test/CodeGen/X86/win32-seh-catchpad-realign.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -stack-symbol-ordering=0 < %s | FileCheck %s
 
 ; The aligned alloca means that we have to realign the stack, which forces the
 ; use of ESI to address local variables.