Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -7135,6 +7135,10 @@
     return N0.getOperand(0);
   }
 
+  // If this is anyext(trunc), don't fold it, allow ourselves to be folded.
+  if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ANY_EXTEND))
+    return SDValue();
+
   // Fold extract-and-trunc into a narrow extract. For example:
   //   i64 x = EXTRACT_VECTOR_ELT(v2i64 val, i32 1)
   //   i32 y = TRUNCATE(i64 x)
Index: llvm/trunk/test/CodeGen/X86/2011-10-21-widen-cmp.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/2011-10-21-widen-cmp.ll
+++ llvm/trunk/test/CodeGen/X86/2011-10-21-widen-cmp.ll
@@ -10,10 +10,7 @@
 ; CHECK-NEXT:    cmpordps %xmm0, %xmm0
 ; CHECK-NEXT:    pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
 ; CHECK-NEXT:    psllq $32, %xmm0
-; CHECK-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; CHECK-NEXT:    psrad $31, %xmm0
-; CHECK-NEXT:    pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7]
-; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; CHECK-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
 ; CHECK-NEXT:    pslld $31, %xmm0
 ; CHECK-NEXT:    blendvps %xmm0, %xmm0
 ; CHECK-NEXT:    movlps %xmm0, (%rax)
Index: llvm/trunk/test/CodeGen/X86/mem-intrin-base-reg.ll
===================================================================
--- llvm/trunk/test/CodeGen/X86/mem-intrin-base-reg.ll
+++ llvm/trunk/test/CodeGen/X86/mem-intrin-base-reg.ll
@@ -8,15 +8,12 @@
 ; for when this is necessary. Typically, we chose ESI for the base register,
 ; which all of the X86 string instructions use.
 
-; The pattern of vector icmp and extractelement is used in these tests because
-; it forces creation of an aligned stack temporary. Perhaps such temporaries
-; shouldn't be aligned.
-
 declare void @escape_vla_and_icmp(i8*, i1 zeroext)
 declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1)
 declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1)
 
 define i32 @memcpy_novla_vector(<4 x i32>* %vp0, i8* %a, i8* %b, i32 %n, i1 zeroext %cond) {
+  %foo = alloca <4 x i32>, align 16
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a, i8* %b, i32 128, i32 4, i1 false)
   br i1 %cond, label %spill_vectors, label %no_vectors
 
@@ -42,6 +39,7 @@
 ; CHECK: rep;movsl
 
 define i32 @memcpy_vla_vector(<4 x i32>* %vp0, i8* %a, i8* %b, i32 %n, i1 zeroext %cond) {
+  %foo = alloca <4 x i32>, align 16
   call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a, i8* %b, i32 128, i32 4, i1 false)
   br i1 %cond, label %spill_vectors, label %no_vectors
 
@@ -70,6 +68,7 @@
 ; stosd doesn't clobber esi, so we can use it.
 
 define i32 @memset_vla_vector(<4 x i32>* %vp0, i8* %a, i32 %n, i1 zeroext %cond) {
+  %foo = alloca <4 x i32>, align 16
   call void @llvm.memset.p0i8.i32(i8* %a, i8 42, i32 128, i32 4, i1 false)
   br i1 %cond, label %spill_vectors, label %no_vectors