Index: llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -7135,6 +7135,10 @@ return N0.getOperand(0); } + // If this is anyext(trunc), don't fold it, allow ourselves to be folded. + if (N->hasOneUse() && (N->use_begin()->getOpcode() == ISD::ANY_EXTEND)) + return SDValue(); + // Fold extract-and-trunc into a narrow extract. For example: // i64 x = EXTRACT_VECTOR_ELT(v2i64 val, i32 1) // i32 y = TRUNCATE(i64 x) Index: llvm/trunk/test/CodeGen/X86/2011-10-21-widen-cmp.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/2011-10-21-widen-cmp.ll +++ llvm/trunk/test/CodeGen/X86/2011-10-21-widen-cmp.ll @@ -10,10 +10,7 @@ ; CHECK-NEXT: cmpordps %xmm0, %xmm0 ; CHECK-NEXT: pmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero ; CHECK-NEXT: psllq $32, %xmm0 -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3] -; CHECK-NEXT: psrad $31, %xmm0 -; CHECK-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3],xmm1[4,5],xmm0[6,7] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; CHECK-NEXT: pslld $31, %xmm0 ; CHECK-NEXT: blendvps %xmm0, %xmm0 ; CHECK-NEXT: movlps %xmm0, (%rax) Index: llvm/trunk/test/CodeGen/X86/mem-intrin-base-reg.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/mem-intrin-base-reg.ll +++ llvm/trunk/test/CodeGen/X86/mem-intrin-base-reg.ll @@ -8,15 +8,12 @@ ; for when this is necessary. Typically, we chose ESI for the base register, ; which all of the X86 string instructions use. -; The pattern of vector icmp and extractelement is used in these tests because -; it forces creation of an aligned stack temporary. Perhaps such temporaries -; shouldn't be aligned. - declare void @escape_vla_and_icmp(i8*, i1 zeroext) declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) define i32 @memcpy_novla_vector(<4 x i32>* %vp0, i8* %a, i8* %b, i32 %n, i1 zeroext %cond) { + %foo = alloca <4 x i32>, align 16 call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a, i8* %b, i32 128, i32 4, i1 false) br i1 %cond, label %spill_vectors, label %no_vectors @@ -42,6 +39,7 @@ ; CHECK: rep;movsl define i32 @memcpy_vla_vector(<4 x i32>* %vp0, i8* %a, i8* %b, i32 %n, i1 zeroext %cond) { + %foo = alloca <4 x i32>, align 16 call void @llvm.memcpy.p0i8.p0i8.i32(i8* %a, i8* %b, i32 128, i32 4, i1 false) br i1 %cond, label %spill_vectors, label %no_vectors @@ -70,6 +68,7 @@ ; stosd doesn't clobber esi, so we can use it. define i32 @memset_vla_vector(<4 x i32>* %vp0, i8* %a, i32 %n, i1 zeroext %cond) { + %foo = alloca <4 x i32>, align 16 call void @llvm.memset.p0i8.i32(i8* %a, i8 42, i32 128, i32 4, i1 false) br i1 %cond, label %spill_vectors, label %no_vectors