Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -13163,9 +13163,12 @@ Ptr, ST->getMemoryVT(), ST->getMemOperand()); } - // Only perform this optimization before the types are legal, because we - // don't want to perform this optimization on every DAGCombine invocation. - if (!LegalTypes) { + // FIXME: This pass can be expensive and we should do it only once, + // ideally just before Instruction Selection so that we can merge stores from + // lowered intrinsics. Currently some LegalizeDAG changes cases of + // of MergeStores from happening. For now do the merging twice; before and + // after legalization. + if (!LegalTypes || (Level == AfterLegalizeDAG)) { for (;;) { // There can be multiple store sequences on the same chain. // Keep trying to merge store sequences until we are unable to do so Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -9420,8 +9420,6 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI, SelectionDAG &DAG, const AArch64Subtarget *Subtarget) { - if (!DCI.isBeforeLegalize()) - return SDValue(); StoreSDNode *S = cast(N); if (S->isVolatile()) Index: test/CodeGen/AArch64/arm64-complex-ret.ll =================================================================== --- test/CodeGen/AArch64/arm64-complex-ret.ll +++ test/CodeGen/AArch64/arm64-complex-ret.ll @@ -2,6 +2,7 @@ define { i192, i192, i21, i192 } @foo(i192) { ; CHECK-LABEL: foo: -; CHECK: stp xzr, xzr, [x8] +; CHECK-DAG: stp xzr, xzr, [x8, #8] +; CHECK-DAG: str xzr, [x8] ret { i192, i192, i21, i192 } {i192 0, i192 1, i21 2, i192 3} } Index: test/CodeGen/AArch64/arm64-narrow-st-merge.ll =================================================================== --- test/CodeGen/AArch64/arm64-narrow-st-merge.ll +++ test/CodeGen/AArch64/arm64-narrow-st-merge.ll @@ -19,7 +19,7 @@ } ; CHECK-LABEL: Strh_zero_4 -; CHECK: stp wzr, wzr +; CHECK: str xzr ; CHECK-STRICT-LABEL: Strh_zero_4 ; CHECK-STRICT: strh wzr ; CHECK-STRICT: strh wzr @@ -137,7 +137,7 @@ } ; CHECK-LABEL: Sturh_zero_4 -; CHECK: stp wzr, wzr +; CHECK: stur xzr ; CHECK-STRICT-LABEL: Sturh_zero_4 ; CHECK-STRICT: sturh wzr ; CHECK-STRICT: sturh wzr Index: test/CodeGen/AArch64/arm64-variadic-aapcs.ll =================================================================== --- test/CodeGen/AArch64/arm64-variadic-aapcs.ll +++ test/CodeGen/AArch64/arm64-variadic-aapcs.ll @@ -32,11 +32,9 @@ ; CHECK: add [[VR_TOP:x[0-9]+]], [[VR_TOPTMP]], #128 ; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16] -; CHECK: mov [[GR_OFFS:w[0-9]+]], #-56 -; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24] - -; CHECK: orr [[VR_OFFS:w[0-9]+]], wzr, #0xffffff80 -; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28] +; CHECK: mov [[GRVR:x[0-9]+]], #-545460846720 +; CHECK: movk [[GRVR]], #65480 +; CHECK: str [[GRVR]], [x[[VA_LIST]], #24] %addr = bitcast %va_list* @var to i8* call void @llvm.va_start(i8* %addr) @@ -70,11 +68,9 @@ ; CHECK: add [[VR_TOP:x[0-9]+]], [[VR_TOPTMP]], #112 ; CHECK: str [[VR_TOP]], [x[[VA_LIST]], #16] -; CHECK: mov [[GR_OFFS:w[0-9]+]], #-40 -; CHECK: str [[GR_OFFS]], [x[[VA_LIST]], #24] - -; CHECK: mov [[VR_OFFS:w[0-9]+]], #-11 -; CHECK: str [[VR_OFFS]], [x[[VA_LIST]], #28] +; CHECK: mov [[GRVR_OFFS:x[0-9]+]], #-40 +; CHECK: movk [[GRVR_OFFS]], #65424, lsl #32 +; CHECK: str [[GRVR_OFFS]], [x[[VA_LIST]], #24] %addr = bitcast %va_list* @var to i8* call void @llvm.va_start(i8* %addr) Index: test/CodeGen/AArch64/merge-store-dependency.ll =================================================================== --- test/CodeGen/AArch64/merge-store-dependency.ll +++ test/CodeGen/AArch64/merge-store-dependency.ll @@ -9,10 +9,9 @@ ;CHECK-LABEL: test entry: ; A53: mov [[DATA:w[0-9]+]], w1 -; A53: str q{{[0-9]+}}, {{.*}} -; A53: str q{{[0-9]+}}, {{.*}} -; A53: str [[DATA]], {{.*}} - +; A53-DAG: stp xzr, xzr +; A53-DAG: str q0 +; A53-DAG: str [[DATA]] %0 = bitcast %struct1* %fde to i8* tail call void @llvm.memset.p0i8.i64(i8* %0, i8 0, i64 40, i32 8, i1 false) %state = getelementptr inbounds %struct1, %struct1* %fde, i64 0, i32 4 Index: test/CodeGen/AArch64/tailcall-explicit-sret.ll =================================================================== --- test/CodeGen/AArch64/tailcall-explicit-sret.ll +++ test/CodeGen/AArch64/tailcall-explicit-sret.ll @@ -79,7 +79,7 @@ ; CHECK-DAG: mov [[FPTR:x[0-9]+]], x0 ; CHECK: mov x0, sp ; CHECK-NEXT: blr [[FPTR]] -; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp] +; CHECK: ldr [[CALLERSRET1:x[0-9]+]], [sp] ; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]] ; CHECK: ret define void @test_indirect_tailcall_explicit_sret_nosret_arg(i1024* sret %arg, void (i1024*)* %f) #0 { @@ -94,7 +94,7 @@ ; CHECK: mov x[[CALLERX8NUM:[0-9]+]], x8 ; CHECK: mov x8, sp ; CHECK-NEXT: blr x0 -; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp] +; CHECK: ldr [[CALLERSRET1:x[0-9]+]], [sp] ; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]] ; CHECK: ret define void @test_indirect_tailcall_explicit_sret_(i1024* sret %arg, i1024 ()* %f) #0 { Index: test/CodeGen/AArch64/tailcall-implicit-sret.ll =================================================================== --- test/CodeGen/AArch64/tailcall-implicit-sret.ll +++ test/CodeGen/AArch64/tailcall-implicit-sret.ll @@ -11,7 +11,7 @@ ; CHECK: mov x[[CALLERX8NUM:[0-9]+]], x8 ; CHECK: mov x8, sp ; CHECK-NEXT: bl _test_sret -; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp] +; CHECK: ldr [[CALLERSRET1:x[0-9]+]], [sp] ; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]] ; CHECK: ret define i1024 @test_call_sret() #0 { @@ -23,7 +23,7 @@ ; CHECK: mov x[[CALLERX8NUM:[0-9]+]], x8 ; CHECK: mov x8, sp ; CHECK-NEXT: bl _test_sret -; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp] +; CHECK: ldr [[CALLERSRET1:x[0-9]+]], [sp] ; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]] ; CHECK: ret define i1024 @test_tailcall_sret() #0 { @@ -35,7 +35,7 @@ ; CHECK: mov x[[CALLERX8NUM:[0-9]+]], x8 ; CHECK: mov x8, sp ; CHECK-NEXT: blr x0 -; CHECK-NEXT: ldr [[CALLERSRET1:x[0-9]+]], [sp] +; CHECK: ldr [[CALLERSRET1:x[0-9]+]], [sp] ; CHECK: str [[CALLERSRET1:x[0-9]+]], [x[[CALLERX8NUM]]] ; CHECK: ret define i1024 @test_indirect_tailcall_sret(i1024 ()* %f) #0 { Index: test/CodeGen/X86/MergeConsecutiveStores.ll =================================================================== --- test/CodeGen/X86/MergeConsecutiveStores.ll +++ test/CodeGen/X86/MergeConsecutiveStores.ll @@ -558,8 +558,7 @@ } ; This is a minimized test based on real code that was failing. -; We could merge stores (and loads) like this... - +; This should now be merged. define void @merge_vec_element_and_scalar_load([6 x i64]* %array) { %idx0 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 0 %idx1 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 1 @@ -576,10 +575,8 @@ ret void ; CHECK-LABEL: merge_vec_element_and_scalar_load -; CHECK: movq (%rdi), %rax -; CHECK-NEXT: movq 8(%rdi), %rcx -; CHECK-NEXT: movq %rax, 32(%rdi) -; CHECK-NEXT: movq %rcx, 40(%rdi) +; CHECK: vmovups (%rdi), %xmm0 +; CHECK-NEXT: vmovups %xmm0, 32(%rdi) ; CHECK-NEXT: retq } Index: test/CodeGen/X86/bigstructret.ll =================================================================== --- test/CodeGen/X86/bigstructret.ll +++ test/CodeGen/X86/bigstructret.ll @@ -19,10 +19,9 @@ } ; CHECK: ReturnBigStruct2 -; CHECK: movl $48, 4(%ecx) -; CHECK: movb $1, 2(%ecx) -; CHECK: movb $1, 1(%ecx) -; CHECK: movb $0, (%ecx) +; CHECK-DAG: movl $48, 4(%ecx) +; CHECK-DAG: movb $1, 2(%ecx) +; CHECK-DAG: movw $256, (%ecx) define fastcc %1 @ReturnBigStruct2() nounwind readnone { entry: Index: test/CodeGen/X86/bitcast-i256.ll =================================================================== --- test/CodeGen/X86/bitcast-i256.ll +++ test/CodeGen/X86/bitcast-i256.ll @@ -5,7 +5,6 @@ ret i256 %r ; CHECK: foo ; CHECK: vextractf128 -; CHECK: vpextrq -; CHECK: vpextrq +; CHECK: vmovups ; CHECK: ret } Index: test/CodeGen/X86/constant-combines.ll =================================================================== --- test/CodeGen/X86/constant-combines.ll +++ test/CodeGen/X86/constant-combines.ll @@ -15,12 +15,11 @@ ; ; CHECK-LABEL: PR22524: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: movl $0, 4(%rdi) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: mulss %xmm0, %xmm1 -; CHECK-NEXT: movl $0, (%rdi) +; CHECK-NEXT: movq $0, (%rdi) ; CHECK-NEXT: movss %xmm1, 4(%rdi) ; CHECK-NEXT: retq entry: Index: test/CodeGen/X86/fold-vector-sext-crash2.ll =================================================================== --- test/CodeGen/X86/fold-vector-sext-crash2.ll +++ test/CodeGen/X86/fold-vector-sext-crash2.ll @@ -53,8 +53,10 @@ ret <2 x i256> %Shuff ; X64-LABEL: test_zext1 - ; X64: movq $0 - ; X64-NEXT: movq $0 + ; X64: xorps %xmm0, %xmm0 + ; X64: movaps %xmm0 + ; X64: movaps %xmm0 + ; X64: movaps %xmm0 ; X64-NEXT: movq $0 ; X64-NEXT: movq $254 @@ -75,8 +77,10 @@ ret <2 x i256> %Shuff ; X64-LABEL: test_zext2 - ; X64: movq $0 - ; X64-NEXT: movq $0 + ; X64: xorps %xmm0, %xmm0 + ; X64-NEXT: movaps %xmm0 + ; X64-NEXT: movaps %xmm0 + ; X64-NEXT: movaps %xmm0 ; X64-NEXT: movq $-1 ; X64-NEXT: movq $-2 Index: test/CodeGen/X86/legalize-shl-vec.ll =================================================================== --- test/CodeGen/X86/legalize-shl-vec.ll +++ test/CodeGen/X86/legalize-shl-vec.ll @@ -26,14 +26,11 @@ ; ; X64-LABEL: test_shl: ; X64: # BB#0: -; X64-NEXT: movq $0, 56(%rdi) -; X64-NEXT: movq $0, 48(%rdi) -; X64-NEXT: movq $0, 40(%rdi) -; X64-NEXT: movq $0, 32(%rdi) -; X64-NEXT: movq $0, 24(%rdi) -; X64-NEXT: movq $0, 16(%rdi) -; X64-NEXT: movq $0, 8(%rdi) -; X64-NEXT: movq $0, (%rdi) +; X64-NEXT: xorps %xmm0, %xmm0 +; X64-NEXT: movaps %xmm0, 48(%rdi) +; X64-NEXT: movaps %xmm0, 32(%rdi) +; X64-NEXT: movaps %xmm0, 16(%rdi) +; X64-NEXT: movaps %xmm0, (%rdi) ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: retq %Amt = insertelement <2 x i256> undef, i256 -1, i32 0 @@ -65,14 +62,11 @@ ; ; X64-LABEL: test_srl: ; X64: # BB#0: -; X64-NEXT: movq $0, 56(%rdi) -; X64-NEXT: movq $0, 48(%rdi) -; X64-NEXT: movq $0, 40(%rdi) -; X64-NEXT: movq $0, 32(%rdi) -; X64-NEXT: movq $0, 24(%rdi) -; X64-NEXT: movq $0, 16(%rdi) -; X64-NEXT: movq $0, 8(%rdi) -; X64-NEXT: movq $0, (%rdi) +; X64-NEXT: xorps %xmm0, %xmm0 +; X64-NEXT: movaps %xmm0, 48(%rdi) +; X64-NEXT: movaps %xmm0, 32(%rdi) +; X64-NEXT: movaps %xmm0, 16(%rdi) +; X64-NEXT: movaps %xmm0, (%rdi) ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: retq %Amt = insertelement <2 x i256> undef, i256 -1, i32 0 Index: test/CodeGen/X86/merge-consecutive-loads-128.ll =================================================================== --- test/CodeGen/X86/merge-consecutive-loads-128.ll +++ test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -530,44 +530,28 @@ ; ; X32-SSE1-LABEL: merge_8i16_i16_23u567u9: ; X32-SSE1: # BB#0: -; X32-SSE1-NEXT: pushl %ebp +; X32-SSE1-NEXT: pushl %edi ; X32-SSE1-NEXT: .Lcfi6: ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 -; X32-SSE1-NEXT: pushl %ebx +; X32-SSE1-NEXT: pushl %esi ; X32-SSE1-NEXT: .Lcfi7: ; X32-SSE1-NEXT: .cfi_def_cfa_offset 12 -; X32-SSE1-NEXT: pushl %edi ; X32-SSE1-NEXT: .Lcfi8: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 16 -; X32-SSE1-NEXT: pushl %esi +; X32-SSE1-NEXT: .cfi_offset %esi, -12 ; X32-SSE1-NEXT: .Lcfi9: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 20 -; X32-SSE1-NEXT: .Lcfi10: -; X32-SSE1-NEXT: .cfi_offset %esi, -20 -; X32-SSE1-NEXT: .Lcfi11: -; X32-SSE1-NEXT: .cfi_offset %edi, -16 -; X32-SSE1-NEXT: .Lcfi12: -; X32-SSE1-NEXT: .cfi_offset %ebx, -12 -; X32-SSE1-NEXT: .Lcfi13: -; X32-SSE1-NEXT: .cfi_offset %ebp, -8 +; X32-SSE1-NEXT: .cfi_offset %edi, -8 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE1-NEXT: movzwl 4(%ecx), %edx -; X32-SSE1-NEXT: movzwl 6(%ecx), %esi -; X32-SSE1-NEXT: movzwl 10(%ecx), %edi -; X32-SSE1-NEXT: movzwl 12(%ecx), %ebx -; X32-SSE1-NEXT: movzwl 14(%ecx), %ebp +; X32-SSE1-NEXT: movl 4(%ecx), %edx +; X32-SSE1-NEXT: movl 10(%ecx), %esi +; X32-SSE1-NEXT: movzwl 14(%ecx), %edi ; X32-SSE1-NEXT: movzwl 18(%ecx), %ecx -; X32-SSE1-NEXT: movw %bp, 10(%eax) -; X32-SSE1-NEXT: movw %bx, 8(%eax) +; X32-SSE1-NEXT: movw %di, 10(%eax) ; X32-SSE1-NEXT: movw %cx, 14(%eax) -; X32-SSE1-NEXT: movw %si, 2(%eax) -; X32-SSE1-NEXT: movw %dx, (%eax) -; X32-SSE1-NEXT: movw %di, 6(%eax) +; X32-SSE1-NEXT: movl %edx, (%eax) +; X32-SSE1-NEXT: movl %esi, 6(%eax) ; X32-SSE1-NEXT: popl %esi ; X32-SSE1-NEXT: popl %edi -; X32-SSE1-NEXT: popl %ebx -; X32-SSE1-NEXT: popl %ebp ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_8i16_i16_23u567u9: @@ -611,10 +595,8 @@ ; X32-SSE1: # BB#0: ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE1-NEXT: movzwl 6(%ecx), %edx -; X32-SSE1-NEXT: movzwl 8(%ecx), %ecx -; X32-SSE1-NEXT: movw %cx, 2(%eax) -; X32-SSE1-NEXT: movw %dx, (%eax) +; X32-SSE1-NEXT: movl 6(%ecx), %ecx +; X32-SSE1-NEXT: movl %ecx, (%eax) ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_8i16_i16_34uuuuuu: @@ -644,24 +626,14 @@ ; ; X32-SSE1-LABEL: merge_8i16_i16_45u7zzzz: ; X32-SSE1: # BB#0: -; X32-SSE1-NEXT: pushl %esi -; X32-SSE1-NEXT: .Lcfi14: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 -; X32-SSE1-NEXT: .Lcfi15: -; X32-SSE1-NEXT: .cfi_offset %esi, -8 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE1-NEXT: movzwl 8(%ecx), %edx -; X32-SSE1-NEXT: movzwl 10(%ecx), %esi +; X32-SSE1-NEXT: movl 8(%ecx), %edx ; X32-SSE1-NEXT: movzwl 14(%ecx), %ecx -; X32-SSE1-NEXT: movw %si, 2(%eax) -; X32-SSE1-NEXT: movw %dx, (%eax) +; X32-SSE1-NEXT: movl %edx, (%eax) ; X32-SSE1-NEXT: movw %cx, 6(%eax) -; X32-SSE1-NEXT: movw $0, 14(%eax) -; X32-SSE1-NEXT: movw $0, 12(%eax) -; X32-SSE1-NEXT: movw $0, 10(%eax) -; X32-SSE1-NEXT: movw $0, 8(%eax) -; X32-SSE1-NEXT: popl %esi +; X32-SSE1-NEXT: movl $0, 12(%eax) +; X32-SSE1-NEXT: movl $0, 8(%eax) ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_8i16_i16_45u7zzzz: @@ -698,64 +670,44 @@ ; ; X32-SSE1-LABEL: merge_16i8_i8_01u3456789ABCDuF: ; X32-SSE1: # BB#0: +; X32-SSE1-NEXT: pushl %ebp +; X32-SSE1-NEXT: .Lcfi10: +; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 ; X32-SSE1-NEXT: pushl %ebx +; X32-SSE1-NEXT: .Lcfi11: +; X32-SSE1-NEXT: .cfi_def_cfa_offset 12 +; X32-SSE1-NEXT: pushl %edi +; X32-SSE1-NEXT: .Lcfi12: +; X32-SSE1-NEXT: .cfi_def_cfa_offset 16 +; X32-SSE1-NEXT: pushl %esi +; X32-SSE1-NEXT: .Lcfi13: +; X32-SSE1-NEXT: .cfi_def_cfa_offset 20 +; X32-SSE1-NEXT: .Lcfi14: +; X32-SSE1-NEXT: .cfi_offset %esi, -20 +; X32-SSE1-NEXT: .Lcfi15: +; X32-SSE1-NEXT: .cfi_offset %edi, -16 ; X32-SSE1-NEXT: .Lcfi16: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 -; X32-SSE1-NEXT: subl $12, %esp +; X32-SSE1-NEXT: .cfi_offset %ebx, -12 ; X32-SSE1-NEXT: .Lcfi17: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 20 -; X32-SSE1-NEXT: .Lcfi18: -; X32-SSE1-NEXT: .cfi_offset %ebx, -8 +; X32-SSE1-NEXT: .cfi_offset %ebp, -8 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE1-NEXT: movb (%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 1(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 3(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 4(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 5(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 6(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 7(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 8(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 9(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 10(%ecx), %bh -; X32-SSE1-NEXT: movb 11(%ecx), %bl -; X32-SSE1-NEXT: movb 12(%ecx), %dh +; X32-SSE1-NEXT: movzwl (%ecx), %ebp +; X32-SSE1-NEXT: movl 3(%ecx), %esi +; X32-SSE1-NEXT: movl 7(%ecx), %edi +; X32-SSE1-NEXT: movzwl 11(%ecx), %ebx ; X32-SSE1-NEXT: movb 13(%ecx), %dl ; X32-SSE1-NEXT: movb 15(%ecx), %cl ; X32-SSE1-NEXT: movb %dl, 13(%eax) -; X32-SSE1-NEXT: movb %dh, 12(%eax) ; X32-SSE1-NEXT: movb %cl, 15(%eax) -; X32-SSE1-NEXT: movb %bl, 11(%eax) -; X32-SSE1-NEXT: movb %bh, 10(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 9(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 8(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 7(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 6(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 5(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 4(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 1(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, (%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 3(%eax) -; X32-SSE1-NEXT: addl $12, %esp +; X32-SSE1-NEXT: movw %bx, 11(%eax) +; X32-SSE1-NEXT: movl %edi, 7(%eax) +; X32-SSE1-NEXT: movw %bp, (%eax) +; X32-SSE1-NEXT: movl %esi, 3(%eax) +; X32-SSE1-NEXT: popl %esi +; X32-SSE1-NEXT: popl %edi ; X32-SSE1-NEXT: popl %ebx +; X32-SSE1-NEXT: popl %ebp ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_16i8_i8_01u3456789ABCDuF: @@ -823,17 +775,13 @@ ; X32-SSE1: # BB#0: ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE1-NEXT: movb (%ecx), %dl -; X32-SSE1-NEXT: movb 1(%ecx), %dh +; X32-SSE1-NEXT: movzwl (%ecx), %edx ; X32-SSE1-NEXT: movb 3(%ecx), %cl -; X32-SSE1-NEXT: movb %dh, 1(%eax) -; X32-SSE1-NEXT: movb %dl, (%eax) +; X32-SSE1-NEXT: movw %dx, (%eax) ; X32-SSE1-NEXT: movb %cl, 3(%eax) ; X32-SSE1-NEXT: movb $0, 15(%eax) -; X32-SSE1-NEXT: movb $0, 14(%eax) -; X32-SSE1-NEXT: movb $0, 13(%eax) -; X32-SSE1-NEXT: movb $0, 7(%eax) -; X32-SSE1-NEXT: movb $0, 6(%eax) +; X32-SSE1-NEXT: movw $0, 13(%eax) +; X32-SSE1-NEXT: movw $0, 6(%eax) ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz: @@ -871,35 +819,14 @@ ; ; X32-SSE1-LABEL: merge_16i8_i8_0123uu67uuuuuzzz: ; X32-SSE1: # BB#0: -; X32-SSE1-NEXT: pushl %ebx -; X32-SSE1-NEXT: .Lcfi19: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 -; X32-SSE1-NEXT: pushl %eax -; X32-SSE1-NEXT: .Lcfi20: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 12 -; X32-SSE1-NEXT: .Lcfi21: -; X32-SSE1-NEXT: .cfi_offset %ebx, -8 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE1-NEXT: movb (%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 1(%ecx), %dh -; X32-SSE1-NEXT: movb 2(%ecx), %bl -; X32-SSE1-NEXT: movb 3(%ecx), %bh -; X32-SSE1-NEXT: movb 6(%ecx), %dl -; X32-SSE1-NEXT: movb 7(%ecx), %cl -; X32-SSE1-NEXT: movb %cl, 7(%eax) -; X32-SSE1-NEXT: movb %dl, 6(%eax) -; X32-SSE1-NEXT: movb %bh, 3(%eax) -; X32-SSE1-NEXT: movb %bl, 2(%eax) -; X32-SSE1-NEXT: movb %dh, 1(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, (%eax) +; X32-SSE1-NEXT: movl (%ecx), %edx +; X32-SSE1-NEXT: movzwl 6(%ecx), %ecx +; X32-SSE1-NEXT: movw %cx, 6(%eax) +; X32-SSE1-NEXT: movl %edx, (%eax) ; X32-SSE1-NEXT: movb $0, 15(%eax) -; X32-SSE1-NEXT: movb $0, 14(%eax) -; X32-SSE1-NEXT: movb $0, 13(%eax) -; X32-SSE1-NEXT: addl $4, %esp -; X32-SSE1-NEXT: popl %ebx +; X32-SSE1-NEXT: movw $0, 13(%eax) ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_16i8_i8_0123uu67uuuuuzzz: @@ -994,14 +921,14 @@ ; X32-SSE1-LABEL: merge_2i64_i64_12_volatile: ; X32-SSE1: # BB#0: ; X32-SSE1-NEXT: pushl %edi -; X32-SSE1-NEXT: .Lcfi22: +; X32-SSE1-NEXT: .Lcfi18: ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 ; X32-SSE1-NEXT: pushl %esi -; X32-SSE1-NEXT: .Lcfi23: +; X32-SSE1-NEXT: .Lcfi19: ; X32-SSE1-NEXT: .cfi_def_cfa_offset 12 -; X32-SSE1-NEXT: .Lcfi24: +; X32-SSE1-NEXT: .Lcfi20: ; X32-SSE1-NEXT: .cfi_offset %esi, -12 -; X32-SSE1-NEXT: .Lcfi25: +; X32-SSE1-NEXT: .Lcfi21: ; X32-SSE1-NEXT: .cfi_offset %edi, -8 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx Index: test/CodeGen/X86/no-sse2-avg.ll =================================================================== --- test/CodeGen/X86/no-sse2-avg.ll +++ test/CodeGen/X86/no-sse2-avg.ll @@ -5,22 +5,8 @@ define <16 x i8> @PR27973() { ; CHECK-LABEL: PR27973: ; CHECK: # BB#0: -; CHECK-NEXT: movb $0, 15(%rdi) -; CHECK-NEXT: movb $0, 14(%rdi) -; CHECK-NEXT: movb $0, 13(%rdi) -; CHECK-NEXT: movb $0, 12(%rdi) -; CHECK-NEXT: movb $0, 11(%rdi) -; CHECK-NEXT: movb $0, 10(%rdi) -; CHECK-NEXT: movb $0, 9(%rdi) -; CHECK-NEXT: movb $0, 8(%rdi) -; CHECK-NEXT: movb $0, 7(%rdi) -; CHECK-NEXT: movb $0, 6(%rdi) -; CHECK-NEXT: movb $0, 5(%rdi) -; CHECK-NEXT: movb $0, 4(%rdi) -; CHECK-NEXT: movb $0, 3(%rdi) -; CHECK-NEXT: movb $0, 2(%rdi) -; CHECK-NEXT: movb $0, 1(%rdi) -; CHECK-NEXT: movb $0, (%rdi) +; CHECK-NEXT: movq $0, 8(%rdi) +; CHECK-NEXT: movq $0, (%rdi) ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: retq ;