Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -2727,6 +2727,9 @@ bool foldBooleans, DAGCombinerInfo &DCI, const SDLoc &dl) const; + // For targets which wrap address, unwrap for analysis. + virtual SDValue unwrapAddress(SDValue N) const { return N; } + /// Returns true (and the GlobalValue and the offset) if the node is a /// GlobalAddress + offset. virtual bool Index: lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp @@ -14,6 +14,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/Target/TargetLowering.h" namespace llvm { @@ -55,7 +56,7 @@ /// Parses tree in Ptr for base, index, offset addresses. BaseIndexOffset BaseIndexOffset::match(SDValue Ptr, const SelectionDAG &DAG) { // (((B + I*M) + c)) + c ... - SDValue Base = Ptr; + SDValue Base = DAG.getTargetLoweringInfo().unwrapAddress(Ptr); SDValue Index = SDValue(); int64_t Offset = 0; bool IsIndexSignExt = false; Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -812,6 +812,8 @@ /// This method returns the name of a target specific DAG node. const char *getTargetNodeName(unsigned Opcode) const override; + bool mergeStoresAfterLegalization() const override { return true; } + bool isCheapToSpeculateCttz() const override; bool isCheapToSpeculateCtlz() const override; @@ -867,6 +869,8 @@ const SelectionDAG &DAG, unsigned Depth) const override; + SDValue unwrapAddress(SDValue N) const override; + bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA, int64_t &Offset) const override; Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -27019,6 +27019,12 @@ return 1; } +SDValue X86TargetLowering::unwrapAddress(SDValue N) const { + if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP) + return N->getOperand(0); + return N; +} + /// Returns true (and the GlobalValue and the offset) if the node is a /// GlobalAddress + offset. bool X86TargetLowering::isGAPlusOffset(SDNode *N, Index: test/CodeGen/X86/MergeConsecutiveStores.ll =================================================================== --- test/CodeGen/X86/MergeConsecutiveStores.ll +++ test/CodeGen/X86/MergeConsecutiveStores.ll @@ -492,10 +492,15 @@ store float %vecext7, float* %arrayidx7, align 4 ret void -; CHECK-LABEL: merge_vec_element_store -; CHECK: vmovups -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; CHECK: vextractf128 $1, %ymm0, %xmm1 +; CHECK: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; CHECK: retq + +; This is what should be generated: +; FIXME-LABEL: merge_vec_element_store +; FIXME: vmovups +; FIXME-NEXT: vzeroupper +; FIXME-NEXT: retq } ; PR21711 - Merge vector stores into wider vector stores. @@ -515,11 +520,18 @@ store <4 x float> %shuffle3, <4 x float>* %idx3, align 16 ret void -; CHECK-LABEL: merge_vec_extract_stores -; CHECK: vmovups %ymm0, 48(%rdi) -; CHECK-NEXT: vmovups %ymm1, 80(%rdi) -; CHECK-NEXT: vzeroupper -; CHECK-NEXT: retq +; These vblendpd are obviously redundant. +; CHECK: vblendpd $12, %ymm0, %ymm0, %ymm0 # ymm0 = ymm0[0,1,2,3] +; CHECK: vmovupd %ymm0, 48(%rdi) +; CHECK: vblendpd $12, %ymm1, %ymm1, %ymm0 # ymm0 = ymm1[0,1,2,3] +; CHECK: vmovupd %ymm0, 80(%rdi) + +; This is what should be generated: +; FIXME-LABEL: merge_vec_extract_stores +; FIXME: vmovups %ymm0, 48(%rdi) +; FIXME-NEXT: vmovups %ymm1, 80(%rdi) +; FIXME-NEXT: vzeroupper +; FIXME-NEXT: retq } ; Merging vector stores when sourced from vector loads. @@ -557,8 +569,7 @@ } ; This is a minimized test based on real code that was failing. -; We could merge stores (and loads) like this... - +; This should now be merged. define void @merge_vec_element_and_scalar_load([6 x i64]* %array) { %idx0 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 0 %idx1 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 1 @@ -575,10 +586,8 @@ ret void ; CHECK-LABEL: merge_vec_element_and_scalar_load -; CHECK: movq (%rdi), %rax -; CHECK-NEXT: movq 8(%rdi), %rcx -; CHECK-NEXT: movq %rax, 32(%rdi) -; CHECK-NEXT: movq %rcx, 40(%rdi) +; CHECK: vmovups (%rdi), %xmm0 +; CHECK-NEXT: vmovups %xmm0, 32(%rdi) ; CHECK-NEXT: retq } Index: test/CodeGen/X86/bigstructret.ll =================================================================== --- test/CodeGen/X86/bigstructret.ll +++ test/CodeGen/X86/bigstructret.ll @@ -31,13 +31,13 @@ ret %0 %3 } + define fastcc %1 @ReturnBigStruct2() nounwind readnone { ; X86-LABEL: ReturnBigStruct2: ; X86: # BB#0: # %entry ; X86-NEXT: movl $48, 4(%ecx) ; X86-NEXT: movb $1, 2(%ecx) -; X86-NEXT: movb $1, 1(%ecx) -; X86-NEXT: movb $0, (%ecx) +; X86-NEXT: movw $256, (%ecx) # imm = 0x100 ; X86-NEXT: movl %ecx, %eax ; X86-NEXT: retl ; @@ -45,8 +45,7 @@ ; X64: # BB#0: # %entry ; X64-NEXT: movl $48, 4(%rdi) ; X64-NEXT: movb $1, 2(%rdi) -; X64-NEXT: movb $1, 1(%rdi) -; X64-NEXT: movb $0, (%rdi) +; X64-NEXT: movw $256, (%rdi) # imm = 0x100 ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: retq entry: Index: test/CodeGen/X86/bitcast-i256.ll =================================================================== --- test/CodeGen/X86/bitcast-i256.ll +++ test/CodeGen/X86/bitcast-i256.ll @@ -12,11 +12,8 @@ ; ; SLOW-LABEL: foo: ; SLOW: # BB#0: -; SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1 -; SLOW-NEXT: vpextrq $1, %xmm1, 24(%rdi) -; SLOW-NEXT: vmovq %xmm1, 16(%rdi) -; SLOW-NEXT: vpextrq $1, %xmm0, 8(%rdi) -; SLOW-NEXT: vmovq %xmm0, (%rdi) +; SLOW-NEXT: vextractf128 $1, %ymm0, 16(%rdi) +; SLOW-NEXT: vmovups %xmm0, (%rdi) ; SLOW-NEXT: movq %rdi, %rax ; SLOW-NEXT: vzeroupper ; SLOW-NEXT: retq Index: test/CodeGen/X86/constant-combines.ll =================================================================== --- test/CodeGen/X86/constant-combines.ll +++ test/CodeGen/X86/constant-combines.ll @@ -15,12 +15,11 @@ ; ; CHECK-LABEL: PR22524: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: movl $0, 4(%rdi) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: mulss %xmm0, %xmm1 -; CHECK-NEXT: movl $0, (%rdi) +; CHECK-NEXT: movq $0, (%rdi) ; CHECK-NEXT: movss %xmm1, 4(%rdi) ; CHECK-NEXT: retq entry: Index: test/CodeGen/X86/extract-store.ll =================================================================== --- test/CodeGen/X86/extract-store.ll +++ test/CodeGen/X86/extract-store.ll @@ -510,22 +510,22 @@ } define void @extract_f128_0(fp128* nocapture %dst, <2 x fp128> %foo) nounwind { -; X32-LABEL: extract_f128_0: -; X32: # BB#0: -; X32-NEXT: pushl %edi -; X32-NEXT: pushl %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl %esi, 12(%edi) -; X32-NEXT: movl %edx, 8(%edi) -; X32-NEXT: movl %ecx, 4(%edi) -; X32-NEXT: movl %eax, (%edi) -; X32-NEXT: popl %esi -; X32-NEXT: popl %edi -; X32-NEXT: retl +; SSE-X32-LABEL: extract_f128_0: +; SSE-X32: # BB#0: +; SSE-X32-NEXT: pushl %edi +; SSE-X32-NEXT: pushl %esi +; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; SSE-X32-NEXT: movl %esi, 12(%edi) +; SSE-X32-NEXT: movl %edx, 8(%edi) +; SSE-X32-NEXT: movl %ecx, 4(%edi) +; SSE-X32-NEXT: movl %eax, (%edi) +; SSE-X32-NEXT: popl %esi +; SSE-X32-NEXT: popl %edi +; SSE-X32-NEXT: retl ; ; SSE2-X64-LABEL: extract_f128_0: ; SSE2-X64: # BB#0: @@ -539,6 +539,13 @@ ; SSE41-X64-NEXT: movq %rsi, (%rdi) ; SSE41-X64-NEXT: retq ; +; AVX-X32-LABEL: extract_f128_0: +; AVX-X32: # BB#0: +; AVX-X32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX-X32-NEXT: vmovups %xmm0, (%eax) +; AVX-X32-NEXT: retl +; ; AVX-X64-LABEL: extract_f128_0: ; AVX-X64: # BB#0: ; AVX-X64-NEXT: movq %rdx, 8(%rdi) @@ -555,22 +562,22 @@ } define void @extract_f128_1(fp128* nocapture %dst, <2 x fp128> %foo) nounwind { -; X32-LABEL: extract_f128_1: -; X32: # BB#0: -; X32-NEXT: pushl %edi -; X32-NEXT: pushl %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movl {{[0-9]+}}(%esp), %edx -; X32-NEXT: movl {{[0-9]+}}(%esp), %esi -; X32-NEXT: movl {{[0-9]+}}(%esp), %edi -; X32-NEXT: movl %esi, 12(%edi) -; X32-NEXT: movl %edx, 8(%edi) -; X32-NEXT: movl %ecx, 4(%edi) -; X32-NEXT: movl %eax, (%edi) -; X32-NEXT: popl %esi -; X32-NEXT: popl %edi -; X32-NEXT: retl +; SSE-X32-LABEL: extract_f128_1: +; SSE-X32: # BB#0: +; SSE-X32-NEXT: pushl %edi +; SSE-X32-NEXT: pushl %esi +; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; SSE-X32-NEXT: movl %esi, 12(%edi) +; SSE-X32-NEXT: movl %edx, 8(%edi) +; SSE-X32-NEXT: movl %ecx, 4(%edi) +; SSE-X32-NEXT: movl %eax, (%edi) +; SSE-X32-NEXT: popl %esi +; SSE-X32-NEXT: popl %edi +; SSE-X32-NEXT: retl ; ; SSE2-X64-LABEL: extract_f128_1: ; SSE2-X64: # BB#0: @@ -584,6 +591,13 @@ ; SSE41-X64-NEXT: movq %rcx, (%rdi) ; SSE41-X64-NEXT: retq ; +; AVX-X32-LABEL: extract_f128_1: +; AVX-X32: # BB#0: +; AVX-X32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX-X32-NEXT: vmovups %xmm0, (%eax) +; AVX-X32-NEXT: retl +; ; AVX-X64-LABEL: extract_f128_1: ; AVX-X64: # BB#0: ; AVX-X64-NEXT: movq %r8, 8(%rdi) Index: test/CodeGen/X86/fold-vector-sext-crash2.ll =================================================================== --- test/CodeGen/X86/fold-vector-sext-crash2.ll +++ test/CodeGen/X86/fold-vector-sext-crash2.ll @@ -53,8 +53,10 @@ ret <2 x i256> %Shuff ; X64-LABEL: test_zext1 - ; X64: movq $0 - ; X64-NEXT: movq $0 + ; X64: xorps %xmm0, %xmm0 + ; X64: movaps %xmm0 + ; X64: movaps %xmm0 + ; X64: movaps %xmm0 ; X64-NEXT: movq $0 ; X64-NEXT: movq $254 @@ -75,8 +77,10 @@ ret <2 x i256> %Shuff ; X64-LABEL: test_zext2 - ; X64: movq $0 - ; X64-NEXT: movq $0 + ; X64: xorps %xmm0, %xmm0 + ; X64-NEXT: movaps %xmm0 + ; X64-NEXT: movaps %xmm0 + ; X64-NEXT: movaps %xmm0 ; X64-NEXT: movq $-1 ; X64-NEXT: movq $-2 Index: test/CodeGen/X86/legalize-shl-vec.ll =================================================================== --- test/CodeGen/X86/legalize-shl-vec.ll +++ test/CodeGen/X86/legalize-shl-vec.ll @@ -26,14 +26,11 @@ ; ; X64-LABEL: test_shl: ; X64: # BB#0: -; X64-NEXT: movq $0, 56(%rdi) -; X64-NEXT: movq $0, 48(%rdi) -; X64-NEXT: movq $0, 40(%rdi) -; X64-NEXT: movq $0, 32(%rdi) -; X64-NEXT: movq $0, 24(%rdi) -; X64-NEXT: movq $0, 16(%rdi) -; X64-NEXT: movq $0, 8(%rdi) -; X64-NEXT: movq $0, (%rdi) +; X64-NEXT: xorps %xmm0, %xmm0 +; X64-NEXT: movaps %xmm0, 48(%rdi) +; X64-NEXT: movaps %xmm0, 32(%rdi) +; X64-NEXT: movaps %xmm0, 16(%rdi) +; X64-NEXT: movaps %xmm0, (%rdi) ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: retq %Amt = insertelement <2 x i256> undef, i256 -1, i32 0 @@ -65,14 +62,11 @@ ; ; X64-LABEL: test_srl: ; X64: # BB#0: -; X64-NEXT: movq $0, 56(%rdi) -; X64-NEXT: movq $0, 48(%rdi) -; X64-NEXT: movq $0, 40(%rdi) -; X64-NEXT: movq $0, 32(%rdi) -; X64-NEXT: movq $0, 24(%rdi) -; X64-NEXT: movq $0, 16(%rdi) -; X64-NEXT: movq $0, 8(%rdi) -; X64-NEXT: movq $0, (%rdi) +; X64-NEXT: xorps %xmm0, %xmm0 +; X64-NEXT: movaps %xmm0, 48(%rdi) +; X64-NEXT: movaps %xmm0, 32(%rdi) +; X64-NEXT: movaps %xmm0, 16(%rdi) +; X64-NEXT: movaps %xmm0, (%rdi) ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: retq %Amt = insertelement <2 x i256> undef, i256 -1, i32 0 Index: test/CodeGen/X86/merge-consecutive-loads-128.ll =================================================================== --- test/CodeGen/X86/merge-consecutive-loads-128.ll +++ test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -526,44 +526,28 @@ ; ; X32-SSE1-LABEL: merge_8i16_i16_23u567u9: ; X32-SSE1: # BB#0: -; X32-SSE1-NEXT: pushl %ebp +; X32-SSE1-NEXT: pushl %edi ; X32-SSE1-NEXT: .Lcfi6: ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 -; X32-SSE1-NEXT: pushl %ebx +; X32-SSE1-NEXT: pushl %esi ; X32-SSE1-NEXT: .Lcfi7: ; X32-SSE1-NEXT: .cfi_def_cfa_offset 12 -; X32-SSE1-NEXT: pushl %edi ; X32-SSE1-NEXT: .Lcfi8: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 16 -; X32-SSE1-NEXT: pushl %esi +; X32-SSE1-NEXT: .cfi_offset %esi, -12 ; X32-SSE1-NEXT: .Lcfi9: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 20 -; X32-SSE1-NEXT: .Lcfi10: -; X32-SSE1-NEXT: .cfi_offset %esi, -20 -; X32-SSE1-NEXT: .Lcfi11: -; X32-SSE1-NEXT: .cfi_offset %edi, -16 -; X32-SSE1-NEXT: .Lcfi12: -; X32-SSE1-NEXT: .cfi_offset %ebx, -12 -; X32-SSE1-NEXT: .Lcfi13: -; X32-SSE1-NEXT: .cfi_offset %ebp, -8 +; X32-SSE1-NEXT: .cfi_offset %edi, -8 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE1-NEXT: movzwl 4(%ecx), %edx -; X32-SSE1-NEXT: movzwl 6(%ecx), %esi -; X32-SSE1-NEXT: movzwl 10(%ecx), %edi -; X32-SSE1-NEXT: movzwl 12(%ecx), %ebx -; X32-SSE1-NEXT: movzwl 14(%ecx), %ebp +; X32-SSE1-NEXT: movl 4(%ecx), %edx +; X32-SSE1-NEXT: movl 10(%ecx), %esi +; X32-SSE1-NEXT: movzwl 14(%ecx), %edi ; X32-SSE1-NEXT: movzwl 18(%ecx), %ecx -; X32-SSE1-NEXT: movw %bp, 10(%eax) -; X32-SSE1-NEXT: movw %bx, 8(%eax) +; X32-SSE1-NEXT: movw %di, 10(%eax) ; X32-SSE1-NEXT: movw %cx, 14(%eax) -; X32-SSE1-NEXT: movw %si, 2(%eax) -; X32-SSE1-NEXT: movw %dx, (%eax) -; X32-SSE1-NEXT: movw %di, 6(%eax) +; X32-SSE1-NEXT: movl %edx, (%eax) +; X32-SSE1-NEXT: movl %esi, 6(%eax) ; X32-SSE1-NEXT: popl %esi ; X32-SSE1-NEXT: popl %edi -; X32-SSE1-NEXT: popl %ebx -; X32-SSE1-NEXT: popl %ebp ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_8i16_i16_23u567u9: @@ -607,10 +591,8 @@ ; X32-SSE1: # BB#0: ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE1-NEXT: movzwl 6(%ecx), %edx -; X32-SSE1-NEXT: movzwl 8(%ecx), %ecx -; X32-SSE1-NEXT: movw %cx, 2(%eax) -; X32-SSE1-NEXT: movw %dx, (%eax) +; X32-SSE1-NEXT: movl 6(%ecx), %ecx +; X32-SSE1-NEXT: movl %ecx, (%eax) ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_8i16_i16_34uuuuuu: @@ -640,24 +622,14 @@ ; ; X32-SSE1-LABEL: merge_8i16_i16_45u7zzzz: ; X32-SSE1: # BB#0: -; X32-SSE1-NEXT: pushl %esi -; X32-SSE1-NEXT: .Lcfi14: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 -; X32-SSE1-NEXT: .Lcfi15: -; X32-SSE1-NEXT: .cfi_offset %esi, -8 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE1-NEXT: movzwl 8(%ecx), %edx -; X32-SSE1-NEXT: movzwl 10(%ecx), %esi +; X32-SSE1-NEXT: movl 8(%ecx), %edx ; X32-SSE1-NEXT: movzwl 14(%ecx), %ecx -; X32-SSE1-NEXT: movw %si, 2(%eax) -; X32-SSE1-NEXT: movw %dx, (%eax) +; X32-SSE1-NEXT: movl %edx, (%eax) ; X32-SSE1-NEXT: movw %cx, 6(%eax) -; X32-SSE1-NEXT: movw $0, 14(%eax) -; X32-SSE1-NEXT: movw $0, 12(%eax) -; X32-SSE1-NEXT: movw $0, 10(%eax) -; X32-SSE1-NEXT: movw $0, 8(%eax) -; X32-SSE1-NEXT: popl %esi +; X32-SSE1-NEXT: movl $0, 12(%eax) +; X32-SSE1-NEXT: movl $0, 8(%eax) ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_8i16_i16_45u7zzzz: @@ -694,64 +666,44 @@ ; ; X32-SSE1-LABEL: merge_16i8_i8_01u3456789ABCDuF: ; X32-SSE1: # BB#0: +; X32-SSE1-NEXT: pushl %ebp +; X32-SSE1-NEXT: .Lcfi10: +; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 ; X32-SSE1-NEXT: pushl %ebx +; X32-SSE1-NEXT: .Lcfi11: +; X32-SSE1-NEXT: .cfi_def_cfa_offset 12 +; X32-SSE1-NEXT: pushl %edi +; X32-SSE1-NEXT: .Lcfi12: +; X32-SSE1-NEXT: .cfi_def_cfa_offset 16 +; X32-SSE1-NEXT: pushl %esi +; X32-SSE1-NEXT: .Lcfi13: +; X32-SSE1-NEXT: .cfi_def_cfa_offset 20 +; X32-SSE1-NEXT: .Lcfi14: +; X32-SSE1-NEXT: .cfi_offset %esi, -20 +; X32-SSE1-NEXT: .Lcfi15: +; X32-SSE1-NEXT: .cfi_offset %edi, -16 ; X32-SSE1-NEXT: .Lcfi16: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 -; X32-SSE1-NEXT: subl $12, %esp +; X32-SSE1-NEXT: .cfi_offset %ebx, -12 ; X32-SSE1-NEXT: .Lcfi17: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 20 -; X32-SSE1-NEXT: .Lcfi18: -; X32-SSE1-NEXT: .cfi_offset %ebx, -8 +; X32-SSE1-NEXT: .cfi_offset %ebp, -8 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE1-NEXT: movb (%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 1(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 3(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 4(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 5(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 6(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 7(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 8(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 9(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 10(%ecx), %bh -; X32-SSE1-NEXT: movb 11(%ecx), %bl -; X32-SSE1-NEXT: movb 12(%ecx), %dh +; X32-SSE1-NEXT: movzwl (%ecx), %ebp +; X32-SSE1-NEXT: movl 3(%ecx), %esi +; X32-SSE1-NEXT: movl 7(%ecx), %edi +; X32-SSE1-NEXT: movzwl 11(%ecx), %ebx ; X32-SSE1-NEXT: movb 13(%ecx), %dl ; X32-SSE1-NEXT: movb 15(%ecx), %cl ; X32-SSE1-NEXT: movb %dl, 13(%eax) -; X32-SSE1-NEXT: movb %dh, 12(%eax) ; X32-SSE1-NEXT: movb %cl, 15(%eax) -; X32-SSE1-NEXT: movb %bl, 11(%eax) -; X32-SSE1-NEXT: movb %bh, 10(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 9(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 8(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 7(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 6(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 5(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 4(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 1(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, (%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 3(%eax) -; X32-SSE1-NEXT: addl $12, %esp +; X32-SSE1-NEXT: movw %bx, 11(%eax) +; X32-SSE1-NEXT: movl %edi, 7(%eax) +; X32-SSE1-NEXT: movw %bp, (%eax) +; X32-SSE1-NEXT: movl %esi, 3(%eax) +; X32-SSE1-NEXT: popl %esi +; X32-SSE1-NEXT: popl %edi ; X32-SSE1-NEXT: popl %ebx +; X32-SSE1-NEXT: popl %ebp ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_16i8_i8_01u3456789ABCDuF: @@ -819,17 +771,13 @@ ; X32-SSE1: # BB#0: ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE1-NEXT: movb (%ecx), %dl -; X32-SSE1-NEXT: movb 1(%ecx), %dh +; X32-SSE1-NEXT: movzwl (%ecx), %edx ; X32-SSE1-NEXT: movb 3(%ecx), %cl -; X32-SSE1-NEXT: movb %dh, 1(%eax) -; X32-SSE1-NEXT: movb %dl, (%eax) +; X32-SSE1-NEXT: movw %dx, (%eax) ; X32-SSE1-NEXT: movb %cl, 3(%eax) ; X32-SSE1-NEXT: movb $0, 15(%eax) -; X32-SSE1-NEXT: movb $0, 14(%eax) -; X32-SSE1-NEXT: movb $0, 13(%eax) -; X32-SSE1-NEXT: movb $0, 7(%eax) -; X32-SSE1-NEXT: movb $0, 6(%eax) +; X32-SSE1-NEXT: movw $0, 13(%eax) +; X32-SSE1-NEXT: movw $0, 6(%eax) ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz: @@ -867,35 +815,14 @@ ; ; X32-SSE1-LABEL: merge_16i8_i8_0123uu67uuuuuzzz: ; X32-SSE1: # BB#0: -; X32-SSE1-NEXT: pushl %ebx -; X32-SSE1-NEXT: .Lcfi19: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 -; X32-SSE1-NEXT: pushl %eax -; X32-SSE1-NEXT: .Lcfi20: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 12 -; X32-SSE1-NEXT: .Lcfi21: -; X32-SSE1-NEXT: .cfi_offset %ebx, -8 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE1-NEXT: movb (%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 1(%ecx), %dh -; X32-SSE1-NEXT: movb 2(%ecx), %bl -; X32-SSE1-NEXT: movb 3(%ecx), %bh -; X32-SSE1-NEXT: movb 6(%ecx), %dl -; X32-SSE1-NEXT: movb 7(%ecx), %cl -; X32-SSE1-NEXT: movb %cl, 7(%eax) -; X32-SSE1-NEXT: movb %dl, 6(%eax) -; X32-SSE1-NEXT: movb %bh, 3(%eax) -; X32-SSE1-NEXT: movb %bl, 2(%eax) -; X32-SSE1-NEXT: movb %dh, 1(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, (%eax) +; X32-SSE1-NEXT: movl (%ecx), %edx +; X32-SSE1-NEXT: movzwl 6(%ecx), %ecx +; X32-SSE1-NEXT: movw %cx, 6(%eax) +; X32-SSE1-NEXT: movl %edx, (%eax) ; X32-SSE1-NEXT: movb $0, 15(%eax) -; X32-SSE1-NEXT: movb $0, 14(%eax) -; X32-SSE1-NEXT: movb $0, 13(%eax) -; X32-SSE1-NEXT: addl $4, %esp -; X32-SSE1-NEXT: popl %ebx +; X32-SSE1-NEXT: movw $0, 13(%eax) ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_16i8_i8_0123uu67uuuuuzzz: @@ -990,14 +917,14 @@ ; X32-SSE1-LABEL: merge_2i64_i64_12_volatile: ; X32-SSE1: # BB#0: ; X32-SSE1-NEXT: pushl %edi -; X32-SSE1-NEXT: .Lcfi22: +; X32-SSE1-NEXT: .Lcfi18: ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 ; X32-SSE1-NEXT: pushl %esi -; X32-SSE1-NEXT: .Lcfi23: +; X32-SSE1-NEXT: .Lcfi19: ; X32-SSE1-NEXT: .cfi_def_cfa_offset 12 -; X32-SSE1-NEXT: .Lcfi24: +; X32-SSE1-NEXT: .Lcfi20: ; X32-SSE1-NEXT: .cfi_offset %esi, -12 -; X32-SSE1-NEXT: .Lcfi25: +; X32-SSE1-NEXT: .Lcfi21: ; X32-SSE1-NEXT: .cfi_offset %edi, -8 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx Index: test/CodeGen/X86/merge-store-partially-alias-loads.ll =================================================================== --- test/CodeGen/X86/merge-store-partially-alias-loads.ll +++ test/CodeGen/X86/merge-store-partially-alias-loads.ll @@ -13,7 +13,7 @@ ; X86-NEXT: movb [[HI1]], 3([[BASEREG]]) ; X86-NEXT: retq -; DBGDAG-LABEL: Optimized lowered selection DAG: BB#0 'merge_store_partial_overlap_load:' +; DBGDAG-LABEL: Optimized legalized selection DAG: BB#0 'merge_store_partial_overlap_load:' ; DBGDAG: [[ENTRYTOKEN:t[0-9]+]]: ch = EntryToken ; DBGDAG-DAG: [[BASEPTR:t[0-9]+]]: i64,ch = CopyFromReg [[ENTRYTOKEN]], ; DBGDAG-DAG: [[ADDPTR:t[0-9]+]]: i64 = add [[BASEPTR]], Constant:i64<2> @@ -27,7 +27,7 @@ ; DBGDAG: X86ISD::RET_FLAG t{{[0-9]+}}, -; DBGDAG: Type-legalized selection DAG: BB#0 'merge_store_partial_overlap_load:' +; DBGDAG-LABEL: Instruction selection begins define void @merge_store_partial_overlap_load([4 x i8]* %tmp) { %tmp8 = getelementptr inbounds [4 x i8], [4 x i8]* %tmp, i32 0, i8 0 %tmp10 = getelementptr inbounds [4 x i8], [4 x i8]* %tmp, i32 0, i8 1 Index: test/CodeGen/X86/no-sse2-avg.ll =================================================================== --- test/CodeGen/X86/no-sse2-avg.ll +++ test/CodeGen/X86/no-sse2-avg.ll @@ -5,22 +5,8 @@ define <16 x i8> @PR27973() { ; CHECK-LABEL: PR27973: ; CHECK: # BB#0: -; CHECK-NEXT: movb $0, 15(%rdi) -; CHECK-NEXT: movb $0, 14(%rdi) -; CHECK-NEXT: movb $0, 13(%rdi) -; CHECK-NEXT: movb $0, 12(%rdi) -; CHECK-NEXT: movb $0, 11(%rdi) -; CHECK-NEXT: movb $0, 10(%rdi) -; CHECK-NEXT: movb $0, 9(%rdi) -; CHECK-NEXT: movb $0, 8(%rdi) -; CHECK-NEXT: movb $0, 7(%rdi) -; CHECK-NEXT: movb $0, 6(%rdi) -; CHECK-NEXT: movb $0, 5(%rdi) -; CHECK-NEXT: movb $0, 4(%rdi) -; CHECK-NEXT: movb $0, 3(%rdi) -; CHECK-NEXT: movb $0, 2(%rdi) -; CHECK-NEXT: movb $0, 1(%rdi) -; CHECK-NEXT: movb $0, (%rdi) +; CHECK-NEXT: movq $0, 8(%rdi) +; CHECK-NEXT: movq $0, (%rdi) ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: retq %t0 = zext <16 x i8> zeroinitializer to <16 x i32> Index: test/CodeGen/X86/stdarg.ll =================================================================== --- test/CodeGen/X86/stdarg.ll +++ test/CodeGen/X86/stdarg.ll @@ -14,8 +14,7 @@ ; ; CHECK-DAG: movq {{.*}}, 192(%rsp) ; CHECK-DAG: movq {{.*}}, 184(%rsp) -; CHECK-DAG: movl {{.*}}, 180(%rsp) -; CHECK-DAG: movl {{.*}}, 176(%rsp) +; CHECK-DAG: movq {{.*}}, 176(%rsp) %ap3 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i64 0, i64 0; <%struct.__va_list_tag*> [#uses=1] call void @bar(%struct.__va_list_tag* %ap3) nounwind call void @llvm.va_end(i8* %ap12) Index: test/CodeGen/X86/stores-merging.ll =================================================================== --- test/CodeGen/X86/stores-merging.ll +++ test/CodeGen/X86/stores-merging.ll @@ -13,9 +13,8 @@ ;; the same result in memory in the end. ; CHECK-LABEL: redundant_stores_merging: -; CHECK: movabsq $528280977409, %rax -; CHECK: movq %rax, e+4(%rip) -; CHECK: movl $456, e+8(%rip) +; CHECK: movabsq $1958505086977, %rax +; CHECK: movq %rax, e+4(%rip) define void @redundant_stores_merging() { entry: store i32 1, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 1), align 4 @@ -26,9 +25,8 @@ ;; This variant tests PR25154. ; CHECK-LABEL: redundant_stores_merging_reverse: -; CHECK: movabsq $528280977409, %rax -; CHECK: movq %rax, e+4(%rip) -; CHECK: movl $456, e+8(%rip) +; CHECK: movabsq $1958505086977, %rax +; CHECK: movq %rax, e+4(%rip) define void @redundant_stores_merging_reverse() { entry: store i32 123, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4