Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -2691,6 +2691,9 @@ bool foldBooleans, DAGCombinerInfo &DCI, const SDLoc &dl) const; + // For targets which wrap address, unwrap for analysis. + virtual SDValue unwrapAddress(SDValue N) const { return N; } + /// Returns true (and the GlobalValue and the offset) if the node is a /// GlobalAddress + offset. virtual bool Index: lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAGAddressAnalysis.cpp @@ -14,6 +14,7 @@ #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/SelectionDAGNodes.h" +#include "llvm/Target/TargetLowering.h" namespace llvm { @@ -49,8 +50,10 @@ /// Parses tree in Ptr for base, index, offset addresses. BaseIndexOffset BaseIndexOffset::match(SDValue Ptr, const SelectionDAG &DAG) { + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + // (((B + I*M) + c)) + c ... - SDValue Base = Ptr; + SDValue Base = TLI.unwrapAddress(Ptr); SDValue Index = SDValue(); int64_t Offset = 0; bool IsIndexSignExt = false; Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -786,6 +786,8 @@ /// This method returns the name of a target specific DAG node. const char *getTargetNodeName(unsigned Opcode) const override; + bool mergeStoresAfterLegalization() const override { return true; } + bool isCheapToSpeculateCttz() const override; bool isCheapToSpeculateCtlz() const override; @@ -841,6 +843,8 @@ const SelectionDAG &DAG, unsigned Depth) const override; + SDValue unwrapAddress(SDValue N) const override; + bool isGAPlusOffset(SDNode *N, const GlobalValue* &GA, int64_t &Offset) const override; Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -13968,11 +13968,8 @@ // result has a single use which is a store or a bitcast to i32. And in // the case of a store, it's not worth it if the index is a constant 0, // because a MOVSSmr can be used instead, which is smaller and faster. - if (!Op.hasOneUse()) - return SDValue(); SDNode *User = *Op.getNode()->use_begin(); - if ((User->getOpcode() != ISD::STORE || - isNullConstant(Op.getOperand(1))) && + if ((User->getOpcode() != ISD::STORE) && (User->getOpcode() != ISD::BITCAST || User->getValueType(0) != MVT::i32)) return SDValue(); @@ -26865,6 +26862,12 @@ return 1; } +SDValue X86TargetLowering::unwrapAddress(SDValue N) const { + if (N->getOpcode() == X86ISD::Wrapper || N->getOpcode() == X86ISD::WrapperRIP) + return N->getOperand(0); + return N; +} + /// Returns true (and the GlobalValue and the offset) if the node is a /// GlobalAddress + offset. bool X86TargetLowering::isGAPlusOffset(SDNode *N, Index: test/CodeGen/X86/2011-10-19-widen_vselect.ll =================================================================== --- test/CodeGen/X86/2011-10-19-widen_vselect.ll +++ test/CodeGen/X86/2011-10-19-widen_vselect.ll @@ -9,7 +9,7 @@ ; X32-LABEL: simple_widen: ; X32: # BB#0: # %entry ; X32-NEXT: extractps $1, %xmm1, (%eax) -; X32-NEXT: movss %xmm1, (%eax) +; X32-NEXT: extractps $0, %xmm1, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: simple_widen: @@ -29,7 +29,7 @@ ; X32-NEXT: cmpordps %xmm0, %xmm0 ; X32-NEXT: blendvps %xmm0, %xmm2, %xmm1 ; X32-NEXT: extractps $1, %xmm1, (%eax) -; X32-NEXT: movss %xmm1, (%eax) +; X32-NEXT: extractps $0, %xmm1, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: complex_inreg_work: @@ -84,7 +84,7 @@ ; X32-NEXT: movaps %xmm1, %xmm0 ; X32-NEXT: blendvps %xmm0, %xmm2, %xmm4 ; X32-NEXT: extractps $1, %xmm4, {{[0-9]+}}(%esp) -; X32-NEXT: movss %xmm4, {{[0-9]+}}(%esp) +; X32-NEXT: extractps $0, %xmm4, {{[0-9]+}}(%esp) ; X32-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero ; X32-NEXT: movsd %xmm0, {{[0-9]+}}(%esp) ; X32-NEXT: addl $60, %esp Index: test/CodeGen/X86/MergeConsecutiveStores.ll =================================================================== --- test/CodeGen/X86/MergeConsecutiveStores.ll +++ test/CodeGen/X86/MergeConsecutiveStores.ll @@ -558,8 +558,7 @@ } ; This is a minimized test based on real code that was failing. -; We could merge stores (and loads) like this... - +; This should now be merged. define void @merge_vec_element_and_scalar_load([6 x i64]* %array) { %idx0 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 0 %idx1 = getelementptr inbounds [6 x i64], [6 x i64]* %array, i64 0, i64 1 @@ -576,10 +575,8 @@ ret void ; CHECK-LABEL: merge_vec_element_and_scalar_load -; CHECK: movq (%rdi), %rax -; CHECK-NEXT: movq 8(%rdi), %rcx -; CHECK-NEXT: movq %rax, 32(%rdi) -; CHECK-NEXT: movq %rcx, 40(%rdi) +; CHECK: vmovups (%rdi), %xmm0 +; CHECK-NEXT: vmovups %xmm0, 32(%rdi) ; CHECK-NEXT: retq } Index: test/CodeGen/X86/avx1-logical-load-folding.ll =================================================================== --- test/CodeGen/X86/avx1-logical-load-folding.ll +++ test/CodeGen/X86/avx1-logical-load-folding.ll @@ -10,7 +10,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: vmovaps (%ecx), %ymm0 ; X86-NEXT: vandps LCPI0_0, %ymm0, %ymm0 -; X86-NEXT: vmovss %xmm0, (%eax) +; X86-NEXT: vextractps $0, %xmm0, (%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; @@ -18,7 +18,7 @@ ; X64: ## BB#0: ; X64-NEXT: vmovaps (%rdi), %ymm0 ; X64-NEXT: vandps {{.*}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vmovss %xmm0, (%rsi) +; X64-NEXT: vextractps $0, %xmm0, (%rsi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq %tmp1 = bitcast float* %A to <8 x float>* @@ -39,7 +39,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: vmovaps (%ecx), %ymm0 ; X86-NEXT: vorps LCPI1_0, %ymm0, %ymm0 -; X86-NEXT: vmovss %xmm0, (%eax) +; X86-NEXT: vextractps $0, %xmm0, (%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; @@ -47,7 +47,7 @@ ; X64: ## BB#0: ; X64-NEXT: vmovaps (%rdi), %ymm0 ; X64-NEXT: vorps {{.*}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vmovss %xmm0, (%rsi) +; X64-NEXT: vextractps $0, %xmm0, (%rsi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq %tmp1 = bitcast float* %A to <8 x float>* @@ -68,7 +68,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: vmovaps (%ecx), %ymm0 ; X86-NEXT: vxorps LCPI2_0, %ymm0, %ymm0 -; X86-NEXT: vmovss %xmm0, (%eax) +; X86-NEXT: vextractps $0, %xmm0, (%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; @@ -76,7 +76,7 @@ ; X64: ## BB#0: ; X64-NEXT: vmovaps (%rdi), %ymm0 ; X64-NEXT: vxorps {{.*}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vmovss %xmm0, (%rsi) +; X64-NEXT: vextractps $0, %xmm0, (%rsi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq %tmp1 = bitcast float* %A to <8 x float>* @@ -96,7 +96,7 @@ ; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X86-NEXT: vmovaps (%ecx), %ymm0 ; X86-NEXT: vandnps LCPI3_0, %ymm0, %ymm0 -; X86-NEXT: vmovss %xmm0, (%eax) +; X86-NEXT: vextractps $0, %xmm0, (%eax) ; X86-NEXT: vzeroupper ; X86-NEXT: retl ; @@ -104,7 +104,7 @@ ; X64: ## BB#0: ; X64-NEXT: vmovaps (%rdi), %ymm0 ; X64-NEXT: vandnps {{.*}}(%rip), %ymm0, %ymm0 -; X64-NEXT: vmovss %xmm0, (%rsi) +; X64-NEXT: vextractps $0, %xmm0, (%rsi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq %tmp1 = bitcast float* %A to <8 x float>* Index: test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics.ll +++ test/CodeGen/X86/avx512-intrinsics.ll @@ -4915,7 +4915,7 @@ ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: kmovw %edx, %k1 ; CHECK-NEXT: vfmadd132ss (%rsi), %xmm0, %xmm0 {%k1} -; CHECK-NEXT: vmovss %xmm0, (%rdi) +; CHECK-NEXT: vextractps $0, %xmm0, (%rdi) ; CHECK-NEXT: retq %a.val = load float, float* %a %av0 = insertelement <4 x float> undef, float %a.val, i32 0 @@ -4942,7 +4942,7 @@ ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: kmovw %edx, %k1 ; CHECK-NEXT: vfmadd132ss (%rsi), %xmm0, %xmm0 {%k1} {z} -; CHECK-NEXT: vmovss %xmm0, (%rdi) +; CHECK-NEXT: vextractps $0, %xmm0, (%rdi) ; CHECK-NEXT: retq %a.val = load float, float* %a %av0 = insertelement <4 x float> undef, float %a.val, i32 0 Index: test/CodeGen/X86/bigstructret.ll =================================================================== --- test/CodeGen/X86/bigstructret.ll +++ test/CodeGen/X86/bigstructret.ll @@ -19,10 +19,9 @@ } ; CHECK: ReturnBigStruct2 -; CHECK: movl $48, 4(%ecx) -; CHECK: movb $1, 2(%ecx) -; CHECK: movb $1, 1(%ecx) -; CHECK: movb $0, (%ecx) +; CHECK-DAG: movl $48, 4(%ecx) +; CHECK-DAG: movb $1, 2(%ecx) +; CHECK-DAG: movw $256, (%ecx) define fastcc %1 @ReturnBigStruct2() nounwind readnone { entry: Index: test/CodeGen/X86/bitcast-i256.ll =================================================================== --- test/CodeGen/X86/bitcast-i256.ll +++ test/CodeGen/X86/bitcast-i256.ll @@ -5,7 +5,6 @@ ret i256 %r ; CHECK: foo ; CHECK: vextractf128 -; CHECK: vpextrq -; CHECK: vpextrq +; CHECK: vmovups ; CHECK: ret } Index: test/CodeGen/X86/constant-combines.ll =================================================================== --- test/CodeGen/X86/constant-combines.ll +++ test/CodeGen/X86/constant-combines.ll @@ -15,12 +15,11 @@ ; ; CHECK-LABEL: PR22524: ; CHECK: # BB#0: # %entry -; CHECK-NEXT: movl $0, 4(%rdi) ; CHECK-NEXT: xorl %eax, %eax ; CHECK-NEXT: movd %eax, %xmm0 ; CHECK-NEXT: xorps %xmm1, %xmm1 ; CHECK-NEXT: mulss %xmm0, %xmm1 -; CHECK-NEXT: movl $0, (%rdi) +; CHECK-NEXT: movq $0, (%rdi) ; CHECK-NEXT: movss %xmm1, 4(%rdi) ; CHECK-NEXT: retq entry: Index: test/CodeGen/X86/extract-store.ll =================================================================== --- test/CodeGen/X86/extract-store.ll +++ test/CodeGen/X86/extract-store.ll @@ -45,6 +45,12 @@ ; AVX-X64: # BB#0: ; AVX-X64-NEXT: vpextrb $0, %xmm0, (%rdi) ; AVX-X64-NEXT: retq +; +; SSE-F128-LABEL: extract_i8_0: +; SSE-F128: # BB#0: +; SSE-F128-NEXT: movd %xmm0, %eax +; SSE-F128-NEXT: movb %al, (%rdi) +; SSE-F128-NEXT: retq %vecext = extractelement <16 x i8> %foo, i32 0 store i8 %vecext, i8* %dst, align 1 ret void @@ -87,6 +93,13 @@ ; AVX-X64: # BB#0: ; AVX-X64-NEXT: vpextrb $3, %xmm0, (%rdi) ; AVX-X64-NEXT: retq +; +; SSE-F128-LABEL: extract_i8_3: +; SSE-F128: # BB#0: +; SSE-F128-NEXT: movd %xmm0, %eax +; SSE-F128-NEXT: shrl $24, %eax +; SSE-F128-NEXT: movb %al, (%rdi) +; SSE-F128-NEXT: retq %vecext = extractelement <16 x i8> %foo, i32 3 store i8 %vecext, i8* %dst, align 1 ret void @@ -127,6 +140,12 @@ ; AVX-X64: # BB#0: ; AVX-X64-NEXT: vpextrb $15, %xmm0, (%rdi) ; AVX-X64-NEXT: retq +; +; SSE-F128-LABEL: extract_i8_15: +; SSE-F128: # BB#0: +; SSE-F128-NEXT: pextrw $7, %xmm0, %eax +; SSE-F128-NEXT: movb %ah, (%rdi) # NOREX +; SSE-F128-NEXT: retq %vecext = extractelement <16 x i8> %foo, i32 15 store i8 %vecext, i8* %dst, align 1 ret void @@ -167,6 +186,12 @@ ; AVX-X64: # BB#0: ; AVX-X64-NEXT: vpextrw $0, %xmm0, (%rdi) ; AVX-X64-NEXT: retq +; +; SSE-F128-LABEL: extract_i16_0: +; SSE-F128: # BB#0: +; SSE-F128-NEXT: movd %xmm0, %eax +; SSE-F128-NEXT: movw %ax, (%rdi) +; SSE-F128-NEXT: retq %vecext = extractelement <8 x i16> %foo, i32 0 store i16 %vecext, i16* %dst, align 1 ret void @@ -207,6 +232,12 @@ ; AVX-X64: # BB#0: ; AVX-X64-NEXT: vpextrw $7, %xmm0, (%rdi) ; AVX-X64-NEXT: retq +; +; SSE-F128-LABEL: extract_i16_7: +; SSE-F128: # BB#0: +; SSE-F128-NEXT: pextrw $7, %xmm0, %eax +; SSE-F128-NEXT: movw %ax, (%rdi) +; SSE-F128-NEXT: retq %vecext = extractelement <8 x i16> %foo, i32 7 store i16 %vecext, i16* %dst, align 1 ret void @@ -274,6 +305,12 @@ ; AVX-X64: # BB#0: ; AVX-X64-NEXT: vpextrd $3, %xmm0, (%rdi) ; AVX-X64-NEXT: retq +; +; SSE-F128-LABEL: extract_i32_3: +; SSE-F128: # BB#0: +; SSE-F128-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE-F128-NEXT: movd %xmm0, (%rdi) +; SSE-F128-NEXT: retq %vecext = extractelement <4 x i32> %foo, i32 3 store i32 %vecext, i32* %dst, align 1 ret void @@ -336,33 +373,55 @@ ; AVX-X64: # BB#0: ; AVX-X64-NEXT: vpextrq $1, %xmm0, (%rdi) ; AVX-X64-NEXT: retq +; +; SSE-F128-LABEL: extract_i64_1: +; SSE-F128: # BB#0: +; SSE-F128-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,0,1] +; SSE-F128-NEXT: movq %xmm0, (%rdi) +; SSE-F128-NEXT: retq %vecext = extractelement <2 x i64> %foo, i32 1 store i64 %vecext, i64* %dst, align 1 ret void } define void @extract_f32_0(float* nocapture %dst, <4 x float> %foo) nounwind { -; SSE-X32-LABEL: extract_f32_0: -; SSE-X32: # BB#0: -; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; SSE-X32-NEXT: movss %xmm0, (%eax) -; SSE-X32-NEXT: retl +; SSE2-X32-LABEL: extract_f32_0: +; SSE2-X32: # BB#0: +; SSE2-X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE2-X32-NEXT: movss %xmm0, (%eax) +; SSE2-X32-NEXT: retl ; -; SSE-X64-LABEL: extract_f32_0: -; SSE-X64: # BB#0: -; SSE-X64-NEXT: movss %xmm0, (%rdi) -; SSE-X64-NEXT: retq +; SSE2-X64-LABEL: extract_f32_0: +; SSE2-X64: # BB#0: +; SSE2-X64-NEXT: movss %xmm0, (%rdi) +; SSE2-X64-NEXT: retq +; +; SSE41-X32-LABEL: extract_f32_0: +; SSE41-X32: # BB#0: +; SSE41-X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE41-X32-NEXT: extractps $0, %xmm0, (%eax) +; SSE41-X32-NEXT: retl +; +; SSE41-X64-LABEL: extract_f32_0: +; SSE41-X64: # BB#0: +; SSE41-X64-NEXT: extractps $0, %xmm0, (%rdi) +; SSE41-X64-NEXT: retq ; ; AVX-X32-LABEL: extract_f32_0: ; AVX-X32: # BB#0: ; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; AVX-X32-NEXT: vmovss %xmm0, (%eax) +; AVX-X32-NEXT: vextractps $0, %xmm0, (%eax) ; AVX-X32-NEXT: retl ; ; AVX-X64-LABEL: extract_f32_0: ; AVX-X64: # BB#0: -; AVX-X64-NEXT: vmovss %xmm0, (%rdi) +; AVX-X64-NEXT: vextractps $0, %xmm0, (%rdi) ; AVX-X64-NEXT: retq +; +; SSE-F128-LABEL: extract_f32_0: +; SSE-F128: # BB#0: +; SSE-F128-NEXT: movss %xmm0, (%rdi) +; SSE-F128-NEXT: retq %vecext = extractelement <4 x float> %foo, i32 0 store float %vecext, float* %dst, align 1 ret void @@ -403,6 +462,12 @@ ; AVX-X64: # BB#0: ; AVX-X64-NEXT: vextractps $3, %xmm0, (%rdi) ; AVX-X64-NEXT: retq +; +; SSE-F128-LABEL: extract_f32_3: +; SSE-F128: # BB#0: +; SSE-F128-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,1,2,3] +; SSE-F128-NEXT: movss %xmm0, (%rdi) +; SSE-F128-NEXT: retq %vecext = extractelement <4 x float> %foo, i32 3 store float %vecext, float* %dst, align 1 ret void @@ -463,9 +528,51 @@ } define void @extract_f128_0(fp128* nocapture %dst, <2 x fp128> %foo) nounwind { +; SSE-X32-LABEL: extract_f128_0: +; SSE-X32: # BB#0: +; SSE-X32-NEXT: pushl %edi +; SSE-X32-NEXT: pushl %esi +; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; SSE-X32-NEXT: movl %esi, 12(%edi) +; SSE-X32-NEXT: movl %edx, 8(%edi) +; SSE-X32-NEXT: movl %ecx, 4(%edi) +; SSE-X32-NEXT: movl %eax, (%edi) +; SSE-X32-NEXT: popl %esi +; SSE-X32-NEXT: popl %edi +; SSE-X32-NEXT: retl +; +; SSE2-X64-LABEL: extract_f128_0: +; SSE2-X64: # BB#0: +; SSE2-X64-NEXT: movq %rdx, 8(%rdi) +; SSE2-X64-NEXT: movq %rsi, (%rdi) +; SSE2-X64-NEXT: retq +; +; SSE41-X64-LABEL: extract_f128_0: +; SSE41-X64: # BB#0: +; SSE41-X64-NEXT: movq %rdx, 8(%rdi) +; SSE41-X64-NEXT: movq %rsi, (%rdi) +; SSE41-X64-NEXT: retq +; +; AVX-X32-LABEL: extract_f128_0: +; AVX-X32: # BB#0: +; AVX-X32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX-X32-NEXT: vmovups %xmm0, (%eax) +; AVX-X32-NEXT: retl +; +; AVX-X64-LABEL: extract_f128_0: +; AVX-X64: # BB#0: +; AVX-X64-NEXT: movq %rdx, 8(%rdi) +; AVX-X64-NEXT: movq %rsi, (%rdi) +; AVX-X64-NEXT: retq +; ; SSE-F128-LABEL: extract_f128_0: ; SSE-F128: # BB#0: -; SSE-F128-NEXT: movaps %xmm0, (%rdi) +; SSE-F128-NEXT: movaps %xmm0, (%rdi) ; SSE-F128-NEXT: retq %vecext = extractelement <2 x fp128> %foo, i32 0 store fp128 %vecext, fp128* %dst, align 1 @@ -473,9 +580,51 @@ } define void @extract_f128_1(fp128* nocapture %dst, <2 x fp128> %foo) nounwind { +; SSE-X32-LABEL: extract_f128_1: +; SSE-X32: # BB#0: +; SSE-X32-NEXT: pushl %edi +; SSE-X32-NEXT: pushl %esi +; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %ecx +; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %edx +; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %esi +; SSE-X32-NEXT: movl {{[0-9]+}}(%esp), %edi +; SSE-X32-NEXT: movl %esi, 12(%edi) +; SSE-X32-NEXT: movl %edx, 8(%edi) +; SSE-X32-NEXT: movl %ecx, 4(%edi) +; SSE-X32-NEXT: movl %eax, (%edi) +; SSE-X32-NEXT: popl %esi +; SSE-X32-NEXT: popl %edi +; SSE-X32-NEXT: retl +; +; SSE2-X64-LABEL: extract_f128_1: +; SSE2-X64: # BB#0: +; SSE2-X64-NEXT: movq %r8, 8(%rdi) +; SSE2-X64-NEXT: movq %rcx, (%rdi) +; SSE2-X64-NEXT: retq +; +; SSE41-X64-LABEL: extract_f128_1: +; SSE41-X64: # BB#0: +; SSE41-X64-NEXT: movq %r8, 8(%rdi) +; SSE41-X64-NEXT: movq %rcx, (%rdi) +; SSE41-X64-NEXT: retq +; +; AVX-X32-LABEL: extract_f128_1: +; AVX-X32: # BB#0: +; AVX-X32-NEXT: vmovups {{[0-9]+}}(%esp), %xmm0 +; AVX-X32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX-X32-NEXT: vmovups %xmm0, (%eax) +; AVX-X32-NEXT: retl +; +; AVX-X64-LABEL: extract_f128_1: +; AVX-X64: # BB#0: +; AVX-X64-NEXT: movq %r8, 8(%rdi) +; AVX-X64-NEXT: movq %rcx, (%rdi) +; AVX-X64-NEXT: retq +; ; SSE-F128-LABEL: extract_f128_1: ; SSE-F128: # BB#0: -; SSE-F128-NEXT: movaps %xmm1, (%rdi) +; SSE-F128-NEXT: movaps %xmm1, (%rdi) ; SSE-F128-NEXT: retq %vecext = extractelement <2 x fp128> %foo, i32 1 store fp128 %vecext, fp128* %dst, align 1 Index: test/CodeGen/X86/fma-scalar-memfold.ll =================================================================== --- test/CodeGen/X86/fma-scalar-memfold.ll +++ test/CodeGen/X86/fma-scalar-memfold.ll @@ -20,14 +20,14 @@ ; CHECK: # BB#0: ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: vfmadd213ss (%rsi), %xmm0, %xmm0 -; CHECK-NEXT: vmovss %xmm0, (%rdi) +; CHECK-NEXT: vextractps $0, %xmm0, (%rdi) ; CHECK-NEXT: retq ; ; FMA4-LABEL: fmadd_aab_ss: ; FMA4: # BB#0: ; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; FMA4-NEXT: vfmaddss (%rsi), %xmm0, %xmm0, %xmm0 -; FMA4-NEXT: vmovss %xmm0, (%rdi) +; FMA4-NEXT: vextractps $0, %xmm0, (%rdi) ; FMA4-NEXT: retq %a.val = load float, float* %a %av0 = insertelement <4 x float> undef, float %a.val, i32 0 @@ -53,14 +53,14 @@ ; CHECK: # BB#0: ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: vfmadd132ss (%rsi), %xmm0, %xmm0 -; CHECK-NEXT: vmovss %xmm0, (%rdi) +; CHECK-NEXT: vextractps $0, %xmm0, (%rdi) ; CHECK-NEXT: retq ; ; FMA4-LABEL: fmadd_aba_ss: ; FMA4: # BB#0: ; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; FMA4-NEXT: vfmaddss %xmm0, (%rsi), %xmm0, %xmm0 -; FMA4-NEXT: vmovss %xmm0, (%rdi) +; FMA4-NEXT: vextractps $0, %xmm0, (%rdi) ; FMA4-NEXT: retq %a.val = load float, float* %a %av0 = insertelement <4 x float> undef, float %a.val, i32 0 @@ -86,14 +86,14 @@ ; CHECK: # BB#0: ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: vfmsub213ss (%rsi), %xmm0, %xmm0 -; CHECK-NEXT: vmovss %xmm0, (%rdi) +; CHECK-NEXT: vextractps $0, %xmm0, (%rdi) ; CHECK-NEXT: retq ; ; FMA4-LABEL: fmsub_aab_ss: ; FMA4: # BB#0: ; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; FMA4-NEXT: vfmsubss (%rsi), %xmm0, %xmm0, %xmm0 -; FMA4-NEXT: vmovss %xmm0, (%rdi) +; FMA4-NEXT: vextractps $0, %xmm0, (%rdi) ; FMA4-NEXT: retq %a.val = load float, float* %a %av0 = insertelement <4 x float> undef, float %a.val, i32 0 @@ -119,14 +119,14 @@ ; CHECK: # BB#0: ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: vfmsub132ss (%rsi), %xmm0, %xmm0 -; CHECK-NEXT: vmovss %xmm0, (%rdi) +; CHECK-NEXT: vextractps $0, %xmm0, (%rdi) ; CHECK-NEXT: retq ; ; FMA4-LABEL: fmsub_aba_ss: ; FMA4: # BB#0: ; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; FMA4-NEXT: vfmsubss %xmm0, (%rsi), %xmm0, %xmm0 -; FMA4-NEXT: vmovss %xmm0, (%rdi) +; FMA4-NEXT: vextractps $0, %xmm0, (%rdi) ; FMA4-NEXT: retq %a.val = load float, float* %a %av0 = insertelement <4 x float> undef, float %a.val, i32 0 @@ -152,14 +152,14 @@ ; CHECK: # BB#0: ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: vfnmadd213ss (%rsi), %xmm0, %xmm0 -; CHECK-NEXT: vmovss %xmm0, (%rdi) +; CHECK-NEXT: vextractps $0, %xmm0, (%rdi) ; CHECK-NEXT: retq ; ; FMA4-LABEL: fnmadd_aab_ss: ; FMA4: # BB#0: ; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; FMA4-NEXT: vfnmaddss (%rsi), %xmm0, %xmm0, %xmm0 -; FMA4-NEXT: vmovss %xmm0, (%rdi) +; FMA4-NEXT: vextractps $0, %xmm0, (%rdi) ; FMA4-NEXT: retq %a.val = load float, float* %a %av0 = insertelement <4 x float> undef, float %a.val, i32 0 @@ -185,14 +185,14 @@ ; CHECK: # BB#0: ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: vfnmadd132ss (%rsi), %xmm0, %xmm0 -; CHECK-NEXT: vmovss %xmm0, (%rdi) +; CHECK-NEXT: vextractps $0, %xmm0, (%rdi) ; CHECK-NEXT: retq ; ; FMA4-LABEL: fnmadd_aba_ss: ; FMA4: # BB#0: ; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; FMA4-NEXT: vfnmaddss %xmm0, (%rsi), %xmm0, %xmm0 -; FMA4-NEXT: vmovss %xmm0, (%rdi) +; FMA4-NEXT: vextractps $0, %xmm0, (%rdi) ; FMA4-NEXT: retq %a.val = load float, float* %a %av0 = insertelement <4 x float> undef, float %a.val, i32 0 @@ -218,14 +218,14 @@ ; CHECK: # BB#0: ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: vfnmsub213ss (%rsi), %xmm0, %xmm0 -; CHECK-NEXT: vmovss %xmm0, (%rdi) +; CHECK-NEXT: vextractps $0, %xmm0, (%rdi) ; CHECK-NEXT: retq ; ; FMA4-LABEL: fnmsub_aab_ss: ; FMA4: # BB#0: ; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; FMA4-NEXT: vfnmsubss (%rsi), %xmm0, %xmm0, %xmm0 -; FMA4-NEXT: vmovss %xmm0, (%rdi) +; FMA4-NEXT: vextractps $0, %xmm0, (%rdi) ; FMA4-NEXT: retq %a.val = load float, float* %a %av0 = insertelement <4 x float> undef, float %a.val, i32 0 @@ -251,14 +251,14 @@ ; CHECK: # BB#0: ; CHECK-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: vfnmsub132ss (%rsi), %xmm0, %xmm0 -; CHECK-NEXT: vmovss %xmm0, (%rdi) +; CHECK-NEXT: vextractps $0, %xmm0, (%rdi) ; CHECK-NEXT: retq ; ; FMA4-LABEL: fnmsub_aba_ss: ; FMA4: # BB#0: ; FMA4-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; FMA4-NEXT: vfnmsubss %xmm0, (%rsi), %xmm0, %xmm0 -; FMA4-NEXT: vmovss %xmm0, (%rdi) +; FMA4-NEXT: vextractps $0, %xmm0, (%rdi) ; FMA4-NEXT: retq %a.val = load float, float* %a %av0 = insertelement <4 x float> undef, float %a.val, i32 0 Index: test/CodeGen/X86/fold-vector-sext-crash2.ll =================================================================== --- test/CodeGen/X86/fold-vector-sext-crash2.ll +++ test/CodeGen/X86/fold-vector-sext-crash2.ll @@ -53,8 +53,10 @@ ret <2 x i256> %Shuff ; X64-LABEL: test_zext1 - ; X64: movq $0 - ; X64-NEXT: movq $0 + ; X64: xorps %xmm0, %xmm0 + ; X64: movaps %xmm0 + ; X64: movaps %xmm0 + ; X64: movaps %xmm0 ; X64-NEXT: movq $0 ; X64-NEXT: movq $254 @@ -75,8 +77,10 @@ ret <2 x i256> %Shuff ; X64-LABEL: test_zext2 - ; X64: movq $0 - ; X64-NEXT: movq $0 + ; X64: xorps %xmm0, %xmm0 + ; X64-NEXT: movaps %xmm0 + ; X64-NEXT: movaps %xmm0 + ; X64-NEXT: movaps %xmm0 ; X64-NEXT: movq $-1 ; X64-NEXT: movq $-2 Index: test/CodeGen/X86/legalize-shl-vec.ll =================================================================== --- test/CodeGen/X86/legalize-shl-vec.ll +++ test/CodeGen/X86/legalize-shl-vec.ll @@ -26,14 +26,11 @@ ; ; X64-LABEL: test_shl: ; X64: # BB#0: -; X64-NEXT: movq $0, 56(%rdi) -; X64-NEXT: movq $0, 48(%rdi) -; X64-NEXT: movq $0, 40(%rdi) -; X64-NEXT: movq $0, 32(%rdi) -; X64-NEXT: movq $0, 24(%rdi) -; X64-NEXT: movq $0, 16(%rdi) -; X64-NEXT: movq $0, 8(%rdi) -; X64-NEXT: movq $0, (%rdi) +; X64-NEXT: xorps %xmm0, %xmm0 +; X64-NEXT: movaps %xmm0, 48(%rdi) +; X64-NEXT: movaps %xmm0, 32(%rdi) +; X64-NEXT: movaps %xmm0, 16(%rdi) +; X64-NEXT: movaps %xmm0, (%rdi) ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: retq %Amt = insertelement <2 x i256> undef, i256 -1, i32 0 @@ -65,14 +62,11 @@ ; ; X64-LABEL: test_srl: ; X64: # BB#0: -; X64-NEXT: movq $0, 56(%rdi) -; X64-NEXT: movq $0, 48(%rdi) -; X64-NEXT: movq $0, 40(%rdi) -; X64-NEXT: movq $0, 32(%rdi) -; X64-NEXT: movq $0, 24(%rdi) -; X64-NEXT: movq $0, 16(%rdi) -; X64-NEXT: movq $0, 8(%rdi) -; X64-NEXT: movq $0, (%rdi) +; X64-NEXT: xorps %xmm0, %xmm0 +; X64-NEXT: movaps %xmm0, 48(%rdi) +; X64-NEXT: movaps %xmm0, 32(%rdi) +; X64-NEXT: movaps %xmm0, 16(%rdi) +; X64-NEXT: movaps %xmm0, (%rdi) ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: retq %Amt = insertelement <2 x i256> undef, i256 -1, i32 0 Index: test/CodeGen/X86/merge-consecutive-loads-128.ll =================================================================== --- test/CodeGen/X86/merge-consecutive-loads-128.ll +++ test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -526,44 +526,28 @@ ; ; X32-SSE1-LABEL: merge_8i16_i16_23u567u9: ; X32-SSE1: # BB#0: -; X32-SSE1-NEXT: pushl %ebp +; X32-SSE1-NEXT: pushl %edi ; X32-SSE1-NEXT: .Lcfi6: ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 -; X32-SSE1-NEXT: pushl %ebx +; X32-SSE1-NEXT: pushl %esi ; X32-SSE1-NEXT: .Lcfi7: ; X32-SSE1-NEXT: .cfi_def_cfa_offset 12 -; X32-SSE1-NEXT: pushl %edi ; X32-SSE1-NEXT: .Lcfi8: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 16 -; X32-SSE1-NEXT: pushl %esi +; X32-SSE1-NEXT: .cfi_offset %esi, -12 ; X32-SSE1-NEXT: .Lcfi9: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 20 -; X32-SSE1-NEXT: .Lcfi10: -; X32-SSE1-NEXT: .cfi_offset %esi, -20 -; X32-SSE1-NEXT: .Lcfi11: -; X32-SSE1-NEXT: .cfi_offset %edi, -16 -; X32-SSE1-NEXT: .Lcfi12: -; X32-SSE1-NEXT: .cfi_offset %ebx, -12 -; X32-SSE1-NEXT: .Lcfi13: -; X32-SSE1-NEXT: .cfi_offset %ebp, -8 +; X32-SSE1-NEXT: .cfi_offset %edi, -8 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE1-NEXT: movzwl 4(%ecx), %edx -; X32-SSE1-NEXT: movzwl 6(%ecx), %esi -; X32-SSE1-NEXT: movzwl 10(%ecx), %edi -; X32-SSE1-NEXT: movzwl 12(%ecx), %ebx -; X32-SSE1-NEXT: movzwl 14(%ecx), %ebp +; X32-SSE1-NEXT: movl 4(%ecx), %edx +; X32-SSE1-NEXT: movl 10(%ecx), %esi +; X32-SSE1-NEXT: movzwl 14(%ecx), %edi ; X32-SSE1-NEXT: movzwl 18(%ecx), %ecx -; X32-SSE1-NEXT: movw %bp, 10(%eax) -; X32-SSE1-NEXT: movw %bx, 8(%eax) +; X32-SSE1-NEXT: movw %di, 10(%eax) ; X32-SSE1-NEXT: movw %cx, 14(%eax) -; X32-SSE1-NEXT: movw %si, 2(%eax) -; X32-SSE1-NEXT: movw %dx, (%eax) -; X32-SSE1-NEXT: movw %di, 6(%eax) +; X32-SSE1-NEXT: movl %edx, (%eax) +; X32-SSE1-NEXT: movl %esi, 6(%eax) ; X32-SSE1-NEXT: popl %esi ; X32-SSE1-NEXT: popl %edi -; X32-SSE1-NEXT: popl %ebx -; X32-SSE1-NEXT: popl %ebp ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_8i16_i16_23u567u9: @@ -607,10 +591,8 @@ ; X32-SSE1: # BB#0: ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE1-NEXT: movzwl 6(%ecx), %edx -; X32-SSE1-NEXT: movzwl 8(%ecx), %ecx -; X32-SSE1-NEXT: movw %cx, 2(%eax) -; X32-SSE1-NEXT: movw %dx, (%eax) +; X32-SSE1-NEXT: movl 6(%ecx), %ecx +; X32-SSE1-NEXT: movl %ecx, (%eax) ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_8i16_i16_34uuuuuu: @@ -640,24 +622,14 @@ ; ; X32-SSE1-LABEL: merge_8i16_i16_45u7zzzz: ; X32-SSE1: # BB#0: -; X32-SSE1-NEXT: pushl %esi -; X32-SSE1-NEXT: .Lcfi14: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 -; X32-SSE1-NEXT: .Lcfi15: -; X32-SSE1-NEXT: .cfi_offset %esi, -8 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE1-NEXT: movzwl 8(%ecx), %edx -; X32-SSE1-NEXT: movzwl 10(%ecx), %esi +; X32-SSE1-NEXT: movl 8(%ecx), %edx ; X32-SSE1-NEXT: movzwl 14(%ecx), %ecx -; X32-SSE1-NEXT: movw %si, 2(%eax) -; X32-SSE1-NEXT: movw %dx, (%eax) +; X32-SSE1-NEXT: movl %edx, (%eax) ; X32-SSE1-NEXT: movw %cx, 6(%eax) -; X32-SSE1-NEXT: movw $0, 14(%eax) -; X32-SSE1-NEXT: movw $0, 12(%eax) -; X32-SSE1-NEXT: movw $0, 10(%eax) -; X32-SSE1-NEXT: movw $0, 8(%eax) -; X32-SSE1-NEXT: popl %esi +; X32-SSE1-NEXT: movl $0, 12(%eax) +; X32-SSE1-NEXT: movl $0, 8(%eax) ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_8i16_i16_45u7zzzz: @@ -694,64 +666,44 @@ ; ; X32-SSE1-LABEL: merge_16i8_i8_01u3456789ABCDuF: ; X32-SSE1: # BB#0: +; X32-SSE1-NEXT: pushl %ebp +; X32-SSE1-NEXT: .Lcfi10: +; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 ; X32-SSE1-NEXT: pushl %ebx +; X32-SSE1-NEXT: .Lcfi11: +; X32-SSE1-NEXT: .cfi_def_cfa_offset 12 +; X32-SSE1-NEXT: pushl %edi +; X32-SSE1-NEXT: .Lcfi12: +; X32-SSE1-NEXT: .cfi_def_cfa_offset 16 +; X32-SSE1-NEXT: pushl %esi +; X32-SSE1-NEXT: .Lcfi13: +; X32-SSE1-NEXT: .cfi_def_cfa_offset 20 +; X32-SSE1-NEXT: .Lcfi14: +; X32-SSE1-NEXT: .cfi_offset %esi, -20 +; X32-SSE1-NEXT: .Lcfi15: +; X32-SSE1-NEXT: .cfi_offset %edi, -16 ; X32-SSE1-NEXT: .Lcfi16: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 -; X32-SSE1-NEXT: subl $12, %esp +; X32-SSE1-NEXT: .cfi_offset %ebx, -12 ; X32-SSE1-NEXT: .Lcfi17: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 20 -; X32-SSE1-NEXT: .Lcfi18: -; X32-SSE1-NEXT: .cfi_offset %ebx, -8 +; X32-SSE1-NEXT: .cfi_offset %ebp, -8 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE1-NEXT: movb (%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 1(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 3(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 4(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 5(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 6(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 7(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 8(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 9(%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 10(%ecx), %bh -; X32-SSE1-NEXT: movb 11(%ecx), %bl -; X32-SSE1-NEXT: movb 12(%ecx), %dh +; X32-SSE1-NEXT: movzwl (%ecx), %ebp +; X32-SSE1-NEXT: movl 3(%ecx), %esi +; X32-SSE1-NEXT: movl 7(%ecx), %edi +; X32-SSE1-NEXT: movzwl 11(%ecx), %ebx ; X32-SSE1-NEXT: movb 13(%ecx), %dl ; X32-SSE1-NEXT: movb 15(%ecx), %cl ; X32-SSE1-NEXT: movb %dl, 13(%eax) -; X32-SSE1-NEXT: movb %dh, 12(%eax) ; X32-SSE1-NEXT: movb %cl, 15(%eax) -; X32-SSE1-NEXT: movb %bl, 11(%eax) -; X32-SSE1-NEXT: movb %bh, 10(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 9(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 8(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 7(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 6(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 5(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 4(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 1(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, (%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, 3(%eax) -; X32-SSE1-NEXT: addl $12, %esp +; X32-SSE1-NEXT: movw %bx, 11(%eax) +; X32-SSE1-NEXT: movl %edi, 7(%eax) +; X32-SSE1-NEXT: movw %bp, (%eax) +; X32-SSE1-NEXT: movl %esi, 3(%eax) +; X32-SSE1-NEXT: popl %esi +; X32-SSE1-NEXT: popl %edi ; X32-SSE1-NEXT: popl %ebx +; X32-SSE1-NEXT: popl %ebp ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_16i8_i8_01u3456789ABCDuF: @@ -819,17 +771,13 @@ ; X32-SSE1: # BB#0: ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE1-NEXT: movb (%ecx), %dl -; X32-SSE1-NEXT: movb 1(%ecx), %dh +; X32-SSE1-NEXT: movzwl (%ecx), %edx ; X32-SSE1-NEXT: movb 3(%ecx), %cl -; X32-SSE1-NEXT: movb %dh, 1(%eax) -; X32-SSE1-NEXT: movb %dl, (%eax) +; X32-SSE1-NEXT: movw %dx, (%eax) ; X32-SSE1-NEXT: movb %cl, 3(%eax) ; X32-SSE1-NEXT: movb $0, 15(%eax) -; X32-SSE1-NEXT: movb $0, 14(%eax) -; X32-SSE1-NEXT: movb $0, 13(%eax) -; X32-SSE1-NEXT: movb $0, 7(%eax) -; X32-SSE1-NEXT: movb $0, 6(%eax) +; X32-SSE1-NEXT: movw $0, 13(%eax) +; X32-SSE1-NEXT: movw $0, 6(%eax) ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_16i8_i8_01u3uuzzuuuuuzzz: @@ -867,35 +815,14 @@ ; ; X32-SSE1-LABEL: merge_16i8_i8_0123uu67uuuuuzzz: ; X32-SSE1: # BB#0: -; X32-SSE1-NEXT: pushl %ebx -; X32-SSE1-NEXT: .Lcfi19: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 -; X32-SSE1-NEXT: pushl %eax -; X32-SSE1-NEXT: .Lcfi20: -; X32-SSE1-NEXT: .cfi_def_cfa_offset 12 -; X32-SSE1-NEXT: .Lcfi21: -; X32-SSE1-NEXT: .cfi_offset %ebx, -8 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-SSE1-NEXT: movb (%ecx), %dl -; X32-SSE1-NEXT: movb %dl, {{[0-9]+}}(%esp) # 1-byte Spill -; X32-SSE1-NEXT: movb 1(%ecx), %dh -; X32-SSE1-NEXT: movb 2(%ecx), %bl -; X32-SSE1-NEXT: movb 3(%ecx), %bh -; X32-SSE1-NEXT: movb 6(%ecx), %dl -; X32-SSE1-NEXT: movb 7(%ecx), %cl -; X32-SSE1-NEXT: movb %cl, 7(%eax) -; X32-SSE1-NEXT: movb %dl, 6(%eax) -; X32-SSE1-NEXT: movb %bh, 3(%eax) -; X32-SSE1-NEXT: movb %bl, 2(%eax) -; X32-SSE1-NEXT: movb %dh, 1(%eax) -; X32-SSE1-NEXT: movb {{[0-9]+}}(%esp), %cl # 1-byte Reload -; X32-SSE1-NEXT: movb %cl, (%eax) +; X32-SSE1-NEXT: movl (%ecx), %edx +; X32-SSE1-NEXT: movzwl 6(%ecx), %ecx +; X32-SSE1-NEXT: movw %cx, 6(%eax) +; X32-SSE1-NEXT: movl %edx, (%eax) ; X32-SSE1-NEXT: movb $0, 15(%eax) -; X32-SSE1-NEXT: movb $0, 14(%eax) -; X32-SSE1-NEXT: movb $0, 13(%eax) -; X32-SSE1-NEXT: addl $4, %esp -; X32-SSE1-NEXT: popl %ebx +; X32-SSE1-NEXT: movw $0, 13(%eax) ; X32-SSE1-NEXT: retl $4 ; ; X32-SSE41-LABEL: merge_16i8_i8_0123uu67uuuuuzzz: @@ -990,14 +917,14 @@ ; X32-SSE1-LABEL: merge_2i64_i64_12_volatile: ; X32-SSE1: # BB#0: ; X32-SSE1-NEXT: pushl %edi -; X32-SSE1-NEXT: .Lcfi22: +; X32-SSE1-NEXT: .Lcfi18: ; X32-SSE1-NEXT: .cfi_def_cfa_offset 8 ; X32-SSE1-NEXT: pushl %esi -; X32-SSE1-NEXT: .Lcfi23: +; X32-SSE1-NEXT: .Lcfi19: ; X32-SSE1-NEXT: .cfi_def_cfa_offset 12 -; X32-SSE1-NEXT: .Lcfi24: +; X32-SSE1-NEXT: .Lcfi20: ; X32-SSE1-NEXT: .cfi_offset %esi, -12 -; X32-SSE1-NEXT: .Lcfi25: +; X32-SSE1-NEXT: .Lcfi21: ; X32-SSE1-NEXT: .cfi_offset %edi, -8 ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx Index: test/CodeGen/X86/merge-store-partially-alias-loads.ll =================================================================== --- test/CodeGen/X86/merge-store-partially-alias-loads.ll +++ test/CodeGen/X86/merge-store-partially-alias-loads.ll @@ -13,7 +13,7 @@ ; X86-NEXT: movb [[HI1]], 3([[BASEREG]]) ; X86-NEXT: retq -; DBGDAG-LABEL: Optimized lowered selection DAG: BB#0 'merge_store_partial_overlap_load:' +; DBGDAG-LABEL: Optimized legalized selection DAG: BB#0 'merge_store_partial_overlap_load:' ; DBGDAG: [[ENTRYTOKEN:t[0-9]+]]: ch = EntryToken ; DBGDAG-DAG: [[BASEPTR:t[0-9]+]]: i64,ch = CopyFromReg [[ENTRYTOKEN]], ; DBGDAG-DAG: [[ADDPTR:t[0-9]+]]: i64 = add [[BASEPTR]], Constant:i64<2> @@ -27,7 +27,7 @@ ; DBGDAG: X86ISD::RET_FLAG t{{[0-9]+}}, -; DBGDAG: Type-legalized selection DAG: BB#0 'merge_store_partial_overlap_load:' +; DBGDAG-LABEL: Instruction selection begins define void @merge_store_partial_overlap_load([4 x i8]* %tmp) { %tmp8 = getelementptr inbounds [4 x i8], [4 x i8]* %tmp, i32 0, i8 0 %tmp10 = getelementptr inbounds [4 x i8], [4 x i8]* %tmp, i32 0, i8 1 Index: test/CodeGen/X86/no-sse2-avg.ll =================================================================== --- test/CodeGen/X86/no-sse2-avg.ll +++ test/CodeGen/X86/no-sse2-avg.ll @@ -5,22 +5,8 @@ define <16 x i8> @PR27973() { ; CHECK-LABEL: PR27973: ; CHECK: # BB#0: -; CHECK-NEXT: movb $0, 15(%rdi) -; CHECK-NEXT: movb $0, 14(%rdi) -; CHECK-NEXT: movb $0, 13(%rdi) -; CHECK-NEXT: movb $0, 12(%rdi) -; CHECK-NEXT: movb $0, 11(%rdi) -; CHECK-NEXT: movb $0, 10(%rdi) -; CHECK-NEXT: movb $0, 9(%rdi) -; CHECK-NEXT: movb $0, 8(%rdi) -; CHECK-NEXT: movb $0, 7(%rdi) -; CHECK-NEXT: movb $0, 6(%rdi) -; CHECK-NEXT: movb $0, 5(%rdi) -; CHECK-NEXT: movb $0, 4(%rdi) -; CHECK-NEXT: movb $0, 3(%rdi) -; CHECK-NEXT: movb $0, 2(%rdi) -; CHECK-NEXT: movb $0, 1(%rdi) -; CHECK-NEXT: movb $0, (%rdi) +; CHECK-NEXT: movq $0, 8(%rdi) +; CHECK-NEXT: movq $0, (%rdi) ; CHECK-NEXT: movq %rdi, %rax ; CHECK-NEXT: retq %t0 = zext <16 x i8> zeroinitializer to <16 x i32> Index: test/CodeGen/X86/sse4a-upgrade.ll =================================================================== --- test/CodeGen/X86/sse4a-upgrade.ll +++ test/CodeGen/X86/sse4a-upgrade.ll @@ -8,12 +8,14 @@ ; X32-LABEL: test_movntss: ; X32: # BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movntss %xmm0, (%eax) +; X32-NEXT: vextractps $0, %xmm0, %ecx +; X32-NEXT: movntil %ecx, (%eax) ; X32-NEXT: retl ; ; X64-LABEL: test_movntss: ; X64: # BB#0: -; X64-NEXT: movntss %xmm0, (%rdi) +; X64-NEXT: vextractps $0, %xmm0, %eax +; X64-NEXT: movntil %eax, (%rdi) ; X64-NEXT: retq tail call void @llvm.x86.sse4a.movnt.ss(i8* %p, <4 x float> %a) nounwind ret void Index: test/CodeGen/X86/stdarg.ll =================================================================== --- test/CodeGen/X86/stdarg.ll +++ test/CodeGen/X86/stdarg.ll @@ -14,8 +14,7 @@ ; ; CHECK-DAG: movq {{.*}}, 192(%rsp) ; CHECK-DAG: movq {{.*}}, 184(%rsp) -; CHECK-DAG: movl {{.*}}, 180(%rsp) -; CHECK-DAG: movl {{.*}}, 176(%rsp) +; CHECK-DAG: movq {{.*}}, 176(%rsp) %ap3 = getelementptr inbounds [1 x %struct.__va_list_tag], [1 x %struct.__va_list_tag]* %ap, i64 0, i64 0; <%struct.__va_list_tag*> [#uses=1] call void @bar(%struct.__va_list_tag* %ap3) nounwind call void @llvm.va_end(i8* %ap12) Index: test/CodeGen/X86/stores-merging.ll =================================================================== --- test/CodeGen/X86/stores-merging.ll +++ test/CodeGen/X86/stores-merging.ll @@ -13,9 +13,8 @@ ;; the same result in memory in the end. ; CHECK-LABEL: redundant_stores_merging: -; CHECK: movabsq $528280977409, %rax -; CHECK: movq %rax, e+4(%rip) -; CHECK: movl $456, e+8(%rip) +; CHECK: movabsq $1958505086977, %rax +; CHECK: movq %rax, e+4(%rip) define void @redundant_stores_merging() { entry: store i32 1, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 1), align 4 @@ -26,9 +25,8 @@ ;; This variant tests PR25154. ; CHECK-LABEL: redundant_stores_merging_reverse: -; CHECK: movabsq $528280977409, %rax -; CHECK: movq %rax, e+4(%rip) -; CHECK: movl $456, e+8(%rip) +; CHECK: movabsq $1958505086977, %rax +; CHECK: movq %rax, e+4(%rip) define void @redundant_stores_merging_reverse() { entry: store i32 123, i32* getelementptr inbounds (%structTy, %structTy* @e, i64 0, i32 2), align 4 Index: test/CodeGen/X86/vec_fptrunc.ll =================================================================== --- test/CodeGen/X86/vec_fptrunc.ll +++ test/CodeGen/X86/vec_fptrunc.ll @@ -11,7 +11,7 @@ ; X32-SSE-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-SSE-NEXT: cvtpd2ps (%ecx), %xmm0 ; X32-SSE-NEXT: extractps $1, %xmm0, 4(%eax) -; X32-SSE-NEXT: movss %xmm0, (%eax) +; X32-SSE-NEXT: extractps $0, %xmm0, (%eax) ; X32-SSE-NEXT: retl ; ; X32-AVX-LABEL: fptrunc_frommem2: @@ -20,7 +20,7 @@ ; X32-AVX-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-AVX-NEXT: vcvtpd2psx (%ecx), %xmm0 ; X32-AVX-NEXT: vextractps $1, %xmm0, 4(%eax) -; X32-AVX-NEXT: vmovss %xmm0, (%eax) +; X32-AVX-NEXT: vextractps $0, %xmm0, (%eax) ; X32-AVX-NEXT: retl ; ; X64-SSE-LABEL: fptrunc_frommem2: Index: test/CodeGen/X86/vector-shuffle-256-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-256-v4.ll +++ test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -807,10 +807,10 @@ define <4 x i64> @shuffle_v4i64_0412(<4 x i64> %a, <4 x i64> %b) { ; AVX1-LABEL: shuffle_v4i64_0412: ; AVX1: # BB#0: +; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm0[8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 -; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = xmm1[0,0] ; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[2,3] ; AVX1-NEXT: retq ; Index: test/CodeGen/X86/widen_arith-6.ll =================================================================== --- test/CodeGen/X86/widen_arith-6.ll +++ test/CodeGen/X86/widen_arith-6.ll @@ -28,7 +28,7 @@ ; CHECK-NEXT: addps %xmm0, %xmm1 ; CHECK-NEXT: extractps $2, %xmm1, 8(%ecx,%eax) ; CHECK-NEXT: extractps $1, %xmm1, 4(%ecx,%eax) -; CHECK-NEXT: movss %xmm1, (%ecx,%eax) +; CHECK-NEXT: extractps $0, %xmm1, (%ecx,%eax) ; CHECK-NEXT: incl {{[0-9]+}}(%esp) ; CHECK-NEXT: .LBB0_1: # %forcond ; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 Index: test/CodeGen/X86/widen_conv-3.ll =================================================================== --- test/CodeGen/X86/widen_conv-3.ll +++ test/CodeGen/X86/widen_conv-3.ll @@ -27,7 +27,7 @@ ; X86-SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,3,2,3] ; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0 ; X86-SSE42-NEXT: extractps $1, %xmm0, 4(%eax) -; X86-SSE42-NEXT: movss %xmm0, (%eax) +; X86-SSE42-NEXT: extractps $0, %xmm0, (%eax) ; X86-SSE42-NEXT: retl ; ; X64-LABEL: convert_v2i16_to_v2f32: @@ -98,7 +98,7 @@ ; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0 ; X86-SSE42-NEXT: extractps $2, %xmm0, 8(%eax) ; X86-SSE42-NEXT: extractps $1, %xmm0, 4(%eax) -; X86-SSE42-NEXT: movss %xmm0, (%eax) +; X86-SSE42-NEXT: extractps $0, %xmm0, (%eax) ; X86-SSE42-NEXT: popl %eax ; X86-SSE42-NEXT: retl ; Index: test/CodeGen/X86/widen_conv-4.ll =================================================================== --- test/CodeGen/X86/widen_conv-4.ll +++ test/CodeGen/X86/widen_conv-4.ll @@ -35,8 +35,8 @@ ; X86-SSE42-NEXT: cvtdq2ps %xmm2, %xmm1 ; X86-SSE42-NEXT: extractps $2, %xmm0, 24(%eax) ; X86-SSE42-NEXT: extractps $1, %xmm0, 20(%eax) +; X86-SSE42-NEXT: extractps $0, %xmm0, 16(%eax) ; X86-SSE42-NEXT: movups %xmm1, (%eax) -; X86-SSE42-NEXT: movss %xmm0, 16(%eax) ; X86-SSE42-NEXT: retl ; ; X64-SSE2-LABEL: convert_v7i16_v7f32: @@ -123,7 +123,7 @@ ; X86-SSE42-NEXT: cvtdq2ps %xmm0, %xmm0 ; X86-SSE42-NEXT: extractps $2, %xmm0, 8(%eax) ; X86-SSE42-NEXT: extractps $1, %xmm0, 4(%eax) -; X86-SSE42-NEXT: movss %xmm0, (%eax) +; X86-SSE42-NEXT: extractps $0, %xmm0, (%eax) ; X86-SSE42-NEXT: popl %eax ; X86-SSE42-NEXT: retl ; Index: test/CodeGen/X86/widen_shuffle-1.ll =================================================================== --- test/CodeGen/X86/widen_shuffle-1.ll +++ test/CodeGen/X86/widen_shuffle-1.ll @@ -10,7 +10,7 @@ ; X86-NEXT: addps %xmm1, %xmm0 ; X86-NEXT: extractps $2, %xmm0, 8(%eax) ; X86-NEXT: extractps $1, %xmm0, 4(%eax) -; X86-NEXT: movss %xmm0, (%eax) +; X86-NEXT: extractps $0, %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: shuf: @@ -36,7 +36,7 @@ ; X86-NEXT: addps %xmm1, %xmm0 ; X86-NEXT: extractps $2, %xmm0, 8(%eax) ; X86-NEXT: extractps $1, %xmm0, 4(%eax) -; X86-NEXT: movss %xmm0, (%eax) +; X86-NEXT: extractps $0, %xmm0, (%eax) ; X86-NEXT: retl ; ; X64-LABEL: shuf2: