diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -2056,6 +2056,19 @@
                              Depth + 1))
       return true;
 
+    // If all bits of the extracted element are known, simplify it to a
+    // constant.
+    if (Known2.isConstant()) {
+      APInt KnownVal = Known2.getConstant().zextOrTrunc(BitWidth);
+      if (VT.isInteger())
+        return TLO.CombineTo(Op, TLO.DAG.getConstant(KnownVal, dl, VT));
+      if (VT.isFloatingPoint())
+        return TLO.CombineTo(
+            Op,
+            TLO.DAG.getConstantFP(
+                APFloat(TLO.DAG.EVTToAPFloatSemantics(VT), KnownVal), dl, VT));
+    }
+
     // Attempt to avoid multi-use ops if we don't need anything from them.
     if (!DemandedSrcBits.isAllOnesValue() ||
         !DemandedSrcElts.isAllOnesValue()) {
diff --git a/llvm/test/CodeGen/AArch64/arm64-nvcast.ll b/llvm/test/CodeGen/AArch64/arm64-nvcast.ll
--- a/llvm/test/CodeGen/AArch64/arm64-nvcast.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-nvcast.ll
@@ -24,17 +24,8 @@
 define void @test2(float * %p1, i32 %v1) {
 ; CHECK-LABEL: test2:
 ; CHECK:       ; %bb.0: ; %entry
-; CHECK-NEXT:    sub sp, sp, #16 ; =16
-; CHECK-NEXT:    .cfi_def_cfa_offset 16
-; CHECK-NEXT:    ; kill: def $w1 killed $w1 def $x1
-; CHECK-NEXT:    movi.16b v0, #63
-; CHECK-NEXT:    and x8, x1, #0x3
-; CHECK-NEXT:    mov x9, sp
-; CHECK-NEXT:    str q0, [sp]
-; CHECK-NEXT:    bfi x9, x8, #2, #2
-; CHECK-NEXT:    ldr s0, [x9]
-; CHECK-NEXT:    str s0, [x0]
-; CHECK-NEXT:    add sp, sp, #16 ; =16
+; CHECK-NEXT:    mov w8, #1061109567
+; CHECK-NEXT:    str w8, [x0]
 ; CHECK-NEXT:    ret
 entry:
   %v2 = extractelement <3 x float> <float 0.7470588088035583, float 0.7470588088035583, float 0.7470588088035583>, i32 %v1
diff --git a/llvm/test/CodeGen/AArch64/arm64-promote-const-complex-initializers.ll b/llvm/test/CodeGen/AArch64/arm64-promote-const-complex-initializers.ll
--- a/llvm/test/CodeGen/AArch64/arm64-promote-const-complex-initializers.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-promote-const-complex-initializers.ll
@@ -30,10 +30,10 @@
 define [1 x <4 x float>] @test2() {
 ; CHECK-LABEL:    .p2align    4               ; -- Begin function test2
 ; CHECK-NEXT: lCPI1_0:
-; CHECK-NEXT:     .long   0x00000000              ; float 0
-; CHECK-NEXT:     .long   0x00000000              ; float 0
-; CHECK-NEXT:     .long   0x00000000              ; float 0
-; CHECK-NEXT:     .long   0x3f800000              ; float 1
+; CHECK-NEXT:     .long   0x80000000              ; float -0
+; CHECK-NEXT:     .long   0x80000000              ; float -0
+; CHECK-NEXT:     .long   0x80000000              ; float -0
+; CHECK-NEXT:     .long   0xbf800000              ; float -1
 ; CHECK-NEXT:     .section    __TEXT,__text,regular,pure_instructions
 ; CHECK-NEXT:     .globl  _test2
 ; CHECK-NEXT:     .p2align    2
@@ -43,17 +43,7 @@
 ; CHECK-NEXT: Lloh2:
 ; CHECK-NEXT:     adrp    x8, lCPI1_0@PAGE
 ; CHECK-NEXT: Lloh3:
-; CHECK-NEXT:     ldr q1, [x8, lCPI1_0@PAGEOFF]
-; CHECK-NEXT:     mov s2, v1[1]
-; CHECK-NEXT:     fneg    s0, s1
-; CHECK-NEXT:     mov s3, v1[2]
-; CHECK-NEXT:     fneg    s2, s2
-; CHECK-NEXT:     mov s1, v1[3]
-; CHECK-NEXT:     fneg    s3, s3
-; CHECK-NEXT:     mov.s   v0[1], v2[0]
-; CHECK-NEXT:     mov.s   v0[2], v3[0]
-; CHECK-NEXT:     fneg    s1, s1
-; CHECK-NEXT:     mov.s   v0[3], v1[0]
+; CHECK-NEXT:     ldr q0, [x8, lCPI1_0@PAGEOFF]
 ; CHECK-NEXT:     ret
 ;
   ret [1 x <4 x float>] [<4 x float>
diff --git a/llvm/test/CodeGen/ARM/func-argpassing-endian.ll b/llvm/test/CodeGen/ARM/func-argpassing-endian.ll
--- a/llvm/test/CodeGen/ARM/func-argpassing-endian.ll
+++ b/llvm/test/CodeGen/ARM/func-argpassing-endian.ll
@@ -102,31 +102,33 @@
 define <4 x i32> @return_v4i32() {
 ; CHECK-LE-LABEL: return_v4i32:
 ; CHECK-LE:       @ %bb.0:
-; CHECK-LE-NEXT:    adr r0, .LCPI6_0
-; CHECK-LE-NEXT:    vld1.64 {d16, d17}, [r0:128]
+; CHECK-LE-NEXT:    vldr d16, .LCPI6_0
+; CHECK-LE-NEXT:    vldr d17, .LCPI6_1
 ; CHECK-LE-NEXT:    vmov r0, r1, d16
 ; CHECK-LE-NEXT:    vmov r2, r3, d17
 ; CHECK-LE-NEXT:    bx lr
-; CHECK-LE-NEXT:    .p2align 4
+; CHECK-LE-NEXT:    .p2align 3
 ; CHECK-LE-NEXT:  @ %bb.1:
 ; CHECK-LE-NEXT:  .LCPI6_0:
 ; CHECK-LE-NEXT:    .long 42 @ double 9.1245819032257467E-313
 ; CHECK-LE-NEXT:    .long 43
+; CHECK-LE-NEXT:  .LCPI6_1:
 ; CHECK-LE-NEXT:    .long 44 @ double 9.5489810615176143E-313
 ; CHECK-LE-NEXT:    .long 45
 ;
 ; CHECK-BE-LABEL: return_v4i32:
 ; CHECK-BE:       @ %bb.0:
-; CHECK-BE-NEXT:    adr r0, .LCPI6_0
-; CHECK-BE-NEXT:    vld1.64 {d16, d17}, [r0:128]
+; CHECK-BE-NEXT:    vldr d16, .LCPI6_0
+; CHECK-BE-NEXT:    vldr d17, .LCPI6_1
 ; CHECK-BE-NEXT:    vmov r1, r0, d16
 ; CHECK-BE-NEXT:    vmov r3, r2, d17
 ; CHECK-BE-NEXT:    bx lr
-; CHECK-BE-NEXT:    .p2align 4
+; CHECK-BE-NEXT:    .p2align 3
 ; CHECK-BE-NEXT:  @ %bb.1:
 ; CHECK-BE-NEXT:  .LCPI6_0:
 ; CHECK-BE-NEXT:    .long 42 @ double 8.912382324178626E-313
 ; CHECK-BE-NEXT:    .long 43
+; CHECK-BE-NEXT:  .LCPI6_1:
 ; CHECK-BE-NEXT:    .long 44 @ double 9.3367814824704935E-313
 ; CHECK-BE-NEXT:    .long 45
    ret < 4 x i32> < i32 42, i32 43, i32 44, i32 45 >
diff --git a/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll
--- a/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll
+++ b/llvm/test/CodeGen/X86/2011-10-19-widen_vselect.ll
@@ -49,14 +49,12 @@
 define void @zero_test() {
 ; X32-LABEL: zero_test:
 ; X32:       # %bb.0: # %entry
-; X32-NEXT:    xorps %xmm0, %xmm0
-; X32-NEXT:    movlps %xmm0, (%eax)
+; X32-NEXT:    movl $0, (%eax)
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: zero_test:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    xorps %xmm0, %xmm0
-; X64-NEXT:    movlps %xmm0, (%rax)
+; X64-NEXT:    movq $0, (%rax)
 ; X64-NEXT:    retq
 entry:
   %0 = select <2 x i1> undef, <2 x float> undef, <2 x float> zeroinitializer
diff --git a/llvm/test/CodeGen/X86/2012-07-10-extload64.ll b/llvm/test/CodeGen/X86/2012-07-10-extload64.ll
--- a/llvm/test/CodeGen/X86/2012-07-10-extload64.ll
+++ b/llvm/test/CodeGen/X86/2012-07-10-extload64.ll
@@ -29,8 +29,8 @@
 ; X86-LABEL: store_64:
 ; X86:       # %bb.0: # %BB
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    xorps %xmm0, %xmm0
-; X86-NEXT:    movlps %xmm0, (%eax)
+; X86-NEXT:    movl $0, 4(%eax)
+; X86-NEXT:    movl $0, (%eax)
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: store_64:
diff --git a/llvm/test/CodeGen/X86/fold-load-vec.ll b/llvm/test/CodeGen/X86/fold-load-vec.ll
--- a/llvm/test/CodeGen/X86/fold-load-vec.ll
+++ b/llvm/test/CodeGen/X86/fold-load-vec.ll
@@ -10,8 +10,8 @@
 ; CHECK-NEXT:    subq $24, %rsp
 ; CHECK-NEXT:    movq %rdi, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movq %rsi, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    movq $0, (%rsp)
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
-; CHECK-NEXT:    movlps %xmm0, (%rsp)
 ; CHECK-NEXT:    unpcklps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
 ; CHECK-NEXT:    movlps %xmm0, (%rsp)
 ; CHECK-NEXT:    movlps %xmm0, (%rsi)
diff --git a/llvm/test/CodeGen/X86/nontemporal-3.ll b/llvm/test/CodeGen/X86/nontemporal-3.ll
--- a/llvm/test/CodeGen/X86/nontemporal-3.ll
+++ b/llvm/test/CodeGen/X86/nontemporal-3.ll
@@ -195,33 +195,14 @@
 }
 
 define void @test_zero_v8f32_align1(<8 x float>* %dst) nounwind {
-; SSE2-LABEL: test_zero_v8f32_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_zero_v8f32_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    xorl %eax, %eax
-; SSE4A-NEXT:    movntiq %rax, 8(%rdi)
-; SSE4A-NEXT:    movntiq %rax, 24(%rdi)
-; SSE4A-NEXT:    xorps %xmm0, %xmm0
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_zero_v8f32_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_zero_v8f32_align1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    movntiq %rax, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
+; SSE-NEXT:    movntiq %rax, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v8f32_align1:
 ; AVX:       # %bb.0:
@@ -245,32 +226,14 @@
 }
 
 define void @test_zero_v4i64_align1(<4 x i64>* %dst) nounwind {
-; SSE2-LABEL: test_zero_v4i64_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_zero_v4i64_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    xorps %xmm0, %xmm0
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_zero_v4i64_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_zero_v4i64_align1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    movntiq %rax, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
+; SSE-NEXT:    movntiq %rax, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v4i64_align1:
 ; AVX:       # %bb.0:
@@ -294,32 +257,14 @@
 }
 
 define void @test_zero_v8i32_align1(<8 x i32>* %dst) nounwind {
-; SSE2-LABEL: test_zero_v8i32_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_zero_v8i32_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    xorps %xmm0, %xmm0
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_zero_v8i32_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_zero_v8i32_align1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    movntiq %rax, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
+; SSE-NEXT:    movntiq %rax, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v8i32_align1:
 ; AVX:       # %bb.0:
@@ -343,32 +288,14 @@
 }
 
 define void @test_zero_v16i16_align1(<16 x i16>* %dst) nounwind {
-; SSE2-LABEL: test_zero_v16i16_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_zero_v16i16_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    xorps %xmm0, %xmm0
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_zero_v16i16_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_zero_v16i16_align1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    movntiq %rax, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
+; SSE-NEXT:    movntiq %rax, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v16i16_align1:
 ; AVX:       # %bb.0:
@@ -392,32 +319,14 @@
 }
 
 define void @test_zero_v32i8_align1(<32 x i8>* %dst) nounwind {
-; SSE2-LABEL: test_zero_v32i8_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_zero_v32i8_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    xorps %xmm0, %xmm0
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_zero_v32i8_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_zero_v32i8_align1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    movntiq %rax, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
+; SSE-NEXT:    movntiq %rax, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v32i8_align1:
 ; AVX:       # %bb.0:
@@ -636,45 +545,18 @@
 }
 
 define void @test_zero_v16f32_align1(<16 x float>* %dst) nounwind {
-; SSE2-LABEL: test_zero_v16f32_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movntiq %rax, 56(%rdi)
-; SSE2-NEXT:    movntiq %rax, 48(%rdi)
-; SSE2-NEXT:    movntiq %rax, 40(%rdi)
-; SSE2-NEXT:    movntiq %rax, 32(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_zero_v16f32_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    xorl %eax, %eax
-; SSE4A-NEXT:    movntiq %rax, 24(%rdi)
-; SSE4A-NEXT:    movntiq %rax, 8(%rdi)
-; SSE4A-NEXT:    movntiq %rax, 56(%rdi)
-; SSE4A-NEXT:    movntiq %rax, 40(%rdi)
-; SSE4A-NEXT:    xorps %xmm0, %xmm0
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_zero_v16f32_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movntiq %rax, 56(%rdi)
-; SSE41-NEXT:    movntiq %rax, 48(%rdi)
-; SSE41-NEXT:    movntiq %rax, 40(%rdi)
-; SSE41-NEXT:    movntiq %rax, 32(%rdi)
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_zero_v16f32_align1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    movntiq %rax, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    movntiq %rax, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
+; SSE-NEXT:    movntiq %rax, 56(%rdi)
+; SSE-NEXT:    movntiq %rax, 48(%rdi)
+; SSE-NEXT:    movntiq %rax, 40(%rdi)
+; SSE-NEXT:    movntiq %rax, 32(%rdi)
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v16f32_align1:
 ; AVX:       # %bb.0:
@@ -706,44 +588,18 @@
 }
 
 define void @test_zero_v8i64_align1(<8 x i64>* %dst) nounwind {
-; SSE2-LABEL: test_zero_v8i64_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movntiq %rax, 56(%rdi)
-; SSE2-NEXT:    movntiq %rax, 48(%rdi)
-; SSE2-NEXT:    movntiq %rax, 40(%rdi)
-; SSE2-NEXT:    movntiq %rax, 32(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_zero_v8i64_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    xorps %xmm0, %xmm0
-; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_zero_v8i64_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movntiq %rax, 56(%rdi)
-; SSE41-NEXT:    movntiq %rax, 48(%rdi)
-; SSE41-NEXT:    movntiq %rax, 40(%rdi)
-; SSE41-NEXT:    movntiq %rax, 32(%rdi)
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_zero_v8i64_align1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    movntiq %rax, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    movntiq %rax, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
+; SSE-NEXT:    movntiq %rax, 56(%rdi)
+; SSE-NEXT:    movntiq %rax, 48(%rdi)
+; SSE-NEXT:    movntiq %rax, 40(%rdi)
+; SSE-NEXT:    movntiq %rax, 32(%rdi)
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v8i64_align1:
 ; AVX:       # %bb.0:
@@ -775,44 +631,18 @@
 }
 
 define void @test_zero_v16i32_align1(<16 x i32>* %dst) nounwind {
-; SSE2-LABEL: test_zero_v16i32_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movntiq %rax, 56(%rdi)
-; SSE2-NEXT:    movntiq %rax, 48(%rdi)
-; SSE2-NEXT:    movntiq %rax, 40(%rdi)
-; SSE2-NEXT:    movntiq %rax, 32(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_zero_v16i32_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    xorps %xmm0, %xmm0
-; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_zero_v16i32_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movntiq %rax, 56(%rdi)
-; SSE41-NEXT:    movntiq %rax, 48(%rdi)
-; SSE41-NEXT:    movntiq %rax, 40(%rdi)
-; SSE41-NEXT:    movntiq %rax, 32(%rdi)
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_zero_v16i32_align1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    movntiq %rax, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    movntiq %rax, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
+; SSE-NEXT:    movntiq %rax, 56(%rdi)
+; SSE-NEXT:    movntiq %rax, 48(%rdi)
+; SSE-NEXT:    movntiq %rax, 40(%rdi)
+; SSE-NEXT:    movntiq %rax, 32(%rdi)
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v16i32_align1:
 ; AVX:       # %bb.0:
@@ -844,44 +674,18 @@
 }
 
 define void @test_zero_v32i16_align1(<32 x i16>* %dst) nounwind {
-; SSE2-LABEL: test_zero_v32i16_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movntiq %rax, 56(%rdi)
-; SSE2-NEXT:    movntiq %rax, 48(%rdi)
-; SSE2-NEXT:    movntiq %rax, 40(%rdi)
-; SSE2-NEXT:    movntiq %rax, 32(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_zero_v32i16_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    xorps %xmm0, %xmm0
-; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_zero_v32i16_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movntiq %rax, 56(%rdi)
-; SSE41-NEXT:    movntiq %rax, 48(%rdi)
-; SSE41-NEXT:    movntiq %rax, 40(%rdi)
-; SSE41-NEXT:    movntiq %rax, 32(%rdi)
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_zero_v32i16_align1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    movntiq %rax, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    movntiq %rax, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
+; SSE-NEXT:    movntiq %rax, 56(%rdi)
+; SSE-NEXT:    movntiq %rax, 48(%rdi)
+; SSE-NEXT:    movntiq %rax, 40(%rdi)
+; SSE-NEXT:    movntiq %rax, 32(%rdi)
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v32i16_align1:
 ; AVX:       # %bb.0:
@@ -913,44 +717,18 @@
 }
 
 define void @test_zero_v64i8_align1(<64 x i8>* %dst) nounwind {
-; SSE2-LABEL: test_zero_v64i8_align1:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    movntiq %rax, 24(%rdi)
-; SSE2-NEXT:    movntiq %rax, 16(%rdi)
-; SSE2-NEXT:    movntiq %rax, 8(%rdi)
-; SSE2-NEXT:    movntiq %rax, (%rdi)
-; SSE2-NEXT:    movntiq %rax, 56(%rdi)
-; SSE2-NEXT:    movntiq %rax, 48(%rdi)
-; SSE2-NEXT:    movntiq %rax, 40(%rdi)
-; SSE2-NEXT:    movntiq %rax, 32(%rdi)
-; SSE2-NEXT:    retq
-;
-; SSE4A-LABEL: test_zero_v64i8_align1:
-; SSE4A:       # %bb.0:
-; SSE4A-NEXT:    xorps %xmm0, %xmm0
-; SSE4A-NEXT:    movntsd %xmm0, 24(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 16(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 8(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, (%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 56(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 48(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 40(%rdi)
-; SSE4A-NEXT:    movntsd %xmm0, 32(%rdi)
-; SSE4A-NEXT:    retq
-;
-; SSE41-LABEL: test_zero_v64i8_align1:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    xorl %eax, %eax
-; SSE41-NEXT:    movntiq %rax, 24(%rdi)
-; SSE41-NEXT:    movntiq %rax, 16(%rdi)
-; SSE41-NEXT:    movntiq %rax, 8(%rdi)
-; SSE41-NEXT:    movntiq %rax, (%rdi)
-; SSE41-NEXT:    movntiq %rax, 56(%rdi)
-; SSE41-NEXT:    movntiq %rax, 48(%rdi)
-; SSE41-NEXT:    movntiq %rax, 40(%rdi)
-; SSE41-NEXT:    movntiq %rax, 32(%rdi)
-; SSE41-NEXT:    retq
+; SSE-LABEL: test_zero_v64i8_align1:
+; SSE:       # %bb.0:
+; SSE-NEXT:    xorl %eax, %eax
+; SSE-NEXT:    movntiq %rax, 24(%rdi)
+; SSE-NEXT:    movntiq %rax, 16(%rdi)
+; SSE-NEXT:    movntiq %rax, 8(%rdi)
+; SSE-NEXT:    movntiq %rax, (%rdi)
+; SSE-NEXT:    movntiq %rax, 56(%rdi)
+; SSE-NEXT:    movntiq %rax, 48(%rdi)
+; SSE-NEXT:    movntiq %rax, 40(%rdi)
+; SSE-NEXT:    movntiq %rax, 32(%rdi)
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: test_zero_v64i8_align1:
 ; AVX:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/pr41619.ll b/llvm/test/CodeGen/X86/pr41619.ll
--- a/llvm/test/CodeGen/X86/pr41619.ll
+++ b/llvm/test/CodeGen/X86/pr41619.ll
@@ -7,10 +7,9 @@
 ; CHECK:       ## %bb.0: ## %bb
 ; CHECK-NEXT:    vmovq %xmm0, %rax
 ; CHECK-NEXT:    vmovd %eax, %xmm0
-; CHECK-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; CHECK-NEXT:    vmovq %xmm0, %rax
 ; CHECK-NEXT:    movl %eax, (%rax)
-; CHECK-NEXT:    vmovlps %xmm1, (%rax)
+; CHECK-NEXT:    movq $0, (%rax)
 ; CHECK-NEXT:    retq
 bb:
   %tmp = bitcast double %arg to i64
diff --git a/llvm/test/CodeGen/X86/vec_zero_cse.ll b/llvm/test/CodeGen/X86/vec_zero_cse.ll
--- a/llvm/test/CodeGen/X86/vec_zero_cse.ll
+++ b/llvm/test/CodeGen/X86/vec_zero_cse.ll
@@ -15,8 +15,8 @@
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl $0, M1+4
 ; X32-NEXT:    movl $0, M1
-; X32-NEXT:    xorps %xmm0, %xmm0
-; X32-NEXT:    movlps %xmm0, M2
+; X32-NEXT:    movl $0, M2+4
+; X32-NEXT:    movl $0, M2
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test1:
@@ -34,8 +34,8 @@
 ; X32:       # %bb.0:
 ; X32-NEXT:    movl $-1, M1+4
 ; X32-NEXT:    movl $-1, M1
-; X32-NEXT:    pcmpeqd %xmm0, %xmm0
-; X32-NEXT:    movq %xmm0, M2
+; X32-NEXT:    movl $-1, M2+4
+; X32-NEXT:    movl $-1, M2
 ; X32-NEXT:    retl
 ;
 ; X64-LABEL: test2:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining.ll
@@ -3061,52 +3061,18 @@
 }
 
 define void @PR43024() {
-; SSE2-LABEL: PR43024:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
-; SSE2-NEXT:    movaps %xmm0, (%rax)
-; SSE2-NEXT:    movaps %xmm0, %xmm1
-; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[1,1],xmm0[1,1]
-; SSE2-NEXT:    addss %xmm0, %xmm1
-; SSE2-NEXT:    xorps %xmm0, %xmm0
-; SSE2-NEXT:    addss %xmm0, %xmm1
-; SSE2-NEXT:    addss %xmm0, %xmm1
-; SSE2-NEXT:    movss %xmm1, (%rax)
-; SSE2-NEXT:    retq
-;
-; SSSE3-LABEL: PR43024:
-; SSSE3:       # %bb.0:
-; SSSE3-NEXT:    movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
-; SSSE3-NEXT:    movaps %xmm0, (%rax)
-; SSSE3-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSSE3-NEXT:    addss %xmm0, %xmm1
-; SSSE3-NEXT:    xorps %xmm0, %xmm0
-; SSSE3-NEXT:    addss %xmm0, %xmm1
-; SSSE3-NEXT:    addss %xmm0, %xmm1
-; SSSE3-NEXT:    movss %xmm1, (%rax)
-; SSSE3-NEXT:    retq
-;
-; SSE41-LABEL: PR43024:
-; SSE41:       # %bb.0:
-; SSE41-NEXT:    movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
-; SSE41-NEXT:    movaps %xmm0, (%rax)
-; SSE41-NEXT:    movshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; SSE41-NEXT:    addss %xmm0, %xmm1
-; SSE41-NEXT:    xorps %xmm0, %xmm0
-; SSE41-NEXT:    addss %xmm0, %xmm1
-; SSE41-NEXT:    addss %xmm0, %xmm1
-; SSE41-NEXT:    movss %xmm1, (%rax)
-; SSE41-NEXT:    retq
+; SSE-LABEL: PR43024:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
+; SSE-NEXT:    movaps %xmm0, (%rax)
+; SSE-NEXT:    movl $2143289344, (%rax) # imm = 0x7FC00000
+; SSE-NEXT:    retq
 ;
 ; AVX-LABEL: PR43024:
 ; AVX:       # %bb.0:
 ; AVX-NEXT:    vmovaps {{.*#+}} xmm0 = [NaN,NaN,0.0E+0,0.0E+0]
 ; AVX-NEXT:    vmovaps %xmm0, (%rax)
-; AVX-NEXT:    vaddss {{\.LCPI.*}}+{{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX-NEXT:    vaddss %xmm1, %xmm0, %xmm0
-; AVX-NEXT:    vaddss {{\.LCPI.*}}+{{.*}}(%rip), %xmm0, %xmm0
-; AVX-NEXT:    vmovss %xmm0, (%rax)
+; AVX-NEXT:    movl $2143289344, (%rax) # imm = 0x7FC00000
 ; AVX-NEXT:    retq
   store <4 x float> <float 0x7FF8000000000000, float 0x7FF8000000000000, float 0x0, float 0x0>, <4 x float>* undef, align 16
   %1 = load <4 x float>, <4 x float>* undef, align 16
diff --git a/llvm/test/CodeGen/X86/widen_shuffle-1.ll b/llvm/test/CodeGen/X86/widen_shuffle-1.ll
--- a/llvm/test/CodeGen/X86/widen_shuffle-1.ll
+++ b/llvm/test/CodeGen/X86/widen_shuffle-1.ll
@@ -105,8 +105,8 @@
 ; X86-LABEL: shuf5:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movsd {{.*#+}} xmm0 = mem[0],zero
-; X86-NEXT:    movsd %xmm0, (%eax)
+; X86-NEXT:    movl $555819297, 4(%eax) # imm = 0x21212121
+; X86-NEXT:    movl $555819297, (%eax) # imm = 0x21212121
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: shuf5: