Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15499,16 +15499,38 @@
     // converts.
   }
 
-  if (ConstEltNo && InVec.getOpcode() == ISD::BITCAST) {
+  // TODO: These transforms should not require the 'hasOneUse' restriction, but
+  // there are regressions on multiple targets without it. We can end up with a
+  // mess of scalar and vector code if we reduce only part of the DAG to scalar.
+  if (ConstEltNo && InVec.getOpcode() == ISD::BITCAST && VT.isInteger() &&
+      InVec.hasOneUse()) {
     // The vector index of the LSBs of the source depend on the endian-ness.
     bool IsLE = DAG.getDataLayout().isLittleEndian();
-
+    unsigned ExtractIndex = ConstEltNo->getZExtValue();
     // extract_elt (v2i32 (bitcast i64:x)), BCTruncElt -> i32 (trunc i64:x)
     unsigned BCTruncElt = IsLE ? 0 : VT.getVectorNumElements() - 1;
     SDValue BCSrc = InVec.getOperand(0);
-    if (InVec.hasOneUse() && ConstEltNo->getZExtValue() == BCTruncElt &&
-        VT.isInteger() && BCSrc.getValueType().isScalarInteger())
+    if (ExtractIndex == BCTruncElt && BCSrc.getValueType().isScalarInteger())
       return DAG.getNode(ISD::TRUNCATE, SDLoc(N), NVT, BCSrc);
+
+    if (BCSrc.getValueType().isInteger() &&
+        BCSrc.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+      // ext_elt (bitcast (scalar_to_vec i64 X to v2i64) to v4i32), TruncElt -->
+      // trunc i64 X to i32
+      SDValue X = BCSrc.getOperand(0);
+      assert(X.getValueType().isScalarInteger() && NVT.isScalarInteger() &&
+             "Extract element and scalar to vector can't change element type "
+             "from FP to integer.");
+      unsigned XBitWidth = X.getValueSizeInBits();
+      unsigned VecEltBitWidth = VT.getScalarSizeInBits();
+      BCTruncElt = IsLE ? 0 : XBitWidth / VecEltBitWidth - 1;
+
+      // An extract element return value type can be wider than its vector
+      // operand element type. In that case, the high bits are undefined, so
+      // it's possible that we may need to extend rather than truncate.
+      if (ExtractIndex == BCTruncElt && XBitWidth > VecEltBitWidth)
+        return DAG.getAnyExtOrTrunc(X, SDLoc(N), NVT);
+    }
   }
 
   // extract_vector_elt (insert_vector_elt vec, val, idx), idx) -> val
Index: test/CodeGen/AArch64/extract-insert.ll
===================================================================
--- test/CodeGen/AArch64/extract-insert.ll
+++ test/CodeGen/AArch64/extract-insert.ll
@@ -12,8 +12,7 @@
 ;
 ; LE-LABEL: trunc_i64_to_i32_le:
 ; LE:       // %bb.0:
-; LE-NEXT:    fmov d0, x0
-; LE-NEXT:    fmov w0, s0
+; LE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; LE-NEXT:    ret
   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
   %bc = bitcast <2 x i64> %ins to <4 x i32>
@@ -24,9 +23,7 @@
 define i32 @trunc_i64_to_i32_be(i64 %x) {
 ; BE-LABEL: trunc_i64_to_i32_be:
 ; BE:       // %bb.0:
-; BE-NEXT:    fmov d0, x0
-; BE-NEXT:    rev64 v0.4s, v0.4s
-; BE-NEXT:    mov w0, v0.s[1]
+; BE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; BE-NEXT:    ret
 ;
 ; LE-LABEL: trunc_i64_to_i32_be:
@@ -50,8 +47,7 @@
 ;
 ; LE-LABEL: trunc_i64_to_i16_le:
 ; LE:       // %bb.0:
-; LE-NEXT:    fmov d0, x0
-; LE-NEXT:    umov w0, v0.h[0]
+; LE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; LE-NEXT:    ret
   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
   %bc = bitcast <2 x i64> %ins to <8 x i16>
@@ -62,9 +58,7 @@
 define i16 @trunc_i64_to_i16_be(i64 %x) {
 ; BE-LABEL: trunc_i64_to_i16_be:
 ; BE:       // %bb.0:
-; BE-NEXT:    fmov d0, x0
-; BE-NEXT:    rev64 v0.8h, v0.8h
-; BE-NEXT:    umov w0, v0.h[3]
+; BE-NEXT:    // kill: def $w0 killed $w0 killed $x0
 ; BE-NEXT:    ret
 ;
 ; LE-LABEL: trunc_i64_to_i16_be:
@@ -88,8 +82,6 @@
 ;
 ; LE-LABEL: trunc_i32_to_i8_le:
 ; LE:       // %bb.0:
-; LE-NEXT:    fmov s0, w0
-; LE-NEXT:    umov w0, v0.b[0]
 ; LE-NEXT:    ret
   %ins = insertelement <4 x i32> undef, i32 %x, i32 0
   %bc = bitcast <4 x i32> %ins to <16 x i8>
@@ -100,9 +92,6 @@
 define i8 @trunc_i32_to_i8_be(i32 %x) {
 ; BE-LABEL: trunc_i32_to_i8_be:
 ; BE:       // %bb.0:
-; BE-NEXT:    fmov s0, w0
-; BE-NEXT:    rev32 v0.16b, v0.16b
-; BE-NEXT:    umov w0, v0.b[3]
 ; BE-NEXT:    ret
 ;
 ; LE-LABEL: trunc_i32_to_i8_be:
Index: test/CodeGen/X86/extract-insert.ll
===================================================================
--- test/CodeGen/X86/extract-insert.ll
+++ test/CodeGen/X86/extract-insert.ll
@@ -68,8 +68,8 @@
 ;
 ; X64-LABEL: trunc_i64_to_i32_le:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rdi, %xmm0
-; X64-NEXT:    movd %xmm0, %eax
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    # kill: def $eax killed $eax killed $rax
 ; X64-NEXT:    retq
   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
   %bc = bitcast <2 x i64> %ins to <4 x i32>
@@ -86,9 +86,8 @@
 ;
 ; X64-LABEL: trunc_i64_to_i16_le:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rdi, %xmm0
-; X64-NEXT:    movd %xmm0, %eax
-; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    # kill: def $ax killed $ax killed $rax
 ; X64-NEXT:    retq
   %ins = insertelement <2 x i64> undef, i64 %x, i32 0
   %bc = bitcast <2 x i64> %ins to <8 x i16>
Index: test/CodeGen/X86/mmx-coalescing.ll
===================================================================
--- test/CodeGen/X86/mmx-coalescing.ll
+++ test/CodeGen/X86/mmx-coalescing.ll
@@ -16,16 +16,17 @@
 ; CHECK-NEXT:  # %bb.2: # %if.B
 ; CHECK-NEXT:    pshufw $238, %mm0, %mm0 # mm0 = mm0[2,3,2,3]
 ; CHECK-NEXT:    movq %mm0, %rax
-; CHECK-NEXT:    jmp .LBB0_3
+; CHECK-NEXT:    testl %eax, %eax
+; CHECK-NEXT:    jne .LBB0_4
 ; CHECK-NEXT:  .LBB0_1: # %if.A
+; CHECK-NEXT:    # =>This Inner Loop Header: Depth=1
 ; CHECK-NEXT:    movd %edx, %mm1
 ; CHECK-NEXT:    psllq %mm1, %mm0
 ; CHECK-NEXT:    movq %mm0, %rax
 ; CHECK-NEXT:    testq %rax, %rax
 ; CHECK-NEXT:    jne .LBB0_4
-; CHECK-NEXT:  .LBB0_3: # %if.C
-; CHECK-NEXT:    movq %rax, %xmm0
-; CHECK-NEXT:    movd %xmm0, %eax
+; CHECK-NEXT:  # %bb.3: # %if.C
+; CHECK-NEXT:    # in Loop: Header=BB0_1 Depth=1
 ; CHECK-NEXT:    testl %eax, %eax
 ; CHECK-NEXT:    je .LBB0_1
 ; CHECK-NEXT:  .LBB0_4: # %merge