Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18132,7 +18132,23 @@
         return DAG.getBitcast(NVT, NewExtract);
       }
     }
-    // TODO - handle (DestNumElts % SrcNumElts) == 0
+    if ((DestNumElts % SrcNumElts) == 0) {
+      unsigned DestSrcRatio = DestNumElts / SrcNumElts;
+      if ((NVT.getVectorNumElements() % DestSrcRatio) == 0) {
+        unsigned NewExtNumElts = NVT.getVectorNumElements() / DestSrcRatio;
+        EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(),
+                                        SrcVT.getScalarType(), NewExtNumElts);
+        if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 &&
+            TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
+          unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio;
+          SDLoc DL(N);
+          SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL);
+          SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
+                                           V.getOperand(0), NewIndex);
+          return DAG.getBitcast(NVT, NewExtract);
+        }
+      }
+    }
   }
 
   // Combine:
Index: lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.cpp
+++ lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -577,7 +577,6 @@
   setTargetDAGCombine(ISD::ANY_EXTEND);
   setTargetDAGCombine(ISD::ZERO_EXTEND);
   setTargetDAGCombine(ISD::SIGN_EXTEND);
-  setTargetDAGCombine(ISD::BITCAST);
   setTargetDAGCombine(ISD::CONCAT_VECTORS);
   setTargetDAGCombine(ISD::STORE);
   if (Subtarget->supportsAddressTopByteIgnored())
@@ -9543,74 +9542,6 @@
   return SDValue();
 }
 
-static SDValue performBitcastCombine(SDNode *N,
-                                     TargetLowering::DAGCombinerInfo &DCI,
-                                     SelectionDAG &DAG) {
-  // Wait 'til after everything is legalized to try this. That way we have
-  // legal vector types and such.
-  if (DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  // Remove extraneous bitcasts around an extract_subvector.
-  // For example,
-  //    (v4i16 (bitconvert
-  //             (extract_subvector (v2i64 (bitconvert (v8i16 ...)), (i64 1)))))
-  //  becomes
-  //    (extract_subvector ((v8i16 ...), (i64 4)))
-
-  // Only interested in 64-bit vectors as the ultimate result.
-  EVT VT = N->getValueType(0);
-  if (!VT.isVector())
-    return SDValue();
-  if (VT.getSimpleVT().getSizeInBits() != 64)
-    return SDValue();
-  // Is the operand an extract_subvector starting at the beginning or halfway
-  // point of the vector? A low half may also come through as an
-  // EXTRACT_SUBREG, so look for that, too.
-  SDValue Op0 = N->getOperand(0);
-  if (Op0->getOpcode() != ISD::EXTRACT_SUBVECTOR &&
-      !(Op0->isMachineOpcode() &&
-        Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG))
-    return SDValue();
-  uint64_t idx = cast<ConstantSDNode>(Op0->getOperand(1))->getZExtValue();
-  if (Op0->getOpcode() == ISD::EXTRACT_SUBVECTOR) {
-    if (Op0->getValueType(0).getVectorNumElements() != idx && idx != 0)
-      return SDValue();
-  } else if (Op0->getMachineOpcode() == AArch64::EXTRACT_SUBREG) {
-    if (idx != AArch64::dsub)
-      return SDValue();
-    // The dsub reference is equivalent to a lane zero subvector reference.
-    idx = 0;
-  }
-  // Look through the bitcast of the input to the extract.
-  if (Op0->getOperand(0)->getOpcode() != ISD::BITCAST)
-    return SDValue();
-  SDValue Source = Op0->getOperand(0)->getOperand(0);
-  // If the source type has twice the number of elements as our destination
-  // type, we know this is an extract of the high or low half of the vector.
-  EVT SVT = Source->getValueType(0);
-  if (!SVT.isVector() ||
-      SVT.getVectorNumElements() != VT.getVectorNumElements() * 2)
-    return SDValue();
-
-  LLVM_DEBUG(
-      dbgs() << "aarch64-lower: bitcast extract_subvector simplification\n");
-
-  // Create the simplified form to just extract the low or high half of the
-  // vector directly rather than bothering with the bitcasts.
-  SDLoc dl(N);
-  unsigned NumElements = VT.getVectorNumElements();
-  if (idx) {
-    SDValue HalfIdx = DAG.getConstant(NumElements, dl, MVT::i64);
-    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Source, HalfIdx);
-  } else {
-    SDValue SubReg = DAG.getTargetConstant(AArch64::dsub, dl, MVT::i32);
-    return SDValue(DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, dl, VT,
-                                      Source, SubReg),
-                   0);
-  }
-}
-
 static SDValue performConcatVectorsCombine(SDNode *N,
                                            TargetLowering::DAGCombinerInfo &DCI,
                                            SelectionDAG &DAG) {
@@ -11381,8 +11312,6 @@
   case ISD::ZERO_EXTEND:
   case ISD::SIGN_EXTEND:
     return performExtendCombine(N, DCI, DAG);
-  case ISD::BITCAST:
-    return performBitcastCombine(N, DCI, DAG);
   case ISD::CONCAT_VECTORS:
     return performConcatVectorsCombine(N, DCI, DAG);
   case ISD::SELECT:
Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -43630,34 +43630,6 @@
         VT, SDLoc(N),
         InVec.getNode()->ops().slice(IdxVal, VT.getVectorNumElements()));
 
-  // Try to move vector bitcast after extract_subv by scaling extraction index:
-  // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
-  // TODO: Move this to DAGCombiner::visitEXTRACT_SUBVECTOR
-  if (InVec.getOpcode() == ISD::BITCAST &&
-      InVec.getOperand(0).getValueType().isVector()) {
-    SDValue SrcOp = InVec.getOperand(0);
-    EVT SrcVT = SrcOp.getValueType();
-    unsigned SrcNumElts = SrcVT.getVectorNumElements();
-    unsigned DestNumElts = InVec.getValueType().getVectorNumElements();
-    if ((DestNumElts % SrcNumElts) == 0) {
-      unsigned DestSrcRatio = DestNumElts / SrcNumElts;
-      if ((VT.getVectorNumElements() % DestSrcRatio) == 0) {
-        unsigned NewExtNumElts = VT.getVectorNumElements() / DestSrcRatio;
-        EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(),
-                                        SrcVT.getScalarType(), NewExtNumElts);
-        if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 &&
-            TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
-          unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio;
-          SDLoc DL(N);
-          SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL);
-          SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
-                                           SrcOp, NewIndex);
-          return DAG.getBitcast(VT, NewExtract);
-        }
-      }
-    }
-  }
-
   // If we're extracting from a broadcast then we're better off just
   // broadcasting to the smaller type directly, assuming this is the only use.
   // As its a broadcast we don't care about the extraction index.
Index: test/CodeGen/AArch64/arm64-neon-2velem.ll
===================================================================
--- test/CodeGen/AArch64/arm64-neon-2velem.ll
+++ test/CodeGen/AArch64/arm64-neon-2velem.ll
@@ -487,18 +487,21 @@
 define <2 x float> @test_vfma_laneq_f32(<2 x float> %a, <2 x float> %b, <4 x float> %v) {
 ; GENERIC-LABEL: test_vfma_laneq_f32:
 ; GENERIC:       // %bb.0: // %entry
-; GENERIC-NEXT:    fmla v0.2s, v1.2s, v2.s[3]
+; GENERIC-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
+; GENERIC-NEXT:    fmla v0.2s, v1.2s, v2.s[1]
 ; GENERIC-NEXT:    ret
 ;
 ; EXYNOSM1-LABEL: test_vfma_laneq_f32:
 ; EXYNOSM1:       // %bb.0: // %entry
-; EXYNOSM1-NEXT:    dup v2.2s, v2.s[3]
+; EXYNOSM1-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
+; EXYNOSM1-NEXT:    dup v2.2s, v2.s[1]
 ; EXYNOSM1-NEXT:    fmla v0.2s, v1.2s, v2.2s
 ; EXYNOSM1-NEXT:    ret
 ;
 ; EXYNOSM3-LABEL: test_vfma_laneq_f32:
 ; EXYNOSM3:       // %bb.0: // %entry
-; EXYNOSM3-NEXT:    fmla v0.2s, v1.2s, v2.s[3]
+; EXYNOSM3-NEXT:    ext v2.16b, v2.16b, v2.16b, #8
+; EXYNOSM3-NEXT:    fmla v0.2s, v1.2s, v2.s[1]
 ; EXYNOSM3-NEXT:    ret
 entry:
   %lane = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
@@ -1837,18 +1840,21 @@
 define <2 x float> @test_vmul_laneq_f32(<2 x float> %a, <4 x float> %v) {
 ; GENERIC-LABEL: test_vmul_laneq_f32:
 ; GENERIC:       // %bb.0: // %entry
-; GENERIC-NEXT:    fmul v0.2s, v0.2s, v1.s[3]
+; GENERIC-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; GENERIC-NEXT:    fmul v0.2s, v0.2s, v1.s[1]
 ; GENERIC-NEXT:    ret
 ;
 ; EXYNOSM1-LABEL: test_vmul_laneq_f32:
 ; EXYNOSM1:       // %bb.0: // %entry
-; EXYNOSM1-NEXT:    dup v1.2s, v1.s[3]
+; EXYNOSM1-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; EXYNOSM1-NEXT:    dup v1.2s, v1.s[1]
 ; EXYNOSM1-NEXT:    fmul v0.2s, v0.2s, v1.2s
 ; EXYNOSM1-NEXT:    ret
 ;
 ; EXYNOSM3-LABEL: test_vmul_laneq_f32:
 ; EXYNOSM3:       // %bb.0: // %entry
-; EXYNOSM3-NEXT:    fmul v0.2s, v0.2s, v1.s[3]
+; EXYNOSM3-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; EXYNOSM3-NEXT:    fmul v0.2s, v0.2s, v1.s[1]
 ; EXYNOSM3-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
@@ -1992,18 +1998,21 @@
 define <2 x float> @test_vmulx_laneq_f32(<2 x float> %a, <4 x float> %v) {
 ; GENERIC-LABEL: test_vmulx_laneq_f32:
 ; GENERIC:       // %bb.0: // %entry
-; GENERIC-NEXT:    fmulx v0.2s, v0.2s, v1.s[3]
+; GENERIC-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; GENERIC-NEXT:    fmulx v0.2s, v0.2s, v1.s[1]
 ; GENERIC-NEXT:    ret
 ;
 ; EXYNOSM1-LABEL: test_vmulx_laneq_f32:
 ; EXYNOSM1:       // %bb.0: // %entry
-; EXYNOSM1-NEXT:    dup v1.2s, v1.s[3]
+; EXYNOSM1-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; EXYNOSM1-NEXT:    dup v1.2s, v1.s[1]
 ; EXYNOSM1-NEXT:    fmulx v0.2s, v0.2s, v1.2s
 ; EXYNOSM1-NEXT:    ret
 ;
 ; EXYNOSM3-LABEL: test_vmulx_laneq_f32:
 ; EXYNOSM3:       // %bb.0: // %entry
-; EXYNOSM3-NEXT:    fmulx v0.2s, v0.2s, v1.s[3]
+; EXYNOSM3-NEXT:    ext v1.16b, v1.16b, v1.16b, #8
+; EXYNOSM3-NEXT:    fmulx v0.2s, v0.2s, v1.s[1]
 ; EXYNOSM3-NEXT:    ret
 entry:
   %shuffle = shufflevector <4 x float> %v, <4 x float> undef, <2 x i32> <i32 3, i32 3>
Index: test/CodeGen/AArch64/arm64-vcvt_f.ll
===================================================================
--- test/CodeGen/AArch64/arm64-vcvt_f.ll
+++ test/CodeGen/AArch64/arm64-vcvt_f.ll
@@ -24,14 +24,21 @@
 ; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_high_f64_f32)
 ; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_high_f64_f32)
 define <2 x double> @test_vcvt_high_f64_f32(<4 x float> %x) nounwind readnone ssp {
-; CHECK-LABEL: test_vcvt_high_f64_f32:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    fcvtl2 v0.2d, v0.4s
-; CHECK-NEXT:    ret
+; GENERIC-LABEL: test_vcvt_high_f64_f32:
+; GENERIC:       // %bb.0:
+; GENERIC-NEXT:    ext.16b v0, v0, v0, #8
+; GENERIC-NEXT:    fcvtl v0.2d, v0.2s
+; GENERIC-NEXT:    ret
+;
+; FAST-LABEL: test_vcvt_high_f64_f32:
+; FAST:       // %bb.0:
+; FAST-NEXT:    fcvtl2 v0.2d, v0.4s
+; FAST-NEXT:    ret
 ;
 ; GISEL-LABEL: test_vcvt_high_f64_f32:
 ; GISEL:       // %bb.0:
-; GISEL-NEXT:    fcvtl2 v0.2d, v0.4s
+; GISEL-NEXT:    ext.16b v0, v0, v0, #8
+; GISEL-NEXT:    fcvtl v0.2d, v0.2s
 ; GISEL-NEXT:    ret
   %cvt_in = shufflevector <4 x float> %x, <4 x float> undef, <2 x i32> <i32 2, i32 3>
   %vcvt1.i = fpext <2 x float> %cvt_in to <2 x double>
Index: test/CodeGen/AArch64/merge-store.ll
===================================================================
--- test/CodeGen/AArch64/merge-store.ll
+++ test/CodeGen/AArch64/merge-store.ll
@@ -42,17 +42,10 @@
 ; the fastness of unaligned accesses was not specified correctly.
 
 define void @merge_vec_extract_stores(<4 x float> %v1, <2 x float>* %ptr) {
-; SPLITTING-LABEL: merge_vec_extract_stores:
-; SPLITTING:       // %bb.0:
-; SPLITTING-NEXT:    ext v1.16b, v0.16b, v0.16b, #8
-; SPLITTING-NEXT:    str d0, [x0, #24]
-; SPLITTING-NEXT:    str d1, [x0, #32]
-; SPLITTING-NEXT:    ret
-;
-; MISALIGNED-LABEL: merge_vec_extract_stores:
-; MISALIGNED:       // %bb.0:
-; MISALIGNED-NEXT:    stur q0, [x0, #24]
-; MISALIGNED-NEXT:    ret
+; CHECK-LABEL: merge_vec_extract_stores:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    stur q0, [x0, #24]
+; CHECK-NEXT:    ret
   %idx0 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 3
   %idx1 = getelementptr inbounds <2 x float>, <2 x float>* %ptr, i64 4
 
@@ -62,9 +55,4 @@
   store <2 x float> %shuffle0, <2 x float>* %idx0, align 8
   store <2 x float> %shuffle1, <2 x float>* %idx1, align 8
   ret void
-
-
-; FIXME: Ideally we would like to use a generic target for this test, but this relies
-; on suppressing store pairs.
-
 }
Index: test/CodeGen/AArch64/vector-fcopysign.ll
===================================================================
--- test/CodeGen/AArch64/vector-fcopysign.ll
+++ test/CodeGen/AArch64/vector-fcopysign.ll
@@ -149,12 +149,13 @@
 ; SplitVecRes mismatched
 define <4 x double> @test_copysign_v4f64_v4f32(<4 x double> %a, <4 x float> %b) #0 {
 ; CHECK-LABEL: test_copysign_v4f64_v4f32:
-; CHECK-NEXT:    movi.2d v3, #0000000000000000
-; CHECK-NEXT:    fcvtl2 v4.2d, v2.4s
+; CHECK-NEXT:    ext.16b v3, v2, v2, #8
+; CHECK-NEXT:    movi.2d v4, #0000000000000000
 ; CHECK-NEXT:    fcvtl v2.2d, v2.2s
-; CHECK-NEXT:    fneg.2d v3, v3
-; CHECK-NEXT:    bit.16b v1, v4, v3
-; CHECK-NEXT:    bit.16b v0, v2, v3
+; CHECK-NEXT:    fneg.2d v4, v4
+; CHECK-NEXT:    fcvtl v3.2d, v3.2s
+; CHECK-NEXT:    bit.16b v0, v2, v4
+; CHECK-NEXT:    bit.16b v1, v3, v4
 ; CHECK-NEXT:    ret
   %tmp0 = fpext <4 x float> %b to <4 x double>
   %r = call <4 x double> @llvm.copysign.v4f64(<4 x double> %a, <4 x double> %tmp0)
Index: test/CodeGen/ARM/combine-vmovdrr.ll
===================================================================
--- test/CodeGen/ARM/combine-vmovdrr.ll
+++ test/CodeGen/ARM/combine-vmovdrr.ll
@@ -9,8 +9,8 @@
 ; they are defined on VPRs and used on VPRs.
 ;
 ; CHECK-LABEL: motivatingExample:
-; CHECK: vldr [[ARG2_VAL:d[0-9]+]], [r1]
-; CHECK-NEXT: vld1.32 {[[ARG1_VALlo:d[0-9]+]], [[ARG1_VALhi:d[0-9]+]]}, [r0]
+; CHECK: vld1.32 {[[ARG1_VALlo:d[0-9]+]], [[ARG1_VALhi:d[0-9]+]]}, [r0]
+; CHECK-NEXT: vldr [[ARG2_VAL:d[0-9]+]], [r1]
 ; CHECK-NEXT: vtbl.8 [[RES:d[0-9]+]], {[[ARG1_VALlo]], [[ARG1_VALhi]]}, [[ARG2_VAL]]
 ; CHECK-NEXT: vstr [[RES]], [r1]
 ; CHECK-NEXT: bx lr