Index: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.h
+++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.h
@@ -365,6 +365,11 @@
   bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                          Type *Ty) const override;
 
+  /// Return true if EXTRACT_SUBVECTOR is cheap for this result type
+  /// with this index.
+  bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
+                               unsigned Index) const override;
+
   Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                         AtomicOrdering Ord) const override;
   Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
Index: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -8200,6 +8200,14 @@
   return Shift < 3;
 }
 
+bool AArch64TargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
+                                                    unsigned Index) const {
+  if (!isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, ResVT))
+    return false;
+
+  return (Index == 0 || Index == ResVT.getVectorNumElements());
+}
+
 /// Turn vector tests of the signbit in the form of:
 ///   xor (sra X, elt_size(X)-1), -1
 /// into:
Index: llvm/trunk/test/CodeGen/AArch64/aarch64-vuzp.ll
===================================================================
--- llvm/trunk/test/CodeGen/AArch64/aarch64-vuzp.ll
+++ llvm/trunk/test/CodeGen/AArch64/aarch64-vuzp.ll
@@ -4,7 +4,6 @@
 
 ; CHECK-LABEL: fun1:
 ; CHECK: uzp1 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-; CHECK-NOT: mov
 define i32 @fun1() {
 entry:
   %vtbl1.i.1 = tail call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> <i8 0, i8 16, i8 19, i8 4, i8 -65, i8 -65, i8 -71, i8 -71, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> undef)
@@ -16,7 +15,6 @@
 
 ; CHECK-LABEL: fun2:
 ; CHECK: uzp2 {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b
-; CHECK-NOT: mov
 define i32 @fun2() {
 entry:
   %vtbl1.i.1 = tail call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> <i8 0, i8 16, i8 19, i8 4, i8 -65, i8 -65, i8 -71, i8 -71, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> undef)
@@ -28,7 +26,6 @@
 
 ; CHECK-LABEL: fun3:
 ; CHECK-NOT: uzp1
-; CHECK: mov
 define i32 @fun3() {
 entry:
   %vtbl1.i.1 = tail call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> <i8 0, i8 16, i8 19, i8 4, i8 -65, i8 -65, i8 -71, i8 -71, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> undef)
@@ -40,7 +37,6 @@
 
 ; CHECK-LABEL: fun4:
 ; CHECK-NOT: uzp2
-; CHECK: mov
 define i32 @fun4() {
 entry:
   %vtbl1.i.1 = tail call <16 x i8> @llvm.aarch64.neon.tbl1.v16i8(<16 x i8> <i8 0, i8 16, i8 19, i8 4, i8 -65, i8 -65, i8 -71, i8 -71, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0>, <16 x i8> undef)
Index: llvm/trunk/test/CodeGen/AArch64/arm64-ext.ll
===================================================================
--- llvm/trunk/test/CodeGen/AArch64/arm64-ext.ll
+++ llvm/trunk/test/CodeGen/AArch64/arm64-ext.ll
@@ -94,19 +94,6 @@
 ; Tests for ReconstructShuffle function. Indices have to be carefully
 ; chosen to reach lowering phase as a BUILD_VECTOR.
 
-; One vector needs vext, the other can be handled by extract_subvector
-; Also checks interleaving of sources is handled correctly.
-; Essence: a vext is used on %A and something saner than stack load/store for final result.
-define <4 x i16> @test_interleaved(<8 x i16>* %A, <8 x i16>* %B) nounwind {
-;CHECK-LABEL: test_interleaved:
-;CHECK: ext.8b
-;CHECK: zip1.4h
-        %tmp1 = load <8 x i16>, <8 x i16>* %A
-        %tmp2 = load <8 x i16>, <8 x i16>* %B
-        %tmp3 = shufflevector <8 x i16> %tmp1, <8 x i16> %tmp2, <4 x i32> <i32 3, i32 8, i32 5, i32 9>
-        ret <4 x i16> %tmp3
-}
-
 ; An undef in the shuffle list should still be optimizable
 define <4 x i16> @test_undef(<8 x i16>* %A, <8 x i16>* %B) nounwind {
 ;CHECK-LABEL: test_undef:
Index: llvm/trunk/test/CodeGen/AArch64/neon-scalar-copy.ll
===================================================================
--- llvm/trunk/test/CodeGen/AArch64/neon-scalar-copy.ll
+++ llvm/trunk/test/CodeGen/AArch64/neon-scalar-copy.ll
@@ -79,7 +79,7 @@
 
 define <1 x i8> @test_vector_dup_bv16B(<16 x i8> %v1) #0 {
  ; CHECK-LABEL: test_vector_dup_bv16B:
- ; CHECK-NEXT: dup v0.16b, v0.b[14]
+ ; CHECK-NEXT: dup v0.8b, v0.b[14]
  ; CHECK-NEXT: ret
  %shuffle.i = shufflevector <16 x i8> %v1, <16 x i8> undef, <1 x i32> <i32 14> 
  ret <1 x i8> %shuffle.i
@@ -95,7 +95,7 @@
 
 define <1 x i16> @test_vector_dup_hv8H(<8 x i16> %v1) #0 {
  ; CHECK-LABEL: test_vector_dup_hv8H:
- ; CHECK-NEXT:	dup v0.8h, v0.h[7]
+ ; CHECK-NEXT:	dup v0.4h, v0.h[7]
  ; CHECK-NEXT: ret
  %shuffle.i = shufflevector <8 x i16> %v1, <8 x i16> undef, <1 x i32> <i32 7> 
  ret <1 x i16> %shuffle.i
@@ -111,7 +111,7 @@
 
 define <1 x i32> @test_vector_dup_sv4S(<4 x i32> %v1) #0 {
  ; CHECK-LABEL: test_vector_dup_sv4S:
- ; CHECK-NEXT: dup v0.4s, v0.s[3]
+ ; CHECK-NEXT: dup v0.2s, v0.s[3]
  ; CHECK-NEXT: ret
  %shuffle = shufflevector <4 x i32> %v1, <4 x i32> undef, <1 x i32> <i32 3> 
  ret <1 x i32> %shuffle
@@ -135,7 +135,7 @@
 
 define <1 x i64> @test_vector_copy_dup_dv2D(<1 x i64> %a, <2 x i64> %c) #0 {
   ; CHECK-LABEL: test_vector_copy_dup_dv2D:
-  ; CHECK-NEXT: dup v0.2d, v1.d[1]
+  ; CHECK-NEXT: ext {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, #8
   ; CHECK-NEXT: ret
   %vget_lane = extractelement <2 x i64> %c, i32 1
   %vset_lane = insertelement <1 x i64> undef, i64 %vget_lane, i32 0