Index: lib/Target/AArch64/AArch64InstrNEON.td
===================================================================
--- lib/Target/AArch64/AArch64InstrNEON.td
+++ lib/Target/AArch64/AArch64InstrNEON.td
@@ -11,6 +11,9 @@
 //
 //===----------------------------------------------------------------------===//
 
+def IsLE    : Predicate<"Subtarget->isLittle()">;
+def IsBE    : Predicate<"!Subtarget->isLittle()">;
+
 //===----------------------------------------------------------------------===//
 // NEON-specific DAG Nodes.
 //===----------------------------------------------------------------------===//
@@ -6594,41 +6597,53 @@
 
 // ..and 128-bit vector bitcasts...
 
-def : Pat<(v2f64 (bitconvert (v16i8  VPR128:$src))), (v2f64 VPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v16i8  VPR128:$src))), (v2i64 VPR128:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v2f64 (bitconvert (v16i8  VPR128:$src))), (v2f64 VPR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v16i8  VPR128:$src))), (v2i64 VPR128:$src)>;
+}
 def : Pat<(v4f32 (bitconvert (v16i8  VPR128:$src))), (v4f32 VPR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v16i8  VPR128:$src))), (v4i32 VPR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v16i8  VPR128:$src))), (v8i16 VPR128:$src)>;
 
-def : Pat<(v2f64 (bitconvert (v8i16  VPR128:$src))), (v2f64 VPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v8i16  VPR128:$src))), (v2i64 VPR128:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v2f64 (bitconvert (v8i16  VPR128:$src))), (v2f64 VPR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v8i16  VPR128:$src))), (v2i64 VPR128:$src)>;
+}
 def : Pat<(v4i32 (bitconvert (v8i16  VPR128:$src))), (v4i32 VPR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v8i16  VPR128:$src))), (v4f32 VPR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v8i16  VPR128:$src))), (v16i8 VPR128:$src)>;
 
-def : Pat<(v2f64 (bitconvert (v4i32  VPR128:$src))), (v2f64 VPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v4i32  VPR128:$src))), (v2i64 VPR128:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v2f64 (bitconvert (v4i32  VPR128:$src))), (v2f64 VPR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v4i32  VPR128:$src))), (v2i64 VPR128:$src)>;
+}
 def : Pat<(v4f32 (bitconvert (v4i32  VPR128:$src))), (v4f32 VPR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v4i32  VPR128:$src))), (v8i16 VPR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v4i32  VPR128:$src))), (v16i8 VPR128:$src)>;
 
-def : Pat<(v2f64 (bitconvert (v4f32  VPR128:$src))), (v2f64 VPR128:$src)>;
-def : Pat<(v2i64 (bitconvert (v4f32  VPR128:$src))), (v2i64 VPR128:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v2f64 (bitconvert (v4f32  VPR128:$src))), (v2f64 VPR128:$src)>;
+  def : Pat<(v2i64 (bitconvert (v4f32  VPR128:$src))), (v2i64 VPR128:$src)>;
+}
 def : Pat<(v4i32 (bitconvert (v4f32  VPR128:$src))), (v4i32 VPR128:$src)>;
 def : Pat<(v8i16 (bitconvert (v4f32  VPR128:$src))), (v8i16 VPR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v4f32  VPR128:$src))), (v16i8 VPR128:$src)>;
 
 def : Pat<(v2f64 (bitconvert (v2i64  VPR128:$src))), (v2f64 VPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v2i64  VPR128:$src))), (v4f32 VPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v2i64  VPR128:$src))), (v4i32 VPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v2i64  VPR128:$src))), (v8i16 VPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v2i64  VPR128:$src))), (v16i8 VPR128:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v4f32 (bitconvert (v2i64  VPR128:$src))), (v4f32 VPR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2i64  VPR128:$src))), (v4i32 VPR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2i64  VPR128:$src))), (v8i16 VPR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2i64  VPR128:$src))), (v16i8 VPR128:$src)>;
+}
 
 def : Pat<(v2i64 (bitconvert (v2f64  VPR128:$src))), (v2i64 VPR128:$src)>;
-def : Pat<(v4f32 (bitconvert (v2f64  VPR128:$src))), (v4f32 VPR128:$src)>;
-def : Pat<(v4i32 (bitconvert (v2f64  VPR128:$src))), (v4i32 VPR128:$src)>;
-def : Pat<(v8i16 (bitconvert (v2f64  VPR128:$src))), (v8i16 VPR128:$src)>;
-def : Pat<(v16i8 (bitconvert (v2f64  VPR128:$src))), (v16i8 VPR128:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v4f32 (bitconvert (v2f64  VPR128:$src))), (v4f32 VPR128:$src)>;
+  def : Pat<(v4i32 (bitconvert (v2f64  VPR128:$src))), (v4i32 VPR128:$src)>;
+  def : Pat<(v8i16 (bitconvert (v2f64  VPR128:$src))), (v8i16 VPR128:$src)>;
+  def : Pat<(v16i8 (bitconvert (v2f64  VPR128:$src))), (v16i8 VPR128:$src)>;
+}
 
 // ...and scalar bitcasts...
 def : Pat<(f16 (bitconvert (v1i16  FPR16:$src))), (f16 FPR16:$src)>;
@@ -6638,10 +6653,12 @@
 
 def : Pat<(i64 (bitconvert (v1i64  FPR64:$src))), (FMOVxd $src)>;
 def : Pat<(i64 (bitconvert (v1f64  FPR64:$src))), (FMOVxd $src)>;
-def : Pat<(i64 (bitconvert (v2i32  FPR64:$src))), (FMOVxd $src)>;
-def : Pat<(i64 (bitconvert (v2f32  FPR64:$src))), (FMOVxd $src)>;
-def : Pat<(i64 (bitconvert (v4i16  FPR64:$src))), (FMOVxd $src)>;
-def : Pat<(i64 (bitconvert (v8i8  FPR64:$src))), (FMOVxd $src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(i64 (bitconvert (v2i32  FPR64:$src))), (FMOVxd $src)>;
+  def : Pat<(i64 (bitconvert (v2f32  FPR64:$src))), (FMOVxd $src)>;
+  def : Pat<(i64 (bitconvert (v4i16  FPR64:$src))), (FMOVxd $src)>;
+  def : Pat<(i64 (bitconvert (v8i8  FPR64:$src))), (FMOVxd $src)>;
+}
 
 def : Pat<(i32 (bitconvert (v1i32  FPR32:$src))), (FMOVws $src)>;
 
@@ -6649,18 +6666,22 @@
 def : Pat<(v4i16 (bitconvert (v1i64  VPR64:$src))), (v4i16 VPR64:$src)>;
 def : Pat<(v2i32 (bitconvert (v1i64  VPR64:$src))), (v2i32 VPR64:$src)>;
 
-def : Pat<(f64   (bitconvert (v8i8  VPR64:$src))), (f64 VPR64:$src)>;
-def : Pat<(f64   (bitconvert (v4i16  VPR64:$src))), (f64 VPR64:$src)>;
-def : Pat<(f64   (bitconvert (v2i32  VPR64:$src))), (f64 VPR64:$src)>;
-def : Pat<(f64   (bitconvert (v2f32  VPR64:$src))), (f64 VPR64:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(f64   (bitconvert (v8i8  VPR64:$src))), (f64 VPR64:$src)>;
+  def : Pat<(f64   (bitconvert (v4i16  VPR64:$src))), (f64 VPR64:$src)>;
+  def : Pat<(f64   (bitconvert (v2i32  VPR64:$src))), (f64 VPR64:$src)>;
+  def : Pat<(f64   (bitconvert (v2f32  VPR64:$src))), (f64 VPR64:$src)>;
+}
 def : Pat<(f64   (bitconvert (v1i64  VPR64:$src))), (f64 VPR64:$src)>;
 
-def : Pat<(f128  (bitconvert (v16i8  VPR128:$src))), (f128 VPR128:$src)>;
-def : Pat<(f128  (bitconvert (v8i16  VPR128:$src))), (f128 VPR128:$src)>;
-def : Pat<(f128  (bitconvert (v4i32  VPR128:$src))), (f128 VPR128:$src)>;
-def : Pat<(f128  (bitconvert (v2i64  VPR128:$src))), (f128 VPR128:$src)>;
-def : Pat<(f128  (bitconvert (v4f32  VPR128:$src))), (f128 VPR128:$src)>;
-def : Pat<(f128  (bitconvert (v2f64  VPR128:$src))), (f128 VPR128:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(f128  (bitconvert (v16i8  VPR128:$src))), (f128 VPR128:$src)>;
+  def : Pat<(f128  (bitconvert (v8i16  VPR128:$src))), (f128 VPR128:$src)>;
+  def : Pat<(f128  (bitconvert (v4i32  VPR128:$src))), (f128 VPR128:$src)>;
+  def : Pat<(f128  (bitconvert (v2i64  VPR128:$src))), (f128 VPR128:$src)>;
+  def : Pat<(f128  (bitconvert (v4f32  VPR128:$src))), (f128 VPR128:$src)>;
+  def : Pat<(f128  (bitconvert (v2f64  VPR128:$src))), (f128 VPR128:$src)>;
+}
 
 def : Pat<(v1i16 (bitconvert (f16  FPR16:$src))), (v1i16 FPR16:$src)>;
 def : Pat<(v1i32 (bitconvert (f32  FPR32:$src))), (v1i32 FPR32:$src)>;
@@ -6669,25 +6690,31 @@
 
 def : Pat<(v1i64 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
 def : Pat<(v1f64 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
-def : Pat<(v2i32 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
-def : Pat<(v2f32 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
-def : Pat<(v4i16 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
-def : Pat<(v8i8 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v2i32 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
+  def : Pat<(v2f32 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
+  def : Pat<(v4i16 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
+  def : Pat<(v8i8 (bitconvert (i64  GPR64:$src))), (FMOVdx $src)>;
+}
 
 def : Pat<(v1i32 (bitconvert (i32  GPR32:$src))), (FMOVsw $src)>;
 
-def : Pat<(v8i8   (bitconvert (f64   FPR64:$src))), (v8i8 FPR64:$src)>;
-def : Pat<(v4i16  (bitconvert (f64   FPR64:$src))), (v4i16 FPR64:$src)>;
-def : Pat<(v2i32  (bitconvert (f64   FPR64:$src))), (v2i32 FPR64:$src)>;
-def : Pat<(v2f32  (bitconvert (f64   FPR64:$src))), (v2f32 FPR64:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v8i8   (bitconvert (f64   FPR64:$src))), (v8i8 FPR64:$src)>;
+  def : Pat<(v4i16  (bitconvert (f64   FPR64:$src))), (v4i16 FPR64:$src)>;
+  def : Pat<(v2i32  (bitconvert (f64   FPR64:$src))), (v2i32 FPR64:$src)>;
+  def : Pat<(v2f32  (bitconvert (f64   FPR64:$src))), (v2f32 FPR64:$src)>;
+}
 def : Pat<(v1i64  (bitconvert (f64   FPR64:$src))), (v1i64 FPR64:$src)>;
 
-def : Pat<(v16i8  (bitconvert (f128   FPR128:$src))), (v16i8 FPR128:$src)>;
-def : Pat<(v8i16  (bitconvert (f128   FPR128:$src))), (v8i16 FPR128:$src)>;
-def : Pat<(v4i32  (bitconvert (f128   FPR128:$src))), (v4i32 FPR128:$src)>;
-def : Pat<(v2i64  (bitconvert (f128   FPR128:$src))), (v2i64 FPR128:$src)>;
-def : Pat<(v4f32  (bitconvert (f128   FPR128:$src))), (v4f32 FPR128:$src)>;
-def : Pat<(v2f64  (bitconvert (f128   FPR128:$src))), (v2f64 FPR128:$src)>;
+let Predicates = [IsLE] in {
+  def : Pat<(v16i8  (bitconvert (f128   FPR128:$src))), (v16i8 FPR128:$src)>;
+  def : Pat<(v8i16  (bitconvert (f128   FPR128:$src))), (v8i16 FPR128:$src)>;
+  def : Pat<(v4i32  (bitconvert (f128   FPR128:$src))), (v4i32 FPR128:$src)>;
+  def : Pat<(v2i64  (bitconvert (f128   FPR128:$src))), (v2i64 FPR128:$src)>;
+  def : Pat<(v4f32  (bitconvert (f128   FPR128:$src))), (v4f32 FPR128:$src)>;
+  def : Pat<(v2f64  (bitconvert (f128   FPR128:$src))), (v2f64 FPR128:$src)>;
+}
 
 // Scalar Three Same
 
@@ -8254,6 +8281,76 @@
 def REV16_8b : NeonI_REV<"rev16", "8b", 0b00, 0b0, 0b0, 0b00001, VPR64,
                          v8i8, Neon_rev16>;
 
+let Predicates = [IsBE] in {
+  // convert between vector and scalar
+  def : Pat<(v8i8  (bitconvert (f64 FPR64:$src))), (REV64_8b $src)>;
+  def : Pat<(v4i16 (bitconvert (f64 FPR64:$src))), (REV64_4h $src)>;
+  def : Pat<(v2i32 (bitconvert (f64 FPR64:$src))), (REV64_2s $src)>;
+  def : Pat<(v2f32 (bitconvert (f64 FPR64:$src))), (REV64_2s $src)>;
+
+  def : Pat<(f64 (bitconvert (v8i8 VPR64:$src))), (REV64_8b $src)>;
+  def : Pat<(f64 (bitconvert (v4i16 VPR64:$src))), (REV64_4h $src)>;
+  def : Pat<(f64 (bitconvert (v2i32 VPR64:$src))), (REV64_2s $src)>;
+  def : Pat<(f64 (bitconvert (v2f32 VPR64:$src))), (REV64_2s $src)>;
+
+  def : Pat<(v2i32 (bitconvert (i64 GPR64:$src))), (REV64_2s (FMOVdx $src))>;
+  def : Pat<(v2f32 (bitconvert (i64 GPR64:$src))), (REV64_2s (FMOVdx $src))>;
+  def : Pat<(v4i16 (bitconvert (i64 GPR64:$src))), (REV64_4h (FMOVdx $src))>;
+  def : Pat<(v8i8  (bitconvert (i64 GPR64:$src))), (REV64_8b (FMOVdx $src))>;
+
+  def : Pat<(i64 (bitconvert (v2i32 FPR64:$src))), (FMOVxd (REV64_2s $src))>;
+  def : Pat<(i64 (bitconvert (v2f32 FPR64:$src))), (FMOVxd (REV64_2s $src))>;
+  def : Pat<(i64 (bitconvert (v4i16 FPR64:$src))), (FMOVxd (REV64_4h $src))>;
+  def : Pat<(i64 (bitconvert (v8i8  FPR64:$src))), (FMOVxd (REV64_8b $src))>;
+
+  def : Pat<(v16i8 (bitconvert (f128 FPR128:$src))),
+            (REV64_16b (EXTvvvi_16b $src, $src, 8))>;
+  def : Pat<(v8i16 (bitconvert (f128 FPR128:$src))),
+            (REV64_8h (EXTvvvi_16b $src, $src, 8))>;
+  def : Pat<(v4i32 (bitconvert (f128 FPR128:$src))),
+            (REV64_4s (EXTvvvi_16b $src, $src, 8))>;
+  def : Pat<(v4f32 (bitconvert (f128 FPR128:$src))),
+            (REV64_4s (EXTvvvi_16b $src, $src, 8))>;
+  def : Pat<(v2i64 (bitconvert (f128 FPR128:$src))),
+            (EXTvvvi_16b $src, $src, 8)>;
+  def : Pat<(v2f64 (bitconvert (f128 FPR128:$src))),
+            (EXTvvvi_16b $src, $src, 8)>;
+
+  def : Pat<(f128 (bitconvert (v16i8 VPR128:$src))),
+            (REV64_16b (EXTvvvi_16b $src, $src, 8))>;
+  def : Pat<(f128 (bitconvert (v8i16 VPR128:$src))),
+            (REV64_8h (EXTvvvi_16b $src, $src, 8))>;
+  def : Pat<(f128 (bitconvert (v4i32 VPR128:$src))),
+            (REV64_4s (EXTvvvi_16b $src, $src, 8))>;
+  def : Pat<(f128 (bitconvert (v4f32 VPR128:$src))),
+            (REV64_4s (EXTvvvi_16b $src, $src, 8))>;
+  def : Pat<(f128 (bitconvert (v2i64 VPR128:$src))),
+            (EXTvvvi_16b $src, $src, 8)>;
+  def : Pat<(f128 (bitconvert (v2f64 VPR128:$src))),
+            (EXTvvvi_16b $src, $src, 8)>;
+
+  // convert between vector and vector
+  def : Pat<(v4f32 (bitconvert (v2i64 VPR128:$src))), (REV64_4s $src)>;
+  def : Pat<(v4i32 (bitconvert (v2i64 VPR128:$src))), (REV64_4s $src)>;
+  def : Pat<(v8i16 (bitconvert (v2i64 VPR128:$src))), (REV64_8h $src)>;
+  def : Pat<(v16i8 (bitconvert (v2i64 VPR128:$src))), (REV64_16b $src)>;
+
+  def : Pat<(v4f32 (bitconvert (v2f64 VPR128:$src))), (REV64_4s $src)>;
+  def : Pat<(v4i32 (bitconvert (v2f64 VPR128:$src))), (REV64_4s $src)>;
+  def : Pat<(v8i16 (bitconvert (v2f64 VPR128:$src))), (REV64_8h $src)>;
+  def : Pat<(v16i8 (bitconvert (v2f64 VPR128:$src))), (REV64_16b $src)>;
+
+  def : Pat<(v2i64 (bitconvert (v16i8 VPR128:$src))), (REV64_16b $src)>;
+  def : Pat<(v2i64 (bitconvert (v8i16 VPR128:$src))), (REV64_8h $src)>;
+  def : Pat<(v2i64 (bitconvert (v4i32 VPR128:$src))), (REV64_4s $src)>;
+  def : Pat<(v2i64 (bitconvert (v4f32 VPR128:$src))), (REV64_4s $src)>;
+
+  def : Pat<(v2f64 (bitconvert (v16i8 VPR128:$src))), (REV64_16b $src)>;
+  def : Pat<(v2f64 (bitconvert (v8i16 VPR128:$src))), (REV64_8h $src)>;
+  def : Pat<(v2f64 (bitconvert (v4i32 VPR128:$src))), (REV64_4s $src)>;
+  def : Pat<(v2f64 (bitconvert (v4f32 VPR128:$src))), (REV64_4s $src)>;
+}
+
 multiclass NeonI_PairwiseAdd<string asmop, bit U, bits<5> opcode,
                              SDPatternOperator Neon_Padd> {
   def 16b8h : NeonI_2VMisc<0b1, U, 0b00, opcode,
Index: test/CodeGen/AArch64/neon-bitconv.ll
===================================================================
--- test/CodeGen/AArch64/neon-bitconv.ll
+++ test/CodeGen/AArch64/neon-bitconv.ll
@@ -0,0 +1,367 @@
+; RUN: llc < %s -march aarch64_be -mattr neon -o - | FileCheck %s
+
+@v2i64 = global <2 x i64> zeroinitializer
+@v2i32 = global <2 x i32> zeroinitializer
+@v4i32 = global <4 x i32> zeroinitializer
+@v4i16 = global <4 x i16> zeroinitializer
+@v8i16 = global <8 x i16> zeroinitializer
+@v8i8 = global <8 x i8> zeroinitializer
+@v16i8 = global <16 x i8> zeroinitializer
+
+@v2f32 = global <2 x float> zeroinitializer
+@v2f64 = global <2 x double> zeroinitializer
+@v4f32 = global <4 x float> zeroinitializer
+
+; 64 bit conversions
+define void @conv_i64_to_v8i8( i64 %val,  <8 x i8>* %store ) {
+; CHECK-LABEL: conv_i64_to_v8i8:
+; CHECK: fmov
+; CHECK-NEXT: rev64 v1.8b
+  %v = bitcast i64 %val to <8 x i8>
+  %w = load <8 x i8>* @v8i8
+  %a = add <8 x i8> %v, %w
+  store <8 x i8> %a, <8 x i8>* %store
+  ret void
+}
+
+define void @conv_v8i8_to_i64( <8 x i8>* %load, <8 x i8>* %store ) {
+; CHECK-LABEL: conv_v8i8_to_i64:
+; CHECK: rev64 v0.8b
+; CHECK-NEXT: fmov
+  %v = load <8 x i8>* %load
+  %w = load <8 x i8>* @v8i8
+  %a = add <8 x i8> %v, %w
+  %f = bitcast <8 x i8> %a to i64
+  call void @conv_i64_to_v8i8( i64 %f, <8 x i8>* %store )
+  ret void
+}
+
+define void @conv_i64_to_v4i16( i64 %val,  <4 x i16>* %store ) {
+; CHECK-LABEL: conv_i64_to_v4i16:
+; CHECK: fmov
+; CHECK-NEXT: rev64 v1.4h
+  %v = bitcast i64 %val to <4 x i16>
+  %w = load <4 x i16>* @v4i16
+  %a = add <4 x i16> %v, %w
+  store <4 x i16> %a, <4 x i16>* %store
+  ret void
+}
+
+define void @conv_v4i16_to_i64( <4 x i16>* %load, <4 x i16>* %store ) {
+; CHECK-LABEL: conv_v4i16_to_i64:
+; CHECK: rev64 v0.4h
+; CHECK-NEXT: fmov
+  %v = load <4 x i16>* %load
+  %w = load <4 x i16>* @v4i16
+  %a = add <4 x i16> %v, %w
+  %f = bitcast <4 x i16> %a to i64
+  call void @conv_i64_to_v4i16( i64 %f, <4 x i16>* %store )
+  ret void
+}
+
+define void @conv_i64_to_v2i32( i64 %val,  <2 x i32>* %store ) {
+; CHECK-LABEL: conv_i64_to_v2i32:
+; CHECK: fmov
+; CHECK-NEXT: rev64 v1.2s
+  %v = bitcast i64 %val to <2 x i32>
+  %w = load <2 x i32>* @v2i32
+  %a = add <2 x i32> %v, %w
+  store <2 x i32> %a, <2 x i32>* %store
+  ret void
+}
+
+define void @conv_v2i32_to_i64( <2 x i32>* %load, <2 x i32>* %store ) {
+; CHECK-LABEL: conv_v2i32_to_i64:
+; CHECK: rev64 v0.2s
+; CHECK-NEXT: fmov
+  %v = load <2 x i32>* %load
+  %w = load <2 x i32>* @v2i32
+  %a = add <2 x i32> %v, %w
+  %f = bitcast <2 x i32> %a to i64
+  call void @conv_i64_to_v2i32( i64 %f, <2 x i32>* %store )
+  ret void
+}
+
+define void @conv_i64_to_v2f32( i64 %val,  <2 x float>* %store ) {
+; CHECK-LABEL: conv_i64_to_v2f32:
+; CHECK: fmov
+; CHECK-NEXT: rev64 v1.2s
+  %v = bitcast i64 %val to <2 x float>
+  %w = load <2 x float>* @v2f32
+  %a = fadd <2 x float> %v, %w
+  store <2 x float> %a, <2 x float>* %store
+  ret void
+}
+
+define void @conv_v2f32_to_i64( <2 x float>* %load, <2 x float>* %store ) {
+; CHECK-LABEL: conv_v2f32_to_i64:
+; CHECK: rev64 v0.2s
+; CHECK-NEXT: fmov
+  %v = load <2 x float>* %load
+  %w = load <2 x float>* @v2f32
+  %a = fadd <2 x float> %v, %w
+  %f = bitcast <2 x float> %a to i64
+  call void @conv_i64_to_v2f32( i64 %f, <2 x float>* %store )
+  ret void
+}
+
+define void @conv_f64_to_v8i8( double %val,  <8 x i8>* %store ) {
+; CHECK-LABEL: conv_f64_to_v8i8:
+; CHECK: rev64 v0.8b
+  %v = bitcast double %val to <8 x i8>
+  %w = load <8 x i8>* @v8i8
+  %a = add <8 x i8> %v, %w
+  store <8 x i8> %a, <8 x i8>* %store
+  ret void
+}
+
+define void @conv_v8i8_to_f64( <8 x i8>* %load, <8 x i8>* %store ) {
+; CHECK-LABEL: conv_v8i8_to_f64:
+; CHECK: rev64 v0.8b
+  %v = load <8 x i8>* %load
+  %w = load <8 x i8>* @v8i8
+  %a = add <8 x i8> %v, %w
+  %f = bitcast <8 x i8> %a to double
+  call void @conv_f64_to_v8i8( double %f, <8 x i8>* %store )
+  ret void
+}
+
+define void @conv_f64_to_v4i16( double %val,  <4 x i16>* %store ) {
+; CHECK-LABEL: conv_f64_to_v4i16:
+; CHECK: rev64 v0.4h
+  %v = bitcast double %val to <4 x i16>
+  %w = load <4 x i16>* @v4i16
+  %a = add <4 x i16> %v, %w
+  store <4 x i16> %a, <4 x i16>* %store
+  ret void
+}
+
+define void @conv_v4i16_to_f64( <4 x i16>* %load, <4 x i16>* %store ) {
+; CHECK-LABEL: conv_v4i16_to_f64:
+; CHECK: rev64 v0.4h
+  %v = load <4 x i16>* %load
+  %w = load <4 x i16>* @v4i16
+  %a = add <4 x i16> %v, %w
+  %f = bitcast <4 x i16> %a to double
+  call void @conv_f64_to_v4i16( double %f, <4 x i16>* %store )
+  ret void
+}
+
+define void @conv_f64_to_v2i32( double %val,  <2 x i32>* %store ) {
+; CHECK-LABEL: conv_f64_to_v2i32:
+; CHECK: rev64 v0.2s
+  %v = bitcast double %val to <2 x i32>
+  %w = load <2 x i32>* @v2i32
+  %a = add <2 x i32> %v, %w
+  store <2 x i32> %a, <2 x i32>* %store
+  ret void
+}
+
+define void @conv_v2i32_to_f64( <2 x i32>* %load, <2 x i32>* %store ) {
+; CHECK-LABEL: conv_v2i32_to_f64:
+; CHECK: rev64 v0.2s
+  %v = load <2 x i32>* %load
+  %w = load <2 x i32>* @v2i32
+  %a = add <2 x i32> %v, %w
+  %f = bitcast <2 x i32> %a to double
+  call void @conv_f64_to_v2i32( double %f, <2 x i32>* %store )
+  ret void
+}
+
+define void @conv_f64_to_v2f32( double %val,  <2 x float>* %store ) {
+; CHECK-LABEL: conv_f64_to_v2f32:
+; CHECK: rev64 v0.2s
+  %v = bitcast double %val to <2 x float>
+  %w = load <2 x float>* @v2f32
+  %a = fadd <2 x float> %v, %w
+  store <2 x float> %a, <2 x float>* %store
+  ret void
+}
+
+define void @conv_v2f32_to_f64( <2 x float>* %load, <2 x float>* %store ) {
+; CHECK-LABEL: conv_v2f32_to_f64:
+; CHECK: rev64 v0.2s
+  %v = load <2 x float>* %load
+  %w = load <2 x float>* @v2f32
+  %a = fadd <2 x float> %v, %w
+  %f = bitcast <2 x float> %a to double
+  call void @conv_f64_to_v2f32( double %f, <2 x float>* %store )
+  ret void
+}
+
+; 128 bit conversions
+
+define void @conv_i128_to_v16i8( i128 %val,  <16 x i8>* %store ) {
+; CHECK-LABEL: conv_i128_to_v16i8:
+; CHECK: rev64 v1.16b
+  %v = bitcast i128 %val to <16 x i8>
+  %w = load  <16 x i8>* @v16i8
+  %a = add <16 x i8> %v, %w
+  store <16 x i8> %a, <16 x i8>* %store
+  ret void
+}
+
+define void @conv_v16i8_to_i128( <16 x i8>* %load, <16 x i8>* %store ) {
+; CHECK-LABEL: conv_v16i8_to_i128:
+; CHECKT: rev64 v0.16b
+  %v = load <16 x i8>* %load
+  %w = load <16 x i8>* @v16i8
+  %a = add <16 x i8> %v, %w
+  %f = bitcast <16 x i8> %a to i128
+  call void @conv_i128_to_v16i8( i128 %f, <16 x i8>* %store )
+  ret void
+}
+
+define void @conv_i128_to_v8i16( i128 %val,  <8 x i16>* %store ) {
+; CHECK-LABEL: conv_i128_to_v8i16:
+; CHECK: rev64 v1.8h
+  %v = bitcast i128 %val to <8 x i16>
+  %w = load  <8 x i16>* @v8i16
+  %a = add <8 x i16> %v, %w
+  store <8 x i16> %a, <8 x i16>* %store
+  ret void
+}
+
+define void @conv_v8i16_to_i128( <8 x i16>* %load, <8 x i16>* %store ) {
+; CHECK-LABEL: conv_v8i16_to_i128:
+; CHECK: rev64 v0.8h
+  %v = load <8 x i16>* %load
+  %w = load <8 x i16>* @v8i16
+  %a = add <8 x i16> %v, %w
+  %f = bitcast <8 x i16> %a to i128
+  call void @conv_i128_to_v8i16( i128 %f, <8 x i16>* %store )
+  ret void
+}
+
+define void @conv_i128_to_v4i32( i128 %val,  <4 x i32>* %store ) {
+; CHECK-LABEL: conv_i128_to_v4i32:
+; CHECK: rev64 v1.4s
+  %v = bitcast i128 %val to <4 x i32>
+  %w = load <4 x i32>* @v4i32
+  %a = add <4 x i32> %v, %w
+  store <4 x i32> %a, <4 x i32>* %store
+  ret void
+}
+
+define void @conv_v4i32_to_i128( <4 x i32>* %load, <4 x i32>* %store ) {
+; CHECK-LABEL: conv_v4i32_to_i128:
+; CHECK: rev64 v0.4s
+  %v = load <4 x i32>* %load
+  %w = load <4 x i32>* @v4i32
+  %a = add <4 x i32> %v, %w
+  %f = bitcast <4 x i32> %a to i128
+  call void @conv_i128_to_v4i32( i128 %f, <4 x i32>* %store )
+  ret void
+}
+
+define void @conv_i128_to_v4f32( i128 %val,  <4 x float>* %store ) {
+; CHECK-LABEL: conv_i128_to_v4f32:
+; CHECK: rev64 v1.4s
+  %v = bitcast i128 %val to <4 x float>
+  %w = load <4 x float>* @v4f32
+  %a = fadd <4 x float> %v, %w
+  store <4 x float> %a, <4 x float>* %store
+  ret void
+}
+
+define void @conv_v4f32_to_i128( <4 x float>* %load, <4 x float>* %store ) {
+; CHECK-LABEL: conv_v4f32_to_i128:
+; CHECK: rev64 v0.4s
+  %v = load <4 x float>* %load
+  %w = load <4 x float>* @v4f32
+  %a = fadd <4 x float> %v, %w
+  %f = bitcast <4 x float> %a to i128
+  call void @conv_i128_to_v4f32( i128 %f, <4 x float>* %store )
+  ret void
+}
+
+define void @conv_f128_to_v2f64( fp128 %val,  <2 x double>* %store ) {
+; CHECK-LABEL: conv_f128_to_v2f64:
+; CHECK: ext v0
+  %v = bitcast fp128 %val to <2 x double>
+  %w = load <2 x double>* @v2f64
+  %a = fadd <2 x double> %v, %w
+  store <2 x double> %a, <2 x double>* %store
+  ret void
+}
+
+define void @conv_v2f64_to_f128( <2 x double>* %load, <2 x double>* %store ) {
+; CHECK-LABEL: conv_v2f64_to_f128:
+; CHECK: ext v0
+  %v = load <2 x double>* %load
+  %w = load <2 x double>* @v2f64
+  %a = fadd <2 x double> %v, %w
+  %f = bitcast <2 x double> %a to fp128
+  call void @conv_f128_to_v2f64( fp128 %f, <2 x double>* %store )
+  ret void
+}
+
+define void @conv_f128_to_v16i8( fp128 %val,  <16 x i8>* %store ) {
+; CHECK-LABEL: conv_f128_to_v16i8:
+; CHECK: ext v0.16b
+; CHECK-NEXT: rev64 v0.16b
+  %v = bitcast fp128 %val to <16 x i8>
+  %w = load  <16 x i8>* @v16i8
+  %a = add <16 x i8> %v, %w
+  store <16 x i8> %a, <16 x i8>* %store
+  ret void
+}
+
+define void @conv_v16i8_to_f128( <16 x i8>* %load, <16 x i8>* %store ) {
+; CHECK-LABEL: conv_v16i8_to_f128:
+; CHECK: ext v0.16b
+; CHECK-NEXT: rev64 v0.16b
+  %v = load <16 x i8>* %load
+  %w = load <16 x i8>* @v16i8
+  %a = add <16 x i8> %v, %w
+  %f = bitcast <16 x i8> %a to fp128
+  call void @conv_f128_to_v16i8( fp128 %f, <16 x i8>* %store )
+  ret void
+}
+
+define void @conv_f128_to_v8i16( fp128 %val,  <8 x i16>* %store ) {
+; CHECK-LABEL: conv_f128_to_v8i16:
+; CHECK: ext v0.16b
+; CHECK-NEXT: rev64 v0.8h
+  %v = bitcast fp128 %val to <8 x i16>
+  %w = load  <8 x i16>* @v8i16
+  %a = add <8 x i16> %v, %w
+  store <8 x i16> %a, <8 x i16>* %store
+  ret void
+}
+
+define void @conv_v8i16_to_f128( <8 x i16>* %load, <8 x i16>* %store ) {
+; CHECK-LABEL: conv_v8i16_to_f128:
+; CHECK: ext v0.16b
+; CHECK-NEXT: rev64 v0.8h
+  %v = load <8 x i16>* %load
+  %w = load <8 x i16>* @v8i16
+  %a = add <8 x i16> %v, %w
+  %f = bitcast <8 x i16> %a to fp128
+  call void @conv_f128_to_v8i16( fp128 %f, <8 x i16>* %store )
+  ret void
+}
+
+define void @conv_f128_to_v4f32( fp128 %val,  <4 x float>* %store ) {
+; CHECK-LABEL: conv_f128_to_v4f32:
+; CHECK: ext v0.16b
+; CHECK-NEXT: rev64 v0.4s
+  %v = bitcast fp128 %val to <4 x float>
+  %w = load <4 x float>* @v4f32
+  %a = fadd <4 x float> %v, %w
+  store <4 x float> %a, <4 x float>* %store
+  ret void
+}
+
+define void @conv_v4f32_to_f128( <4 x float>* %load, <4 x float>* %store ) {
+; CHECK-LABEL: conv_v4f32_to_f128:
+; CHECK: ext v0.16b
+; CHECK-NEXT: rev64 v0.4s
+  %v = load <4 x float>* %load
+  %w = load <4 x float>* @v4f32
+  %a = fadd <4 x float> %v, %w
+  %f = bitcast <4 x float> %a to fp128
+  call void @conv_f128_to_v4f32( fp128 %f, <4 x float>* %store )
+  ret void
+}
+
Index: test/CodeGen/AArch64/unaligned-vector-ld1-st1.ll
===================================================================
--- test/CodeGen/AArch64/unaligned-vector-ld1-st1.ll
+++ test/CodeGen/AArch64/unaligned-vector-ld1-st1.ll
@@ -1,8 +1,8 @@
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon -o - | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64_be-none-linux-gnu -mattr=+neon -o - | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -aarch64-no-strict-align -mattr=+neon -o - | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64_be-none-linux-gnu -aarch64-no-strict-align -mattr=+neon -o - | FileCheck %s
-; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -aarch64-strict-align -mattr=+neon -o - | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
+; RUN: llc < %s -mtriple=aarch64_be-none-linux-gnu -mattr=+neon -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -aarch64-no-strict-align -mattr=+neon -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
+; RUN: llc < %s -mtriple=aarch64_be-none-linux-gnu -aarch64-no-strict-align -mattr=+neon -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-BE
+; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -aarch64-strict-align -mattr=+neon -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-LE
 ; RUN: llc < %s -mtriple=aarch64_be-none-linux-gnu -aarch64-strict-align -mattr=+neon -o - | FileCheck %s --check-prefix=BE-STRICT-ALIGN
 
 ;; Check element-aligned 128-bit vector load/store - integer
@@ -148,12 +148,18 @@
 define <2 x float> @align2vf32 (<2 x float>* %head.byte, <2 x float>* %head.half, <2 x float>* %head.word, <2 x float>* %head.dword,
                                 <2 x float>* %tail.byte, <2 x float>* %tail.half, <2 x float>* %tail.word, <2 x float>* %tail.dword) {
 ; CHECK-LABEL: align2vf32
-; CHECK: ld1     { v0.2s }, [x0]
-; CHECK: ld1     { v1.2s }, [x1]
-; CHECK: ld1     { v2.2s }, [x2]
-; CHECK: st1     { v0.2s }, [x4]
-; CHECK: st1     { v1.2s }, [x5]
-; CHECK: st1     { v2.2s }, [x6]
+; CHECK-LE: ld1     { v0.2s }, [x0]
+; CHECK-LE: ld1     { v1.2s }, [x1]
+; CHECK-LE: ld1     { v2.2s }, [x2]
+; CHECK-LE: st1     { v0.2s }, [x4]
+; CHECK-LE: st1     { v1.2s }, [x5]
+; CHECK-LE: st1     { v2.2s }, [x6]
+; CHECK-BE: ld1     { v0.2s }, [x1]
+; CHECK-BE: ld1     { v1.2s }, [x0]
+; CHECK-BE: ld1     { v2.2s }, [x2]
+; CHECK-BE: st1     { v1.2s }, [x4]
+; CHECK-BE: st1     { v0.2s }, [x5]
+; CHECK-BE: st1     { v2.2s }, [x6]
 ; BE-STRICT-ALIGN-LABEL: align2vf32
 ; BE-STRICT-ALIGN: ldrb 
 ; BE-STRICT-ALIGN: ldrh