Index: lib/Target/AArch64/AArch64InstrInfo.td
===================================================================
--- lib/Target/AArch64/AArch64InstrInfo.td
+++ lib/Target/AArch64/AArch64InstrInfo.td
@@ -21,6 +21,9 @@
 def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
                                  AssemblerPredicate<"FeatureCrypto","crypto">;
 
+def IsLE             : Predicate<"Subtarget->isLittle()">;
+def IsBE             : Predicate<"!Subtarget->isLittle()">;
+
 // Use fused MAC if more precision in FP computation is allowed.
 def UseFusedMAC      : Predicate<"(TM.Options.AllowFPOpFusion =="
                                  " FPOpFusion::Fast)">;
@@ -2612,13 +2615,13 @@
 }
 
 let Predicates = [HasFPARMv8] in {
-def LDRs_lit  : A64I_LDRlitSimple<0b00, 0b1, FPR32>;
-def LDRd_lit  : A64I_LDRlitSimple<0b01, 0b1, FPR64>;
+  def LDRs_lit  : A64I_LDRlitSimple<0b00, 0b1, FPR32>;
+  def LDRd_lit  : A64I_LDRlitSimple<0b01, 0b1, FPR64>;
 }
 
 let mayLoad = 1 in {
   let Predicates = [HasFPARMv8] in {
-  def LDRq_lit : A64I_LDRlitSimple<0b10, 0b1, FPR128>;
+    def LDRq_lit : A64I_LDRlitSimple<0b10, 0b1, FPR128>;
   }
 
   def LDRSWx_lit : A64I_LDRlit<0b10, 0b0,
@@ -5032,6 +5035,85 @@
   : ls_neutral_pats<LOAD, STORE, Base, Offset, address, sty>,
     ls_atomic_pats<LOAD, STORE, Base, Offset, address, sty, sty>;
 
+
+// Wrappers to instantiate all allowed same-size fp/vector loads
+
+// NEON-BE: allow all neon vectors as well, since ld1/st1 must be disabled
+// LD1 & ST1 are not ABI conforming in big endian: wrong arg memory layout
+// http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf
+// section 4.1.2, 2nd paragraph: LDR/STR layout
+// "on a big-endian system element 0 will contain the highest-addressed 
+// element of a short vector."
+// FIXME: eventually also enable for LE 
+// (desired by ARM - smaller code due to more powerful adressing modes)
+
+// NEON 8 bit types
+multiclass ls_FPR8_pats<Instruction LOAD, Instruction STORE, 
+                        dag Base, dag Offset, dag address> {
+  let Predicates = [HasNEON] in {
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v1i8>;
+  }
+}
+
+// NEON 16 bit types
+multiclass ls_FPR16_pats<Instruction LOAD, Instruction STORE, 
+                         dag Base, dag Offset, dag address> {
+  let Predicates = [HasFPARMv8] in {
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, f16>;
+  }
+
+  let Predicates = [HasNEON] in {
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v1i16>;
+  }
+}
+
+// NEON 32 bit types
+multiclass ls_FPR32_pats<Instruction LOAD, Instruction STORE, 
+                         dag Base, dag Offset, dag address> {
+  let Predicates = [HasFPARMv8] in {
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, f32>;
+  }
+
+  let Predicates = [HasNEON] in {
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v1i32>;
+//    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v1f32>; does not exist - v1f64 DOES --  WHY ?
+  }
+}
+
+// NEON 64 bit types
+multiclass ls_FPR64_pats<Instruction LOAD, Instruction STORE, 
+                         dag Base, dag Offset, dag address> {
+  let Predicates = [HasFPARMv8] in {
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, f64>;
+  }
+
+  let Predicates = [HasNEON] in {
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v8i8>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v4i16>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v2i32>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v1i64>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v2f32>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v1f64>;
+  }
+}
+
+// NEON 128 bit types FPR128
+multiclass ls_FPR128_pats<Instruction LOAD, Instruction STORE, 
+                          dag Base, dag Offset, dag address> {
+  let Predicates = [HasFPARMv8] in {
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, f128>;
+  }
+
+  let Predicates = [HasNEON] in {
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v16i8>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v8i16>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v4i32>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v2i64>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v4f32>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v2f64>;
+  }
+}
+
 //===------------------------------
 // 2.2. Addressing-mode instantiations
 //===------------------------------
@@ -5075,37 +5157,40 @@
                                    !subst(ALIGN, min_align8, decls.pattern))),
                           i64>;
 
-  defm : ls_neutral_pats<LSFP16_LDR, LSFP16_STR, Base,
+  defm : ls_FPR8_pats<    LSFP8_LDR, LSFP8_STR, Base,
                           !foreach(decls.pattern, Offset,
+                                   !subst(OFFSET, byte_uimm12, decls.pattern)),
+                          !foreach(decls.pattern, address,
+                                   !subst(OFFSET, byte_uimm12,
+                                   !subst(ALIGN, any_align, decls.pattern)))>;
+
+  defm : ls_FPR16_pats<   LSFP16_LDR, LSFP16_STR, Base,
+                          !foreach(decls.pattern, Offset,
                                    !subst(OFFSET, hword_uimm12, decls.pattern)),
                           !foreach(decls.pattern, address,
                                    !subst(OFFSET, hword_uimm12,
-                                   !subst(ALIGN, min_align2, decls.pattern))),
-                          f16>;
+                                   !subst(ALIGN, min_align2, decls.pattern)))>;
 
-  defm : ls_neutral_pats<LSFP32_LDR, LSFP32_STR, Base,
+  defm : ls_FPR32_pats<   LSFP32_LDR, LSFP32_STR, Base,
                           !foreach(decls.pattern, Offset,
                                    !subst(OFFSET, word_uimm12, decls.pattern)),
                           !foreach(decls.pattern, address,
                                    !subst(OFFSET, word_uimm12,
-                                   !subst(ALIGN, min_align4, decls.pattern))),
-                          f32>;
+                                   !subst(ALIGN, min_align4, decls.pattern)))>;
 
-  defm : ls_neutral_pats<LSFP64_LDR, LSFP64_STR, Base,
+  defm : ls_FPR64_pats<   LSFP64_LDR, LSFP64_STR, Base,
                           !foreach(decls.pattern, Offset,
                                    !subst(OFFSET, dword_uimm12, decls.pattern)),
                           !foreach(decls.pattern, address,
                                    !subst(OFFSET, dword_uimm12,
-                                   !subst(ALIGN, min_align8, decls.pattern))),
-                          f64>;
+                                   !subst(ALIGN, min_align8, decls.pattern)))>;
 
-  defm : ls_neutral_pats<LSFP128_LDR, LSFP128_STR, Base,
+  defm : ls_FPR128_pats<  LSFP128_LDR, LSFP128_STR, Base,
                           !foreach(decls.pattern, Offset,
                                    !subst(OFFSET, qword_uimm12, decls.pattern)),
                           !foreach(decls.pattern, address,
                                    !subst(OFFSET, qword_uimm12,
-                                   !subst(ALIGN, min_align16, decls.pattern))),
-                          f128>;
+                                   !subst(ALIGN, min_align16, decls.pattern)))>;
 
   defm : load_signed_pats<"B", "", Base,
                           !foreach(decls.pattern, Offset,
@@ -5175,11 +5260,10 @@
   defm : ls_int_neutral_pats<LS32_LDUR, LS32_STUR, Base, Offset, address, i32>;
   defm : ls_int_neutral_pats<LS64_LDUR, LS64_STUR, Base, Offset, address, i64>;
 
-  defm : ls_neutral_pats<LSFP16_LDUR, LSFP16_STUR, Base, Offset, address, f16>;
-  defm : ls_neutral_pats<LSFP32_LDUR, LSFP32_STUR, Base, Offset, address, f32>;
-  defm : ls_neutral_pats<LSFP64_LDUR, LSFP64_STUR, Base, Offset, address, f64>;
-  defm : ls_neutral_pats<LSFP128_LDUR, LSFP128_STUR, Base, Offset, address,
-                         f128>;
+  defm : ls_FPR16_pats<LSFP16_LDUR, LSFP16_STUR, Base, Offset, address>;
+  defm : ls_FPR32_pats<LSFP32_LDUR, LSFP32_STUR, Base, Offset, address>;
+  defm : ls_FPR64_pats<LSFP64_LDUR, LSFP64_STUR, Base, Offset, address>;
+  defm : ls_FPR128_pats<LSFP128_LDUR, LSFP128_STUR, Base, Offset, address>;
 
   def : Pat<(i64 (zextloadi32 address)),
             (SUBREG_TO_REG (i64 0), (LS32_LDUR Base, Offset), sub_32)>;
Index: lib/Target/AArch64/AArch64InstrNEON.td
===================================================================
--- lib/Target/AArch64/AArch64InstrNEON.td
+++ lib/Target/AArch64/AArch64InstrNEON.td
@@ -104,10 +104,19 @@
   defm : ls_128_pats<address, Base, Offset, v2f64>;
 }
 
-defm : uimm12_neon_pats<(A64WrapperSmall
-                          tconstpool:$Hi, tconstpool:$Lo12, ALIGN),
-                        (ADRPxi tconstpool:$Hi), (i64 tconstpool:$Lo12)>;
-
+// LDR is only valid for little endian. 
+// In BE LDR needs correctly byte-swapped 128bit literals, so simple array 
+// initializers won't work right now.
+// Big-endian must - for now - do the element swaps using vector intrinsics.
+// That's an additional "add offset12" instruction, there.
+// According to ARM, BE & LE should use intrinsics for initialization.
+// That's also the only portable code.
+// FIXME: BE could use vector-literal-swapping before emit pass.
+let Predicates = [IsLE] in { // TODO: this will eventually be removed
+  defm : uimm12_neon_pats<(A64WrapperSmall
+                           tconstpool:$Hi, tconstpool:$Lo12, ALIGN),
+                           (ADRPxi tconstpool:$Hi), (i64 tconstpool:$Lo12)>;
+}
 //===----------------------------------------------------------------------===//
 // Multiclasses
 //===----------------------------------------------------------------------===//
@@ -3432,9 +3441,31 @@
 // the three 64-bit vectors list {BA, DC, FE}.
 // E.g. LD3_2S will load 32-bit elements {A, B, C, D, E, F} into the three
 // 64-bit vectors list {DA, EB, FC}.
-// Store instructions store multiple structure to N registers like load.
+// Store instructions store multiple structure from N registers like load.
+//
+// Problem for Big Endian (BE):
+// LD1/ST1 do "array" loads/stores - reading elements from ascending addresses
+// into ascending indexes in the register, in big-endian byte-swapping is done 
+// per element. (hence LD1 & Co are sometimes referred to as "array loads".)
+//
+// LDR/STR read the whole register doing byte-swapping on the whole register 
+// in big-endian mode.
+//
+// Obviously the two layouts differ by reversing the elements so they can't be 
+// mixed without explicit element-swap operations in BE.
+//
+// The only overlap is reading single elements to registers:
+// LDR i128/f128 - doing byte-swapping for the whole register.
+// LD1/ST1 i128/f128 - also doing byte-swapping within the 128bit element.
+// Analogously for stores.
 
+// For this reason there are IsLE guards around the respective patterns, or -
+// when no patterns are defined, yet - around the instruction definition.
 
+// In a PBQP matcher, one would add a separate set of "reversed" nonterminals 
+// with the element swap operations as chain rules - and let the matcher find
+// the optimal coverage. FIXME: How to do that here ?
+
 class NeonI_LDVList<bit q, bits<4> opcode, bits<2> size,
                     RegisterOperand VecList, string asmop>
   : NeonI_LdStMult<q, 1, opcode, size,
@@ -3471,15 +3502,28 @@
 }
 
 // Load multiple N-element structure to N consecutive registers (N = 1,2,3,4)
-defm LD1 : LDVList_BHSD<0b0111, "VOne", "ld1">;
+
+// LD1 disallowed in BE, when LDR and STR are used exclusively as per the ABI.
+// reason: LDR/STR use different memory/register layout (no element swaps).
+// If different types of loads were used from the same memory address the results 
+// will be inconsistent.
+// The only allowed use of LD1 is in initializations using explicit intrinsics to do 
+// the element-swaps.
+
+// Single element has no swapping problem in BE.
 def LD1_1D : NeonI_LDVList<0, 0b0111, 0b11, VOne1D_operand, "ld1">;
 
-defm LD2 : LDVList_BHSD<0b1000, "VPair", "ld2">;
+// Multiple elements would be reversed in BE.
+let Predicates = [IsLE] in {
+  defm LD1 : LDVList_BHSD<0b0111, "VOne", "ld1">;
 
-defm LD3 : LDVList_BHSD<0b0100, "VTriple", "ld3">;
+  defm LD2 : LDVList_BHSD<0b1000, "VPair", "ld2">;
 
-defm LD4 : LDVList_BHSD<0b0000, "VQuad", "ld4">;
+  defm LD3 : LDVList_BHSD<0b0100, "VTriple", "ld3">;
 
+  defm LD4 : LDVList_BHSD<0b0000, "VQuad", "ld4">;
+}
+
 // Load multiple 1-element structure to N consecutive registers (N = 2,3,4)
 defm LD1x2 : LDVList_BHSD<0b1010, "VPair", "ld1">;
 def LD1x2_1D : NeonI_LDVList<0, 0b1010, 0b11, VPair1D_operand, "ld1">;
@@ -3526,73 +3570,79 @@
 }
 
 // Store multiple N-element structures from N registers (N = 1,2,3,4)
-defm ST1 : STVList_BHSD<0b0111, "VOne", "st1">;
+// ARM ABI: default memory layout in BE is LDR/STR 
+// Single element has no swapping problem in BE.
 def ST1_1D : NeonI_STVList<0, 0b0111, 0b11, VOne1D_operand, "st1">;
 
-defm ST2 : STVList_BHSD<0b1000, "VPair", "st2">;
+// Multiple elements would be reversed in BE.
+let Predicates = [IsLE] in {
+  defm ST1 : STVList_BHSD<0b0111, "VOne", "st1">;
 
-defm ST3 : STVList_BHSD<0b0100, "VTriple", "st3">;
+  defm ST2 : STVList_BHSD<0b1000, "VPair", "st2">;
 
-defm ST4 : STVList_BHSD<0b0000, "VQuad", "st4">;
+  defm ST3 : STVList_BHSD<0b0100, "VTriple", "st3">;
 
-// Store multiple 1-element structures from N consecutive registers (N = 2,3,4)
-defm ST1x2 : STVList_BHSD<0b1010, "VPair", "st1">;
-def ST1x2_1D : NeonI_STVList<0, 0b1010, 0b11, VPair1D_operand, "st1">;
+  defm ST4 : STVList_BHSD<0b0000, "VQuad", "st4">;
 
-defm ST1x3 : STVList_BHSD<0b0110, "VTriple", "st1">;
-def ST1x3_1D : NeonI_STVList<0, 0b0110, 0b11, VTriple1D_operand, "st1">;
+  // Store multiple 1-element structures from N consecutive registers (N = 2,3,4)
+  defm ST1x2 : STVList_BHSD<0b1010, "VPair", "st1">;
+  def ST1x2_1D : NeonI_STVList<0, 0b1010, 0b11, VPair1D_operand, "st1">;
 
-defm ST1x4 : STVList_BHSD<0b0010, "VQuad", "st1">;
-def ST1x4_1D : NeonI_STVList<0, 0b0010, 0b11, VQuad1D_operand, "st1">;
+  defm ST1x3 : STVList_BHSD<0b0110, "VTriple", "st1">;
+  def ST1x3_1D : NeonI_STVList<0, 0b0110, 0b11, VTriple1D_operand, "st1">;
 
-def : Pat<(v2f64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>;
-def : Pat<(v2i64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>;
+  defm ST1x4 : STVList_BHSD<0b0010, "VQuad", "st1">;
+  def ST1x4_1D : NeonI_STVList<0, 0b0010, 0b11, VQuad1D_operand, "st1">;
 
-def : Pat<(v4f32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>;
-def : Pat<(v4i32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>;
+  def : Pat<(v2f64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>;
+  def : Pat<(v2i64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>;
 
-def : Pat<(v8i16 (load GPR64xsp:$addr)), (LD1_8H GPR64xsp:$addr)>;
-def : Pat<(v16i8 (load GPR64xsp:$addr)), (LD1_16B GPR64xsp:$addr)>;
+  def : Pat<(v4f32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>;
+  def : Pat<(v4i32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>;
 
-def : Pat<(v1f64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>;
-def : Pat<(v1i64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>;
+  def : Pat<(v8i16 (load GPR64xsp:$addr)), (LD1_8H GPR64xsp:$addr)>;
+  def : Pat<(v16i8 (load GPR64xsp:$addr)), (LD1_16B GPR64xsp:$addr)>;
 
-def : Pat<(v2f32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>;
-def : Pat<(v2i32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>;
+  def : Pat<(v1f64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>;
+  def : Pat<(v1i64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>;
 
-def : Pat<(v4i16 (load GPR64xsp:$addr)), (LD1_4H GPR64xsp:$addr)>;
-def : Pat<(v8i8 (load GPR64xsp:$addr)), (LD1_8B GPR64xsp:$addr)>;
+  def : Pat<(v2f32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>;
+  def : Pat<(v2i32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>;
 
-def : Pat<(store (v2i64 VPR128:$value), GPR64xsp:$addr),
-          (ST1_2D GPR64xsp:$addr, VPR128:$value)>;
-def : Pat<(store (v2f64 VPR128:$value), GPR64xsp:$addr),
-          (ST1_2D GPR64xsp:$addr, VPR128:$value)>;
+  def : Pat<(v4i16 (load GPR64xsp:$addr)), (LD1_4H GPR64xsp:$addr)>;
+  def : Pat<(v8i8 (load GPR64xsp:$addr)), (LD1_8B GPR64xsp:$addr)>;
 
-def : Pat<(store (v4i32 VPR128:$value), GPR64xsp:$addr),
-          (ST1_4S GPR64xsp:$addr, VPR128:$value)>;
-def : Pat<(store (v4f32 VPR128:$value), GPR64xsp:$addr),
-          (ST1_4S GPR64xsp:$addr, VPR128:$value)>;
+  def : Pat<(store (v2i64 VPR128:$value), GPR64xsp:$addr),
+            (ST1_2D GPR64xsp:$addr, VPR128:$value)>;
+  def : Pat<(store (v2f64 VPR128:$value), GPR64xsp:$addr),
+            (ST1_2D GPR64xsp:$addr, VPR128:$value)>;
 
-def : Pat<(store (v8i16 VPR128:$value), GPR64xsp:$addr),
-          (ST1_8H GPR64xsp:$addr, VPR128:$value)>;
-def : Pat<(store (v16i8 VPR128:$value), GPR64xsp:$addr),
-          (ST1_16B GPR64xsp:$addr, VPR128:$value)>;
+  def : Pat<(store (v4i32 VPR128:$value), GPR64xsp:$addr),
+            (ST1_4S GPR64xsp:$addr, VPR128:$value)>;
+  def : Pat<(store (v4f32 VPR128:$value), GPR64xsp:$addr),
+            (ST1_4S GPR64xsp:$addr, VPR128:$value)>;
 
-def : Pat<(store (v1i64 VPR64:$value), GPR64xsp:$addr),
-          (ST1_1D GPR64xsp:$addr, VPR64:$value)>;
-def : Pat<(store (v1f64 VPR64:$value), GPR64xsp:$addr),
-          (ST1_1D GPR64xsp:$addr, VPR64:$value)>;
+  def : Pat<(store (v8i16 VPR128:$value), GPR64xsp:$addr),
+            (ST1_8H GPR64xsp:$addr, VPR128:$value)>;
+  def : Pat<(store (v16i8 VPR128:$value), GPR64xsp:$addr),
+            (ST1_16B GPR64xsp:$addr, VPR128:$value)>;
 
-def : Pat<(store (v2i32 VPR64:$value), GPR64xsp:$addr),
-          (ST1_2S GPR64xsp:$addr, VPR64:$value)>;
-def : Pat<(store (v2f32 VPR64:$value), GPR64xsp:$addr),
-          (ST1_2S GPR64xsp:$addr, VPR64:$value)>;
+  def : Pat<(store (v1i64 VPR64:$value), GPR64xsp:$addr),
+            (ST1_1D GPR64xsp:$addr, VPR64:$value)>;
+  def : Pat<(store (v1f64 VPR64:$value), GPR64xsp:$addr),
+            (ST1_1D GPR64xsp:$addr, VPR64:$value)>;
 
-def : Pat<(store (v4i16 VPR64:$value), GPR64xsp:$addr),
-          (ST1_4H GPR64xsp:$addr, VPR64:$value)>;
-def : Pat<(store (v8i8 VPR64:$value), GPR64xsp:$addr),
-          (ST1_8B GPR64xsp:$addr, VPR64:$value)>;
+  def : Pat<(store (v2i32 VPR64:$value), GPR64xsp:$addr),
+            (ST1_2S GPR64xsp:$addr, VPR64:$value)>;
+  def : Pat<(store (v2f32 VPR64:$value), GPR64xsp:$addr),
+            (ST1_2S GPR64xsp:$addr, VPR64:$value)>;
 
+  def : Pat<(store (v4i16 VPR64:$value), GPR64xsp:$addr),
+            (ST1_4H GPR64xsp:$addr, VPR64:$value)>;
+  def : Pat<(store (v8i8 VPR64:$value), GPR64xsp:$addr),
+            (ST1_8B GPR64xsp:$addr, VPR64:$value)>;
+}
+
 // Match load/store of v1i8/v1i16/v1i32 type to FPR8/FPR16/FPR32 load/store.
 // FIXME: for now we have v1i8, v1i16, v1i32 legal types, if they are illegal,
 // these patterns are not needed any more.
@@ -3776,35 +3826,40 @@
                               ImmTy2, asmop>;
 }
 
-// Post-index load multiple N-element structures from N registers (N = 1,2,3,4)
-defm LD1WB : LDWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "ld1">;
+// Single element loads are ok for BE.
 defm LD1WB_1D : NeonI_LDWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8,
                                  "ld1">;
 
-defm LD2WB : LDWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "ld2">;
+// Multiple elements would be reversed in BE.
+let Predicates = [IsLE] in {
+  // Post-index load multiple N-element structures from N registers (N = 1,2,3,4)
+  defm LD1WB : LDWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "ld1">;
 
-defm LD3WB : LDWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48,
-                             "ld3">;
+  defm LD2WB : LDWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "ld2">;
 
-defm LD4WB : LDWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "ld4">;
+  defm LD3WB : LDWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48,
+                               "ld3">;
 
-// Post-index load multiple 1-element structures from N consecutive registers
-// (N = 2,3,4)
-defm LD1x2WB : LDWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32,
-                               "ld1">;
-defm LD1x2WB_1D : NeonI_LDWB_VList<0, 0b1010, 0b11, VPair1D_operand,
-                                   uimm_exact16, "ld1">;
+  defm LD4WB : LDWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "ld4">;
 
-defm LD1x3WB : LDWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48,
-                               "ld1">;
-defm LD1x3WB_1D : NeonI_LDWB_VList<0, 0b0110, 0b11, VTriple1D_operand,
-                                   uimm_exact24, "ld1">;
+  // Post-index load multiple 1-element structures to N consecutive registers
+  // (N = 2,3,4)
+  defm LD1x2WB : LDWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32,
+                                 "ld1">;
+  defm LD1x2WB_1D : NeonI_LDWB_VList<0, 0b1010, 0b11, VPair1D_operand,
+                                     uimm_exact16, "ld1">;
 
-defm LD1x4WB : LDWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64,
-                                "ld1">;
-defm LD1x4WB_1D : NeonI_LDWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
-                                   uimm_exact32, "ld1">;
+  defm LD1x3WB : LDWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48,
+                                 "ld1">;
+  defm LD1x3WB_1D : NeonI_LDWB_VList<0, 0b0110, 0b11, VTriple1D_operand,
+                                     uimm_exact24, "ld1">;
 
+  defm LD1x4WB : LDWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64,
+                                 "ld1">;
+  defm LD1x4WB_1D : NeonI_LDWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
+                                     uimm_exact32, "ld1">;
+}
+
 multiclass NeonI_STWB_VList<bit q, bits<4> opcode, bits<2> size,
                             RegisterOperand VecList, Operand ImmTy,
                             string asmop> {
@@ -3861,33 +3916,36 @@
 }
 
 // Post-index load multiple N-element structures from N registers (N = 1,2,3,4)
-defm ST1WB : STWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "st1">;
-defm ST1WB_1D : NeonI_STWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8,
-                                 "st1">;
+// Loading multiple elements in BE mode suffers from element-reversal.
+let Predicates = [IsLE] in {
+  defm ST1WB_1D : NeonI_STWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8,
+                                 "st1">;       
+  defm ST1WB : STWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "st1">;
 
-defm ST2WB : STWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "st2">;
+  defm ST2WB : STWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "st2">;
 
-defm ST3WB : STWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48,
-                             "st3">;
+  defm ST3WB : STWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48,
+                               "st3">;
 
-defm ST4WB : STWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "st4">;
+  defm ST4WB : STWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "st4">;
 
-// Post-index load multiple 1-element structures from N consecutive registers
-// (N = 2,3,4)
-defm ST1x2WB : STWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32,
-                               "st1">;
-defm ST1x2WB_1D : NeonI_STWB_VList<0, 0b1010, 0b11, VPair1D_operand,
-                                   uimm_exact16, "st1">;
+  // Post-index load multiple 1-element structures from N consecutive registers
+  // (N = 2,3,4)
+  defm ST1x2WB : STWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32,
+                                 "st1">;
+  defm ST1x2WB_1D : NeonI_STWB_VList<0, 0b1010, 0b11, VPair1D_operand,
+                                     uimm_exact16, "st1">;
 
-defm ST1x3WB : STWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48,
-                               "st1">;
-defm ST1x3WB_1D : NeonI_STWB_VList<0, 0b0110, 0b11, VTriple1D_operand,
-                                   uimm_exact24, "st1">;
+  defm ST1x3WB : STWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48,
+                                 "st1">;
+  defm ST1x3WB_1D : NeonI_STWB_VList<0, 0b0110, 0b11, VTriple1D_operand,
+                                     uimm_exact24, "st1">;
 
-defm ST1x4WB : STWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64,
-                               "st1">;
-defm ST1x4WB_1D : NeonI_STWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
-                                   uimm_exact32, "st1">;
+  defm ST1x4WB : STWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64,
+                                 "st1">;
+  defm ST1x4WB_1D : NeonI_STWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
+                                     uimm_exact32, "st1">;
+}
 
 // End of post-index vector load/store multiple N-element structure
 // (class SIMD lselem-post)
@@ -3963,13 +4021,17 @@
 }
 
 // Load single 1-element structure to all lanes of 1 register
+// Single element loads are fine in BE
 defm LD1R : LDN_Dup_BHSD<0b0, 0b110, "VOne", "ld1r">;
 
 // Load single N-element structure to all lanes of N consecutive
 // registers (N = 2,3,4)
-defm LD2R : LDN_Dup_BHSD<0b1, 0b110, "VPair", "ld2r">;
-defm LD3R : LDN_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r">;
-defm LD4R : LDN_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r">;
+// Multi-element loads suffer from element reversal in BE.
+let Predicates = [IsLE] in {
+  defm LD2R : LDN_Dup_BHSD<0b1, 0b110, "VPair", "ld2r">;
+  defm LD3R : LDN_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r">;
+  defm LD4R : LDN_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r">;
+}
 
 
 class LD1R_pattern <ValueType VTy, ValueType DTy, PatFrag LoadOp,
@@ -3978,28 +4040,31 @@
           (VTy (INST GPR64xsp:$Rn))>;
 
 // Match all LD1R instructions
-def : LD1R_pattern<v8i8, i32, extloadi8, LD1R_8B>;
+// This won't work as intended in BE mode, as STR q0 stored the elements swapped.
+let Predicates = [IsLE] in {
+  def : LD1R_pattern<v8i8, i32, extloadi8, LD1R_8B>;
+  def : LD1R_pattern<v16i8, i32, extloadi8, LD1R_16B>;
 
-def : LD1R_pattern<v16i8, i32, extloadi8, LD1R_16B>;
+  def : LD1R_pattern<v4i16, i32, extloadi16, LD1R_4H>;
 
-def : LD1R_pattern<v4i16, i32, extloadi16, LD1R_4H>;
+  def : LD1R_pattern<v8i16, i32, extloadi16, LD1R_8H>;
 
-def : LD1R_pattern<v8i16, i32, extloadi16, LD1R_8H>;
+  def : LD1R_pattern<v2i32, i32, load, LD1R_2S>;
+  def : LD1R_pattern<v2f32, f32, load, LD1R_2S>;
 
-def : LD1R_pattern<v2i32, i32, load, LD1R_2S>;
-def : LD1R_pattern<v2f32, f32, load, LD1R_2S>;
+  def : LD1R_pattern<v4i32, i32, load, LD1R_4S>;
+  def : LD1R_pattern<v4f32, f32, load, LD1R_4S>;
 
-def : LD1R_pattern<v4i32, i32, load, LD1R_4S>;
-def : LD1R_pattern<v4f32, f32, load, LD1R_4S>;
+  def : LD1R_pattern<v2i64, i64, load, LD1R_2D>;
+  def : LD1R_pattern<v2f64, f64, load, LD1R_2D>;
+}
 
-def : LD1R_pattern<v2i64, i64, load, LD1R_2D>;
-def : LD1R_pattern<v2f64, f64, load, LD1R_2D>;
-
 class LD1R_pattern_v1 <ValueType VTy, ValueType DTy, PatFrag LoadOp,
                        Instruction INST>
   : Pat<(VTy (scalar_to_vector (DTy (LoadOp GPR64xsp:$Rn)))),
         (VTy (INST GPR64xsp:$Rn))>;
 
+// Single element operations are swap-safe in BE.
 def : LD1R_pattern_v1<v1i64, i64, load, LD1R_1D>;
 def : LD1R_pattern_v1<v1f64, f64, load, LD1R_1D>;
 
@@ -4064,46 +4129,64 @@
 }
 
 // Load single 1-element structure to one lane of 1 register.
+// No dangerous element swaps in BE. :-)
 defm LD1LN : LDN_Lane_BHSD<0b0, 0b0, "VOne", "ld1">;
 
 // Load single N-element structure to one lane of N consecutive registers
 // (N = 2,3,4)
-defm LD2LN : LDN_Lane_BHSD<0b1, 0b0, "VPair", "ld2">;
-defm LD3LN : LDN_Lane_BHSD<0b0, 0b1, "VTriple", "ld3">;
-defm LD4LN : LDN_Lane_BHSD<0b1, 0b1, "VQuad", "ld4">;
+//
+// This will not work as intended in BE mode, if the matcher generates it to
+// load a vector to a lane. (STR q0 stored the vector's elements swapped)
+// Must always use an intrinsic, so the user knows it's loading from an array 
+// layout.
+let Predicates = [IsLE] in {
+  defm LD2LN : LDN_Lane_BHSD<0b1, 0b0, "VPair", "ld2">;
+  defm LD3LN : LDN_Lane_BHSD<0b0, 0b1, "VTriple", "ld3">;
+  defm LD4LN : LDN_Lane_BHSD<0b1, 0b1, "VQuad", "ld4">;
+}
 
-multiclass LD1LN_patterns<ValueType VTy, ValueType VTy2, ValueType DTy,
-                          Operand ImmOp, Operand ImmOp2, PatFrag LoadOp,
-                          Instruction INST> {
-  def : Pat<(VTy (vector_insert (VTy VPR64:$src),
-                     (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp:$lane))),
-            (VTy (EXTRACT_SUBREG
-                     (INST GPR64xsp:$Rn,
-                           (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
-                           ImmOp:$lane),
-                     sub_64))>;
+// Multiple elements would be reversed in BE.
+let Predicates = [IsLE] in {
+  multiclass LD1LN_patterns<ValueType VTy, ValueType VTy2, ValueType DTy,
+                            Operand ImmOp, Operand ImmOp2, PatFrag LoadOp,
+                            Instruction INST> {
+    def : Pat<(VTy (vector_insert (VTy VPR64:$src),
+                       (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp:$lane))),
+              (VTy (EXTRACT_SUBREG
+                       (INST GPR64xsp:$Rn,
+                             (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
+                             ImmOp:$lane),
+                       sub_64))>;
 
-  def : Pat<(VTy2 (vector_insert (VTy2 VPR128:$src),
-                      (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp2:$lane))),
-            (VTy2 (INST GPR64xsp:$Rn, VPR128:$src, ImmOp2:$lane))>;
+    def : Pat<(VTy2 (vector_insert (VTy2 VPR128:$src),
+                        (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp2:$lane))),
+              (VTy2 (INST GPR64xsp:$Rn, VPR128:$src, ImmOp2:$lane))>;
+  }
 }
 
 // Match all LD1LN instructions
-defm : LD1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
-                      extloadi8, LD1LN_B>;
+//
+// This will not work as intended in BE mode, if the matcher generates it to
+// load a vector to a lane. (STR q0 stored the elements swapped in BE)
+// Must always use an intrinsic, so the user knows it's loading from an array 
+// layout.
+let Predicates = [IsLE] in {
+  defm : LD1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
+                        extloadi8, LD1LN_B>;
 
-defm : LD1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
-                      extloadi16, LD1LN_H>;
+  defm : LD1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
+                        extloadi16, LD1LN_H>;
 
-defm : LD1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
-                      load, LD1LN_S>;
-defm : LD1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
-                      load, LD1LN_S>;
+  defm : LD1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
+                        load, LD1LN_S>;
+  defm : LD1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
+                        load, LD1LN_S>;
 
-defm : LD1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
-                      load, LD1LN_D>;
-defm : LD1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
-                      load, LD1LN_D>;
+  defm : LD1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
+                        load, LD1LN_D>;
+  defm : LD1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
+                        load, LD1LN_D>;
+}
 
 class NeonI_STN_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
                      Operand ImmOp, string asmop>
@@ -4149,13 +4232,17 @@
 }
 
 // Store single 1-element structure from one lane of 1 register.
+// single element should be fine in BE - no swapping of elements.
 defm ST1LN : STN_Lane_BHSD<0b0, 0b0, "VOne", "st1">;
 
 // Store single N-element structure from one lane of N consecutive registers
 // (N = 2,3,4)
-defm ST2LN : STN_Lane_BHSD<0b1, 0b0, "VPair", "st2">;
-defm ST3LN : STN_Lane_BHSD<0b0, 0b1, "VTriple", "st3">;
-defm ST4LN : STN_Lane_BHSD<0b1, 0b1, "VQuad", "st4">;
+// Multiple elements would be reversed in BE.
+let Predicates = [IsLE] in {
+  defm ST2LN : STN_Lane_BHSD<0b1, 0b0, "VPair", "st2">;
+  defm ST3LN : STN_Lane_BHSD<0b0, 0b1, "VTriple", "st3">;
+  defm ST4LN : STN_Lane_BHSD<0b1, 0b1, "VQuad", "st4">;
+}
 
 multiclass ST1LN_patterns<ValueType VTy, ValueType VTy2, ValueType DTy,
                           Operand ImmOp, Operand ImmOp2, PatFrag StoreOp,
@@ -4172,22 +4259,25 @@
 }
 
 // Match all ST1LN instructions
-defm : ST1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
-                      truncstorei8, ST1LN_B>;
+//
+// Multiple elements would be reversed in BE.
+let Predicates = [IsLE] in {
+  defm : ST1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
+                        truncstorei8, ST1LN_B>;
 
-defm : ST1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
-                      truncstorei16, ST1LN_H>;
+  defm : ST1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
+                        truncstorei16, ST1LN_H>;
 
-defm : ST1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
-                      store, ST1LN_S>;
-defm : ST1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
-                      store, ST1LN_S>;
+  defm : ST1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
+                        store, ST1LN_S>;
+  defm : ST1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
+                        store, ST1LN_S>;
 
-defm : ST1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
-                      store, ST1LN_D>;
-defm : ST1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
-                      store, ST1LN_D>;
-
+  defm : ST1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
+                        store, ST1LN_D>;
+  defm : ST1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
+                        store, ST1LN_D>;
+}
 // End of vector load/store single N-element structure (class SIMD lsone).
 
 
@@ -4256,17 +4346,21 @@
 }
 
 // Post-index load single 1-element structure to all lanes of 1 register
+// one element duplication should be fine in BE - no swapping of elements.
 defm LD1R_WB : LDWB_Dup_BHSD<0b0, 0b110, "VOne", "ld1r", uimm_exact1,
                              uimm_exact2, uimm_exact4, uimm_exact8>;
 
 // Post-index load single N-element structure to all lanes of N consecutive
 // registers (N = 2,3,4)
-defm LD2R_WB : LDWB_Dup_BHSD<0b1, 0b110, "VPair", "ld2r", uimm_exact2,
-                             uimm_exact4, uimm_exact8, uimm_exact16>;
-defm LD3R_WB : LDWB_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r", uimm_exact3,
-                             uimm_exact6, uimm_exact12, uimm_exact24>;
-defm LD4R_WB : LDWB_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r", uimm_exact4,
-                             uimm_exact8, uimm_exact16, uimm_exact32>;
+// Multiple elements would be reversed in BE.
+let Predicates = [IsLE] in {
+  defm LD2R_WB : LDWB_Dup_BHSD<0b1, 0b110, "VPair", "ld2r", uimm_exact2,
+                               uimm_exact4, uimm_exact8, uimm_exact16>;
+  defm LD3R_WB : LDWB_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r", uimm_exact3,
+                               uimm_exact6, uimm_exact12, uimm_exact24>;
+  defm LD4R_WB : LDWB_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r", uimm_exact4,
+                               uimm_exact8, uimm_exact16, uimm_exact32>;
+}
 
 let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1,
     Constraints = "$Rn = $wb, $Rt = $src",
@@ -4357,18 +4451,22 @@
 }
 
 // Post-index load single 1-element structure to one lane of 1 register.
+// One element from 1 lane is fine in BE - no swapping of elements.
 defm LD1LN_WB : LD_Lane_WB_BHSD<0b0, 0b0, "VOne", "ld1", uimm_exact1,
                                 uimm_exact2, uimm_exact4, uimm_exact8>;
 
 // Post-index load single N-element structure to one lane of N consecutive
 // registers
 // (N = 2,3,4)
-defm LD2LN_WB : LD_Lane_WB_BHSD<0b1, 0b0, "VPair", "ld2", uimm_exact2,
-                                uimm_exact4, uimm_exact8, uimm_exact16>;
-defm LD3LN_WB : LD_Lane_WB_BHSD<0b0, 0b1, "VTriple", "ld3", uimm_exact3,
-                                uimm_exact6, uimm_exact12, uimm_exact24>;
-defm LD4LN_WB : LD_Lane_WB_BHSD<0b1, 0b1, "VQuad", "ld4", uimm_exact4,
-                                uimm_exact8, uimm_exact16, uimm_exact32>;
+// Multiple elements would be reversed in BE.
+let Predicates = [IsLE] in {
+  defm LD2LN_WB : LD_Lane_WB_BHSD<0b1, 0b0, "VPair", "ld2", uimm_exact2,
+                                  uimm_exact4, uimm_exact8, uimm_exact16>;
+  defm LD3LN_WB : LD_Lane_WB_BHSD<0b0, 0b1, "VTriple", "ld3", uimm_exact3,
+                                  uimm_exact6, uimm_exact12, uimm_exact24>;
+  defm LD4LN_WB : LD_Lane_WB_BHSD<0b1, 0b1, "VQuad", "ld4", uimm_exact4,
+                                  uimm_exact8, uimm_exact16, uimm_exact32>;
+}
 
 let mayStore = 1, neverHasSideEffects = 1,
     hasExtraDefRegAllocReq = 1, Constraints = "$Rn = $wb",
@@ -4459,17 +4557,21 @@
 }
 
 // Post-index store single 1-element structure from one lane of 1 register.
+// one element from 1 lane should be fine in BE - no swapping of elements.
 defm ST1LN_WB : ST_Lane_WB_BHSD<0b0, 0b0, "VOne", "st1", uimm_exact1,
                                 uimm_exact2, uimm_exact4, uimm_exact8>;
 
 // Post-index store single N-element structure from one lane of N consecutive
 // registers (N = 2,3,4)
-defm ST2LN_WB : ST_Lane_WB_BHSD<0b1, 0b0, "VPair", "st2", uimm_exact2,
-                                uimm_exact4, uimm_exact8, uimm_exact16>;
-defm ST3LN_WB : ST_Lane_WB_BHSD<0b0, 0b1, "VTriple", "st3", uimm_exact3,
-                                uimm_exact6, uimm_exact12, uimm_exact24>;
-defm ST4LN_WB : ST_Lane_WB_BHSD<0b1, 0b1, "VQuad", "st4", uimm_exact4,
-                                uimm_exact8, uimm_exact16, uimm_exact32>;
+// Multiple elements would be reversed in BE.
+let Predicates = [IsLE] in {
+  defm ST2LN_WB : ST_Lane_WB_BHSD<0b1, 0b0, "VPair", "st2", uimm_exact2,
+                                  uimm_exact4, uimm_exact8, uimm_exact16>;
+  defm ST3LN_WB : ST_Lane_WB_BHSD<0b0, 0b1, "VTriple", "st3", uimm_exact3,
+                                  uimm_exact6, uimm_exact12, uimm_exact24>;
+  defm ST4LN_WB : ST_Lane_WB_BHSD<0b1, 0b1, "VQuad", "st4", uimm_exact4,
+                                  uimm_exact8, uimm_exact16, uimm_exact32>;
+}
 
 // End of post-index load/store single N-element instructions
 // (class SIMD lsone-post)
Index: test/CodeGen/AArch64/128bit_load_store.ll
===================================================================
--- test/CodeGen/AArch64/128bit_load_store.ll
+++ test/CodeGen/AArch64/128bit_load_store.ll
@@ -1,5 +1,118 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s
 
+define void @test_store_v1i8(<1 x i8>* %ptr, <1 x i8> %val) #0 {
+; CHECK: test_store_v1i8
+; CHECK: str	 {{b[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <1 x i8> %val, <1 x i8>* %ptr, align 8
+  ret void
+}
+
+
+
+define void @test_store_f16(half* %ptr, half %val) #0 {
+; CHECK: test_store_f16
+; CHECK: str	 {{h[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store half %val, half* %ptr, align 8
+  ret void
+}
+
+define void @test_store_v1i16(<1 x i16>* %ptr, <1 x i16> %val) #0 {
+; CHECK: test_store_v1i16
+; CHECK: str	 {{h[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <1 x i16> %val, <1 x i16>* %ptr, align 8
+  ret void
+}
+
+
+
+define void @test_store_f32(float* %ptr, float %val) #0 {
+; CHECK: test_store_f32
+; CHECK: str	 {{s[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store float %val, float* %ptr, align 8
+  ret void
+}
+
+define void @test_store_v1f32(<1 x float>* %ptr, <1 x float> %val) #0 {
+; CHECK: test_store_v1f32
+; CHECK: str	 {{s[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <1 x float> %val, <1 x float>* %ptr, align 8
+  ret void
+}
+
+define void @test_store_v1i32(<1 x i32>* %ptr, <1 x i32> %val) #0 {
+; CHECK: test_store_v1i32
+; CHECK: str	 {{s[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <1 x i32> %val, <1 x i32>* %ptr, align 8
+  ret void
+}
+
+
+define void @test_store_f64(double *%ptr, double %val) #0 {
+; CHECK: test_store_f64
+; CHECK: str	 {{d[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store double %val, double* %ptr, align 8
+  ret void
+}
+
+define void @test_store_v1f64(<1 x double>* %ptr, <1 x double> %val) #0 {
+; CHECK: test_store_v1f64
+; CHECK: str	 {{d[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <1 x double> %val, <1 x double>* %ptr, align 8
+  ret void
+}
+
+define void @test_store_v2f32(<2 x float>* %ptr, <2 x float> %val) #0 {
+; CHECK: test_store_v2f32
+; CHECK: str	 {{d[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <2 x float> %val, <2 x float>* %ptr, align 8
+  ret void
+}
+
+define void @test_store_v1i64(<1 x i64>* %ptr, <1 x i64> %val) #0 {
+; CHECK: test_store_v1i64
+; CHECK: str	 {{d[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <1 x i64> %val, <1 x i64>* %ptr, align 8
+  ret void
+}
+
+define void @test_store_v2i32(<2 x i32>* %ptr, <2 x i32> %val) #0 {
+; CHECK: test_store_v2i32
+; CHECK: str	 {{d[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <2 x i32> %val, <2 x i32>* %ptr, align 8
+  ret void
+}
+
+define void @test_store_v4i16(<4 x i16>* %ptr, <4 x i16> %val) #0 {
+; CHECK: test_store_v4i16
+; CHECK: str	 {{d[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <4 x i16> %val, <4 x i16>* %ptr, align 8
+  ret void
+}
+
+define void @test_store_v8i8(<8 x i8>* %ptr, <8 x i8> %val) #0 {
+; CHECK: test_store_v8i8
+; CHECK: str	 {{d[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <8 x i8> %val, <8 x i8>* %ptr, align 8
+  ret void
+}
+
+
+
+
 define void @test_store_f128(fp128* %ptr, fp128 %val) #0 {
 ; CHECK: test_store_f128
 ; CHECK: str	 {{q[0-9]+}}, [{{x[0-9]+}}]
@@ -8,6 +121,54 @@
   ret void
 }
 
+define void @test_store_v2f64(<2 x double>* %ptr, <2 x double> %val) #0 {
+; CHECK: test_store_v2f64
+; CHECK: str	 {{q[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <2 x double> %val, <2 x double>* %ptr, align 16
+  ret void
+}
+
+define void @test_store_v4f32(<4 x float>* %ptr, <4 x float> %val) #0 {
+; CHECK: test_store_v4f32
+; CHECK: str	 {{q[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <4 x float> %val, <4 x float>* %ptr, align 16
+  ret void
+}
+
+define void @test_store_v2i64(<2 x i64>* %ptr, <2 x i64> %val) #0 {
+; CHECK: test_store_v2i64
+; CHECK: str	 {{q[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <2 x i64> %val, <2 x i64>* %ptr, align 16
+  ret void
+}
+
+define void @test_store_v4i32(<4 x i32>* %ptr, <4 x i32> %val) #0 {
+; CHECK: test_store_v4i32
+; CHECK: str	 {{q[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <4 x i32> %val, <4 x i32>* %ptr, align 16
+  ret void
+}
+
+define void @test_store_v8i16(<8 x i16>* %ptr, <8 x i16> %val) #0 {
+; CHECK: test_store_v8i16
+; CHECK: str	 {{q[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <8 x i16> %val, <8 x i16>* %ptr, align 16
+  ret void
+}
+
+define void @test_store_v16i8(<16 x i8>* %ptr, <16 x i8> %val) #0 {
+; CHECK: test_store_v16i8
+; CHECK: str	 {{q[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <16 x i8> %val, <16 x i8>* %ptr, align 16
+  ret void
+}
+
 define fp128 @test_load_f128(fp128* readonly %ptr) #2 {
 ; CHECK: test_load_f128
 ; CHECK: ldr	 {{q[0-9]+}}, [{{x[0-9]+}}]
Index: test/CodeGen/AArch64/neon-copy.ll
===================================================================
--- test/CodeGen/AArch64/neon-copy.ll
+++ test/CodeGen/AArch64/neon-copy.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck --check-prefix=CHECK --check-prefix=CHECK-LE %s
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck --check-prefix=CHECK --check-prefix=CHECK-BE %s
 
 
 define <16 x i8> @ins16bw(<16 x i8> %tmp1, i8 %tmp2) {
@@ -859,7 +860,8 @@
 
 define <4 x i16> @test_dup_v1i64_v4i16(<1 x i64> %a) {
 ; CHECK-LABEL: test_dup_v1i64_v4i16:
-; CHECK: dup v0.4h, v0.h[0]
+; CHECK-LE: dup v0.4h, v0.h[0]
+; CHECK-BE: dup v0.4h, v0.h[2]
 entry:
   %x = extractelement <1 x i64> %a, i32 0
   %vget_lane = trunc i64 %x to i16
@@ -883,7 +885,8 @@
 
 define <8 x i16> @test_dup_v2i64_v8i16(<2 x i64> %a) {
 ; CHECK-LABEL: test_dup_v2i64_v8i16:
-; CHECK: dup v0.8h, v0.h[4]
+; CHECK-LE: dup v0.8h, v0.h[4]
+; CHECK-BE: dup v0.8h, v0.h[6]
 entry:
   %x = extractelement <2 x i64> %a, i32 1
   %vget_lane = trunc i64 %x to i16
@@ -926,7 +929,8 @@
 
 define <4 x i16> @test_dup_v2i64_v4i16(<2 x i64> %a) {
 ; CHECK-LABEL: test_dup_v2i64_v4i16:
-; CHECK: dup v0.4h, v0.h[0]
+; CHECK-LE: dup v0.4h, v0.h[0]
+; CHECK-BE: dup v0.4h, v0.h[2]
 entry:
   %x = extractelement <2 x i64> %a, i32 0
   %vget_lane = trunc i64 %x to i16
Index: test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll
===================================================================
--- test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll
+++ test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define void @test_ldst1_v16i8(<16 x i8>* %ptr, <16 x i8>* %ptr2) {
@@ -2,4 +3,4 @@
 ; CHECK-LABEL: test_ldst1_v16i8:
-; CHECK: ld1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
+; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+|sp}}]
+; CHECK: str q{{[0-9]+}}, [x{{[0-9]+|sp}}]
   %tmp = load <16 x i8>* %ptr
@@ -11,8 +12,8 @@
 
 define void @test_ldst1_v8i16(<8 x i16>* %ptr, <8 x i16>* %ptr2) {
 ; CHECK-LABEL: test_ldst1_v8i16:
-; CHECK: ld1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
+; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+|sp}}]
+; CHECK: str q{{[0-9]+}}, [x{{[0-9]+|sp}}]
   %tmp = load <8 x i16>* %ptr
   store <8 x i16> %tmp, <8 x i16>* %ptr2
   ret void
@@ -20,8 +21,8 @@
 
 define void @test_ldst1_v4i32(<4 x i32>* %ptr, <4 x i32>* %ptr2) {
 ; CHECK-LABEL: test_ldst1_v4i32:
-; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
+; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+|sp}}]
+; CHECK: str q{{[0-9]+}}, [x{{[0-9]+|sp}}]
   %tmp = load <4 x i32>* %ptr
   store <4 x i32> %tmp, <4 x i32>* %ptr2
   ret void
@@ -29,8 +30,8 @@
 
 define void @test_ldst1_v2i64(<2 x i64>* %ptr, <2 x i64>* %ptr2) {
 ; CHECK-LABEL: test_ldst1_v2i64:
-; CHECK: ld1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
+; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+|sp}}]
+; CHECK: str q{{[0-9]+}}, [x{{[0-9]+|sp}}]
   %tmp = load <2 x i64>* %ptr
   store <2 x i64> %tmp, <2 x i64>* %ptr2
   ret void
@@ -38,8 +39,8 @@
 
 define void @test_ldst1_v8i8(<8 x i8>* %ptr, <8 x i8>* %ptr2) {
 ; CHECK-LABEL: test_ldst1_v8i8:
-; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
+; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+|sp}}]
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+|sp}}]
   %tmp = load <8 x i8>* %ptr
   store <8 x i8> %tmp, <8 x i8>* %ptr2
   ret void
@@ -47,8 +48,8 @@
 
 define void @test_ldst1_v4i16(<4 x i16>* %ptr, <4 x i16>* %ptr2) {
 ; CHECK-LABEL: test_ldst1_v4i16:
-; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
+; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+|sp}}]
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+|sp}}]
   %tmp = load <4 x i16>* %ptr
   store <4 x i16> %tmp, <4 x i16>* %ptr2
   ret void
@@ -56,8 +57,8 @@
 
 define void @test_ldst1_v2i32(<2 x i32>* %ptr, <2 x i32>* %ptr2) {
 ; CHECK-LABEL: test_ldst1_v2i32:
-; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+|sp}}]
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+|sp}}]
   %tmp = load <2 x i32>* %ptr
   store <2 x i32> %tmp, <2 x i32>* %ptr2
   ret void
@@ -65,8 +66,8 @@
 
 define void @test_ldst1_v1i64(<1 x i64>* %ptr, <1 x i64>* %ptr2) {
 ; CHECK-LABEL: test_ldst1_v1i64:
-; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
+; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+|sp}}]
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+|sp}}]
   %tmp = load <1 x i64>* %ptr
   store <1 x i64> %tmp, <1 x i64>* %ptr2
   ret void
Index: test/CodeGen/AArch64/neon-truncStore-extLoad.ll
===================================================================
--- test/CodeGen/AArch64/neon-truncStore-extLoad.ll
+++ test/CodeGen/AArch64/neon-truncStore-extLoad.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 ; A vector TruncStore can not be selected.
@@ -5,7 +6,7 @@
 define void @truncStore.v2i64(<2 x i64> %a, <2 x i32>* %result) {
 ; CHECK-LABEL: truncStore.v2i64:
 ; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-; CHECK: st1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+|sp}}]
   %b = trunc <2 x i64> %a to <2 x i32>
   store <2 x i32> %b, <2 x i32>* %result
   ret void
@@ -14,7 +15,7 @@
 define void @truncStore.v4i32(<4 x i32> %a, <4 x i16>* %result) {
 ; CHECK-LABEL: truncStore.v4i32:
 ; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
-; CHECK: st1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+|sp}}]
   %b = trunc <4 x i32> %a to <4 x i16>
   store <4 x i16> %b, <4 x i16>* %result
   ret void
@@ -23,7 +24,7 @@
 define void @truncStore.v8i16(<8 x i16> %a, <8 x i8>* %result) {
 ; CHECK-LABEL: truncStore.v8i16:
 ; CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
-; CHECK: st1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+|sp}}]
   %b = trunc <8 x i16> %a to <8 x i8>
   store <8 x i8> %b, <8 x i8>* %result
   ret void
@@ -54,4 +55,4 @@
   %vecext = extractelement <4 x i8> %a, i32 0
   %conv = zext i8 %vecext to i32
   ret i32 %conv
-}
\ No newline at end of file
+}