Index: lib/Target/AArch64/AArch64InstrInfo.td
===================================================================
--- lib/Target/AArch64/AArch64InstrInfo.td
+++ lib/Target/AArch64/AArch64InstrInfo.td
@@ -21,6 +21,9 @@
 def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
                                  AssemblerPredicate<"FeatureCrypto","crypto">;
 
+def IsLE             : Predicate<"Subtarget->isLittle()">;
+def IsBE             : Predicate<"!Subtarget->isLittle()">;
+
 // Use fused MAC if more precision in FP computation is allowed.
 def UseFusedMAC      : Predicate<"(TM.Options.AllowFPOpFusion =="
                                  " FPOpFusion::Fast)">;
@@ -4849,6 +4852,83 @@
   : ls_neutral_pats<LOAD, STORE, Base, Offset, address, sty>,
     ls_atomic_pats<LOAD, STORE, Base, Offset, address, sty, sty>;
 
+
+// wrappers to instantiate all allowed same-size fp/vector loads
+
+// NEON-BE: allow all neon vectors as well, since ld1/st1 must be disabled
+// LD1 & ST1 are not ABI conforming in big endian: wrong arg memory layout
+// TODO: eventually also enable for LE 
+// (desired by ARM - smaller code due to more powerful adressing modes)
+
+// neon 8 bit types
+multiclass ls_FPR8_pats<Instruction LOAD, Instruction STORE, 
+                        dag Base, dag Offset, dag address> {
+  let Predicates = [HasNEON, IsBE] in {
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v1i8>;
+  }
+}
+
+// neon 16 bit types
+multiclass ls_FPR16_pats<Instruction LOAD, Instruction STORE, 
+                         dag Base, dag Offset, dag address> {
+  // float is always there
+  // shouldn't this be guarded by HasFPARMv8 ???
+  defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, f16>;
+
+  // TODO: eventually also enable for LE 
+  let Predicates = [HasNEON, IsBE] in {
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v1i16>;
+  }
+}
+
+// neon 32 bit types
+multiclass ls_FPR32_pats<Instruction LOAD, Instruction STORE, 
+                         dag Base, dag Offset, dag address> {
+  // float is always there
+  defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, f32>;
+
+  // TODO: eventually also enable for LE 
+  let Predicates = [HasNEON, IsBE] in {
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v1i32>;
+//    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v1f32>; does not exist - v1f64 DOES --  WHY ?
+  }
+}
+
+// neon 64 bit types
+multiclass ls_FPR64_pats<Instruction LOAD, Instruction STORE, 
+                         dag Base, dag Offset, dag address> {
+  // float is always there
+  // shouldn't this be guarded by HasFPARMv8 ???
+  defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, f64>;
+
+  // TODO: eventually also enable for LE 
+  let Predicates = [HasNEON, IsBE] in {
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v8i8>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v4i16>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v2i32>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v1i64>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v2f32>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v1f64>;
+  }
+}
+
+// neon 128 bit types FPR128
+multiclass ls_FPR128_pats<Instruction LOAD, Instruction STORE, 
+                          dag Base, dag Offset, dag address> {
+  // float is always there
+  defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, f128>;
+
+  // TODO: eventually also enable for LE 
+  let Predicates = [HasNEON, IsBE] in {
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v16i8>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v8i16>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v4i32>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v2i64>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v4f32>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v2f64>;
+  }
+}
+
 //===------------------------------
 // 2.2. Addressing-mode instantiations
 //===------------------------------
@@ -4892,37 +4972,40 @@
                                    !subst(ALIGN, min_align8, decls.pattern))),
                           i64>;
 
-  defm : ls_neutral_pats<LSFP16_LDR, LSFP16_STR, Base,
+  defm : ls_FPR8_pats<    LSFP8_LDR, LSFP8_STR, Base,
                           !foreach(decls.pattern, Offset,
+                                   !subst(OFFSET, byte_uimm12, decls.pattern)),
+                          !foreach(decls.pattern, address,
+                                   !subst(OFFSET, byte_uimm12,
+                                   !subst(ALIGN, any_align, decls.pattern)))>;
+
+  defm : ls_FPR16_pats<   LSFP16_LDR, LSFP16_STR, Base,
+                          !foreach(decls.pattern, Offset,
                                    !subst(OFFSET, hword_uimm12, decls.pattern)),
                           !foreach(decls.pattern, address,
                                    !subst(OFFSET, hword_uimm12,
-                                   !subst(ALIGN, min_align2, decls.pattern))),
-                          f16>;
+                                   !subst(ALIGN, min_align2, decls.pattern)))>;
 
-  defm : ls_neutral_pats<LSFP32_LDR, LSFP32_STR, Base,
+  defm : ls_FPR32_pats<   LSFP32_LDR, LSFP32_STR, Base,
                           !foreach(decls.pattern, Offset,
                                    !subst(OFFSET, word_uimm12, decls.pattern)),
                           !foreach(decls.pattern, address,
                                    !subst(OFFSET, word_uimm12,
-                                   !subst(ALIGN, min_align4, decls.pattern))),
-                          f32>;
+                                   !subst(ALIGN, min_align4, decls.pattern)))>;
 
-  defm : ls_neutral_pats<LSFP64_LDR, LSFP64_STR, Base,
+  defm : ls_FPR64_pats<   LSFP64_LDR, LSFP64_STR, Base,
                           !foreach(decls.pattern, Offset,
                                    !subst(OFFSET, dword_uimm12, decls.pattern)),
                           !foreach(decls.pattern, address,
                                    !subst(OFFSET, dword_uimm12,
-                                   !subst(ALIGN, min_align8, decls.pattern))),
-                          f64>;
+                                   !subst(ALIGN, min_align8, decls.pattern)))>;
 
-  defm : ls_neutral_pats<LSFP128_LDR, LSFP128_STR, Base,
+  defm : ls_FPR128_pats<  LSFP128_LDR, LSFP128_STR, Base,
                           !foreach(decls.pattern, Offset,
                                    !subst(OFFSET, qword_uimm12, decls.pattern)),
                           !foreach(decls.pattern, address,
                                    !subst(OFFSET, qword_uimm12,
-                                   !subst(ALIGN, min_align16, decls.pattern))),
-                          f128>;
+                                   !subst(ALIGN, min_align16, decls.pattern)))>;
 
   defm : load_signed_pats<"B", "", Base,
                           !foreach(decls.pattern, Offset,
@@ -4992,11 +5075,10 @@
   defm : ls_int_neutral_pats<LS32_LDUR, LS32_STUR, Base, Offset, address, i32>;
   defm : ls_int_neutral_pats<LS64_LDUR, LS64_STUR, Base, Offset, address, i64>;
 
-  defm : ls_neutral_pats<LSFP16_LDUR, LSFP16_STUR, Base, Offset, address, f16>;
-  defm : ls_neutral_pats<LSFP32_LDUR, LSFP32_STUR, Base, Offset, address, f32>;
-  defm : ls_neutral_pats<LSFP64_LDUR, LSFP64_STUR, Base, Offset, address, f64>;
-  defm : ls_neutral_pats<LSFP128_LDUR, LSFP128_STUR, Base, Offset, address,
-                         f128>;
+  defm : ls_FPR16_pats<LSFP16_LDUR, LSFP16_STUR, Base, Offset, address>;
+  defm : ls_FPR32_pats<LSFP32_LDUR, LSFP32_STUR, Base, Offset, address>;
+  defm : ls_FPR64_pats<LSFP64_LDUR, LSFP64_STUR, Base, Offset, address>;
+  defm : ls_FPR128_pats<LSFP128_LDUR, LSFP128_STUR, Base, Offset, address>;
 
   def : Pat<(i64 (zextloadi32 address)),
             (SUBREG_TO_REG (i64 0), (LS32_LDUR Base, Offset), sub_32)>;
Index: lib/Target/AArch64/AArch64InstrNEON.td
===================================================================
--- lib/Target/AArch64/AArch64InstrNEON.td
+++ lib/Target/AArch64/AArch64InstrNEON.td
@@ -3379,15 +3379,27 @@
 }
 
 // Load multiple N-element structure to N consecutive registers (N = 1,2,3,4)
-defm LD1 : LDVList_BHSD<0b0111, "VOne", "ld1">;
+
+// LD1 disallowed in BE, when LDR and STR are used exclusively to save on 12bit offset adds.
+// reason: LDR/STR use different memory/register layout (no element swaps).
+// If different types of loads were used from the same memory address the results 
+// will be inconsistent.
+// the only allowed use of LD1 is in initializations using explicit intrinsics to do 
+// the element-swaps.
+
+// this should work in BE - single operand: no element swaps
+let Predicates = [IsLE] in {
 def LD1_1D : NeonI_LDVList<0, 0b0111, 0b11, VOne1D_operand, "ld1">;
 
-defm LD2 : LDVList_BHSD<0b1000, "VPair", "ld2">;
+  defm LD1 : LDVList_BHSD<0b0111, "VOne", "ld1">;
 
-defm LD3 : LDVList_BHSD<0b0100, "VTriple", "ld3">;
+  defm LD2 : LDVList_BHSD<0b1000, "VPair", "ld2">;
 
-defm LD4 : LDVList_BHSD<0b0000, "VQuad", "ld4">;
+  defm LD3 : LDVList_BHSD<0b0100, "VTriple", "ld3">;
 
+  defm LD4 : LDVList_BHSD<0b0000, "VQuad", "ld4">;
+}
+
 // Load multiple 1-element structure to N consecutive registers (N = 2,3,4)
 defm LD1x2 : LDVList_BHSD<0b1010, "VPair", "ld1">;
 def LD1x2_1D : NeonI_LDVList<0, 0b1010, 0b11, VPair1D_operand, "ld1">;
@@ -3433,72 +3445,77 @@
 }
 
 // Store multiple N-element structures from N registers (N = 1,2,3,4)
-defm ST1 : STVList_BHSD<0b0111, "VOne", "st1">;
+// ARM ABI: default memory layout in BE is LDR/STR 
+// (LD1/ST1 swap elements and are incompatible with ABI)
+let Predicates = [IsLE] in {
 def ST1_1D : NeonI_STVList<0, 0b0111, 0b11, VOne1D_operand, "st1">;
 
-defm ST2 : STVList_BHSD<0b1000, "VPair", "st2">;
+  defm ST1 : STVList_BHSD<0b0111, "VOne", "st1">;
 
-defm ST3 : STVList_BHSD<0b0100, "VTriple", "st3">;
+  defm ST2 : STVList_BHSD<0b1000, "VPair", "st2">;
 
-defm ST4 : STVList_BHSD<0b0000, "VQuad", "st4">;
+  defm ST3 : STVList_BHSD<0b0100, "VTriple", "st3">;
 
+  defm ST4 : STVList_BHSD<0b0000, "VQuad", "st4">;
+
 // Store multiple 1-element structures from N consecutive registers (N = 2,3,4)
-defm ST1x2 : STVList_BHSD<0b1010, "VPair", "st1">;
-def ST1x2_1D : NeonI_STVList<0, 0b1010, 0b11, VPair1D_operand, "st1">;
+  defm ST1x2 : STVList_BHSD<0b1010, "VPair", "st1">;
+  def ST1x2_1D : NeonI_STVList<0, 0b1010, 0b11, VPair1D_operand, "st1">;
 
-defm ST1x3 : STVList_BHSD<0b0110, "VTriple", "st1">;
-def ST1x3_1D : NeonI_STVList<0, 0b0110, 0b11, VTriple1D_operand, "st1">;
+  defm ST1x3 : STVList_BHSD<0b0110, "VTriple", "st1">;
+  def ST1x3_1D : NeonI_STVList<0, 0b0110, 0b11, VTriple1D_operand, "st1">;
 
-defm ST1x4 : STVList_BHSD<0b0010, "VQuad", "st1">;
-def ST1x4_1D : NeonI_STVList<0, 0b0010, 0b11, VQuad1D_operand, "st1">;
+  defm ST1x4 : STVList_BHSD<0b0010, "VQuad", "st1">;
+  def ST1x4_1D : NeonI_STVList<0, 0b0010, 0b11, VQuad1D_operand, "st1">;
 
-def : Pat<(v2f64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>;
-def : Pat<(v2i64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>;
+  def : Pat<(v2f64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>;
+  def : Pat<(v2i64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>;
 
-def : Pat<(v4f32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>;
-def : Pat<(v4i32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>;
+  def : Pat<(v4f32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>;
+  def : Pat<(v4i32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>;
 
-def : Pat<(v8i16 (load GPR64xsp:$addr)), (LD1_8H GPR64xsp:$addr)>;
-def : Pat<(v16i8 (load GPR64xsp:$addr)), (LD1_16B GPR64xsp:$addr)>;
+  def : Pat<(v8i16 (load GPR64xsp:$addr)), (LD1_8H GPR64xsp:$addr)>;
+  def : Pat<(v16i8 (load GPR64xsp:$addr)), (LD1_16B GPR64xsp:$addr)>;
 
-def : Pat<(v1f64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>;
-def : Pat<(v1i64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>;
+  def : Pat<(v1f64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>;
+  def : Pat<(v1i64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>;
 
-def : Pat<(v2f32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>;
-def : Pat<(v2i32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>;
+  def : Pat<(v2f32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>;
+  def : Pat<(v2i32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>;
 
-def : Pat<(v4i16 (load GPR64xsp:$addr)), (LD1_4H GPR64xsp:$addr)>;
-def : Pat<(v8i8 (load GPR64xsp:$addr)), (LD1_8B GPR64xsp:$addr)>;
+  def : Pat<(v4i16 (load GPR64xsp:$addr)), (LD1_4H GPR64xsp:$addr)>;
+  def : Pat<(v8i8 (load GPR64xsp:$addr)), (LD1_8B GPR64xsp:$addr)>;
 
-def : Pat<(store (v2i64 VPR128:$value), GPR64xsp:$addr),
-          (ST1_2D GPR64xsp:$addr, VPR128:$value)>;
-def : Pat<(store (v2f64 VPR128:$value), GPR64xsp:$addr),
-          (ST1_2D GPR64xsp:$addr, VPR128:$value)>;
+  def : Pat<(store (v2i64 VPR128:$value), GPR64xsp:$addr),
+            (ST1_2D GPR64xsp:$addr, VPR128:$value)>;
+  def : Pat<(store (v2f64 VPR128:$value), GPR64xsp:$addr),
+            (ST1_2D GPR64xsp:$addr, VPR128:$value)>;
 
-def : Pat<(store (v4i32 VPR128:$value), GPR64xsp:$addr),
-          (ST1_4S GPR64xsp:$addr, VPR128:$value)>;
-def : Pat<(store (v4f32 VPR128:$value), GPR64xsp:$addr),
-          (ST1_4S GPR64xsp:$addr, VPR128:$value)>;
+  def : Pat<(store (v4i32 VPR128:$value), GPR64xsp:$addr),
+            (ST1_4S GPR64xsp:$addr, VPR128:$value)>;
+  def : Pat<(store (v4f32 VPR128:$value), GPR64xsp:$addr),
+            (ST1_4S GPR64xsp:$addr, VPR128:$value)>;
 
-def : Pat<(store (v8i16 VPR128:$value), GPR64xsp:$addr),
-          (ST1_8H GPR64xsp:$addr, VPR128:$value)>;
-def : Pat<(store (v16i8 VPR128:$value), GPR64xsp:$addr),
-          (ST1_16B GPR64xsp:$addr, VPR128:$value)>;
+  def : Pat<(store (v8i16 VPR128:$value), GPR64xsp:$addr),
+            (ST1_8H GPR64xsp:$addr, VPR128:$value)>;
+  def : Pat<(store (v16i8 VPR128:$value), GPR64xsp:$addr),
+            (ST1_16B GPR64xsp:$addr, VPR128:$value)>;
 
-def : Pat<(store (v1i64 VPR64:$value), GPR64xsp:$addr),
-          (ST1_1D GPR64xsp:$addr, VPR64:$value)>;
-def : Pat<(store (v1f64 VPR64:$value), GPR64xsp:$addr),
-          (ST1_1D GPR64xsp:$addr, VPR64:$value)>;
+  def : Pat<(store (v1i64 VPR64:$value), GPR64xsp:$addr),
+            (ST1_1D GPR64xsp:$addr, VPR64:$value)>;
+  def : Pat<(store (v1f64 VPR64:$value), GPR64xsp:$addr),
+            (ST1_1D GPR64xsp:$addr, VPR64:$value)>;
 
-def : Pat<(store (v2i32 VPR64:$value), GPR64xsp:$addr),
-          (ST1_2S GPR64xsp:$addr, VPR64:$value)>;
-def : Pat<(store (v2f32 VPR64:$value), GPR64xsp:$addr),
-          (ST1_2S GPR64xsp:$addr, VPR64:$value)>;
+  def : Pat<(store (v2i32 VPR64:$value), GPR64xsp:$addr),
+            (ST1_2S GPR64xsp:$addr, VPR64:$value)>;
+  def : Pat<(store (v2f32 VPR64:$value), GPR64xsp:$addr),
+            (ST1_2S GPR64xsp:$addr, VPR64:$value)>;
 
-def : Pat<(store (v4i16 VPR64:$value), GPR64xsp:$addr),
-          (ST1_4H GPR64xsp:$addr, VPR64:$value)>;
-def : Pat<(store (v8i8 VPR64:$value), GPR64xsp:$addr),
-          (ST1_8B GPR64xsp:$addr, VPR64:$value)>;
+  def : Pat<(store (v4i16 VPR64:$value), GPR64xsp:$addr),
+            (ST1_4H GPR64xsp:$addr, VPR64:$value)>;
+  def : Pat<(store (v8i8 VPR64:$value), GPR64xsp:$addr),
+            (ST1_8B GPR64xsp:$addr, VPR64:$value)>;
+}
 
 // Match load/store of v1i8/v1i16/v1i32 type to FPR8/FPR16/FPR32 load/store.
 // FIXME: for now we have v1i8, v1i16, v1i32 legal types, if they are illegal,
@@ -3681,34 +3698,36 @@
                               ImmTy2, asmop>;
 }
 
-// Post-index load multiple N-element structures from N registers (N = 1,2,3,4)
-defm LD1WB : LDWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "ld1">;
+let Predicates = [IsLE] in {
 defm LD1WB_1D : NeonI_LDWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8,
-                                 "ld1">;
+                                "ld1">;
+  // Post-index load multiple N-element structures from N registers (N = 1,2,3,4)
+  defm LD1WB : LDWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "ld1">;
 
-defm LD2WB : LDWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "ld2">;
+  defm LD2WB : LDWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "ld2">;
 
-defm LD3WB : LDWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48,
-                             "ld3">;
+  defm LD3WB : LDWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48,
+                               "ld3">;
 
-defm LD4WB : LDWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "ld4">;
+  defm LD4WB : LDWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "ld4">;
 
-// Post-index load multiple 1-element structures from N consecutive registers
-// (N = 2,3,4)
-defm LD1x2WB : LDWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32,
-                               "ld1">;
-defm LD1x2WB_1D : NeonI_LDWB_VList<0, 0b1010, 0b11, VPair1D_operand,
-                                   uimm_exact16, "ld1">;
+  // Post-index load multiple 1-element structures from N consecutive registers
+  // (N = 2,3,4)
+  defm LD1x2WB : LDWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32,
+                                 "ld1">;
+  defm LD1x2WB_1D : NeonI_LDWB_VList<0, 0b1010, 0b11, VPair1D_operand,
+                                     uimm_exact16, "ld1">;
 
-defm LD1x3WB : LDWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48,
-                               "ld1">;
-defm LD1x3WB_1D : NeonI_LDWB_VList<0, 0b0110, 0b11, VTriple1D_operand,
-                                   uimm_exact24, "ld1">;
+  defm LD1x3WB : LDWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48,
+                                 "ld1">;
+  defm LD1x3WB_1D : NeonI_LDWB_VList<0, 0b0110, 0b11, VTriple1D_operand,
+                                     uimm_exact24, "ld1">;
 
-defm LD1x4WB : LDWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64,
-                                "ld1">;
-defm LD1x4WB_1D : NeonI_LDWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
-                                   uimm_exact32, "ld1">;
+  defm LD1x4WB : LDWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64,
+                                 "ld1">;
+  defm LD1x4WB_1D : NeonI_LDWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
+                                     uimm_exact32, "ld1">;
+}
 
 multiclass NeonI_STWB_VList<bit q, bits<4> opcode, bits<2> size,
                             RegisterOperand VecList, Operand ImmTy,
@@ -3764,33 +3783,35 @@
 }
 
 // Post-index load multiple N-element structures from N registers (N = 1,2,3,4)
-defm ST1WB : STWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "st1">;
+let Predicates = [IsLE] in {
 defm ST1WB_1D : NeonI_STWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8,
-                                 "st1">;
+                                 "st1">;       
+  defm ST1WB : STWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "st1">;
 
-defm ST2WB : STWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "st2">;
+  defm ST2WB : STWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "st2">;
 
-defm ST3WB : STWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48,
-                             "st3">;
+  defm ST3WB : STWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48,
+                               "st3">;
 
-defm ST4WB : STWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "st4">;
+  defm ST4WB : STWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "st4">;
 
-// Post-index load multiple 1-element structures from N consecutive registers
-// (N = 2,3,4)
-defm ST1x2WB : STWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32,
-                               "st1">;
-defm ST1x2WB_1D : NeonI_STWB_VList<0, 0b1010, 0b11, VPair1D_operand,
-                                   uimm_exact16, "st1">;
+  // Post-index load multiple 1-element structures from N consecutive registers
+  // (N = 2,3,4)
+  defm ST1x2WB : STWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32,
+                                 "st1">;
+  defm ST1x2WB_1D : NeonI_STWB_VList<0, 0b1010, 0b11, VPair1D_operand,
+                                     uimm_exact16, "st1">;
 
-defm ST1x3WB : STWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48,
-                               "st1">;
-defm ST1x3WB_1D : NeonI_STWB_VList<0, 0b0110, 0b11, VTriple1D_operand,
-                                   uimm_exact24, "st1">;
+  defm ST1x3WB : STWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48,
+                                 "st1">;
+  defm ST1x3WB_1D : NeonI_STWB_VList<0, 0b0110, 0b11, VTriple1D_operand,
+                                     uimm_exact24, "st1">;
 
-defm ST1x4WB : STWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64,
-                               "st1">;
-defm ST1x4WB_1D : NeonI_STWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
-                                   uimm_exact32, "st1">;
+  defm ST1x4WB : STWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64,
+                                 "st1">;
+  defm ST1x4WB_1D : NeonI_STWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
+                                     uimm_exact32, "st1">;
+}
 
 // End of post-index vector load/store multiple N-element structure
 // (class SIMD lselem-post)
@@ -3864,14 +3885,16 @@
                           !cast<RegisterOperand>(List # "2D_operand"), asmop>;
 }
 
+let Predicates = [IsLE] in {
 // Load single 1-element structure to all lanes of 1 register
 defm LD1R : LDN_Dup_BHSD<0b0, 0b110, "VOne", "ld1r">;
 
-// Load single N-element structure to all lanes of N consecutive
-// registers (N = 2,3,4)
-defm LD2R : LDN_Dup_BHSD<0b1, 0b110, "VPair", "ld2r">;
-defm LD3R : LDN_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r">;
-defm LD4R : LDN_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r">;
+  // Load single N-element structure to all lanes of N consecutive
+  // registers (N = 2,3,4)
+  defm LD2R : LDN_Dup_BHSD<0b1, 0b110, "VPair", "ld2r">;
+  defm LD3R : LDN_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r">;
+  defm LD4R : LDN_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r">;
+}
 
 
 class LD1R_pattern <ValueType VTy, ValueType DTy, PatFrag LoadOp,
@@ -3880,31 +3903,37 @@
           (VTy (INST GPR64xsp:$Rn))>;
 
 // Match all LD1R instructions
+
+let Predicates = [IsLE] in {
+// element swap on bytes == byte swap
 def : LD1R_pattern<v8i8, i32, extloadi8, LD1R_8B>;
-
 def : LD1R_pattern<v16i8, i32, extloadi8, LD1R_16B>;
 
-def : LD1R_pattern<v4i16, i32, extloadi16, LD1R_4H>;
+  def : LD1R_pattern<v4i16, i32, extloadi16, LD1R_4H>;
 
-def : LD1R_pattern<v8i16, i32, extloadi16, LD1R_8H>;
+  def : LD1R_pattern<v8i16, i32, extloadi16, LD1R_8H>;
 
-def : LD1R_pattern<v2i32, i32, load, LD1R_2S>;
-def : LD1R_pattern<v2f32, f32, load, LD1R_2S>;
+  def : LD1R_pattern<v2i32, i32, load, LD1R_2S>;
+  def : LD1R_pattern<v2f32, f32, load, LD1R_2S>;
 
-def : LD1R_pattern<v4i32, i32, load, LD1R_4S>;
-def : LD1R_pattern<v4f32, f32, load, LD1R_4S>;
+  def : LD1R_pattern<v4i32, i32, load, LD1R_4S>;
+  def : LD1R_pattern<v4f32, f32, load, LD1R_4S>;
 
-def : LD1R_pattern<v2i64, i64, load, LD1R_2D>;
-def : LD1R_pattern<v2f64, f64, load, LD1R_2D>;
+  def : LD1R_pattern<v2i64, i64, load, LD1R_2D>;
+  def : LD1R_pattern<v2f64, f64, load, LD1R_2D>;
+}
 
 class LD1R_pattern_v1 <ValueType VTy, ValueType DTy, PatFrag LoadOp,
                        Instruction INST>
   : Pat<(VTy (scalar_to_vector (DTy (LoadOp GPR64xsp:$Rn)))),
         (VTy (INST GPR64xsp:$Rn))>;
 
-def : LD1R_pattern_v1<v1i64, i64, load, LD1R_1D>;
-def : LD1R_pattern_v1<v1f64, f64, load, LD1R_1D>;
+let Predicates = [IsLE] in {
+  def : LD1R_pattern_v1<v1i64, i64, load, LD1R_1D>;
+  def : LD1R_pattern_v1<v1f64, f64, load, LD1R_1D>;
+}
 
+
 multiclass VectorList_Bare_BHSD<string PREFIX, int Count,
                                 RegisterClass RegList> {
   defm B : VectorList_operands<PREFIX, "B", Count, RegList>;
@@ -3964,14 +3993,16 @@
   }
 }
 
-// Load single 1-element structure to one lane of 1 register.
-defm LD1LN : LDN_Lane_BHSD<0b0, 0b0, "VOne", "ld1">;
+let Predicates = [IsLE] in {
+  // Load single 1-element structure to one lane of 1 register.
+  defm LD1LN : LDN_Lane_BHSD<0b0, 0b0, "VOne", "ld1">;
 
-// Load single N-element structure to one lane of N consecutive registers
-// (N = 2,3,4)
-defm LD2LN : LDN_Lane_BHSD<0b1, 0b0, "VPair", "ld2">;
-defm LD3LN : LDN_Lane_BHSD<0b0, 0b1, "VTriple", "ld3">;
-defm LD4LN : LDN_Lane_BHSD<0b1, 0b1, "VQuad", "ld4">;
+  // Load single N-element structure to one lane of N consecutive registers
+  // (N = 2,3,4)
+  defm LD2LN : LDN_Lane_BHSD<0b1, 0b0, "VPair", "ld2">;
+  defm LD3LN : LDN_Lane_BHSD<0b0, 0b1, "VTriple", "ld3">;
+  defm LD4LN : LDN_Lane_BHSD<0b1, 0b1, "VQuad", "ld4">;
+}
 
 multiclass LD1LN_patterns<ValueType VTy, ValueType VTy2, ValueType DTy,
                           Operand ImmOp, Operand ImmOp2, PatFrag LoadOp,
@@ -3989,22 +4020,24 @@
             (VTy2 (INST GPR64xsp:$Rn, VPR128:$src, ImmOp2:$lane))>;
 }
 
-// Match all LD1LN instructions
-defm : LD1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
-                      extloadi8, LD1LN_B>;
+let Predicates = [IsLE] in {
+  // Match all LD1LN instructions
+  defm : LD1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
+                        extloadi8, LD1LN_B>;
 
-defm : LD1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
-                      extloadi16, LD1LN_H>;
+  defm : LD1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
+                        extloadi16, LD1LN_H>;
 
-defm : LD1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
-                      load, LD1LN_S>;
-defm : LD1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
-                      load, LD1LN_S>;
+  defm : LD1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
+                        load, LD1LN_S>;
+  defm : LD1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
+                        load, LD1LN_S>;
 
-defm : LD1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
-                      load, LD1LN_D>;
-defm : LD1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
-                      load, LD1LN_D>;
+  defm : LD1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
+                        load, LD1LN_D>;
+  defm : LD1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
+                        load, LD1LN_D>;
+}
 
 class NeonI_STN_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
                      Operand ImmOp, string asmop>
@@ -4048,14 +4081,16 @@
   }
 }
 
+let Predicates = [IsLE] in {
 // Store single 1-element structure from one lane of 1 register.
 defm ST1LN : STN_Lane_BHSD<0b0, 0b0, "VOne", "st1">;
 
-// Store single N-element structure from one lane of N consecutive registers
-// (N = 2,3,4)
-defm ST2LN : STN_Lane_BHSD<0b1, 0b0, "VPair", "st2">;
-defm ST3LN : STN_Lane_BHSD<0b0, 0b1, "VTriple", "st3">;
-defm ST4LN : STN_Lane_BHSD<0b1, 0b1, "VQuad", "st4">;
+  // Store single N-element structure from one lane of N consecutive registers
+  // (N = 2,3,4)
+  defm ST2LN : STN_Lane_BHSD<0b1, 0b0, "VPair", "st2">;
+  defm ST3LN : STN_Lane_BHSD<0b0, 0b1, "VTriple", "st3">;
+  defm ST4LN : STN_Lane_BHSD<0b1, 0b1, "VQuad", "st4">;
+}
 
 multiclass ST1LN_patterns<ValueType VTy, ValueType VTy2, ValueType DTy,
                           Operand ImmOp, Operand ImmOp2, PatFrag StoreOp,
@@ -4071,23 +4106,24 @@
             (INST GPR64xsp:$Rn, VPR128:$Rt, ImmOp2:$lane)>;
 }
 
-// Match all ST1LN instructions
-defm : ST1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
-                      truncstorei8, ST1LN_B>;
+let Predicates = [IsLE] in {
+  // Match all ST1LN instructions
+  defm : ST1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
+                        truncstorei8, ST1LN_B>;
 
-defm : ST1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
-                      truncstorei16, ST1LN_H>;
+  defm : ST1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
+                        truncstorei16, ST1LN_H>;
 
-defm : ST1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
-                      store, ST1LN_S>;
-defm : ST1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
-                      store, ST1LN_S>;
+  defm : ST1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
+                        store, ST1LN_S>;
+  defm : ST1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
+                        store, ST1LN_S>;
 
-defm : ST1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
-                      store, ST1LN_D>;
-defm : ST1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
-                      store, ST1LN_D>;
-
+  defm : ST1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
+                        store, ST1LN_D>;
+  defm : ST1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
+                        store, ST1LN_D>;
+}
 // End of vector load/store single N-element structure (class SIMD lsone).
 
 
@@ -4153,18 +4189,20 @@
                               uimm_d, asmop>;
 }
 
+let Predicates = [IsLE] in {
 // Post-index load single 1-element structure to all lanes of 1 register
 defm LD1R_WB : LDWB_Dup_BHSD<0b0, 0b110, "VOne", "ld1r", uimm_exact1,
                              uimm_exact2, uimm_exact4, uimm_exact8>;
 
-// Post-index load single N-element structure to all lanes of N consecutive
-// registers (N = 2,3,4)
-defm LD2R_WB : LDWB_Dup_BHSD<0b1, 0b110, "VPair", "ld2r", uimm_exact2,
-                             uimm_exact4, uimm_exact8, uimm_exact16>;
-defm LD3R_WB : LDWB_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r", uimm_exact3,
-                             uimm_exact6, uimm_exact12, uimm_exact24>;
-defm LD4R_WB : LDWB_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r", uimm_exact4,
-                             uimm_exact8, uimm_exact16, uimm_exact32>;
+  // Post-index load single N-element structure to all lanes of N consecutive
+  // registers (N = 2,3,4)
+  defm LD2R_WB : LDWB_Dup_BHSD<0b1, 0b110, "VPair", "ld2r", uimm_exact2,
+                               uimm_exact4, uimm_exact8, uimm_exact16>;
+  defm LD3R_WB : LDWB_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r", uimm_exact3,
+                               uimm_exact6, uimm_exact12, uimm_exact24>;
+  defm LD4R_WB : LDWB_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r", uimm_exact4,
+                               uimm_exact8, uimm_exact16, uimm_exact32>;
+}
 
 let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1,
     Constraints = "$Rn = $wb, $Rt = $src",
@@ -4253,18 +4291,20 @@
 }
 
 // Post-index load single 1-element structure to one lane of 1 register.
+let Predicates = [IsLE] in {
 defm LD1LN_WB : LD_Lane_WB_BHSD<0b0, 0b0, "VOne", "ld1", uimm_exact1,
                                 uimm_exact2, uimm_exact4, uimm_exact8>;
 
-// Post-index load single N-element structure to one lane of N consecutive
-// registers
-// (N = 2,3,4)
-defm LD2LN_WB : LD_Lane_WB_BHSD<0b1, 0b0, "VPair", "ld2", uimm_exact2,
-                                uimm_exact4, uimm_exact8, uimm_exact16>;
-defm LD3LN_WB : LD_Lane_WB_BHSD<0b0, 0b1, "VTriple", "ld3", uimm_exact3,
-                                uimm_exact6, uimm_exact12, uimm_exact24>;
-defm LD4LN_WB : LD_Lane_WB_BHSD<0b1, 0b1, "VQuad", "ld4", uimm_exact4,
-                                uimm_exact8, uimm_exact16, uimm_exact32>;
+  // Post-index load single N-element structure to one lane of N consecutive
+  // registers
+  // (N = 2,3,4)
+  defm LD2LN_WB : LD_Lane_WB_BHSD<0b1, 0b0, "VPair", "ld2", uimm_exact2,
+                                  uimm_exact4, uimm_exact8, uimm_exact16>;
+  defm LD3LN_WB : LD_Lane_WB_BHSD<0b0, 0b1, "VTriple", "ld3", uimm_exact3,
+                                  uimm_exact6, uimm_exact12, uimm_exact24>;
+  defm LD4LN_WB : LD_Lane_WB_BHSD<0b1, 0b1, "VQuad", "ld4", uimm_exact4,
+                                  uimm_exact8, uimm_exact16, uimm_exact32>;
+}
 
 let mayStore = 1, neverHasSideEffects = 1,
     hasExtraDefRegAllocReq = 1, Constraints = "$Rn = $wb",
@@ -4353,17 +4393,19 @@
 }
 
 // Post-index store single 1-element structure from one lane of 1 register.
+let Predicates = [IsLE] in {
 defm ST1LN_WB : ST_Lane_WB_BHSD<0b0, 0b0, "VOne", "st1", uimm_exact1,
                                 uimm_exact2, uimm_exact4, uimm_exact8>;
 
 // Post-index store single N-element structure from one lane of N consecutive
 // registers (N = 2,3,4)
-defm ST2LN_WB : ST_Lane_WB_BHSD<0b1, 0b0, "VPair", "st2", uimm_exact2,
-                                uimm_exact4, uimm_exact8, uimm_exact16>;
-defm ST3LN_WB : ST_Lane_WB_BHSD<0b0, 0b1, "VTriple", "st3", uimm_exact3,
-                                uimm_exact6, uimm_exact12, uimm_exact24>;
-defm ST4LN_WB : ST_Lane_WB_BHSD<0b1, 0b1, "VQuad", "st4", uimm_exact4,
-                                uimm_exact8, uimm_exact16, uimm_exact32>;
+  defm ST2LN_WB : ST_Lane_WB_BHSD<0b1, 0b0, "VPair", "st2", uimm_exact2,
+                                  uimm_exact4, uimm_exact8, uimm_exact16>;
+  defm ST3LN_WB : ST_Lane_WB_BHSD<0b0, 0b1, "VTriple", "st3", uimm_exact3,
+                                  uimm_exact6, uimm_exact12, uimm_exact24>;
+  defm ST4LN_WB : ST_Lane_WB_BHSD<0b1, 0b1, "VQuad", "st4", uimm_exact4,
+                                  uimm_exact8, uimm_exact16, uimm_exact32>;
+}
 
 // End of post-index load/store single N-element instructions
 // (class SIMD lsone-post)