Index: lib/Target/AArch64/AArch64InstrInfo.td
===================================================================
--- lib/Target/AArch64/AArch64InstrInfo.td
+++ lib/Target/AArch64/AArch64InstrInfo.td
@@ -21,6 +21,9 @@
 def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
                                  AssemblerPredicate<"FeatureCrypto","crypto">;
 
+def IsLE             : Predicate<"Subtarget->isLittle()">;
+def IsBE             : Predicate<"!Subtarget->isLittle()">;
+
 // Use fused MAC if more precision in FP computation is allowed.
 def UseFusedMAC      : Predicate<"(TM.Options.AllowFPOpFusion =="
                                  " FPOpFusion::Fast)">;
@@ -4849,6 +4852,85 @@
   : ls_neutral_pats<LOAD, STORE, Base, Offset, address, sty>,
     ls_atomic_pats<LOAD, STORE, Base, Offset, address, sty, sty>;
 
+
+// Wrappers to instantiate all allowed same-size fp/vector loads
+
+// NEON-BE: allow all neon vectors as well, since ld1/st1 must be disabled
+// LD1 & ST1 are not ABI conforming in big endian: wrong arg memory layout
+// http://infocenter.arm.com/help/topic/com.arm.doc.ihi0055b/IHI0055B_aapcs64.pdf
+// section 4.1.2, 2nd paragraph: LDR/STR layout
+// "on a big-endian system element 0 will contain the highest-addressed 
+// element of a short vector."
+// FIXME: eventually also enable for LE 
+// (desired by ARM - smaller code due to more powerful adressing modes)
+
+// NEON 8 bit types
+multiclass ls_FPR8_pats<Instruction LOAD, Instruction STORE, 
+                        dag Base, dag Offset, dag address> {
+  let Predicates = [HasNEON] in {
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v1i8>;
+  }
+}
+
+// NEON 16 bit types
+multiclass ls_FPR16_pats<Instruction LOAD, Instruction STORE, 
+                         dag Base, dag Offset, dag address> {
+  let Predicates = [HasFPARMv8] in {
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, f16>;
+  }
+
+  let Predicates = [HasNEON] in {
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v1i16>;
+  }
+}
+
+// NEON 32 bit types
+multiclass ls_FPR32_pats<Instruction LOAD, Instruction STORE, 
+                         dag Base, dag Offset, dag address> {
+  let Predicates = [HasFPARMv8] in {
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, f32>;
+  }
+
+  let Predicates = [HasNEON] in {
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v1i32>;
+//    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v1f32>; does not exist - v1f64 DOES --  WHY ?
+  }
+}
+
+// NEON 64 bit types
+multiclass ls_FPR64_pats<Instruction LOAD, Instruction STORE, 
+                         dag Base, dag Offset, dag address> {
+  let Predicates = [HasFPARMv8] in {
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, f64>;
+  }
+
+  let Predicates = [HasNEON] in {
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v8i8>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v4i16>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v2i32>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v1i64>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v2f32>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v1f64>;
+  }
+}
+
+// NEON 128 bit types FPR128
+multiclass ls_FPR128_pats<Instruction LOAD, Instruction STORE, 
+                          dag Base, dag Offset, dag address> {
+  let Predicates = [HasFPARMv8] in {
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, f128>;
+  }
+
+  let Predicates = [HasNEON] in {
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v16i8>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v8i16>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v4i32>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v2i64>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v4f32>;
+    defm : ls_neutral_pats<LOAD, STORE, Base, Offset, address, v2f64>;
+  }
+}
+
 //===------------------------------
 // 2.2. Addressing-mode instantiations
 //===------------------------------
@@ -4892,37 +4974,40 @@
                                    !subst(ALIGN, min_align8, decls.pattern))),
                           i64>;
 
-  defm : ls_neutral_pats<LSFP16_LDR, LSFP16_STR, Base,
+  defm : ls_FPR8_pats<    LSFP8_LDR, LSFP8_STR, Base,
                           !foreach(decls.pattern, Offset,
+                                   !subst(OFFSET, byte_uimm12, decls.pattern)),
+                          !foreach(decls.pattern, address,
+                                   !subst(OFFSET, byte_uimm12,
+                                   !subst(ALIGN, any_align, decls.pattern)))>;
+
+  defm : ls_FPR16_pats<   LSFP16_LDR, LSFP16_STR, Base,
+                          !foreach(decls.pattern, Offset,
                                    !subst(OFFSET, hword_uimm12, decls.pattern)),
                           !foreach(decls.pattern, address,
                                    !subst(OFFSET, hword_uimm12,
-                                   !subst(ALIGN, min_align2, decls.pattern))),
-                          f16>;
+                                   !subst(ALIGN, min_align2, decls.pattern)))>;
 
-  defm : ls_neutral_pats<LSFP32_LDR, LSFP32_STR, Base,
+  defm : ls_FPR32_pats<   LSFP32_LDR, LSFP32_STR, Base,
                           !foreach(decls.pattern, Offset,
                                    !subst(OFFSET, word_uimm12, decls.pattern)),
                           !foreach(decls.pattern, address,
                                    !subst(OFFSET, word_uimm12,
-                                   !subst(ALIGN, min_align4, decls.pattern))),
-                          f32>;
+                                   !subst(ALIGN, min_align4, decls.pattern)))>;
 
-  defm : ls_neutral_pats<LSFP64_LDR, LSFP64_STR, Base,
+  defm : ls_FPR64_pats<   LSFP64_LDR, LSFP64_STR, Base,
                           !foreach(decls.pattern, Offset,
                                    !subst(OFFSET, dword_uimm12, decls.pattern)),
                           !foreach(decls.pattern, address,
                                    !subst(OFFSET, dword_uimm12,
-                                   !subst(ALIGN, min_align8, decls.pattern))),
-                          f64>;
+                                   !subst(ALIGN, min_align8, decls.pattern)))>;
 
-  defm : ls_neutral_pats<LSFP128_LDR, LSFP128_STR, Base,
+  defm : ls_FPR128_pats<  LSFP128_LDR, LSFP128_STR, Base,
                           !foreach(decls.pattern, Offset,
                                    !subst(OFFSET, qword_uimm12, decls.pattern)),
                           !foreach(decls.pattern, address,
                                    !subst(OFFSET, qword_uimm12,
-                                   !subst(ALIGN, min_align16, decls.pattern))),
-                          f128>;
+                                   !subst(ALIGN, min_align16, decls.pattern)))>;
 
   defm : load_signed_pats<"B", "", Base,
                           !foreach(decls.pattern, Offset,
@@ -4992,11 +5077,10 @@
   defm : ls_int_neutral_pats<LS32_LDUR, LS32_STUR, Base, Offset, address, i32>;
   defm : ls_int_neutral_pats<LS64_LDUR, LS64_STUR, Base, Offset, address, i64>;
 
-  defm : ls_neutral_pats<LSFP16_LDUR, LSFP16_STUR, Base, Offset, address, f16>;
-  defm : ls_neutral_pats<LSFP32_LDUR, LSFP32_STUR, Base, Offset, address, f32>;
-  defm : ls_neutral_pats<LSFP64_LDUR, LSFP64_STUR, Base, Offset, address, f64>;
-  defm : ls_neutral_pats<LSFP128_LDUR, LSFP128_STUR, Base, Offset, address,
-                         f128>;
+  defm : ls_FPR16_pats<LSFP16_LDUR, LSFP16_STUR, Base, Offset, address>;
+  defm : ls_FPR32_pats<LSFP32_LDUR, LSFP32_STUR, Base, Offset, address>;
+  defm : ls_FPR64_pats<LSFP64_LDUR, LSFP64_STUR, Base, Offset, address>;
+  defm : ls_FPR128_pats<LSFP128_LDUR, LSFP128_STUR, Base, Offset, address>;
 
   def : Pat<(i64 (zextloadi32 address)),
             (SUBREG_TO_REG (i64 0), (LS32_LDUR Base, Offset), sub_32)>;
Index: lib/Target/AArch64/AArch64InstrNEON.td
===================================================================
--- lib/Target/AArch64/AArch64InstrNEON.td
+++ lib/Target/AArch64/AArch64InstrNEON.td
@@ -104,6 +104,14 @@
   defm : ls_128_pats<address, Base, Offset, v2f64>;
 }
 
+// LDR is only valid for little endian. 
+// In BE LDR needs correctly byte-swapped 128bit literals, so simple array 
+// initializers won't work right now.
+// Big-endian must - for now - do the element swaps using vector intrinsics.
+// That's an additional "add offset12" instruction, there.
+// According to ARM, BE & LE should use intrinsics for initialization.
+// That's also the only portable code.
+// FIXME: BE could use vector-literal-swapping before emit pass.
 defm : uimm12_neon_pats<(A64WrapperSmall
                           tconstpool:$Hi, tconstpool:$Lo12, ALIGN),
                         (ADRPxi tconstpool:$Hi), (i64 tconstpool:$Lo12)>;
@@ -3341,9 +3349,31 @@
 // the three 64-bit vectors list {BA, DC, FE}.
 // E.g. LD3_2S will load 32-bit elements {A, B, C, D, E, F} into the three
 // 64-bit vectors list {DA, EB, FC}.
-// Store instructions store multiple structure to N registers like load.
+// Store instructions store multiple structure from N registers like load.
+//
+// Problem for Big Endian (BE):
+// LD1/ST1 do "array" loads/stores - reading elements from ascending addresses
+// into ascending indexes in the register, in big-endian byte-swapping is done 
+// per element. (hence LD1 & Co are sometimes referred to as "array loads".)
+//
+// LDR/STR read the whole register doing byte-swapping on the whole register 
+// in big-endian mode.
+//
+// Obviously the two layouts differ by reversing the elements so they can't be 
+// mixed without explicit element-swap operations in BE.
+//
+// The only overlap is reading single elements to registers:
+// LDR i128/f128 - doing byte-swapping for the whole register.
+// LD1/ST1 i128/f128 - also doing byte-swapping within the 128bit element.
+// Analogously for stores.
 
+// For this reason there are IsLE guards around the respective patterns, or -
+// when no patterns are defined, yet - around the instruction definition.
 
+// In a PBQP matcher, one would add a separate set of "reversed" nonterminals 
+// with the element swap operations as chain rules - and let the matcher find
+// the optimal coverage. FIXME: How to do that here ?
+
 class NeonI_LDVList<bit q, bits<4> opcode, bits<2> size,
                     RegisterOperand VecList, string asmop>
   : NeonI_LdStMult<q, 1, opcode, size,
@@ -3379,15 +3409,28 @@
 }
 
 // Load multiple N-element structure to N consecutive registers (N = 1,2,3,4)
-defm LD1 : LDVList_BHSD<0b0111, "VOne", "ld1">;
+
+// LD1 disallowed in BE, when LDR and STR are used exclusively as per the ABI.
+// reason: LDR/STR use different memory/register layout (no element swaps).
+// If different types of loads were used from the same memory address the results 
+// will be inconsistent.
+// The only allowed use of LD1 is in initializations using explicit intrinsics to do 
+// the element-swaps.
+
+// Single element has no swapping problem in BE.
 def LD1_1D : NeonI_LDVList<0, 0b0111, 0b11, VOne1D_operand, "ld1">;
 
-defm LD2 : LDVList_BHSD<0b1000, "VPair", "ld2">;
+// Multiple elements would be reversed in BE.
+let Predicates = [IsLE] in {
+  defm LD1 : LDVList_BHSD<0b0111, "VOne", "ld1">;
 
-defm LD3 : LDVList_BHSD<0b0100, "VTriple", "ld3">;
+  defm LD2 : LDVList_BHSD<0b1000, "VPair", "ld2">;
 
-defm LD4 : LDVList_BHSD<0b0000, "VQuad", "ld4">;
+  defm LD3 : LDVList_BHSD<0b0100, "VTriple", "ld3">;
 
+  defm LD4 : LDVList_BHSD<0b0000, "VQuad", "ld4">;
+}
+
 // Load multiple 1-element structure to N consecutive registers (N = 2,3,4)
 defm LD1x2 : LDVList_BHSD<0b1010, "VPair", "ld1">;
 def LD1x2_1D : NeonI_LDVList<0, 0b1010, 0b11, VPair1D_operand, "ld1">;
@@ -3433,73 +3476,79 @@
 }
 
 // Store multiple N-element structures from N registers (N = 1,2,3,4)
-defm ST1 : STVList_BHSD<0b0111, "VOne", "st1">;
+// ARM ABI: default memory layout in BE is LDR/STR 
+// Single element has no swapping problem in BE.
 def ST1_1D : NeonI_STVList<0, 0b0111, 0b11, VOne1D_operand, "st1">;
 
-defm ST2 : STVList_BHSD<0b1000, "VPair", "st2">;
+// Multiple elements would be reversed in BE.
+let Predicates = [IsLE] in {
+  defm ST1 : STVList_BHSD<0b0111, "VOne", "st1">;
 
-defm ST3 : STVList_BHSD<0b0100, "VTriple", "st3">;
+  defm ST2 : STVList_BHSD<0b1000, "VPair", "st2">;
 
-defm ST4 : STVList_BHSD<0b0000, "VQuad", "st4">;
+  defm ST3 : STVList_BHSD<0b0100, "VTriple", "st3">;
 
-// Store multiple 1-element structures from N consecutive registers (N = 2,3,4)
-defm ST1x2 : STVList_BHSD<0b1010, "VPair", "st1">;
-def ST1x2_1D : NeonI_STVList<0, 0b1010, 0b11, VPair1D_operand, "st1">;
+  defm ST4 : STVList_BHSD<0b0000, "VQuad", "st4">;
 
-defm ST1x3 : STVList_BHSD<0b0110, "VTriple", "st1">;
-def ST1x3_1D : NeonI_STVList<0, 0b0110, 0b11, VTriple1D_operand, "st1">;
+  // Store multiple 1-element structures from N consecutive registers (N = 2,3,4)
+  defm ST1x2 : STVList_BHSD<0b1010, "VPair", "st1">;
+  def ST1x2_1D : NeonI_STVList<0, 0b1010, 0b11, VPair1D_operand, "st1">;
 
-defm ST1x4 : STVList_BHSD<0b0010, "VQuad", "st1">;
-def ST1x4_1D : NeonI_STVList<0, 0b0010, 0b11, VQuad1D_operand, "st1">;
+  defm ST1x3 : STVList_BHSD<0b0110, "VTriple", "st1">;
+  def ST1x3_1D : NeonI_STVList<0, 0b0110, 0b11, VTriple1D_operand, "st1">;
 
-def : Pat<(v2f64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>;
-def : Pat<(v2i64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>;
+  defm ST1x4 : STVList_BHSD<0b0010, "VQuad", "st1">;
+  def ST1x4_1D : NeonI_STVList<0, 0b0010, 0b11, VQuad1D_operand, "st1">;
 
-def : Pat<(v4f32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>;
-def : Pat<(v4i32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>;
+  def : Pat<(v2f64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>;
+  def : Pat<(v2i64 (load GPR64xsp:$addr)), (LD1_2D GPR64xsp:$addr)>;
 
-def : Pat<(v8i16 (load GPR64xsp:$addr)), (LD1_8H GPR64xsp:$addr)>;
-def : Pat<(v16i8 (load GPR64xsp:$addr)), (LD1_16B GPR64xsp:$addr)>;
+  def : Pat<(v4f32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>;
+  def : Pat<(v4i32 (load GPR64xsp:$addr)), (LD1_4S GPR64xsp:$addr)>;
 
-def : Pat<(v1f64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>;
-def : Pat<(v1i64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>;
+  def : Pat<(v8i16 (load GPR64xsp:$addr)), (LD1_8H GPR64xsp:$addr)>;
+  def : Pat<(v16i8 (load GPR64xsp:$addr)), (LD1_16B GPR64xsp:$addr)>;
 
-def : Pat<(v2f32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>;
-def : Pat<(v2i32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>;
+  def : Pat<(v1f64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>;
+  def : Pat<(v1i64 (load GPR64xsp:$addr)), (LD1_1D GPR64xsp:$addr)>;
 
-def : Pat<(v4i16 (load GPR64xsp:$addr)), (LD1_4H GPR64xsp:$addr)>;
-def : Pat<(v8i8 (load GPR64xsp:$addr)), (LD1_8B GPR64xsp:$addr)>;
+  def : Pat<(v2f32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>;
+  def : Pat<(v2i32 (load GPR64xsp:$addr)), (LD1_2S GPR64xsp:$addr)>;
 
-def : Pat<(store (v2i64 VPR128:$value), GPR64xsp:$addr),
-          (ST1_2D GPR64xsp:$addr, VPR128:$value)>;
-def : Pat<(store (v2f64 VPR128:$value), GPR64xsp:$addr),
-          (ST1_2D GPR64xsp:$addr, VPR128:$value)>;
+  def : Pat<(v4i16 (load GPR64xsp:$addr)), (LD1_4H GPR64xsp:$addr)>;
+  def : Pat<(v8i8 (load GPR64xsp:$addr)), (LD1_8B GPR64xsp:$addr)>;
 
-def : Pat<(store (v4i32 VPR128:$value), GPR64xsp:$addr),
-          (ST1_4S GPR64xsp:$addr, VPR128:$value)>;
-def : Pat<(store (v4f32 VPR128:$value), GPR64xsp:$addr),
-          (ST1_4S GPR64xsp:$addr, VPR128:$value)>;
+  def : Pat<(store (v2i64 VPR128:$value), GPR64xsp:$addr),
+            (ST1_2D GPR64xsp:$addr, VPR128:$value)>;
+  def : Pat<(store (v2f64 VPR128:$value), GPR64xsp:$addr),
+            (ST1_2D GPR64xsp:$addr, VPR128:$value)>;
 
-def : Pat<(store (v8i16 VPR128:$value), GPR64xsp:$addr),
-          (ST1_8H GPR64xsp:$addr, VPR128:$value)>;
-def : Pat<(store (v16i8 VPR128:$value), GPR64xsp:$addr),
-          (ST1_16B GPR64xsp:$addr, VPR128:$value)>;
+  def : Pat<(store (v4i32 VPR128:$value), GPR64xsp:$addr),
+            (ST1_4S GPR64xsp:$addr, VPR128:$value)>;
+  def : Pat<(store (v4f32 VPR128:$value), GPR64xsp:$addr),
+            (ST1_4S GPR64xsp:$addr, VPR128:$value)>;
 
-def : Pat<(store (v1i64 VPR64:$value), GPR64xsp:$addr),
-          (ST1_1D GPR64xsp:$addr, VPR64:$value)>;
-def : Pat<(store (v1f64 VPR64:$value), GPR64xsp:$addr),
-          (ST1_1D GPR64xsp:$addr, VPR64:$value)>;
+  def : Pat<(store (v8i16 VPR128:$value), GPR64xsp:$addr),
+            (ST1_8H GPR64xsp:$addr, VPR128:$value)>;
+  def : Pat<(store (v16i8 VPR128:$value), GPR64xsp:$addr),
+            (ST1_16B GPR64xsp:$addr, VPR128:$value)>;
 
-def : Pat<(store (v2i32 VPR64:$value), GPR64xsp:$addr),
-          (ST1_2S GPR64xsp:$addr, VPR64:$value)>;
-def : Pat<(store (v2f32 VPR64:$value), GPR64xsp:$addr),
-          (ST1_2S GPR64xsp:$addr, VPR64:$value)>;
+  def : Pat<(store (v1i64 VPR64:$value), GPR64xsp:$addr),
+            (ST1_1D GPR64xsp:$addr, VPR64:$value)>;
+  def : Pat<(store (v1f64 VPR64:$value), GPR64xsp:$addr),
+            (ST1_1D GPR64xsp:$addr, VPR64:$value)>;
 
-def : Pat<(store (v4i16 VPR64:$value), GPR64xsp:$addr),
-          (ST1_4H GPR64xsp:$addr, VPR64:$value)>;
-def : Pat<(store (v8i8 VPR64:$value), GPR64xsp:$addr),
-          (ST1_8B GPR64xsp:$addr, VPR64:$value)>;
+  def : Pat<(store (v2i32 VPR64:$value), GPR64xsp:$addr),
+            (ST1_2S GPR64xsp:$addr, VPR64:$value)>;
+  def : Pat<(store (v2f32 VPR64:$value), GPR64xsp:$addr),
+            (ST1_2S GPR64xsp:$addr, VPR64:$value)>;
 
+  def : Pat<(store (v4i16 VPR64:$value), GPR64xsp:$addr),
+            (ST1_4H GPR64xsp:$addr, VPR64:$value)>;
+  def : Pat<(store (v8i8 VPR64:$value), GPR64xsp:$addr),
+            (ST1_8B GPR64xsp:$addr, VPR64:$value)>;
+}
+
 // Match load/store of v1i8/v1i16/v1i32 type to FPR8/FPR16/FPR32 load/store.
 // FIXME: for now we have v1i8, v1i16, v1i32 legal types, if they are illegal,
 // these patterns are not needed any more.
@@ -3681,35 +3730,40 @@
                               ImmTy2, asmop>;
 }
 
-// Post-index load multiple N-element structures from N registers (N = 1,2,3,4)
-defm LD1WB : LDWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "ld1">;
+// Single element loads are ok for BE.
 defm LD1WB_1D : NeonI_LDWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8,
-                                 "ld1">;
+                                "ld1">;
 
-defm LD2WB : LDWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "ld2">;
+// Multiple elements would be reversed in BE.
+let Predicates = [IsLE] in {
+  // Post-index load multiple N-element structures from N registers (N = 1,2,3,4)
+  defm LD1WB : LDWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "ld1">;
 
-defm LD3WB : LDWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48,
-                             "ld3">;
+  defm LD2WB : LDWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "ld2">;
 
-defm LD4WB : LDWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "ld4">;
+  defm LD3WB : LDWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48,
+                               "ld3">;
 
-// Post-index load multiple 1-element structures from N consecutive registers
-// (N = 2,3,4)
-defm LD1x2WB : LDWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32,
-                               "ld1">;
-defm LD1x2WB_1D : NeonI_LDWB_VList<0, 0b1010, 0b11, VPair1D_operand,
-                                   uimm_exact16, "ld1">;
+  defm LD4WB : LDWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "ld4">;
 
-defm LD1x3WB : LDWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48,
-                               "ld1">;
-defm LD1x3WB_1D : NeonI_LDWB_VList<0, 0b0110, 0b11, VTriple1D_operand,
-                                   uimm_exact24, "ld1">;
+  // Post-index load multiple 1-element structures to N consecutive registers
+  // (N = 2,3,4)
+  defm LD1x2WB : LDWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32,
+                                 "ld1">;
+  defm LD1x2WB_1D : NeonI_LDWB_VList<0, 0b1010, 0b11, VPair1D_operand,
+                                     uimm_exact16, "ld1">;
 
-defm LD1x4WB : LDWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64,
-                                "ld1">;
-defm LD1x4WB_1D : NeonI_LDWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
-                                   uimm_exact32, "ld1">;
+  defm LD1x3WB : LDWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48,
+                                 "ld1">;
+  defm LD1x3WB_1D : NeonI_LDWB_VList<0, 0b0110, 0b11, VTriple1D_operand,
+                                     uimm_exact24, "ld1">;
 
+  defm LD1x4WB : LDWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64,
+                                 "ld1">;
+  defm LD1x4WB_1D : NeonI_LDWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
+                                     uimm_exact32, "ld1">;
+}
+
 multiclass NeonI_STWB_VList<bit q, bits<4> opcode, bits<2> size,
                             RegisterOperand VecList, Operand ImmTy,
                             string asmop> {
@@ -3764,33 +3818,36 @@
 }
 
 // Post-index load multiple N-element structures from N registers (N = 1,2,3,4)
-defm ST1WB : STWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "st1">;
-defm ST1WB_1D : NeonI_STWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8,
-                                 "st1">;
+// Loading multiple elements in BE mode suffers from element-reversal.
+let Predicates = [IsLE] in {
+  defm ST1WB_1D : NeonI_STWB_VList<0, 0b0111, 0b11, VOne1D_operand, uimm_exact8,
+                                 "st1">;       
+  defm ST1WB : STWB_VList_BHSD<0b0111, "VOne", uimm_exact8, uimm_exact16, "st1">;
 
-defm ST2WB : STWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "st2">;
+  defm ST2WB : STWB_VList_BHSD<0b1000, "VPair", uimm_exact16, uimm_exact32, "st2">;
 
-defm ST3WB : STWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48,
-                             "st3">;
+  defm ST3WB : STWB_VList_BHSD<0b0100, "VTriple", uimm_exact24, uimm_exact48,
+                               "st3">;
 
-defm ST4WB : STWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "st4">;
+  defm ST4WB : STWB_VList_BHSD<0b0000, "VQuad", uimm_exact32, uimm_exact64, "st4">;
 
-// Post-index load multiple 1-element structures from N consecutive registers
-// (N = 2,3,4)
-defm ST1x2WB : STWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32,
-                               "st1">;
-defm ST1x2WB_1D : NeonI_STWB_VList<0, 0b1010, 0b11, VPair1D_operand,
-                                   uimm_exact16, "st1">;
+  // Post-index load multiple 1-element structures from N consecutive registers
+  // (N = 2,3,4)
+  defm ST1x2WB : STWB_VList_BHSD<0b1010, "VPair", uimm_exact16, uimm_exact32,
+                                 "st1">;
+  defm ST1x2WB_1D : NeonI_STWB_VList<0, 0b1010, 0b11, VPair1D_operand,
+                                     uimm_exact16, "st1">;
 
-defm ST1x3WB : STWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48,
-                               "st1">;
-defm ST1x3WB_1D : NeonI_STWB_VList<0, 0b0110, 0b11, VTriple1D_operand,
-                                   uimm_exact24, "st1">;
+  defm ST1x3WB : STWB_VList_BHSD<0b0110, "VTriple", uimm_exact24, uimm_exact48,
+                                 "st1">;
+  defm ST1x3WB_1D : NeonI_STWB_VList<0, 0b0110, 0b11, VTriple1D_operand,
+                                     uimm_exact24, "st1">;
 
-defm ST1x4WB : STWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64,
-                               "st1">;
-defm ST1x4WB_1D : NeonI_STWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
-                                   uimm_exact32, "st1">;
+  defm ST1x4WB : STWB_VList_BHSD<0b0010, "VQuad", uimm_exact32, uimm_exact64,
+                                 "st1">;
+  defm ST1x4WB_1D : NeonI_STWB_VList<0, 0b0010, 0b11, VQuad1D_operand,
+                                     uimm_exact32, "st1">;
+}
 
 // End of post-index vector load/store multiple N-element structure
 // (class SIMD lselem-post)
@@ -3865,13 +3922,17 @@
 }
 
 // Load single 1-element structure to all lanes of 1 register
+// Single element loads are fine in BE
 defm LD1R : LDN_Dup_BHSD<0b0, 0b110, "VOne", "ld1r">;
 
 // Load single N-element structure to all lanes of N consecutive
 // registers (N = 2,3,4)
-defm LD2R : LDN_Dup_BHSD<0b1, 0b110, "VPair", "ld2r">;
-defm LD3R : LDN_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r">;
-defm LD4R : LDN_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r">;
+// Multi-element loads suffer from element reversal in BE.
+let Predicates = [IsLE] in {
+  defm LD2R : LDN_Dup_BHSD<0b1, 0b110, "VPair", "ld2r">;
+  defm LD3R : LDN_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r">;
+  defm LD4R : LDN_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r">;
+}
 
 
 class LD1R_pattern <ValueType VTy, ValueType DTy, PatFrag LoadOp,
@@ -3880,31 +3941,35 @@
           (VTy (INST GPR64xsp:$Rn))>;
 
 // Match all LD1R instructions
-def : LD1R_pattern<v8i8, i32, extloadi8, LD1R_8B>;
+// This won't work as intended in BE mode, as STR q0 stored the elements swapped.
+let Predicates = [IsLE] in {
+  def : LD1R_pattern<v8i8, i32, extloadi8, LD1R_8B>;
+  def : LD1R_pattern<v16i8, i32, extloadi8, LD1R_16B>;
 
-def : LD1R_pattern<v16i8, i32, extloadi8, LD1R_16B>;
+  def : LD1R_pattern<v4i16, i32, extloadi16, LD1R_4H>;
 
-def : LD1R_pattern<v4i16, i32, extloadi16, LD1R_4H>;
+  def : LD1R_pattern<v8i16, i32, extloadi16, LD1R_8H>;
 
-def : LD1R_pattern<v8i16, i32, extloadi16, LD1R_8H>;
+  def : LD1R_pattern<v2i32, i32, load, LD1R_2S>;
+  def : LD1R_pattern<v2f32, f32, load, LD1R_2S>;
 
-def : LD1R_pattern<v2i32, i32, load, LD1R_2S>;
-def : LD1R_pattern<v2f32, f32, load, LD1R_2S>;
+  def : LD1R_pattern<v4i32, i32, load, LD1R_4S>;
+  def : LD1R_pattern<v4f32, f32, load, LD1R_4S>;
 
-def : LD1R_pattern<v4i32, i32, load, LD1R_4S>;
-def : LD1R_pattern<v4f32, f32, load, LD1R_4S>;
+  def : LD1R_pattern<v2i64, i64, load, LD1R_2D>;
+  def : LD1R_pattern<v2f64, f64, load, LD1R_2D>;
+}
 
-def : LD1R_pattern<v2i64, i64, load, LD1R_2D>;
-def : LD1R_pattern<v2f64, f64, load, LD1R_2D>;
-
 class LD1R_pattern_v1 <ValueType VTy, ValueType DTy, PatFrag LoadOp,
                        Instruction INST>
   : Pat<(VTy (scalar_to_vector (DTy (LoadOp GPR64xsp:$Rn)))),
         (VTy (INST GPR64xsp:$Rn))>;
 
+// Single element operations are swap-safe in BE.
 def : LD1R_pattern_v1<v1i64, i64, load, LD1R_1D>;
 def : LD1R_pattern_v1<v1f64, f64, load, LD1R_1D>;
 
+
 multiclass VectorList_Bare_BHSD<string PREFIX, int Count,
                                 RegisterClass RegList> {
   defm B : VectorList_operands<PREFIX, "B", Count, RegList>;
@@ -3965,46 +4030,64 @@
 }
 
 // Load single 1-element structure to one lane of 1 register.
+// No dangerous element swaps in BE. :-)
 defm LD1LN : LDN_Lane_BHSD<0b0, 0b0, "VOne", "ld1">;
 
 // Load single N-element structure to one lane of N consecutive registers
 // (N = 2,3,4)
-defm LD2LN : LDN_Lane_BHSD<0b1, 0b0, "VPair", "ld2">;
-defm LD3LN : LDN_Lane_BHSD<0b0, 0b1, "VTriple", "ld3">;
-defm LD4LN : LDN_Lane_BHSD<0b1, 0b1, "VQuad", "ld4">;
+//
+// This will not work as intended in BE mode, if the matcher generates it to
+// load a vector to a lane. (STR q0 stored the vector's elements swapped)
+// Must always use an intrinsic, so the user knows it's loading from an array 
+// layout.
+let Predicates = [IsLE] in {
+  defm LD2LN : LDN_Lane_BHSD<0b1, 0b0, "VPair", "ld2">;
+  defm LD3LN : LDN_Lane_BHSD<0b0, 0b1, "VTriple", "ld3">;
+  defm LD4LN : LDN_Lane_BHSD<0b1, 0b1, "VQuad", "ld4">;
+}
 
-multiclass LD1LN_patterns<ValueType VTy, ValueType VTy2, ValueType DTy,
-                          Operand ImmOp, Operand ImmOp2, PatFrag LoadOp,
-                          Instruction INST> {
-  def : Pat<(VTy (vector_insert (VTy VPR64:$src),
-                     (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp:$lane))),
-            (VTy (EXTRACT_SUBREG
-                     (INST GPR64xsp:$Rn,
-                           (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
-                           ImmOp:$lane),
-                     sub_64))>;
-
-  def : Pat<(VTy2 (vector_insert (VTy2 VPR128:$src),
-                      (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp2:$lane))),
-            (VTy2 (INST GPR64xsp:$Rn, VPR128:$src, ImmOp2:$lane))>;
+// Multiple elements would be reversed in BE.
+let Predicates = [IsLE] in {
+  multiclass LD1LN_patterns<ValueType VTy, ValueType VTy2, ValueType DTy,
+                            Operand ImmOp, Operand ImmOp2, PatFrag LoadOp,
+                            Instruction INST> {
+    def : Pat<(VTy (vector_insert (VTy VPR64:$src),
+                       (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp:$lane))),
+              (VTy (EXTRACT_SUBREG
+                       (INST GPR64xsp:$Rn,
+                             (SUBREG_TO_REG (i64 0), VPR64:$src, sub_64),
+                             ImmOp:$lane),
+                       sub_64))>;
+  
+    def : Pat<(VTy2 (vector_insert (VTy2 VPR128:$src),
+                        (DTy (LoadOp GPR64xsp:$Rn)), (ImmOp2:$lane))),
+              (VTy2 (INST GPR64xsp:$Rn, VPR128:$src, ImmOp2:$lane))>;
+  }
 }
 
 // Match all LD1LN instructions
-defm : LD1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
-                      extloadi8, LD1LN_B>;
+//
+// This will not work as intended in BE mode, if the matcher generates it to
+// load a vector to a lane. (STR q0 stored the elements swapped in BE)
+// Must always use an intrinsic, so the user knows it's loading from an array 
+// layout.
+let Predicates = [IsLE] in {
+  defm : LD1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
+                        extloadi8, LD1LN_B>;
 
-defm : LD1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
-                      extloadi16, LD1LN_H>;
+  defm : LD1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
+                        extloadi16, LD1LN_H>;
 
-defm : LD1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
-                      load, LD1LN_S>;
-defm : LD1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
-                      load, LD1LN_S>;
+  defm : LD1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
+                        load, LD1LN_S>;
+  defm : LD1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
+                        load, LD1LN_S>;
 
-defm : LD1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
-                      load, LD1LN_D>;
-defm : LD1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
-                      load, LD1LN_D>;
+  defm : LD1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
+                        load, LD1LN_D>;
+  defm : LD1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
+                        load, LD1LN_D>;
+}
 
 class NeonI_STN_Lane<bit r, bits<2> op2_1, bit op0, RegisterOperand VList,
                      Operand ImmOp, string asmop>
@@ -4049,13 +4132,17 @@
 }
 
 // Store single 1-element structure from one lane of 1 register.
+// single element should be fine in BE - no swapping of elements.
 defm ST1LN : STN_Lane_BHSD<0b0, 0b0, "VOne", "st1">;
 
 // Store single N-element structure from one lane of N consecutive registers
 // (N = 2,3,4)
-defm ST2LN : STN_Lane_BHSD<0b1, 0b0, "VPair", "st2">;
-defm ST3LN : STN_Lane_BHSD<0b0, 0b1, "VTriple", "st3">;
-defm ST4LN : STN_Lane_BHSD<0b1, 0b1, "VQuad", "st4">;
+// Multiple elements would be reversed in BE.
+let Predicates = [IsLE] in {
+  defm ST2LN : STN_Lane_BHSD<0b1, 0b0, "VPair", "st2">;
+  defm ST3LN : STN_Lane_BHSD<0b0, 0b1, "VTriple", "st3">;
+  defm ST4LN : STN_Lane_BHSD<0b1, 0b1, "VQuad", "st4">;
+}
 
 multiclass ST1LN_patterns<ValueType VTy, ValueType VTy2, ValueType DTy,
                           Operand ImmOp, Operand ImmOp2, PatFrag StoreOp,
@@ -4072,22 +4159,25 @@
 }
 
 // Match all ST1LN instructions
-defm : ST1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
-                      truncstorei8, ST1LN_B>;
+//
+// Multiple elements would be reversed in BE.
+let Predicates = [IsLE] in {
+  defm : ST1LN_patterns<v8i8, v16i8, i32, neon_uimm3_bare, neon_uimm4_bare,
+                        truncstorei8, ST1LN_B>;
 
-defm : ST1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
-                      truncstorei16, ST1LN_H>;
+  defm : ST1LN_patterns<v4i16, v8i16, i32, neon_uimm2_bare, neon_uimm3_bare,
+                        truncstorei16, ST1LN_H>;
 
-defm : ST1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
-                      store, ST1LN_S>;
-defm : ST1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
-                      store, ST1LN_S>;
+  defm : ST1LN_patterns<v2i32, v4i32, i32, neon_uimm1_bare, neon_uimm2_bare,
+                        store, ST1LN_S>;
+  defm : ST1LN_patterns<v2f32, v4f32, f32, neon_uimm1_bare, neon_uimm2_bare,
+                        store, ST1LN_S>;
 
-defm : ST1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
-                      store, ST1LN_D>;
-defm : ST1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
-                      store, ST1LN_D>;
-
+  defm : ST1LN_patterns<v1i64, v2i64, i64, neon_uimm0_bare, neon_uimm1_bare,
+                        store, ST1LN_D>;
+  defm : ST1LN_patterns<v1f64, v2f64, f64, neon_uimm0_bare, neon_uimm1_bare,
+                        store, ST1LN_D>;
+}
 // End of vector load/store single N-element structure (class SIMD lsone).
 
 
@@ -4154,17 +4244,21 @@
 }
 
 // Post-index load single 1-element structure to all lanes of 1 register
+// one element duplication should be fine in BE - no swapping of elements.
 defm LD1R_WB : LDWB_Dup_BHSD<0b0, 0b110, "VOne", "ld1r", uimm_exact1,
                              uimm_exact2, uimm_exact4, uimm_exact8>;
 
 // Post-index load single N-element structure to all lanes of N consecutive
 // registers (N = 2,3,4)
-defm LD2R_WB : LDWB_Dup_BHSD<0b1, 0b110, "VPair", "ld2r", uimm_exact2,
-                             uimm_exact4, uimm_exact8, uimm_exact16>;
-defm LD3R_WB : LDWB_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r", uimm_exact3,
-                             uimm_exact6, uimm_exact12, uimm_exact24>;
-defm LD4R_WB : LDWB_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r", uimm_exact4,
-                             uimm_exact8, uimm_exact16, uimm_exact32>;
+// Multiple elements would be reversed in BE.
+let Predicates = [IsLE] in {
+  defm LD2R_WB : LDWB_Dup_BHSD<0b1, 0b110, "VPair", "ld2r", uimm_exact2,
+                               uimm_exact4, uimm_exact8, uimm_exact16>;
+  defm LD3R_WB : LDWB_Dup_BHSD<0b0, 0b111, "VTriple", "ld3r", uimm_exact3,
+                               uimm_exact6, uimm_exact12, uimm_exact24>;
+  defm LD4R_WB : LDWB_Dup_BHSD<0b1, 0b111, "VQuad", "ld4r", uimm_exact4,
+                               uimm_exact8, uimm_exact16, uimm_exact32>;
+}
 
 let mayLoad = 1, neverHasSideEffects = 1, hasExtraDefRegAllocReq = 1,
     Constraints = "$Rn = $wb, $Rt = $src",
@@ -4253,18 +4347,22 @@
 }
 
 // Post-index load single 1-element structure to one lane of 1 register.
+// One element from 1 lane is fine in BE - no swapping of elements.
 defm LD1LN_WB : LD_Lane_WB_BHSD<0b0, 0b0, "VOne", "ld1", uimm_exact1,
                                 uimm_exact2, uimm_exact4, uimm_exact8>;
 
 // Post-index load single N-element structure to one lane of N consecutive
 // registers
 // (N = 2,3,4)
-defm LD2LN_WB : LD_Lane_WB_BHSD<0b1, 0b0, "VPair", "ld2", uimm_exact2,
-                                uimm_exact4, uimm_exact8, uimm_exact16>;
-defm LD3LN_WB : LD_Lane_WB_BHSD<0b0, 0b1, "VTriple", "ld3", uimm_exact3,
-                                uimm_exact6, uimm_exact12, uimm_exact24>;
-defm LD4LN_WB : LD_Lane_WB_BHSD<0b1, 0b1, "VQuad", "ld4", uimm_exact4,
-                                uimm_exact8, uimm_exact16, uimm_exact32>;
+// Multiple elements would be reversed in BE.
+let Predicates = [IsLE] in {
+  defm LD2LN_WB : LD_Lane_WB_BHSD<0b1, 0b0, "VPair", "ld2", uimm_exact2,
+                                  uimm_exact4, uimm_exact8, uimm_exact16>;
+  defm LD3LN_WB : LD_Lane_WB_BHSD<0b0, 0b1, "VTriple", "ld3", uimm_exact3,
+                                  uimm_exact6, uimm_exact12, uimm_exact24>;
+  defm LD4LN_WB : LD_Lane_WB_BHSD<0b1, 0b1, "VQuad", "ld4", uimm_exact4,
+                                  uimm_exact8, uimm_exact16, uimm_exact32>;
+}
 
 let mayStore = 1, neverHasSideEffects = 1,
     hasExtraDefRegAllocReq = 1, Constraints = "$Rn = $wb",
@@ -4353,17 +4451,21 @@
 }
 
 // Post-index store single 1-element structure from one lane of 1 register.
+// one element from 1 lane should be fine in BE - no swapping of elements.
 defm ST1LN_WB : ST_Lane_WB_BHSD<0b0, 0b0, "VOne", "st1", uimm_exact1,
                                 uimm_exact2, uimm_exact4, uimm_exact8>;
 
 // Post-index store single N-element structure from one lane of N consecutive
 // registers (N = 2,3,4)
-defm ST2LN_WB : ST_Lane_WB_BHSD<0b1, 0b0, "VPair", "st2", uimm_exact2,
-                                uimm_exact4, uimm_exact8, uimm_exact16>;
-defm ST3LN_WB : ST_Lane_WB_BHSD<0b0, 0b1, "VTriple", "st3", uimm_exact3,
-                                uimm_exact6, uimm_exact12, uimm_exact24>;
-defm ST4LN_WB : ST_Lane_WB_BHSD<0b1, 0b1, "VQuad", "st4", uimm_exact4,
-                                uimm_exact8, uimm_exact16, uimm_exact32>;
+// Multiple elements would be reversed in BE.
+let Predicates = [IsLE] in {
+  defm ST2LN_WB : ST_Lane_WB_BHSD<0b1, 0b0, "VPair", "st2", uimm_exact2,
+                                  uimm_exact4, uimm_exact8, uimm_exact16>;
+  defm ST3LN_WB : ST_Lane_WB_BHSD<0b0, 0b1, "VTriple", "st3", uimm_exact3,
+                                  uimm_exact6, uimm_exact12, uimm_exact24>;
+  defm ST4LN_WB : ST_Lane_WB_BHSD<0b1, 0b1, "VQuad", "st4", uimm_exact4,
+                                  uimm_exact8, uimm_exact16, uimm_exact32>;
+}
 
 // End of post-index load/store single N-element instructions
 // (class SIMD lsone-post)
Index: test/CodeGen/AArch64/128bit_load_store.ll
===================================================================
--- test/CodeGen/AArch64/128bit_load_store.ll
+++ test/CodeGen/AArch64/128bit_load_store.ll
@@ -1,5 +1,118 @@
+; R UN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=neon | FileCheck %s
 
+define void @test_store_v1i8(<1 x i8>* %ptr, <1 x i8> %val) #0 {
+; CHECK: test_store_v1i8
+; CHECK: str	 {{b[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <1 x i8> %val, <1 x i8>* %ptr, align 8
+  ret void
+}
+
+
+
+define void @test_store_f16(half* %ptr, half %val) #0 {
+; CHECK: test_store_f16
+; CHECK: str	 {{h[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store half %val, half* %ptr, align 8
+  ret void
+}
+
+define void @test_store_v1i16(<1 x i16>* %ptr, <1 x i16> %val) #0 {
+; CHECK: test_store_v1i16
+; CHECK: str	 {{h[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <1 x i16> %val, <1 x i16>* %ptr, align 8
+  ret void
+}
+
+
+
+define void @test_store_f32(float* %ptr, float %val) #0 {
+; CHECK: test_store_f32
+; CHECK: str	 {{s[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store float %val, float* %ptr, align 8
+  ret void
+}
+
+define void @test_store_v1f32(<1 x float>* %ptr, <1 x float> %val) #0 {
+; CHECK: test_store_v1f32
+; CHECK: str	 {{s[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <1 x float> %val, <1 x float>* %ptr, align 8
+  ret void
+}
+
+define void @test_store_v1i32(<1 x i32>* %ptr, <1 x i32> %val) #0 {
+; CHECK: test_store_v1i32
+; CHECK: str	 {{s[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <1 x i32> %val, <1 x i32>* %ptr, align 8
+  ret void
+}
+
+
+define void @test_store_f64(double *%ptr, double %val) #0 {
+; CHECK: test_store_f64
+; CHECK: str	 {{d[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store double %val, double* %ptr, align 8
+  ret void
+}
+
+define void @test_store_v1f64(<1 x double>* %ptr, <1 x double> %val) #0 {
+; CHECK: test_store_v1f64
+; CHECK: str	 {{d[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <1 x double> %val, <1 x double>* %ptr, align 8
+  ret void
+}
+
+define void @test_store_v2f32(<2 x float>* %ptr, <2 x float> %val) #0 {
+; CHECK: test_store_v2f32
+; CHECK: str	 {{d[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <2 x float> %val, <2 x float>* %ptr, align 8
+  ret void
+}
+
+define void @test_store_v1i64(<1 x i64>* %ptr, <1 x i64> %val) #0 {
+; CHECK: test_store_v1i64
+; CHECK: str	 {{d[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <1 x i64> %val, <1 x i64>* %ptr, align 8
+  ret void
+}
+
+define void @test_store_v2i32(<2 x i32>* %ptr, <2 x i32> %val) #0 {
+; CHECK: test_store_v2i32
+; CHECK: str	 {{d[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <2 x i32> %val, <2 x i32>* %ptr, align 8
+  ret void
+}
+
+define void @test_store_v4i16(<4 x i16>* %ptr, <4 x i16> %val) #0 {
+; CHECK: test_store_v4i16
+; CHECK: str	 {{d[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <4 x i16> %val, <4 x i16>* %ptr, align 8
+  ret void
+}
+
+define void @test_store_v8i8(<8 x i8>* %ptr, <8 x i8> %val) #0 {
+; CHECK: test_store_v8i8
+; CHECK: str	 {{d[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <8 x i8> %val, <8 x i8>* %ptr, align 8
+  ret void
+}
+
+
+
+
 define void @test_store_f128(fp128* %ptr, fp128 %val) #0 {
 ; CHECK: test_store_f128
 ; CHECK: str	 {{q[0-9]+}}, [{{x[0-9]+}}]
@@ -8,6 +121,54 @@
   ret void
 }
 
+define void @test_store_v2f64(<2 x double>* %ptr, <2 x double> %val) #0 {
+; CHECK: test_store_v2f64
+; CHECK: str	 {{q[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <2 x double> %val, <2 x double>* %ptr, align 16
+  ret void
+}
+
+define void @test_store_v4f32(<4 x float>* %ptr, <4 x float> %val) #0 {
+; CHECK: test_store_v4f32
+; CHECK: str	 {{q[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <4 x float> %val, <4 x float>* %ptr, align 16
+  ret void
+}
+
+define void @test_store_v2i64(<2 x i64>* %ptr, <2 x i64> %val) #0 {
+; CHECK: test_store_v2i64
+; CHECK: str	 {{q[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <2 x i64> %val, <2 x i64>* %ptr, align 16
+  ret void
+}
+
+define void @test_store_v4i32(<4 x i32>* %ptr, <4 x i32> %val) #0 {
+; CHECK: test_store_v4i32
+; CHECK: str	 {{q[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <4 x i32> %val, <4 x i32>* %ptr, align 16
+  ret void
+}
+
+define void @test_store_v8i16(<8 x i16>* %ptr, <8 x i16> %val) #0 {
+; CHECK: test_store_v8i16
+; CHECK: str	 {{q[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <8 x i16> %val, <8 x i16>* %ptr, align 16
+  ret void
+}
+
+define void @test_store_v16i8(<16 x i8>* %ptr, <16 x i8> %val) #0 {
+; CHECK: test_store_v16i8
+; CHECK: str	 {{q[0-9]+}}, [{{x[0-9]+}}]
+entry:
+  store <16 x i8> %val, <16 x i8>* %ptr, align 16
+  ret void
+}
+
 define fp128 @test_load_f128(fp128* readonly %ptr) #2 {
 ; CHECK: test_load_f128
 ; CHECK: ldr	 {{q[0-9]+}}, [{{x[0-9]+}}]
Index: test/CodeGen/AArch64/addsub-shifted.ll
===================================================================
--- test/CodeGen/AArch64/addsub-shifted.ll
+++ test/CodeGen/AArch64/addsub-shifted.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 @var32 = global i32 0
Index: test/CodeGen/AArch64/addsub.ll
===================================================================
--- test/CodeGen/AArch64/addsub.ll
+++ test/CodeGen/AArch64/addsub.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 ; Note that this should be refactored (for efficiency if nothing else)
Index: test/CodeGen/AArch64/addsub_ext.ll
===================================================================
--- test/CodeGen/AArch64/addsub_ext.ll
+++ test/CodeGen/AArch64/addsub_ext.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 @var8 = global i8 0
Index: test/CodeGen/AArch64/alloca.ll
===================================================================
--- test/CodeGen/AArch64/alloca.ll
+++ test/CodeGen/AArch64/alloca.ll
@@ -1,3 +1,5 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-NOFP %s
 
Index: test/CodeGen/AArch64/analyze-branch.ll
===================================================================
--- test/CodeGen/AArch64/analyze-branch.ll
+++ test/CodeGen/AArch64/analyze-branch.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
 
 ; This test checks that LLVM can do basic stripping and reapplying of branches
Index: test/CodeGen/AArch64/assertion-rc-mismatch.ll
===================================================================
--- test/CodeGen/AArch64/assertion-rc-mismatch.ll
+++ test/CodeGen/AArch64/assertion-rc-mismatch.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; Test case related to <rdar://problem/15633429>.
 
Index: test/CodeGen/AArch64/atomic-ops-not-barriers.ll
===================================================================
--- test/CodeGen/AArch64/atomic-ops-not-barriers.ll
+++ test/CodeGen/AArch64/atomic-ops-not-barriers.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 
 define i32 @foo(i32* %var, i1 %cond) {
Index: test/CodeGen/AArch64/atomic-ops.ll
===================================================================
--- test/CodeGen/AArch64/atomic-ops.ll
+++ test/CodeGen/AArch64/atomic-ops.ll
@@ -1,3 +1,5 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-REG %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-REG %s
 
Index: test/CodeGen/AArch64/basic-pic.ll
===================================================================
--- test/CodeGen/AArch64/basic-pic.ll
+++ test/CodeGen/AArch64/basic-pic.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -relocation-model=pic %s -o - | FileCheck %s
 
 @var = global i32 0
Index: test/CodeGen/AArch64/bitfield-insert-0.ll
===================================================================
--- test/CodeGen/AArch64/bitfield-insert-0.ll
+++ test/CodeGen/AArch64/bitfield-insert-0.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -filetype=obj < %s | llvm-objdump -disassemble - | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -filetype=obj < %s | llvm-objdump -disassemble - | FileCheck %s
 
 ; The encoding of lsb -> immr in the CGed bitfield instructions was wrong at one
Index: test/CodeGen/AArch64/bitfield-insert.ll
===================================================================
--- test/CodeGen/AArch64/bitfield-insert.ll
+++ test/CodeGen/AArch64/bitfield-insert.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
 
 ; First, a simple example from Clang. The registers could plausibly be
Index: test/CodeGen/AArch64/bitfield.ll
===================================================================
--- test/CodeGen/AArch64/bitfield.ll
+++ test/CodeGen/AArch64/bitfield.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
 
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
Index: test/CodeGen/AArch64/blockaddress.ll
===================================================================
--- test/CodeGen/AArch64/blockaddress.ll
+++ test/CodeGen/AArch64/blockaddress.ll
@@ -1,3 +1,5 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -code-model=large -mtriple=aarch64_be-none-linux-gnu -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-LARGE %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -code-model=large -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck --check-prefix=CHECK-LARGE %s
 
Index: test/CodeGen/AArch64/bool-loads.ll
===================================================================
--- test/CodeGen/AArch64/bool-loads.ll
+++ test/CodeGen/AArch64/bool-loads.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
 
 @var = global i1 0
Index: test/CodeGen/AArch64/breg.ll
===================================================================
--- test/CodeGen/AArch64/breg.ll
+++ test/CodeGen/AArch64/breg.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 @stored_label = global i8* null
Index: test/CodeGen/AArch64/callee-save.ll
===================================================================
--- test/CodeGen/AArch64/callee-save.ll
+++ test/CodeGen/AArch64/callee-save.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 @var = global float 0.0
Index: test/CodeGen/AArch64/code-model-large-abs.ll
===================================================================
--- test/CodeGen/AArch64/code-model-large-abs.ll
+++ test/CodeGen/AArch64/code-model-large-abs.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -code-model=large < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -code-model=large < %s | FileCheck %s
 
 @var8 = global i8 0
Index: test/CodeGen/AArch64/compare-branch.ll
===================================================================
--- test/CodeGen/AArch64/compare-branch.ll
+++ test/CodeGen/AArch64/compare-branch.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 @var32 = global i32 0
Index: test/CodeGen/AArch64/complex-copy-noneon.ll
===================================================================
--- test/CodeGen/AArch64/complex-copy-noneon.ll
+++ test/CodeGen/AArch64/complex-copy-noneon.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=-neon < %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=-neon < %s
 
 ; The DAG combiner decided to use a vector load/store for this struct copy
Index: test/CodeGen/AArch64/concatvector-v8i8-bug.ll
===================================================================
--- test/CodeGen/AArch64/concatvector-v8i8-bug.ll
+++ test/CodeGen/AArch64/concatvector-v8i8-bug.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon
 ; Bug: i8 type in FRP8 register but not registering with register class causes segmentation fault.
 ; Fix: Removed i8 type from FPR8 register class.
Index: test/CodeGen/AArch64/cond-sel.ll
===================================================================
--- test/CodeGen/AArch64/cond-sel.ll
+++ test/CodeGen/AArch64/cond-sel.ll
@@ -1,3 +1,5 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
Index: test/CodeGen/AArch64/cpus.ll
===================================================================
--- test/CodeGen/AArch64/cpus.ll
+++ test/CodeGen/AArch64/cpus.ll
@@ -1,3 +1,7 @@
+; RUN: llc < %s -mtriple=aarch64_be-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64_be-unknown-unknown -mcpu=cortex-a53 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64_be-unknown-unknown -mcpu=cortex-a57 2>&1 | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64_be-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID
 ; This tests that llc accepts all valid AArch64 CPUs
 
 ; RUN: llc < %s -mtriple=aarch64-unknown-unknown -mcpu=generic 2>&1 | FileCheck %s
Index: test/CodeGen/AArch64/directcond.ll
===================================================================
--- test/CodeGen/AArch64/directcond.ll
+++ test/CodeGen/AArch64/directcond.ll
@@ -1,3 +1,5 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
Index: test/CodeGen/AArch64/dp-3source.ll
===================================================================
--- test/CodeGen/AArch64/dp-3source.ll
+++ test/CodeGen/AArch64/dp-3source.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 define i32 @test_madd32(i32 %val0, i32 %val1, i32 %val2) {
Index: test/CodeGen/AArch64/dp1.ll
===================================================================
--- test/CodeGen/AArch64/dp1.ll
+++ test/CodeGen/AArch64/dp1.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 @var32 = global i32 0
Index: test/CodeGen/AArch64/dp2.ll
===================================================================
--- test/CodeGen/AArch64/dp2.ll
+++ test/CodeGen/AArch64/dp2.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 @var32_0 = global i32 0
Index: test/CodeGen/AArch64/extern-weak.ll
===================================================================
--- test/CodeGen/AArch64/extern-weak.ll
+++ test/CodeGen/AArch64/extern-weak.ll
@@ -1,3 +1,5 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -o - < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -code-model=large -o - < %s | FileCheck --check-prefix=CHECK-LARGE %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -o - < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -code-model=large -o - < %s | FileCheck --check-prefix=CHECK-LARGE %s
 
Index: test/CodeGen/AArch64/extract.ll
===================================================================
--- test/CodeGen/AArch64/extract.ll
+++ test/CodeGen/AArch64/extract.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 define i64 @ror_i64(i64 %in) {
Index: test/CodeGen/AArch64/fastcc-reserved.ll
===================================================================
--- test/CodeGen/AArch64/fastcc-reserved.ll
+++ test/CodeGen/AArch64/fastcc-reserved.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -tailcallopt | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s
 
 ; This test is designed to be run in the situation where the
Index: test/CodeGen/AArch64/fastcc.ll
===================================================================
--- test/CodeGen/AArch64/fastcc.ll
+++ test/CodeGen/AArch64/fastcc.ll
@@ -1,3 +1,5 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -tailcallopt | FileCheck %s -check-prefix CHECK-TAIL
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s -check-prefix CHECK-TAIL
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
Index: test/CodeGen/AArch64/fcmp.ll
===================================================================
--- test/CodeGen/AArch64/fcmp.ll
+++ test/CodeGen/AArch64/fcmp.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 declare void @bar(i32)
Index: test/CodeGen/AArch64/fcvt-fixed.ll
===================================================================
--- test/CodeGen/AArch64/fcvt-fixed.ll
+++ test/CodeGen/AArch64/fcvt-fixed.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -O0 | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0 | FileCheck %s
 
 @var32 = global i32 0
Index: test/CodeGen/AArch64/fcvt-int.ll
===================================================================
--- test/CodeGen/AArch64/fcvt-int.ll
+++ test/CodeGen/AArch64/fcvt-int.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 define i32 @test_floattoi32(float %in) {
Index: test/CodeGen/AArch64/flags-multiuse.ll
===================================================================
--- test/CodeGen/AArch64/flags-multiuse.ll
+++ test/CodeGen/AArch64/flags-multiuse.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 
 ; LLVM should be able to cope with multiple uses of the same flag-setting
Index: test/CodeGen/AArch64/floatdp_1source.ll
===================================================================
--- test/CodeGen/AArch64/floatdp_1source.ll
+++ test/CodeGen/AArch64/floatdp_1source.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 @varhalf = global half 0.0
Index: test/CodeGen/AArch64/floatdp_2source.ll
===================================================================
--- test/CodeGen/AArch64/floatdp_2source.ll
+++ test/CodeGen/AArch64/floatdp_2source.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 @varfloat = global float 0.0
Index: test/CodeGen/AArch64/fp-cond-sel.ll
===================================================================
--- test/CodeGen/AArch64/fp-cond-sel.ll
+++ test/CodeGen/AArch64/fp-cond-sel.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 @varfloat = global float 0.0
Index: test/CodeGen/AArch64/fp-dp3.ll
===================================================================
--- test/CodeGen/AArch64/fp-dp3.ll
+++ test/CodeGen/AArch64/fp-dp3.ll
@@ -1,3 +1,5 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -fp-contract=fast | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s -check-prefix=CHECK-NOFAST
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -fp-contract=fast | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s -check-prefix=CHECK-NOFAST
 
Index: test/CodeGen/AArch64/fp128-folding.ll
===================================================================
--- test/CodeGen/AArch64/fp128-folding.ll
+++ test/CodeGen/AArch64/fp128-folding.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 declare void @bar(i8*, i8*, i32*)
 
Index: test/CodeGen/AArch64/fp128.ll
===================================================================
--- test/CodeGen/AArch64/fp128.ll
+++ test/CodeGen/AArch64/fp128.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 
 @lhs = global fp128 zeroinitializer
Index: test/CodeGen/AArch64/fpimm.ll
===================================================================
--- test/CodeGen/AArch64/fpimm.ll
+++ test/CodeGen/AArch64/fpimm.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 @varf32 = global float 0.0
Index: test/CodeGen/AArch64/frameaddr.ll
===================================================================
--- test/CodeGen/AArch64/frameaddr.ll
+++ test/CodeGen/AArch64/frameaddr.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -mtriple=aarch64_be-none-linux-gnu  | FileCheck %s
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu  | FileCheck %s
 
 define i8* @t() nounwind {
Index: test/CodeGen/AArch64/func-argpassing.ll
===================================================================
--- test/CodeGen/AArch64/func-argpassing.ll
+++ test/CodeGen/AArch64/func-argpassing.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
Index: test/CodeGen/AArch64/func-calls.ll
===================================================================
--- test/CodeGen/AArch64/func-calls.ll
+++ test/CodeGen/AArch64/func-calls.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
Index: test/CodeGen/AArch64/global-alignment.ll
===================================================================
--- test/CodeGen/AArch64/global-alignment.ll
+++ test/CodeGen/AArch64/global-alignment.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 
 @var32 = global [3 x i32] zeroinitializer
Index: test/CodeGen/AArch64/got-abuse.ll
===================================================================
--- test/CodeGen/AArch64/got-abuse.ll
+++ test/CodeGen/AArch64/got-abuse.ll
@@ -1,3 +1,5 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -relocation-model=pic -filetype=obj < %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -filetype=obj < %s
 
Index: test/CodeGen/AArch64/i128-align.ll
===================================================================
--- test/CodeGen/AArch64/i128-align.ll
+++ test/CodeGen/AArch64/i128-align.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 
 %struct = type { i32, i128, i8 }
Index: test/CodeGen/AArch64/illegal-float-ops.ll
===================================================================
--- test/CodeGen/AArch64/illegal-float-ops.ll
+++ test/CodeGen/AArch64/illegal-float-ops.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 
 @varfloat = global float 0.0
Index: test/CodeGen/AArch64/init-array.ll
===================================================================
--- test/CodeGen/AArch64/init-array.ll
+++ test/CodeGen/AArch64/init-array.ll
@@ -1,3 +1,5 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -verify-machineinstrs -use-init-array < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-none-none-eabi -verify-machineinstrs -use-init-array < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -use-init-array < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-none-eabi -verify-machineinstrs -use-init-array < %s | FileCheck %s
 
Index: test/CodeGen/AArch64/inline-asm-constraints-badI.ll
===================================================================
--- test/CodeGen/AArch64/inline-asm-constraints-badI.ll
+++ test/CodeGen/AArch64/inline-asm-constraints-badI.ll
@@ -1,3 +1,4 @@
+; RUN: not llc -mtriple=aarch64_be-none-linux-gnu < %s
 ; RUN: not llc -mtriple=aarch64-none-linux-gnu < %s
 
 define void @foo() {
Index: test/CodeGen/AArch64/inline-asm-constraints-badK.ll
===================================================================
--- test/CodeGen/AArch64/inline-asm-constraints-badK.ll
+++ test/CodeGen/AArch64/inline-asm-constraints-badK.ll
@@ -1,3 +1,4 @@
+; RUN: not llc -mtriple=aarch64_be-none-linux-gnu < %s
 ; RUN: not llc -mtriple=aarch64-none-linux-gnu < %s
 
 define void @foo() {
Index: test/CodeGen/AArch64/inline-asm-constraints-badK2.ll
===================================================================
--- test/CodeGen/AArch64/inline-asm-constraints-badK2.ll
+++ test/CodeGen/AArch64/inline-asm-constraints-badK2.ll
@@ -1,3 +1,4 @@
+; RUN: not llc -mtriple=aarch64_be-none-linux-gnu < %s
 ; RUN: not llc -mtriple=aarch64-none-linux-gnu < %s
 
 define void @foo() {
Index: test/CodeGen/AArch64/inline-asm-constraints-badL.ll
===================================================================
--- test/CodeGen/AArch64/inline-asm-constraints-badL.ll
+++ test/CodeGen/AArch64/inline-asm-constraints-badL.ll
@@ -1,3 +1,4 @@
+; RUN: not llc -mtriple=aarch64_be-none-linux-gnu < %s
 ; RUN: not llc -mtriple=aarch64-none-linux-gnu < %s
 
 define void @foo() {
Index: test/CodeGen/AArch64/inline-asm-modifiers.ll
===================================================================
--- test/CodeGen/AArch64/inline-asm-modifiers.ll
+++ test/CodeGen/AArch64/inline-asm-modifiers.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -relocation-model=pic -no-integrated-as < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -no-integrated-as < %s | FileCheck %s
 
 @var_simple = hidden global i32 0
Index: test/CodeGen/AArch64/jump-table.ll
===================================================================
--- test/CodeGen/AArch64/jump-table.ll
+++ test/CodeGen/AArch64/jump-table.ll
@@ -1,3 +1,5 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
+; RUN: llc -code-model=large -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK-LARGE %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -code-model=large -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck --check-prefix=CHECK-LARGE %s
 
Index: test/CodeGen/AArch64/large-consts.ll
===================================================================
--- test/CodeGen/AArch64/large-consts.ll
+++ test/CodeGen/AArch64/large-consts.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-linux-gnu -o - %s -code-model=large -show-mc-encoding | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnu -o - %s -code-model=large -show-mc-encoding | FileCheck %s
 
 ; Make sure the shift amount is encoded into the instructions by LLVM because
Index: test/CodeGen/AArch64/large-frame.ll
===================================================================
--- test/CodeGen/AArch64/large-frame.ll
+++ test/CodeGen/AArch64/large-frame.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
 declare void @use_addr(i8*)
 
Index: test/CodeGen/AArch64/ldst-regoffset.ll
===================================================================
--- test/CodeGen/AArch64/ldst-regoffset.ll
+++ test/CodeGen/AArch64/ldst-regoffset.ll
@@ -1,3 +1,5 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
Index: test/CodeGen/AArch64/ldst-unscaledimm.ll
===================================================================
--- test/CodeGen/AArch64/ldst-unscaledimm.ll
+++ test/CodeGen/AArch64/ldst-unscaledimm.ll
@@ -1,3 +1,5 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
Index: test/CodeGen/AArch64/ldst-unsignedimm.ll
===================================================================
--- test/CodeGen/AArch64/ldst-unsignedimm.ll
+++ test/CodeGen/AArch64/ldst-unsignedimm.ll
@@ -1,3 +1,5 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
 
Index: test/CodeGen/AArch64/literal_pools.ll
===================================================================
--- test/CodeGen/AArch64/literal_pools.ll
+++ test/CodeGen/AArch64/literal_pools.ll
@@ -1,3 +1,7 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -code-model=large | FileCheck --check-prefix=CHECK-LARGE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -code-model=large -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP-LARGE %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -code-model=large | FileCheck --check-prefix=CHECK-LARGE %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
Index: test/CodeGen/AArch64/local_vars.ll
===================================================================
--- test/CodeGen/AArch64/local_vars.ll
+++ test/CodeGen/AArch64/local_vars.ll
@@ -1,3 +1,5 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -O0 | FileCheck %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -O0 -disable-fp-elim | FileCheck -check-prefix CHECK-WITHFP %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0 | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0 -disable-fp-elim | FileCheck -check-prefix CHECK-WITHFP %s
 
Index: test/CodeGen/AArch64/logical-imm.ll
===================================================================
--- test/CodeGen/AArch64/logical-imm.ll
+++ test/CodeGen/AArch64/logical-imm.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 @var32 = global i32 0
Index: test/CodeGen/AArch64/logical_shifted_reg.ll
===================================================================
--- test/CodeGen/AArch64/logical_shifted_reg.ll
+++ test/CodeGen/AArch64/logical_shifted_reg.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -O0 | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0 | FileCheck %s
 
 @var1_32 = global i32 0
Index: test/CodeGen/AArch64/mature-mc-support.ll
===================================================================
--- test/CodeGen/AArch64/mature-mc-support.ll
+++ test/CodeGen/AArch64/mature-mc-support.ll
@@ -1,3 +1,5 @@
+; RUN: not llc -mtriple=aarch64_be-pc-linux < %s > /dev/null 2> %t1
+; RUN: not llc -mtriple=aarch64_be-pc-linux -filetype=obj < %s > /dev/null 2> %t2
 ; Test that inline assembly is parsed by the MC layer when MC support is mature
 ; (even when the output is assembly).
 
Index: test/CodeGen/AArch64/movw-consts.ll
===================================================================
--- test/CodeGen/AArch64/movw-consts.ll
+++ test/CodeGen/AArch64/movw-consts.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs -O0 < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs -O0 < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 define i64 @test0() {
Index: test/CodeGen/AArch64/movw-shift-encoding.ll
===================================================================
--- test/CodeGen/AArch64/movw-shift-encoding.ll
+++ test/CodeGen/AArch64/movw-shift-encoding.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-linux-gnu < %s -show-mc-encoding -code-model=large | FileCheck %s
 ; RUN: llc -mtriple=aarch64-linux-gnu < %s -show-mc-encoding -code-model=large | FileCheck %s
 
 @var = global i32 0
Index: test/CodeGen/AArch64/neon-2velem-high.ll
===================================================================
--- test/CodeGen/AArch64/neon-2velem-high.ll
+++ test/CodeGen/AArch64/neon-2velem-high.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 declare <2 x float> @llvm.fma.v2f32(<2 x float>, <2 x float>, <2 x float>)
Index: test/CodeGen/AArch64/neon-2velem.ll
===================================================================
--- test/CodeGen/AArch64/neon-2velem.ll
+++ test/CodeGen/AArch64/neon-2velem.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 declare <2 x double> @llvm.aarch64.neon.vmulx.v2f64(<2 x double>, <2 x double>)
Index: test/CodeGen/AArch64/neon-3vdiff.ll
===================================================================
--- test/CodeGen/AArch64/neon-3vdiff.ll
+++ test/CodeGen/AArch64/neon-3vdiff.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 declare <8 x i16> @llvm.arm.neon.vmullp.v8i16(<8 x i8>, <8 x i8>)
Index: test/CodeGen/AArch64/neon-aba-abd.ll
===================================================================
--- test/CodeGen/AArch64/neon-aba-abd.ll
+++ test/CodeGen/AArch64/neon-aba-abd.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 declare <8 x i8> @llvm.arm.neon.vabdu.v8i8(<8 x i8>, <8 x i8>)
Index: test/CodeGen/AArch64/neon-across.ll
===================================================================
--- test/CodeGen/AArch64/neon-across.ll
+++ test/CodeGen/AArch64/neon-across.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 declare float @llvm.aarch64.neon.vminnmv(<4 x float>)
Index: test/CodeGen/AArch64/neon-add-pairwise.ll
===================================================================
--- test/CodeGen/AArch64/neon-add-pairwise.ll
+++ test/CodeGen/AArch64/neon-add-pairwise.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 declare <8 x i8> @llvm.arm.neon.vpadd.v8i8(<8 x i8>, <8 x i8>)
Index: test/CodeGen/AArch64/neon-add-sub.ll
===================================================================
--- test/CodeGen/AArch64/neon-add-sub.ll
+++ test/CodeGen/AArch64/neon-add-sub.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <8 x i8> @add8xi8(<8 x i8> %A, <8 x i8> %B) {
Index: test/CodeGen/AArch64/neon-bitcast.ll
===================================================================
--- test/CodeGen/AArch64/neon-bitcast.ll
+++ test/CodeGen/AArch64/neon-bitcast.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon -verify-machineinstrs < %s | FileCheck %s
 
 ; From <8 x i8>
Index: test/CodeGen/AArch64/neon-bitwise-instructions.ll
===================================================================
--- test/CodeGen/AArch64/neon-bitwise-instructions.ll
+++ test/CodeGen/AArch64/neon-bitwise-instructions.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <8 x i8> @and8xi8(<8 x i8> %a, <8 x i8> %b) {
Index: test/CodeGen/AArch64/neon-bsl.ll
===================================================================
--- test/CodeGen/AArch64/neon-bsl.ll
+++ test/CodeGen/AArch64/neon-bsl.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 declare <2 x double> @llvm.arm.neon.vbsl.v2f64(<2 x double>, <2 x double>, <2 x double>)
Index: test/CodeGen/AArch64/neon-compare-instructions.ll
===================================================================
--- test/CodeGen/AArch64/neon-compare-instructions.ll
+++ test/CodeGen/AArch64/neon-compare-instructions.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 define <8 x i8> @cmeq8xi8(<8 x i8> %A, <8 x i8> %B) {
Index: test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll
===================================================================
--- test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll
+++ test/CodeGen/AArch64/neon-copyPhysReg-tuple.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <4 x i32> @copyTuple.QPair(i8* %a, i8* %b) {
Index: test/CodeGen/AArch64/neon-crypto.ll
===================================================================
--- test/CodeGen/AArch64/neon-crypto.ll
+++ test/CodeGen/AArch64/neon-crypto.ll
@@ -1,3 +1,5 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon -mattr=+crypto | FileCheck %s
+; RUN: not llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon 2>&1 | FileCheck --check-prefix=CHECK-NO-CRYPTO %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -mattr=+crypto | FileCheck %s
 ; RUN: not llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon 2>&1 | FileCheck --check-prefix=CHECK-NO-CRYPTO %s
 
Index: test/CodeGen/AArch64/neon-diagnostics.ll
===================================================================
--- test/CodeGen/AArch64/neon-diagnostics.ll
+++ test/CodeGen/AArch64/neon-diagnostics.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <2 x float> @test_vfma_lane_f32(<2 x float> %a, <2 x float> %b, <2 x float> %v) {
Index: test/CodeGen/AArch64/neon-extract.ll
===================================================================
--- test/CodeGen/AArch64/neon-extract.ll
+++ test/CodeGen/AArch64/neon-extract.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <8 x i8> @test_vext_s8(<8 x i8> %a, <8 x i8> %b) {
Index: test/CodeGen/AArch64/neon-facge-facgt.ll
===================================================================
--- test/CodeGen/AArch64/neon-facge-facgt.ll
+++ test/CodeGen/AArch64/neon-facge-facgt.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 declare <2 x i32> @llvm.arm.neon.vacge.v2i32.v2f32(<2 x float>, <2 x float>)
Index: test/CodeGen/AArch64/neon-fma.ll
===================================================================
--- test/CodeGen/AArch64/neon-fma.ll
+++ test/CodeGen/AArch64/neon-fma.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 define <2 x float> @fmla2xfloat(<2 x float> %A, <2 x float> %B, <2 x float> %C) {
Index: test/CodeGen/AArch64/neon-fpround_f128.ll
===================================================================
--- test/CodeGen/AArch64/neon-fpround_f128.ll
+++ test/CodeGen/AArch64/neon-fpround_f128.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 define <1 x double> @test_fpround_v1f128(<1 x fp128>* %a) {
Index: test/CodeGen/AArch64/neon-frsqrt-frecp.ll
===================================================================
--- test/CodeGen/AArch64/neon-frsqrt-frecp.ll
+++ test/CodeGen/AArch64/neon-frsqrt-frecp.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon  | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon  | FileCheck %s
 
 ; Set of tests for when the intrinsic is used.
Index: test/CodeGen/AArch64/neon-halving-add-sub.ll
===================================================================
--- test/CodeGen/AArch64/neon-halving-add-sub.ll
+++ test/CodeGen/AArch64/neon-halving-add-sub.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 declare <8 x i8> @llvm.arm.neon.vhaddu.v8i8(<8 x i8>, <8 x i8>)
Index: test/CodeGen/AArch64/neon-load-store-v1i32.ll
===================================================================
--- test/CodeGen/AArch64/neon-load-store-v1i32.ll
+++ test/CodeGen/AArch64/neon-load-store-v1i32.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 ; Test load/store of v1i8, v1i16, v1i32 types can be selected correctly
Index: test/CodeGen/AArch64/neon-max-min-pairwise.ll
===================================================================
--- test/CodeGen/AArch64/neon-max-min-pairwise.ll
+++ test/CodeGen/AArch64/neon-max-min-pairwise.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 declare <8 x i8> @llvm.arm.neon.vpmaxs.v8i8(<8 x i8>, <8 x i8>)
Index: test/CodeGen/AArch64/neon-max-min.ll
===================================================================
--- test/CodeGen/AArch64/neon-max-min.ll
+++ test/CodeGen/AArch64/neon-max-min.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 declare <8 x i8> @llvm.arm.neon.vmaxs.v8i8(<8 x i8>, <8 x i8>)
Index: test/CodeGen/AArch64/neon-misc.ll
===================================================================
--- test/CodeGen/AArch64/neon-misc.ll
+++ test/CodeGen/AArch64/neon-misc.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 
Index: test/CodeGen/AArch64/neon-mla-mls.ll
===================================================================
--- test/CodeGen/AArch64/neon-mla-mls.ll
+++ test/CodeGen/AArch64/neon-mla-mls.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 
Index: test/CodeGen/AArch64/neon-mov.ll
===================================================================
--- test/CodeGen/AArch64/neon-mov.ll
+++ test/CodeGen/AArch64/neon-mov.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <8 x i8> @movi8b() {
Index: test/CodeGen/AArch64/neon-mul-div.ll
===================================================================
--- test/CodeGen/AArch64/neon-mul-div.ll
+++ test/CodeGen/AArch64/neon-mul-div.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 
Index: test/CodeGen/AArch64/neon-or-combine.ll
===================================================================
--- test/CodeGen/AArch64/neon-or-combine.ll
+++ test/CodeGen/AArch64/neon-or-combine.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 ; Check that the DAGCombiner does not crash with an assertion failure
Index: test/CodeGen/AArch64/neon-perm.ll
===================================================================
--- test/CodeGen/AArch64/neon-perm.ll
+++ test/CodeGen/AArch64/neon-perm.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 %struct.int8x8x2_t = type { [2 x <8 x i8>] }
Index: test/CodeGen/AArch64/neon-rounding-halving-add.ll
===================================================================
--- test/CodeGen/AArch64/neon-rounding-halving-add.ll
+++ test/CodeGen/AArch64/neon-rounding-halving-add.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 declare <8 x i8> @llvm.arm.neon.vrhaddu.v8i8(<8 x i8>, <8 x i8>)
Index: test/CodeGen/AArch64/neon-rounding-shift.ll
===================================================================
--- test/CodeGen/AArch64/neon-rounding-shift.ll
+++ test/CodeGen/AArch64/neon-rounding-shift.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 declare <8 x i8> @llvm.arm.neon.vrshiftu.v8i8(<8 x i8>, <8 x i8>)
Index: test/CodeGen/AArch64/neon-saturating-add-sub.ll
===================================================================
--- test/CodeGen/AArch64/neon-saturating-add-sub.ll
+++ test/CodeGen/AArch64/neon-saturating-add-sub.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 declare <8 x i8> @llvm.arm.neon.vqaddu.v8i8(<8 x i8>, <8 x i8>)
Index: test/CodeGen/AArch64/neon-saturating-rounding-shift.ll
===================================================================
--- test/CodeGen/AArch64/neon-saturating-rounding-shift.ll
+++ test/CodeGen/AArch64/neon-saturating-rounding-shift.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 declare <8 x i8> @llvm.arm.neon.vqrshiftu.v8i8(<8 x i8>, <8 x i8>)
Index: test/CodeGen/AArch64/neon-saturating-shift.ll
===================================================================
--- test/CodeGen/AArch64/neon-saturating-shift.ll
+++ test/CodeGen/AArch64/neon-saturating-shift.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 declare <8 x i8> @llvm.arm.neon.vqshiftu.v8i8(<8 x i8>, <8 x i8>)
Index: test/CodeGen/AArch64/neon-scalar-abs.ll
===================================================================
--- test/CodeGen/AArch64/neon-scalar-abs.ll
+++ test/CodeGen/AArch64/neon-scalar-abs.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define i64 @test_vabsd_s64(i64 %a) {
Index: test/CodeGen/AArch64/neon-scalar-add-sub.ll
===================================================================
--- test/CodeGen/AArch64/neon-scalar-add-sub.ll
+++ test/CodeGen/AArch64/neon-scalar-add-sub.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <1 x i64> @add1xi64(<1 x i64> %A, <1 x i64> %B) {
Index: test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
===================================================================
--- test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
+++ test/CodeGen/AArch64/neon-scalar-by-elem-fma.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 declare float @llvm.fma.f32(float, float, float)
Index: test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll
===================================================================
--- test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll
+++ test/CodeGen/AArch64/neon-scalar-by-elem-mul.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 define float @test_fmul_lane_ss2S(float %a, <2 x float> %v) {
Index: test/CodeGen/AArch64/neon-scalar-compare.ll
===================================================================
--- test/CodeGen/AArch64/neon-scalar-compare.ll
+++ test/CodeGen/AArch64/neon-scalar-compare.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 ;; Scalar Integer Compare
Index: test/CodeGen/AArch64/neon-scalar-copy.ll
===================================================================
--- test/CodeGen/AArch64/neon-scalar-copy.ll
+++ test/CodeGen/AArch64/neon-scalar-copy.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 define float @test_dup_sv2S(<2 x float> %v) {
Index: test/CodeGen/AArch64/neon-scalar-cvt.ll
===================================================================
--- test/CodeGen/AArch64/neon-scalar-cvt.ll
+++ test/CodeGen/AArch64/neon-scalar-cvt.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 define float @test_vcvts_f32_s32(i32 %a) {
Index: test/CodeGen/AArch64/neon-scalar-ext.ll
===================================================================
--- test/CodeGen/AArch64/neon-scalar-ext.ll
+++ test/CodeGen/AArch64/neon-scalar-ext.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 define <1 x i64> @test_zext_v1i32_v1i64(<2 x i32> %v) nounwind readnone {
Index: test/CodeGen/AArch64/neon-scalar-extract-narrow.ll
===================================================================
--- test/CodeGen/AArch64/neon-scalar-extract-narrow.ll
+++ test/CodeGen/AArch64/neon-scalar-extract-narrow.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 define i8 @test_vqmovunh_s16(i16 %a) {
Index: test/CodeGen/AArch64/neon-scalar-fabd.ll
===================================================================
--- test/CodeGen/AArch64/neon-scalar-fabd.ll
+++ test/CodeGen/AArch64/neon-scalar-fabd.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define float @test_vabds_f32(float %a, float %b) {
Index: test/CodeGen/AArch64/neon-scalar-fcvt.ll
===================================================================
--- test/CodeGen/AArch64/neon-scalar-fcvt.ll
+++ test/CodeGen/AArch64/neon-scalar-fcvt.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 ;; Scalar Floating-point Convert
Index: test/CodeGen/AArch64/neon-scalar-fp-compare.ll
===================================================================
--- test/CodeGen/AArch64/neon-scalar-fp-compare.ll
+++ test/CodeGen/AArch64/neon-scalar-fp-compare.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 ;; Scalar Floating-point Compare
Index: test/CodeGen/AArch64/neon-scalar-mul.ll
===================================================================
--- test/CodeGen/AArch64/neon-scalar-mul.ll
+++ test/CodeGen/AArch64/neon-scalar-mul.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 define i16 @test_vqdmulhh_s16(i16 %a, i16 %b) {
Index: test/CodeGen/AArch64/neon-scalar-neg.ll
===================================================================
--- test/CodeGen/AArch64/neon-scalar-neg.ll
+++ test/CodeGen/AArch64/neon-scalar-neg.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define i64 @test_vnegd_s64(i64 %a) {
Index: test/CodeGen/AArch64/neon-scalar-recip.ll
===================================================================
--- test/CodeGen/AArch64/neon-scalar-recip.ll
+++ test/CodeGen/AArch64/neon-scalar-recip.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 define float @test_vrecpss_f32(float %a, float %b) {
Index: test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll
===================================================================
--- test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll
+++ test/CodeGen/AArch64/neon-scalar-reduce-pairwise.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 declare <1 x i64> @llvm.aarch64.neon.vpadd(<2 x i64>)
Index: test/CodeGen/AArch64/neon-scalar-rounding-shift.ll
===================================================================
--- test/CodeGen/AArch64/neon-scalar-rounding-shift.ll
+++ test/CodeGen/AArch64/neon-scalar-rounding-shift.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 
Index: test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll
===================================================================
--- test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll
+++ test/CodeGen/AArch64/neon-scalar-saturating-add-sub.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 declare <1 x i8> @llvm.arm.neon.vqaddu.v1i8(<1 x i8>, <1 x i8>)
Index: test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll
===================================================================
--- test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll
+++ test/CodeGen/AArch64/neon-scalar-saturating-rounding-shift.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 declare <1 x i64> @llvm.arm.neon.vqrshiftu.v1i64(<1 x i64>, <1 x i64>)
Index: test/CodeGen/AArch64/neon-scalar-saturating-shift.ll
===================================================================
--- test/CodeGen/AArch64/neon-scalar-saturating-shift.ll
+++ test/CodeGen/AArch64/neon-scalar-saturating-shift.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 declare <1 x i64> @llvm.arm.neon.vqshiftu.v1i64(<1 x i64>, <1 x i64>)
Index: test/CodeGen/AArch64/neon-scalar-shift-imm.ll
===================================================================
--- test/CodeGen/AArch64/neon-scalar-shift-imm.ll
+++ test/CodeGen/AArch64/neon-scalar-shift-imm.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define i64 @test_vshrd_n_s64(i64 %a) {
Index: test/CodeGen/AArch64/neon-scalar-shift.ll
===================================================================
--- test/CodeGen/AArch64/neon-scalar-shift.ll
+++ test/CodeGen/AArch64/neon-scalar-shift.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 declare <1 x i64> @llvm.arm.neon.vshiftu.v1i64(<1 x i64>, <1 x i64>)
Index: test/CodeGen/AArch64/neon-select_cc.ll
===================================================================
--- test/CodeGen/AArch64/neon-select_cc.ll
+++ test/CodeGen/AArch64/neon-select_cc.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 define <8x i8> @test_select_cc_v8i8_i8(i8 %a, i8 %b, <8x i8> %c, <8x i8> %d ) {
Index: test/CodeGen/AArch64/neon-shift-left-long.ll
===================================================================
--- test/CodeGen/AArch64/neon-shift-left-long.ll
+++ test/CodeGen/AArch64/neon-shift-left-long.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <8 x i16> @test_sshll_v8i8(<8 x i8> %a) {
Index: test/CodeGen/AArch64/neon-shift.ll
===================================================================
--- test/CodeGen/AArch64/neon-shift.ll
+++ test/CodeGen/AArch64/neon-shift.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 declare <8 x i8> @llvm.arm.neon.vshiftu.v8i8(<8 x i8>, <8 x i8>)
Index: test/CodeGen/AArch64/neon-shl-ashr-lshr.ll
===================================================================
--- test/CodeGen/AArch64/neon-shl-ashr-lshr.ll
+++ test/CodeGen/AArch64/neon-shl-ashr-lshr.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <8 x i8> @shl.v8i8(<8 x i8> %a, <8 x i8> %b) {
Index: test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll
===================================================================
--- test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll
+++ test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define void @test_ldst1_v16i8(<16 x i8>* %ptr, <16 x i8>* %ptr2) {
@@ -2,4 +3,4 @@
 ; CHECK-LABEL: test_ldst1_v16i8:
-; CHECK: ld1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
+; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+|sp}}]
+; CHECK: str q{{[0-9]+}}, [x{{[0-9]+|sp}}]
   %tmp = load <16 x i8>* %ptr
@@ -11,8 +12,8 @@
 
 define void @test_ldst1_v8i16(<8 x i16>* %ptr, <8 x i16>* %ptr2) {
 ; CHECK-LABEL: test_ldst1_v8i16:
-; CHECK: ld1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
+; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+|sp}}]
+; CHECK: str q{{[0-9]+}}, [x{{[0-9]+|sp}}]
   %tmp = load <8 x i16>* %ptr
   store <8 x i16> %tmp, <8 x i16>* %ptr2
   ret void
@@ -20,8 +21,8 @@
 
 define void @test_ldst1_v4i32(<4 x i32>* %ptr, <4 x i32>* %ptr2) {
 ; CHECK-LABEL: test_ldst1_v4i32:
-; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
+; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+|sp}}]
+; CHECK: str q{{[0-9]+}}, [x{{[0-9]+|sp}}]
   %tmp = load <4 x i32>* %ptr
   store <4 x i32> %tmp, <4 x i32>* %ptr2
   ret void
@@ -29,8 +30,8 @@
 
 define void @test_ldst1_v2i64(<2 x i64>* %ptr, <2 x i64>* %ptr2) {
 ; CHECK-LABEL: test_ldst1_v2i64:
-; CHECK: ld1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
+; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+|sp}}]
+; CHECK: str q{{[0-9]+}}, [x{{[0-9]+|sp}}]
   %tmp = load <2 x i64>* %ptr
   store <2 x i64> %tmp, <2 x i64>* %ptr2
   ret void
@@ -38,8 +39,8 @@
 
 define void @test_ldst1_v8i8(<8 x i8>* %ptr, <8 x i8>* %ptr2) {
 ; CHECK-LABEL: test_ldst1_v8i8:
-; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
+; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+|sp}}]
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+|sp}}]
   %tmp = load <8 x i8>* %ptr
   store <8 x i8> %tmp, <8 x i8>* %ptr2
   ret void
@@ -47,8 +48,8 @@
 
 define void @test_ldst1_v4i16(<4 x i16>* %ptr, <4 x i16>* %ptr2) {
 ; CHECK-LABEL: test_ldst1_v4i16:
-; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
+; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+|sp}}]
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+|sp}}]
   %tmp = load <4 x i16>* %ptr
   store <4 x i16> %tmp, <4 x i16>* %ptr2
   ret void
@@ -56,8 +57,8 @@
 
 define void @test_ldst1_v2i32(<2 x i32>* %ptr, <2 x i32>* %ptr2) {
 ; CHECK-LABEL: test_ldst1_v2i32:
-; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+|sp}}]
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+|sp}}]
   %tmp = load <2 x i32>* %ptr
   store <2 x i32> %tmp, <2 x i32>* %ptr2
   ret void
@@ -65,8 +66,8 @@
 
 define void @test_ldst1_v1i64(<1 x i64>* %ptr, <1 x i64>* %ptr2) {
 ; CHECK-LABEL: test_ldst1_v1i64:
-; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
-; CHECK: st1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
+; CHECK: ldr d{{[0-9]+}}, [x{{[0-9]+|sp}}]
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+|sp}}]
   %tmp = load <1 x i64>* %ptr
   store <1 x i64> %tmp, <1 x i64>* %ptr2
   ret void
Index: test/CodeGen/AArch64/neon-simd-ldst.ll
===================================================================
--- test/CodeGen/AArch64/neon-simd-ldst.ll
+++ test/CodeGen/AArch64/neon-simd-ldst.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -O2 -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define void @test_ldstq_4v(i8* noalias %io, i32 %count) {
Index: test/CodeGen/AArch64/neon-simd-post-ldst-multi-elem.ll
===================================================================
--- test/CodeGen/AArch64/neon-simd-post-ldst-multi-elem.ll
+++ test/CodeGen/AArch64/neon-simd-post-ldst-multi-elem.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 ;Check for a post-increment updating load.
Index: test/CodeGen/AArch64/neon-simd-post-ldst-one.ll
===================================================================
--- test/CodeGen/AArch64/neon-simd-post-ldst-one.ll
+++ test/CodeGen/AArch64/neon-simd-post-ldst-one.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define { [2 x <16 x i8>] } @test_vld2q_dup_fx_update(i8* %a, i8** %ptr) {
Index: test/CodeGen/AArch64/neon-simd-shift.ll
===================================================================
--- test/CodeGen/AArch64/neon-simd-shift.ll
+++ test/CodeGen/AArch64/neon-simd-shift.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <8 x i8> @test_vshr_n_s8(<8 x i8> %a) {
Index: test/CodeGen/AArch64/neon-simd-tbl.ll
===================================================================
--- test/CodeGen/AArch64/neon-simd-tbl.ll
+++ test/CodeGen/AArch64/neon-simd-tbl.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 declare <16 x i8> @llvm.aarch64.neon.vtbx4.v16i8(<16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>)
Index: test/CodeGen/AArch64/neon-simd-vget.ll
===================================================================
--- test/CodeGen/AArch64/neon-simd-vget.ll
+++ test/CodeGen/AArch64/neon-simd-vget.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 define <8 x i8> @test_vget_high_s8(<16 x i8> %a) {
Index: test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll
===================================================================
--- test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll
+++ test/CodeGen/AArch64/neon-spill-fpr8-fpr16.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon < %s | FileCheck %s
 
 ; This file tests the spill of FPR8/FPR16. The volatile loads/stores force the
Index: test/CodeGen/AArch64/neon-truncStore-extLoad.ll
===================================================================
--- test/CodeGen/AArch64/neon-truncStore-extLoad.ll
+++ test/CodeGen/AArch64/neon-truncStore-extLoad.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 ; A vector TruncStore can not be selected.
@@ -5,7 +6,7 @@
 define void @truncStore.v2i64(<2 x i64> %a, <2 x i32>* %result) {
 ; CHECK-LABEL: truncStore.v2i64:
 ; CHECK: xtn v{{[0-9]+}}.2s, v{{[0-9]+}}.2d
-; CHECK: st1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+|sp}}]
   %b = trunc <2 x i64> %a to <2 x i32>
   store <2 x i32> %b, <2 x i32>* %result
   ret void
@@ -14,7 +15,7 @@
 define void @truncStore.v4i32(<4 x i32> %a, <4 x i16>* %result) {
 ; CHECK-LABEL: truncStore.v4i32:
 ; CHECK: xtn v{{[0-9]+}}.4h, v{{[0-9]+}}.4s
-; CHECK: st1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+|sp}}]
   %b = trunc <4 x i32> %a to <4 x i16>
   store <4 x i16> %b, <4 x i16>* %result
   ret void
@@ -23,7 +24,7 @@
 define void @truncStore.v8i16(<8 x i16> %a, <8 x i8>* %result) {
 ; CHECK-LABEL: truncStore.v8i16:
 ; CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
-; CHECK: st1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
+; CHECK: str d{{[0-9]+}}, [x{{[0-9]+|sp}}]
   %b = trunc <8 x i16> %a to <8 x i8>
   store <8 x i8> %b, <8 x i8>* %result
   ret void
@@ -54,4 +55,4 @@
   %vecext = extractelement <4 x i8> %a, i32 0
   %conv = zext i8 %vecext to i32
   ret i32 %conv
-}
\ No newline at end of file
+}
Index: test/CodeGen/AArch64/neon-v1i1-setcc.ll
===================================================================
--- test/CodeGen/AArch64/neon-v1i1-setcc.ll
+++ test/CodeGen/AArch64/neon-v1i1-setcc.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast | FileCheck %s
 
 ; This file test the DAG node like "v1i1 SETCC v1i64, v1i64". As the v1i1 type
Index: test/CodeGen/AArch64/neon-vector-list-spill.ll
===================================================================
--- test/CodeGen/AArch64/neon-vector-list-spill.ll
+++ test/CodeGen/AArch64/neon-vector-list-spill.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon -fp-contract=fast
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon -fp-contract=fast
 
 ; FIXME: We should not generate ld/st for such register spill/fill, because the
Index: test/CodeGen/AArch64/regress-bitcast-formals.ll
===================================================================
--- test/CodeGen/AArch64/regress-bitcast-formals.ll
+++ test/CodeGen/AArch64/regress-bitcast-formals.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 
 ; CallingConv.td requires a bitcast for vector arguments. Make sure we're
Index: test/CodeGen/AArch64/regress-f128csel-flags.ll
===================================================================
--- test/CodeGen/AArch64/regress-f128csel-flags.ll
+++ test/CodeGen/AArch64/regress-f128csel-flags.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 
 ; We used to not mark NZCV as being used in the continuation basic-block
Index: test/CodeGen/AArch64/regress-fp128-livein.ll
===================================================================
--- test/CodeGen/AArch64/regress-fp128-livein.ll
+++ test/CodeGen/AArch64/regress-fp128-livein.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -verify-machineinstrs < %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s
 
 ; Regression test for NZCV reg live-in not being added to fp128csel IfTrue BB,
Index: test/CodeGen/AArch64/regress-tail-livereg.ll
===================================================================
--- test/CodeGen/AArch64/regress-tail-livereg.ll
+++ test/CodeGen/AArch64/regress-tail-livereg.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
 @var = global void()* zeroinitializer
 
Index: test/CodeGen/AArch64/regress-tblgen-chains.ll
===================================================================
--- test/CodeGen/AArch64/regress-tblgen-chains.ll
+++ test/CodeGen/AArch64/regress-tblgen-chains.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
 
 ; When generating DAG selection tables, TableGen used to only flag an
Index: test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll
===================================================================
--- test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll
+++ test/CodeGen/AArch64/regress-w29-reserved-with-fp.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -disable-fp-elim < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -disable-fp-elim < %s | FileCheck %s
 @var = global i32 0
 
Index: test/CodeGen/AArch64/regress-wzr-allocatable.ll
===================================================================
--- test/CodeGen/AArch64/regress-wzr-allocatable.ll
+++ test/CodeGen/AArch64/regress-wzr-allocatable.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -O0
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -O0
 
 ; When WZR wasn't marked as reserved, this function tried to allocate
Index: test/CodeGen/AArch64/returnaddr.ll
===================================================================
--- test/CodeGen/AArch64/returnaddr.ll
+++ test/CodeGen/AArch64/returnaddr.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -mtriple=aarch64_be-none-linux-gnu  | FileCheck %s
 ; RUN: llc < %s -mtriple=aarch64-none-linux-gnu  | FileCheck %s
 
 define i8* @rt0(i32 %x) nounwind readnone {
Index: test/CodeGen/AArch64/setcc-takes-i32.ll
===================================================================
--- test/CodeGen/AArch64/setcc-takes-i32.ll
+++ test/CodeGen/AArch64/setcc-takes-i32.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
 
 ; Most important point here is that the promotion of the i1 works
Index: test/CodeGen/AArch64/sext_inreg.ll
===================================================================
--- test/CodeGen/AArch64/sext_inreg.ll
+++ test/CodeGen/AArch64/sext_inreg.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 ; For formal arguments, we have the following vector type promotion,
Index: test/CodeGen/AArch64/sibling-call.ll
===================================================================
--- test/CodeGen/AArch64/sibling-call.ll
+++ test/CodeGen/AArch64/sibling-call.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 declare void @callee_stack0()
Index: test/CodeGen/AArch64/sincos-expansion.ll
===================================================================
--- test/CodeGen/AArch64/sincos-expansion.ll
+++ test/CodeGen/AArch64/sincos-expansion.ll
@@ -1,3 +1,4 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs < %s | FileCheck %s
 
 define float @test_sincos_f32(float %f) {
Index: test/CodeGen/AArch64/sincospow-vector-expansion.ll
===================================================================
--- test/CodeGen/AArch64/sincospow-vector-expansion.ll
+++ test/CodeGen/AArch64/sincospow-vector-expansion.ll
@@ -1,3 +1,4 @@
+; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=+neon | FileCheck %s
 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
 
 
Index: test/CodeGen/AArch64/tail-call.ll
===================================================================
--- test/CodeGen/AArch64/tail-call.ll
+++ test/CodeGen/AArch64/tail-call.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -tailcallopt | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -tailcallopt | FileCheck %s
 
 declare fastcc void @callee_stack0()
Index: test/CodeGen/AArch64/tls-dynamic-together.ll
===================================================================
--- test/CodeGen/AArch64/tls-dynamic-together.ll
+++ test/CodeGen/AArch64/tls-dynamic-together.ll
@@ -1,3 +1,4 @@
+; RUN: llc -O0 -mtriple=aarch64_be-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -O0 -mtriple=aarch64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
 
 ; If the .tlsdesccall and blr parts are emitted completely separately (even with
Index: test/CodeGen/AArch64/tls-dynamics.ll
===================================================================
--- test/CodeGen/AArch64/tls-dynamics.ll
+++ test/CodeGen/AArch64/tls-dynamics.ll
@@ -1,3 +1,5 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -relocation-model=pic -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -verify-machineinstrs < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -relocation-model=pic -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
 
Index: test/CodeGen/AArch64/tls-execs.ll
===================================================================
--- test/CodeGen/AArch64/tls-execs.ll
+++ test/CodeGen/AArch64/tls-execs.ll
@@ -1,3 +1,5 @@
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -verify-machineinstrs -show-mc-encoding < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64_be-none-linux-gnu -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -verify-machineinstrs -show-mc-encoding < %s | FileCheck %s
 ; RUN: llc -mtriple=aarch64-none-linux-gnu -filetype=obj < %s | llvm-objdump -r - | FileCheck --check-prefix=CHECK-RELOC %s
 
Index: test/CodeGen/AArch64/tst-br.ll
===================================================================
--- test/CodeGen/AArch64/tst-br.ll
+++ test/CodeGen/AArch64/tst-br.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 ; We've got the usual issues with LLVM reordering blocks here. The
Index: test/CodeGen/AArch64/variadic.ll
===================================================================
--- test/CodeGen/AArch64/variadic.ll
+++ test/CodeGen/AArch64/variadic.ll
@@ -1,3 +1,5 @@
+; RUN: llc -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=aarch64_be-none-linux-gnu -mattr=-fp-armv8 < %s | FileCheck --check-prefix=CHECK-NOFP %s
 ; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
 ; RUN: llc -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 < %s | FileCheck --check-prefix=CHECK-NOFP %s
 
Index: test/CodeGen/AArch64/zero-reg.ll
===================================================================
--- test/CodeGen/AArch64/zero-reg.ll
+++ test/CodeGen/AArch64/zero-reg.ll
@@ -1,3 +1,4 @@
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck %s
 ; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s
 
 @var32 = global i32 0