Index: include/llvm/ADT/Triple.h
===================================================================
--- include/llvm/ADT/Triple.h
+++ include/llvm/ADT/Triple.h
@@ -86,6 +86,7 @@
   enum SubArchType {
     NoSubArch,
 
+    ARMSubArch_v8_1a,
     ARMSubArch_v8,
     ARMSubArch_v7,
     ARMSubArch_v7em,
Index: include/llvm/Support/ARMBuildAttributes.h
===================================================================
--- include/llvm/Support/ARMBuildAttributes.h
+++ include/llvm/Support/ARMBuildAttributes.h
@@ -106,7 +106,7 @@
   v6_M     = 11,  // e.g. Cortex M1
   v6S_M    = 12,  // v6_M with the System extensions
   v7E_M    = 13,  // v7_M with DSP extensions
-  v8       = 14   // v8, AArch32
+  v8       = 14,  // v8,v8.1a AArch32
 };
 
 enum CPUArchProfile {               // (=7), uleb128
@@ -145,6 +145,7 @@
   AllowNeon = 1,      // SIMDv1 was permitted
   AllowNeon2 = 2,     // SIMDv2 was permitted (Half-precision FP, MAC operations)
   AllowNeonARMv8 = 3, // ARM v8-A SIMD was permitted
+  AllowNeonARMv8_1a = 4,// ARM v8.1-A SIMD was permitted (RDMA)
 
   // Tag_ABI_PCS_R9_use, (=14), uleb128
   R9IsGPR = 0,        // R9 used as v6 (just another callee-saved register)
Index: lib/Support/Triple.cpp
===================================================================
--- lib/Support/Triple.cpp
+++ lib/Support/Triple.cpp
@@ -281,6 +281,7 @@
     .Cases("v7", "v7a", "v7em", "v7l", arch)
     .Cases("v7m", "v7r", "v7s", arch)
     .Cases("v8", "v8a", arch)
+    .Cases("v8.1", "v8.1a", arch)
     .Default(Triple::UnknownArch);
 }
 
@@ -403,6 +404,7 @@
     SubArchName = SubArchName.substr(0, SubArchName.size() - 2);
 
   return StringSwitch<Triple::SubArchType>(SubArchName)
+    .EndsWith("v8.1a", Triple::ARMSubArch_v8_1a)
     .EndsWith("v8", Triple::ARMSubArch_v8)
     .EndsWith("v8a", Triple::ARMSubArch_v8)
     .EndsWith("v7", Triple::ARMSubArch_v7)
@@ -1109,6 +1111,7 @@
       .Cases("v7m", "v7-m", "cortex-m3")
       .Cases("v7em", "v7e-m", "cortex-m4")
       .Cases("v8", "v8a", "v8-a", "cortex-a53")
+      .Cases("v8.1", "v8.1a", "v8.1-a", "generic-armv8.1-a")
       .Default(nullptr);
   else
     result = llvm::StringSwitch<const char *>(MArch)
Index: lib/Target/AArch64/AArch64.td
===================================================================
--- lib/Target/AArch64/AArch64.td
+++ lib/Target/AArch64/AArch64.td
@@ -32,6 +32,9 @@
 def FeatureCRC : SubtargetFeature<"crc", "HasCRC", "true",
   "Enable ARMv8 CRC-32 checksum instructions">;
 
+def FeatureV8_1a : SubtargetFeature<"v8.1a", "HasV8_1a", "true",
+  "Enable ARMv8.1a extensions", [FeatureCRC]>;
+
 /// Cyclone has register move instructions which are "free".
 def FeatureZCRegMove : SubtargetFeature<"zcm", "HasZeroCycleRegMove", "true",
                                         "Has zero-cycle register moves">;
@@ -89,6 +92,10 @@
                                               FeatureNEON,
                                               FeatureCRC]>;
 
+def : ProcessorModel<"generic-armv8.1-a", NoSchedModel, [FeatureV8_1a,
+                                                         FeatureNEON,
+                                                         FeatureCrypto]>;
+
 def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
 def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
 // FIXME: Cortex-A72 is currently modelled as an Cortex-A57.
Index: lib/Target/AArch64/AArch64InstrFormats.td
===================================================================
--- lib/Target/AArch64/AArch64InstrFormats.td
+++ lib/Target/AArch64/AArch64InstrFormats.td
@@ -3282,6 +3282,10 @@
     : BaseLoadStoreExclusive<sz, o2, L, o1, o0, oops, iops, asm, operands> {
   bits<5> Rt;
   bits<5> Rn;
+  let Inst{20-16} = 0b11111;
+  let Unpredictable{20-16} = 0b11111;
+  let Inst{14-10} = 0b11111;
+  let Unpredictable{14-10} = 0b11111;
   let Inst{9-5} = Rn;
   let Inst{4-0} = Rt;
 
@@ -5298,6 +5302,27 @@
   let Inst{4-0}   = Rd;
 }
 
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDThreeScalarTied<bit U, bits<2> size, bit R, bits<5> opcode,
+                        dag oops, dag iops, string asm,
+            list<dag> pattern>
+  : I<oops, iops, asm, "\t$Rd, $Rn, $Rm", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21}    = R;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
 multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm,
                             SDPatternOperator OpNode> {
   def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
@@ -5325,6 +5350,16 @@
   def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
 }
 
+multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm,
+                                 SDPatternOperator OpNode = null_frag> {
+  def v1i32: BaseSIMDThreeScalarTied<U, 0b10, R, opc, (outs FPR32:$dst),
+                                     (ins FPR32:$Rd, FPR32:$Rn, FPR32:$Rm), 
+                                     asm, []>;
+  def v1i16: BaseSIMDThreeScalarTied<U, 0b01, R, opc, (outs FPR16:$dst),
+                                     (ins FPR16:$Rd, FPR16:$Rn, FPR16:$Rm), 
+                                     asm, []>;
+}
+
 multiclass SIMDThreeScalarSD<bit U, bit S, bits<5> opc, string asm,
                              SDPatternOperator OpNode = null_frag> {
   let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
@@ -8517,6 +8552,197 @@
 } // end of 'let Predicates = [HasNEON]'
 
 //----------------------------------------------------------------------------
+// AdvSIMD v8.1 Rounding Double Multiply Add/Subtract
+//----------------------------------------------------------------------------
+
+let Predicates = [HasNEON, HasV8_1a] in {
+
+class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode,
+                                    RegisterOperand regtype, string asm, 
+                                    string kind, list<dag> pattern>
+  : BaseSIMDThreeSameVectorTied<Q, U, size, opcode, regtype, asm, kind, 
+                                pattern> {
+  let Inst{21}=0;
+}
+multiclass SIMDThreeSameVectorSQRDMLxHTiedHS<bit U, bits<5> opc, string asm,
+                                             SDPatternOperator Accum> {
+  def v4i16 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b01, opc, V64, asm, ".4h",
+    [(set (v4i16 V64:$dst),
+          (Accum (v4i16 V64:$Rd),
+                 (v4i16 (int_aarch64_neon_sqrdmulh (v4i16 V64:$Rn),
+                                                   (v4i16 V64:$Rm)))))]>;         
+  def v8i16 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b01, opc, V128, asm, ".8h",
+    [(set (v8i16 V128:$dst),
+          (Accum (v8i16 V128:$Rd),
+                 (v8i16 (int_aarch64_neon_sqrdmulh (v8i16 V128:$Rn),
+                                                   (v8i16 V128:$Rm)))))]>;
+  def v2i32 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b10, opc, V64, asm, ".2s",
+    [(set (v2i32 V64:$dst),
+          (Accum (v2i32 V64:$Rd),
+                 (v2i32 (int_aarch64_neon_sqrdmulh (v2i32 V64:$Rn),
+                                                   (v2i32 V64:$Rm)))))]>;
+  def v4i32 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b10, opc, V128, asm, ".4s",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqrdmulh (v4i32 V128:$Rn),
+                                                   (v4i32 V128:$Rm)))))]>;
+}
+
+multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm,
+                                     SDPatternOperator Accum> {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
+                                          V64, V64, V128_lo, VectorIndexH,
+                                          asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4i16 V64:$dst),
+          (Accum (v4i16 V64:$Rd),
+                 (v4i16 (int_aarch64_neon_sqrdmulh
+                          (v4i16 V64:$Rn),
+                          (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                    VectorIndexH:$idx))))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+  // FIXME: uncomment the following, after backend will support i16 neon type
+  //def : Pat<(i16 (Accum (i16 FPR16Op:$Rd),
+  //              (i16 (vector_extract (v4i16
+  //                       (int_aarch64_neon_sqdmull (v4i16 V64:$Rn),
+  //                           (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+  //                                                  VectorIndexH:$idx)))),
+  //                       (i64 0))))),
+  //          (EXTRACT_SUBREG
+  //              (!cast<Instruction>(NAME # v4i16_indexed)
+  //                  (SUBREG_TO_REG (i32 0), FPR16Op:$Rd, ssub), V64:$Rn,
+  //                  V128_lo:$Rm, VectorIndexH:$idx),
+  //              ssub)>;  
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+                                          V128, V128, V128_lo, VectorIndexH,
+                                          asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8i16 V128:$dst),
+          (Accum (v8i16 V128:$Rd),
+                 (v8i16 (int_aarch64_neon_sqrdmulh
+                          (v8i16 V128:$Rn),
+                          (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                   VectorIndexH:$idx))))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+  // FIXME: It should be a "def" here, similar to one above, 
+  // after backend will support i16 neon type
+
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+                                          V64, V64, V128, VectorIndexS,
+                                          asm, ".2s", ".2s", ".2s", ".s",
+    [(set (v2i32 V64:$dst),
+        (Accum (v2i32 V64:$Rd),
+               (v2i32 (int_aarch64_neon_sqrdmulh
+                        (v2i32 V64:$Rn),
+                        (v2i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                 VectorIndexS:$idx))))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  // FIXME: it would be nice to use the scalar (v1i32) instruction here, but 
+  // an intermediate EXTRACT_SUBREG would be untyped.
+  // FIXME: direct EXTRACT_SUBREG from v2i32 to i32 is illegal, that's why we 
+  // got it lowered here as (i32 vector_extract (v4i32 insert_subvector(..)))
+  def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
+                        (i32 (vector_extract 
+                               (v4i32 (insert_subvector
+                                        (undef), 
+                                        (v2i32 (int_aarch64_neon_sqrdmulh 
+                                                 (v2i32 V64:$Rn),
+                                                 (v2i32 (AArch64duplane32 
+                                                          (v4i32 V128:$Rm),
+                                                          VectorIndexS:$idx)))),
+                                      (i32 0))),
+                               (i64 0))))),
+            (EXTRACT_SUBREG
+                (!cast<Instruction>(NAME # v2i32_indexed)
+                  (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), 
+                                        FPR32Op:$Rd, 
+                                        ssub)), 
+                  V64:$Rn,
+                  V128:$Rm, 
+                  VectorIndexS:$idx),
+                ssub)>;
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                          V128, V128, V128, VectorIndexS,
+                                          asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqrdmulh
+                          (v4i32 V128:$Rn),
+                          (v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                   VectorIndexS:$idx))))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  // FIXME: it would be nice to use the scalar (v1i32) instruction here, but
+  // an intermediate EXTRACT_SUBREG would be untyped.
+  def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
+                        (i32 (vector_extract 
+                               (v4i32 (int_aarch64_neon_sqrdmulh 
+                                        (v4i32 V128:$Rn),
+                                        (v4i32 (AArch64duplane32 
+                                                 (v4i32 V128:$Rm),
+                                                 VectorIndexS:$idx)))),
+                               (i64 0))))),
+            (EXTRACT_SUBREG
+                (v4i32 (!cast<Instruction>(NAME # v4i32_indexed)
+                         (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), 
+                                               FPR32Op:$Rd, 
+                                               ssub)), 
+                         V128:$Rn,
+                         V128:$Rm, 
+                         VectorIndexS:$idx)),
+                ssub)>;
+
+  def i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
+                                        FPR16Op, FPR16Op, V128_lo,
+                                        VectorIndexH, asm, ".h", "", "", ".h",
+    [
+  // FIXME: uncomment the following, after backend will support i16 neon type
+  //  (set (i16 FPR16Op:$dst),
+  //        (Accum (i16 FPR16Op:$Rd),
+  //               (i16 (int_aarch64_neon_sqrdmulh
+  //                          (i16 FPR16Op:$Rn),
+  //                          (i16 (vector_extract (v8i16 V128_lo:$Rm),
+  //                                               VectorIndexH:$idx))))))
+    ]> {
+    
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
+                                        FPR32Op, FPR32Op, V128, VectorIndexS,
+                                        asm, ".s", "", "", ".s",
+    [(set (i32 FPR32Op:$dst),
+          (Accum (i32 FPR32Op:$Rd),
+                 (i32 (int_aarch64_neon_sqrdmulh
+                        (i32 FPR32Op:$Rn),
+                        (i32 (vector_extract (v4i32 V128:$Rm),
+                                             VectorIndexS:$idx))))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+} // let Predicates = [HasNeon, HasV8_1a]
+
+//----------------------------------------------------------------------------
 // Crypto extensions
 //----------------------------------------------------------------------------
 
@@ -8627,3 +8853,141 @@
 def : TokenAlias<".S", ".s">;
 def : TokenAlias<".D", ".d">;
 def : TokenAlias<".Q", ".q">;
+
+//----------------------------------------------------------------------------
+// v8.1 atomic instructions extension:
+// * CAS
+// * CASP
+// * LD<OP>
+// * SWP
+
+// Instruction encodings:
+//
+//      31 30|29  24|23|22|21|20 16|15|14  10|9 5|4 0
+// CAS  SZ   |001000|1 |A |1 |Rs   |R |11111 |Rn |Rt
+// CASP  0|SZ|001000|0 |A |1 |Rs   |R |11111 |Rn |Rt
+// LD   SZ   |111000|A |R |1 |Rs   |0 |OPC|00|Rn |Rt
+// SWP  SZ   |111000|A |R |1 |Rs   |1 |OPC|00|Rn |Rt
+
+// Instruction syntax:
+//
+// CAS{<order>}[<size>] <Ws>, <Wt>, [<Xn|SP>]
+// CAS{<order>} <Xs>, <Xt>, [<Xn|SP>]
+// CASP{<order>} <Ws>, <W(s+1)>, <Wt>, <W(t+1)>, [<Xn|SP>]
+// CASP{<order>} <Xs>, <X(s+1)>, <Xt>, <X(t+1)>, [<Xn|SP>]
+// LD<OP>{<order>}[<size>] <Ws>, <Wt>, [<Xn|SP>]
+// LD<OP>{<order>} <Xs>, <Xt>, [<Xn|SP>]
+// SWP{<order>}[<size>] <Ws>, <Wt>, [<Xn|SP>]
+// SWP{<order>} <Xs>, <Xt>, [<Xn|SP>]
+
+
+let Predicates = [HasV8_1a], mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
+class BaseCASEncoding<dag oops, dag iops, string asm, string operands,
+                      string cstr, list<dag> pattern>
+      : I<oops, iops, asm, operands, cstr, pattern> {
+  bits<2> Sz;
+  bit NP;
+  bit A;
+  bit R;
+  bits<5> Rs;
+  bits<5> Rn;
+  bits<5> Rt;
+  let Inst{31-30} = Sz;
+  let Inst{29-24} = 0b001000;
+  let Inst{23} = NP;
+  let Inst{22} = A;
+  let Inst{21} = 0b1;
+  let Inst{20-16} = Rs;
+  let Inst{15} = R;
+  let Inst{14-10} = 0b11111;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+}
+
+class BaseCAS<string order, string size, RegisterClass RC>
+      : BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn),
+                        "cas" # order # size, "\t$Rs, $Rt, [$Rn]",
+                        "$out = $Rt",[]> {
+  let NP = 1;
+}
+
+let Predicates = [HasV8_1a], mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
+class BaseLD<string op, string order, string size, RegisterClass RC>
+      : I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "ld" # op # order # size, "\t$Rs, $Rt, [$Rn]","",[]> {
+  bits<2> Sz;
+  bit A;
+  bit R;
+  bits<5> Rs;
+  bits<3> opc;
+  bits<5> Rn;
+  bits<5> Rt;
+  let Inst{31-30} = Sz;
+  let Inst{29-24} = 0b111000;
+  let Inst{23} = A;
+  let Inst{22} = R;
+  let Inst{21} = 0b1;
+  let Inst{20-16} = Rs;
+  let Inst{15} = 0b0;
+  let Inst{14-12} = opc;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+}
+
+multiclass LDOp <string order, string size, RegisterClass RC> {
+  let opc = 0b000 in def _add  : BaseLD<"add" , order, size, RC>;
+  let opc = 0b001 in def _clr  : BaseLD<"clr" , order, size, RC>;
+  let opc = 0b010 in def _eor  : BaseLD<"eor" , order, size, RC>;
+  let opc = 0b011 in def _set  : BaseLD<"set" , order, size, RC>;
+  let opc = 0b100 in def _smax : BaseLD<"smax", order, size, RC>;
+  let opc = 0b101 in def _smin : BaseLD<"smin", order, size, RC>;
+  let opc = 0b110 in def _umax : BaseLD<"umax", order, size, RC>;
+  let opc = 0b111 in def _umin : BaseLD<"umin", order, size, RC>;
+}
+
+// Aliases for LD<OP>
+let Predicates = [HasV8_1a] in
+class BaseAliasLD<string asm, RegisterClass OP, Register Reg,Instruction inst> :
+      InstAlias<asm # "\t$Rs, [$Rn]", (inst Reg, OP:$Rs, GPR64sp:$Rn)>;
+
+multiclass BaseAliasLDOp<string asm> {
+  def : BaseAliasLD<"st" # asm # "lb", GPR32, WZR, !cast<Instruction>("LD" # _B_release_ # asm)>;
+  def : BaseAliasLD<"st" # asm # "lh", GPR32, WZR, !cast<Instruction>("LD" # _H_release_ # asm)>;
+  def : BaseAliasLD<"st" # asm # "l",  GPR32, WZR, !cast<Instruction>("LD" # _S_release_ # asm)>;
+  def : BaseAliasLD<"st" # asm # "l",  GPR64, XZR, !cast<Instruction>("LD" # _D_release_ # asm)>;
+  def : BaseAliasLD<"st" # asm # "b",  GPR32, WZR, !cast<Instruction>("LD" # _B_no_order_ # asm)>;
+  def : BaseAliasLD<"st" # asm # "h",  GPR32, WZR, !cast<Instruction>("LD" # _H_no_order_ # asm)>;
+  def : BaseAliasLD<"st" # asm # "",   GPR32, WZR, !cast<Instruction>("LD" # _S_no_order_ # asm)>;
+  def : BaseAliasLD<"st" # asm # "",   GPR64, XZR, !cast<Instruction>("LD" # _D_no_order_ # asm)>;
+}
+
+let Predicates = [HasV8_1a] in
+class BaseSWP<string order, string size, RegisterClass RC>
+      : I<(outs RC:$Rt),(ins RC:$Rs, GPR64sp:$Rn), "swp" # order # size,
+          "\t$Rs, $Rt, [$Rn]","",[]> {
+  bits<2> Sz;
+  bit A;
+  bit R;
+  bits<5> Rs;
+  bits<3> opc = 0b000;
+  bits<5> Rn;
+  bits<5> Rt;
+  let Inst{31-30} = Sz;
+  let Inst{29-24} = 0b111000;
+  let Inst{23} = A;
+  let Inst{22} = R;
+  let Inst{21} = 0b1;
+  let Inst{20-16} = Rs;
+  let Inst{15} = 0b1;
+  let Inst{14-12} = opc;
+  let Inst{11-10} = 0b00;
+  let Inst{9-5} = Rn;
+  let Inst{4-0} = Rt;
+}
+
+class BaseCASP<string order, string size, RegisterOperand RC>
+      : BaseCASEncoding<(outs RC:$out),(ins RC:$Rs, RC:$Rt, GPR64sp:$Rn),
+                        "casp" # order # size, "\t$Rs, $Rt, [$Rn]",
+                        "$out = $Rs",[]> {
+  let NP = 0;
+}
Index: lib/Target/AArch64/AArch64InstrInfo.td
===================================================================
--- lib/Target/AArch64/AArch64InstrInfo.td
+++ lib/Target/AArch64/AArch64InstrInfo.td
@@ -22,6 +22,8 @@
                                  AssemblerPredicate<"FeatureCrypto", "crypto">;
 def HasCRC           : Predicate<"Subtarget->hasCRC()">,
                                  AssemblerPredicate<"FeatureCRC", "crc">;
+def HasV8_1a         : Predicate<"Subtarget->hasV8_1a()">,
+                                 AssemblerPredicate<"FeatureV8_1a", "v8.1a">;
 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
 def IsCyclone        : Predicate<"Subtarget->isCyclone()">;
@@ -725,6 +727,80 @@
 def CRC32CWrr : BaseCRC32<0, 0b10, 1, GPR32, int_aarch64_crc32cw, "crc32cw">;
 def CRC32CXrr : BaseCRC32<1, 0b11, 1, GPR64, int_aarch64_crc32cx, "crc32cx">;
 
+// v8.1 atomic CAS
+multiclass CASOrder<string size, RegisterClass RC> {
+  let A = 0b1, R = 0b0 in def _acquire : BaseCAS<"a",size,RC>;
+  let A = 0b0, R = 0b1 in def _release : BaseCAS<"l",size,RC>;
+  let A = 0b1, R = 0b1 in def _acquire_and_release : BaseCAS<"al",size,RC>;
+  let A = 0b0, R = 0b0 in def _no_order : BaseCAS<"",size,RC>;
+}
+
+multiclass CASOrderSize {
+  let Sz = 0b00 in defm _B : CASOrder<"b",GPR32>;
+  let Sz = 0b01 in defm _H : CASOrder<"h",GPR32>;
+  let Sz = 0b10 in defm _S : CASOrder<"",GPR32>;
+  let Sz = 0b11 in defm _D : CASOrder<"",GPR64>;
+}
+
+defm CAS : CASOrderSize;
+
+// v8.1 atomic SWP
+multiclass SWPOrder<string size, RegisterClass RC> {
+  let A = 0b1, R = 0b0 in def _acquire : BaseSWP<"a",size,RC>;
+  let A = 0b0, R = 0b1 in def _release : BaseSWP<"l",size,RC>;
+  let A = 0b1, R = 0b1 in def _acquire_and_release : BaseSWP<"al",size,RC>;
+  let A = 0b0, R = 0b0 in def _no_order : BaseSWP<"",size,RC>;
+}
+
+multiclass SWPOrderSize {
+  let Sz = 0b00 in defm _B : SWPOrder<"b",GPR32>;
+  let Sz = 0b01 in defm _H : SWPOrder<"h",GPR32>;
+  let Sz = 0b10 in defm _S : SWPOrder<"",GPR32>;
+  let Sz = 0b11 in defm _D : SWPOrder<"",GPR64>;
+}
+
+defm SWP : SWPOrderSize;
+
+// v8.1 atomic LD<OP>
+multiclass LDOpOrder<string size, RegisterClass RC> {
+  let A = 0b1, R = 0b0 in defm _acquire : LDOp<"a", size, RC>;
+  let A = 0b0, R = 0b1 in defm _release : LDOp<"l", size, RC>;
+  let A = 0b1, R = 0b1 in defm _acquire_and_release : LDOp<"al", size, RC>;
+  let A = 0b0, R = 0b0 in defm _no_order : LDOp<"", size, RC>;
+}
+
+multiclass LDOpOrderSize {
+  let Sz = 0b00 in defm _B : LDOpOrder<"b", GPR32>;
+  let Sz = 0b01 in defm _H : LDOpOrder<"h", GPR32>;
+  let Sz = 0b10 in defm _S : LDOpOrder<"", GPR32>;
+  let Sz = 0b11 in defm _D : LDOpOrder<"", GPR64>;
+}
+
+defm LD : LDOpOrderSize;
+
+defm : BaseAliasLDOp<"add">;
+defm : BaseAliasLDOp<"clr">;
+defm : BaseAliasLDOp<"eor">;
+defm : BaseAliasLDOp<"set">;
+defm : BaseAliasLDOp<"smax">;
+defm : BaseAliasLDOp<"smin">;
+defm : BaseAliasLDOp<"umax">;
+defm : BaseAliasLDOp<"umin">;
+
+// v8.1 atomic CASP
+multiclass CASPOrder<string size, RegisterOperand RC> {
+  let A = 0b1, R = 0b0 in def _acquire    : BaseCASP<"a",size,RC>;
+  let A = 0b0, R = 0b1 in def _release    : BaseCASP<"l",size,RC>;
+  let A = 0b1, R = 0b1 in def _acquire_and_release   : BaseCASP<"al",size,RC>;
+  let A = 0b0, R = 0b0 in def _no_order : BaseCASP<"",size,RC>;
+}
+
+multiclass CASPOrderSize {
+  let Sz = 0b00 in defm _S : CASPOrder<"",ConsecutivePairClassOperand32>;
+  let Sz = 0b01 in defm _D : CASPOrder<"",ConsecutivePairClassOperand64>;
+}
+
+defm CASP : CASPOrderSize;
 
 //===----------------------------------------------------------------------===//
 // Logical instructions.
@@ -2312,6 +2388,20 @@
 def STXPW  : StoreExclusivePair<0b10, 0, 0, 1, 0, GPR32, "stxp">;
 def STXPX  : StoreExclusivePair<0b11, 0, 0, 1, 0, GPR64, "stxp">;
 
+let Predicates = [HasV8_1a] in {
+  // v8 limited-order region extension load-acquire instructions
+  def LDLARW  : LoadAcquire   <0b10, 1, 1, 0, 0, GPR32, "ldlar">;
+  def LDLARX  : LoadAcquire   <0b11, 1, 1, 0, 0, GPR64, "ldlar">;
+  def LDLARB  : LoadAcquire   <0b00, 1, 1, 0, 0, GPR32, "ldlarb">;
+  def LDLARH  : LoadAcquire   <0b01, 1, 1, 0, 0, GPR32, "ldlarh">;
+
+  // v8 limited-order region extension store-release instructions
+  def STLLRW  : StoreRelease   <0b10, 1, 0, 0, 0, GPR32, "stllr">;
+  def STLLRX  : StoreRelease   <0b11, 1, 0, 0, 0, GPR64, "stllr">;
+  def STLLRB  : StoreRelease   <0b00, 1, 0, 0, 0, GPR32, "stllrb">;
+  def STLLRH  : StoreRelease   <0b01, 1, 0, 0, 0, GPR32, "stllrh">;
+}
+
 //===----------------------------------------------------------------------===//
 // Scaled floating point to integer conversion instructions.
 //===----------------------------------------------------------------------===//
@@ -2769,6 +2859,10 @@
 defm URHADD   : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>;
 defm URSHL    : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>;
 defm USHL     : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>;
+defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah",
+                                                  int_aarch64_neon_sqadd>;
+defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh",
+                                                    int_aarch64_neon_sqsub>;
 
 defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
 defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
@@ -2985,6 +3079,32 @@
 defm UQSUB    : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub>;
 defm URSHL    : SIMDThreeScalarD<   1, 0b01010, "urshl", int_aarch64_neon_urshl>;
 defm USHL     : SIMDThreeScalarD<   1, 0b01000, "ushl", int_aarch64_neon_ushl>;
+let Predicates = [HasV8_1a] in {
+  defm SQRDMLAH : SIMDThreeScalarHSTied<1, 0, 0b10000, "sqrdmlah">;
+  defm SQRDMLSH : SIMDThreeScalarHSTied<1, 0, 0b10001, "sqrdmlsh">;
+  // FIXME: uncomment the following, after backend will support i16 neon type
+  //def : Pat<(i16 (int_aarch64_neon_sqadd
+  //                 (i16 FPR16:$Rd),
+  //                 (i16 (int_aarch64_neon_sqrdmulh (i16 FPR16:$Rn),
+  //                                                 (i16 FPR16:$Rm))))),
+  //          (SQRDMLAHv1i16 FPR16:$Rd, FPR16:$Rn, FPR16:$Rm)>;
+  def : Pat<(i32 (int_aarch64_neon_sqadd
+                   (i32 FPR32:$Rd),
+                   (i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn),
+                                                   (i32 FPR32:$Rm))))),
+            (SQRDMLAHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+  // FIXME: uncomment the following, after backend will support i16 neon type
+  //def : Pat<(i16 (int_aarch64_neon_sqsub
+  //                 (i16 FPR16:$Rd),
+  //                 (i16 (int_aarch64_neon_sqrdmulh (i16 FPR16:$Rn),
+  //                                                 (i16 FPR16:$Rm))))),
+  //          (SQRDMLSHv1i16 FPR16:$Rd, FPR16:$Rn, FPR16:$Rm)>;
+  def : Pat<(i32 (int_aarch64_neon_sqsub
+                   (i32 FPR32:$Rd),
+                   (i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn),
+                                                   (i32 FPR32:$Rm))))),
+            (SQRDMLSHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+}
 
 def : InstAlias<"cmls $dst, $src1, $src2",
                 (CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
@@ -4315,6 +4435,10 @@
                                            int_aarch64_neon_sqadd>;
 defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
                                            int_aarch64_neon_sqsub>;
+defm SQRDMLAH : SIMDIndexedSQRDMLxHSDTied<1, 0b1101, "sqrdmlah",
+                                          int_aarch64_neon_sqadd>;
+defm SQRDMLSH : SIMDIndexedSQRDMLxHSDTied<1, 0b1111, "sqrdmlsh",
+                                          int_aarch64_neon_sqsub>;
 defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>;
 defm UMLAL   : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
     TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
Index: lib/Target/AArch64/AArch64RegisterInfo.td
===================================================================
--- lib/Target/AArch64/AArch64RegisterInfo.td
+++ lib/Target/AArch64/AArch64RegisterInfo.td
@@ -592,3 +592,37 @@
 def FPR32Op : RegisterOperand<FPR32, "printOperand">;
 def FPR64Op : RegisterOperand<FPR64, "printOperand">;
 def FPR128Op : RegisterOperand<FPR128, "printOperand">;
+
+
+//===----------------------------------------------------------------------===//
+// ARMv8.1a atomic CASP register operands
+
+
+let Namespace = "AArch64" in {
+  def sube32 : SubRegIndex<32>;
+  def subo32 : SubRegIndex<32>;
+  def sube64 : SubRegIndex<64>;
+  def subo64 : SubRegIndex<64>;
+}
+
+def Pair32 : RegisterTuples<[sube32, subo32], [(rotl GPR32, 0), (rotl GPR32, 1)]>;
+def Pair64 : RegisterTuples<[sube64, subo64], [(rotl GPR64, 0), (rotl GPR64, 1)]>;
+
+def Pair32Class   : RegisterClass<"AArch64", [untyped], 32, (add Pair32)> {let Size = 64;}
+def Pair64Class   : RegisterClass<"AArch64", [untyped], 64, (add Pair64)> {let Size = 128;}
+
+
+let RenderMethod = "addRegOperands", ParserMethod="tryParsePair" in {
+  def Pair32AsmOperandClass : AsmOperandClass { let Name = "Pair32"; }
+  def Pair64AsmOperandClass : AsmOperandClass { let Name = "Pair64"; }
+}
+
+def ConsecutivePairClassOperand32 : RegisterOperand<Pair32Class,"printConsecutivePairClassOperand<32>"> {
+  let ParserMatchClass = Pair32AsmOperandClass;
+}
+def ConsecutivePairClassOperand64 : RegisterOperand<Pair64Class,"printConsecutivePairClassOperand<64>"> {
+  let ParserMatchClass = Pair64AsmOperandClass;
+}
+
+
+//===----- END: v8.1a atomic CASP register operands -----------------------===//
Index: lib/Target/AArch64/AArch64Subtarget.h
===================================================================
--- lib/Target/AArch64/AArch64Subtarget.h
+++ lib/Target/AArch64/AArch64Subtarget.h
@@ -41,6 +41,7 @@
   bool HasNEON;
   bool HasCrypto;
   bool HasCRC;
+  bool HasV8_1a;
 
   // HasZeroCycleRegMove - Has zero-cycle register mov instructions.
   bool HasZeroCycleRegMove;
@@ -100,6 +101,7 @@
   bool hasNEON() const { return HasNEON; }
   bool hasCrypto() const { return HasCrypto; }
   bool hasCRC() const { return HasCRC; }
+  bool hasV8_1a() const { return HasV8_1a; }
 
   bool isLittleEndian() const { return IsLittle; }
 
Index: lib/Target/AArch64/AArch64Subtarget.cpp
===================================================================
--- lib/Target/AArch64/AArch64Subtarget.cpp
+++ lib/Target/AArch64/AArch64Subtarget.cpp
@@ -48,7 +48,7 @@
                                    const TargetMachine &TM, bool LittleEndian)
     : AArch64GenSubtargetInfo(TT, CPU, FS), ARMProcFamily(Others),
       HasFPARMv8(false), HasNEON(false), HasCrypto(false), HasCRC(false),
-      HasZeroCycleRegMove(false), HasZeroCycleZeroing(false),
+      HasV8_1a(false), HasZeroCycleRegMove(false), HasZeroCycleZeroing(false),
       IsLittle(LittleEndian), CPUString(CPU), TargetTriple(TT), FrameLowering(),
       InstrInfo(initializeSubtargetDependencies(FS)),
       TSInfo(TM.getDataLayout()), TLInfo(TM, *this) {}
Index: lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
===================================================================
--- lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -106,6 +106,7 @@
   OperandMatchResultTy tryParseAddSubImm(OperandVector &Operands);
   OperandMatchResultTy tryParseGPR64sp0Operand(OperandVector &Operands);
   bool tryParseVectorRegister(OperandVector &Operands);
+  OperandMatchResultTy tryParsePair(OperandVector &Operands);
 
 public:
   enum AArch64MatchResultTy {
@@ -204,6 +205,8 @@
 
   struct BarrierOp {
     unsigned Val; // Not the enum since not all values have names.
+    const char *Data;
+    unsigned Length;
   };
 
   struct SysRegOp {
@@ -220,6 +223,8 @@
 
   struct PrefetchOp {
     unsigned Val;
+    const char *Data;
+    unsigned Length;
   };
 
   struct ShiftExtendOp {
@@ -347,6 +352,11 @@
     return Barrier.Val;
   }
 
+  StringRef getBarrierName() const {
+    assert(Kind == k_Barrier && "Invalid access!");
+    return StringRef(Barrier.Data, Barrier.Length);
+  }
+
   unsigned getReg() const override {
     assert(Kind == k_Register && "Invalid access!");
     return Reg.RegNum;
@@ -382,6 +392,11 @@
     return Prefetch.Val;
   }
 
+  StringRef getPrefetchName() const {
+    assert(Kind == k_Prefetch && "Invalid access!");
+    return StringRef(Prefetch.Data, Prefetch.Length);
+  }
+
   AArch64_AM::ShiftExtendType getShiftExtendType() const {
     assert(Kind == k_ShiftExtend && "Invalid access!");
     return ShiftExtend.Type;
@@ -860,7 +875,14 @@
     return Kind == k_Register && !Reg.isVector &&
       AArch64MCRegisterClasses[AArch64::GPR64RegClassID].contains(Reg.RegNum);
   }
-
+  bool isPair32() const {
+    return Kind == k_Register && !Reg.isVector &&
+      AArch64MCRegisterClasses[AArch64::Pair32ClassRegClassID].contains(Reg.RegNum);
+  }
+  bool isPair64() const {
+    return Kind == k_Register && !Reg.isVector &&
+      AArch64MCRegisterClasses[AArch64::Pair64ClassRegClassID].contains(Reg.RegNum);
+  }
   bool isGPR64sp0() const {
     return Kind == k_Register && !Reg.isVector &&
       AArch64MCRegisterClasses[AArch64::GPR64spRegClassID].contains(Reg.RegNum);
@@ -1595,10 +1617,14 @@
     return Op;
   }
 
-  static std::unique_ptr<AArch64Operand> CreateBarrier(unsigned Val, SMLoc S,
+  static std::unique_ptr<AArch64Operand> CreateBarrier(unsigned Val,
+                                                       StringRef Str,
+                                                       SMLoc S,
                                                        MCContext &Ctx) {
     auto Op = make_unique<AArch64Operand>(k_Barrier, Ctx);
     Op->Barrier.Val = Val;
+    Op->Barrier.Data = Str.data();
+    Op->Barrier.Length = Str.size();
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
@@ -1629,10 +1655,14 @@
     return Op;
   }
 
-  static std::unique_ptr<AArch64Operand> CreatePrefetch(unsigned Val, SMLoc S,
+  static std::unique_ptr<AArch64Operand> CreatePrefetch(unsigned Val,
+                                                        StringRef Str,
+                                                        SMLoc S,
                                                         MCContext &Ctx) {
     auto Op = make_unique<AArch64Operand>(k_Prefetch, Ctx);
     Op->Prefetch.Val = Val;
+    Op->Barrier.Data = Str.data();
+    Op->Barrier.Length = Str.size();
     Op->StartLoc = S;
     Op->EndLoc = S;
     return Op;
@@ -1660,9 +1690,8 @@
        << AArch64_AM::getFPImmFloat(getFPImm()) << ") >";
     break;
   case k_Barrier: {
-    bool Valid;
-    StringRef Name = AArch64DB::DBarrierMapper().toString(getBarrier(), Valid);
-    if (Valid)
+    StringRef Name = getBarrierName();
+    if (!Name.empty())
       OS << "<barrier " << Name << ">";
     else
       OS << "<barrier invalid #" << getBarrier() << ">";
@@ -1705,9 +1734,8 @@
     OS << "c" << getSysCR();
     break;
   case k_Prefetch: {
-    bool Valid;
-    StringRef Name = AArch64PRFM::PRFMMapper().toString(getPrefetch(), Valid);
-    if (Valid)
+    StringRef Name = getPrefetchName();
+    if (!Name.empty())
       OS << "<prfop " << Name << ">";
     else
       OS << "<prfop invalid #" << getPrefetch() << ">";
@@ -1950,7 +1978,11 @@
       return MatchOperand_ParseFail;
     }
 
-    Operands.push_back(AArch64Operand::CreatePrefetch(prfop, S, getContext()));
+    bool Valid;
+    auto Mapper = AArch64PRFM::PRFMMapper(STI.getFeatureBits());
+    StringRef Name = Mapper.toString(MCE->getValue(), Valid);
+    Operands.push_back(AArch64Operand::CreatePrefetch(prfop, Name,
+                                                      S, getContext()));
     return MatchOperand_Success;
   }
 
@@ -1960,14 +1992,16 @@
   }
 
   bool Valid;
-  unsigned prfop = AArch64PRFM::PRFMMapper().fromString(Tok.getString(), Valid);
+  auto Mapper = AArch64PRFM::PRFMMapper(STI.getFeatureBits());
+  unsigned prfop = Mapper.fromString(Tok.getString(), Valid);
   if (!Valid) {
     TokError("pre-fetch hint expected");
     return MatchOperand_ParseFail;
   }
 
   Parser.Lex(); // Eat identifier token.
-  Operands.push_back(AArch64Operand::CreatePrefetch(prfop, S, getContext()));
+  Operands.push_back(AArch64Operand::CreatePrefetch(prfop, Tok.getString(),
+                                                    S, getContext()));
   return MatchOperand_Success;
 }
 
@@ -2569,8 +2603,11 @@
       Error(ExprLoc, "barrier operand out of range");
       return MatchOperand_ParseFail;
     }
-    Operands.push_back(
-        AArch64Operand::CreateBarrier(MCE->getValue(), ExprLoc, getContext()));
+    bool Valid;
+    auto Mapper = AArch64DB::DBarrierMapper(STI.getFeatureBits());
+    StringRef Name = Mapper.toString(MCE->getValue(), Valid);
+    Operands.push_back( AArch64Operand::CreateBarrier(MCE->getValue(), Name,
+                                                      ExprLoc, getContext()));
     return MatchOperand_Success;
   }
 
@@ -2580,7 +2617,8 @@
   }
 
   bool Valid;
-  unsigned Opt = AArch64DB::DBarrierMapper().fromString(Tok.getString(), Valid);
+  auto Mapper = AArch64DB::DBarrierMapper(STI.getFeatureBits());
+  unsigned Opt = Mapper.fromString(Tok.getString(), Valid);
   if (!Valid) {
     TokError("invalid barrier option name");
     return MatchOperand_ParseFail;
@@ -2592,8 +2630,8 @@
     return MatchOperand_ParseFail;
   }
 
-  Operands.push_back(
-      AArch64Operand::CreateBarrier(Opt, getLoc(), getContext()));
+  Operands.push_back( AArch64Operand::CreateBarrier(Opt, Tok.getString(),
+                                                    getLoc(), getContext()));
   Parser.Lex(); // Consume the option
 
   return MatchOperand_Success;
@@ -2618,8 +2656,8 @@
   assert(IsKnown == (MSRReg != -1U) &&
          "register should be -1 if and only if it's unknown");
 
-  uint32_t PStateField =
-      AArch64PState::PStateMapper().fromString(Tok.getString(), IsKnown);
+  auto PStateMapper = AArch64PState::PStateMapper(STI.getFeatureBits());
+  uint32_t PStateField = PStateMapper.fromString(Tok.getString(), IsKnown);
   assert(IsKnown == (PStateField != -1U) &&
          "register should be -1 if and only if it's unknown");
 
@@ -4262,3 +4300,75 @@
     return Match_Success;
   return Match_InvalidOperand;
 }
+
+
+AArch64AsmParser::OperandMatchResultTy
+AArch64AsmParser::tryParsePair(OperandVector &Operands) {
+
+  SMLoc S = getLoc();
+
+  if (getParser().getTok().isNot(AsmToken::Identifier)) {
+    Error(S, "Expected register");
+    return MatchOperand_ParseFail;
+  }
+
+  int FirstReg = tryParseRegister();
+  if (FirstReg ==-1) {
+    return MatchOperand_ParseFail;
+  }
+
+  const MCRegisterClass &WRegClass =
+      AArch64MCRegisterClasses[AArch64::GPR32RegClassID];
+  const MCRegisterClass &XRegClass =
+      AArch64MCRegisterClasses[AArch64::GPR64RegClassID];
+
+  bool isXReg = XRegClass.contains(FirstReg),
+       isWReg = WRegClass.contains(FirstReg);
+  if (!isXReg && !isWReg) {
+    Error(S, "Expected register");
+    return MatchOperand_ParseFail;
+  }
+
+  const MCRegisterInfo *RI = getContext().getRegisterInfo();
+  unsigned FirstEncoding = RI->getEncodingValue(FirstReg);
+
+  if (FirstEncoding & 0x1) {
+    Error(S, "Expected even register");
+    return MatchOperand_ParseFail;
+  }
+
+  SMLoc M = getLoc();
+  if (getParser().getTok().isNot(AsmToken::Comma)) {
+    Error(M, "Expected Comma");
+    return MatchOperand_ParseFail;
+  }
+  // Eat the comma
+  getParser().Lex();
+
+  SMLoc E = getLoc();
+  int SecondReg = tryParseRegister();
+  if (SecondReg ==-1) {
+    return MatchOperand_ParseFail;
+  }
+
+  if (RI->getEncodingValue(SecondReg) != FirstEncoding + 1 ||
+      (isXReg && !XRegClass.contains(SecondReg)) ||
+      (isWReg && !WRegClass.contains(SecondReg))) {
+    Error(E,"Expected consecutive registers");
+    return MatchOperand_ParseFail;
+  }
+
+  unsigned Pair = 0;
+  if (isXReg) {
+    Pair = AArch64MCRegisterClasses[AArch64::Pair64ClassRegClassID].getRegister(
+        FirstEncoding);
+  } else {
+    Pair = AArch64MCRegisterClasses[AArch64::Pair32ClassRegClassID].getRegister(
+        FirstEncoding);
+  }
+
+  Operands.push_back(AArch64Operand::CreateReg(Pair, false, S, getLoc(),
+      getContext()));
+
+  return MatchOperand_Success;
+}
Index: lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
===================================================================
--- lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -169,6 +169,10 @@
                                          uint64_t Addr, const void *Decoder);
 static DecodeStatus DecodeVecShiftL8Imm(llvm::MCInst &Inst, unsigned Imm,
                                         uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodePair32ClassRegisterClass(MCInst &Inst, unsigned RegNo,
+                                        uint64_t Addr, const void *Decoder);
+static DecodeStatus DecodePair64ClassRegisterClass(MCInst &Inst, unsigned RegNo,
+                                        uint64_t Addr, const void *Decoder);
 
 static bool Check(DecodeStatus &Out, DecodeStatus In) {
   switch (In) {
@@ -1102,6 +1106,12 @@
   case AArch64::STLRW:
   case AArch64::STLRB:
   case AArch64::STLRH:
+  case AArch64::STLLRW:
+  case AArch64::STLLRB:
+  case AArch64::STLLRH:
+  case AArch64::LDLARW:
+  case AArch64::LDLARB:
+  case AArch64::LDLARH:
     DecodeGPR32RegisterClass(Inst, Rt, Addr, Decoder);
     break;
   case AArch64::STLXRX:
@@ -1112,6 +1122,8 @@
   case AArch64::LDAXRX:
   case AArch64::LDXRX:
   case AArch64::STLRX:
+  case AArch64::LDLARX:
+  case AArch64::STLLRX:
     DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
     break;
   case AArch64::STLXPW:
@@ -1504,7 +1516,10 @@
   Inst.addOperand(MCOperand::CreateImm(crm));
 
   bool ValidNamed;
-  (void)AArch64PState::PStateMapper().toString(pstate_field, ValidNamed);
+  const AArch64Disassembler *Dis =
+      static_cast<const AArch64Disassembler *>(Decoder);
+  AArch64PState::PStateMapper Mapper(Dis->getSubtargetInfo().getFeatureBits());
+  Mapper.toString(pstate_field, ValidNamed);
 
   return ValidNamed ? Success : Fail;
 }
@@ -1532,3 +1547,29 @@
 
   return Success;
 }
+
+static DecodeStatus DecodePair32ClassRegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  // Register number must be even (see CASP instruction)
+  if (RegNo & 0x1)
+    return Fail;
+
+  unsigned Register = AArch64MCRegisterClasses[AArch64::Pair32ClassRegClassID].
+      getRegister(RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
+
+static DecodeStatus DecodePair64ClassRegisterClass(MCInst &Inst, unsigned RegNo,
+                                             uint64_t Addr,
+                                             const void *Decoder) {
+  // Register number must be even (see CASP instruction)
+  if (RegNo & 0x1)
+    return Fail;
+
+  unsigned Register = AArch64MCRegisterClasses[AArch64::Pair64ClassRegClassID].
+      getRegister(RegNo);
+  Inst.addOperand(MCOperand::CreateReg(Register));
+  return Success;
+}
Index: lib/Target/AArch64/Disassembler/LLVMBuild.txt
===================================================================
--- lib/Target/AArch64/Disassembler/LLVMBuild.txt
+++ lib/Target/AArch64/Disassembler/LLVMBuild.txt
@@ -19,5 +19,5 @@
 type = Library
 name = AArch64Disassembler
 parent = AArch64
-required_libraries = AArch64Info AArch64Utils MC MCDisassembler Support
+required_libraries = AArch64Desc AArch64Info AArch64Utils MC MCDisassembler Support
 add_to_library_groups = AArch64
Index: lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
===================================================================
--- lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
+++ lib/Target/AArch64/InstPrinter/AArch64InstPrinter.h
@@ -116,6 +116,8 @@
   void printMRSSystemRegister(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printSystemPStateField(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printSIMDType10Operand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  template<unsigned size>
+  void printConsecutivePairClassOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
 };
 
 class AArch64AppleInstPrinter : public AArch64InstPrinter {
Index: lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
===================================================================
--- lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
+++ lib/Target/AArch64/InstPrinter/AArch64InstPrinter.cpp
@@ -1088,7 +1088,8 @@
                                          raw_ostream &O) {
   unsigned prfop = MI->getOperand(OpNum).getImm();
   bool Valid;
-  StringRef Name = AArch64PRFM::PRFMMapper().toString(prfop, Valid);
+  auto PRFMMapper = AArch64PRFM::PRFMMapper(getAvailableFeatures());
+  StringRef Name = PRFMMapper.toString(prfop, Valid);
   if (Valid)
     O << Name;
   else
@@ -1150,6 +1151,19 @@
   return Reg;
 }
 
+template<unsigned size>
+void AArch64InstPrinter::printConsecutivePairClassOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O) {
+  static_assert(size == 64 || size == 32,"Template parameter must be either 32 or 64");
+  unsigned Reg = MI->getOperand(OpNum).getReg();
+
+  unsigned sube = (size == 32) ? AArch64::sube32 : AArch64::sube64;
+  unsigned subo = (size == 32) ? AArch64::subo32 : AArch64::subo64;
+
+  unsigned even = MRI.getSubReg(Reg,  sube);
+  unsigned odd = MRI.getSubReg(Reg,  subo);
+  O << getRegisterName(even) << ", " << getRegisterName(odd);
+}
+
 void AArch64InstPrinter::printVectorList(const MCInst *MI, unsigned OpNum,
                                          raw_ostream &O,
                                          StringRef LayoutSuffix) {
@@ -1260,12 +1274,11 @@
   unsigned Val = MI->getOperand(OpNo).getImm();
   unsigned Opcode = MI->getOpcode();
 
+  AArch64NamedImmMapper Mapper = Opcode == AArch64::ISB ?
+    AArch64ISB::ISBMapper(getAvailableFeatures()):
+    Mapper = AArch64DB::DBarrierMapper(getAvailableFeatures());
   bool Valid;
-  StringRef Name;
-  if (Opcode == AArch64::ISB)
-    Name = AArch64ISB::ISBMapper().toString(Val, Valid);
-  else
-    Name = AArch64DB::DBarrierMapper().toString(Val, Valid);
+  StringRef Name = Mapper.toString(Val, Valid);
   if (Valid)
     O << Name;
   else
@@ -1276,8 +1289,8 @@
                                                 raw_ostream &O) {
   unsigned Val = MI->getOperand(OpNo).getImm();
 
-  auto Mapper = AArch64SysReg::MRSMapper(getAvailableFeatures());
-  std::string Name = Mapper.toString(Val);
+  auto MRSMapper = AArch64SysReg::MRSMapper(getAvailableFeatures());
+  std::string Name = MRSMapper.toString(Val);
 
   O << StringRef(Name).upper();
 }
@@ -1286,8 +1299,8 @@
                                                 raw_ostream &O) {
   unsigned Val = MI->getOperand(OpNo).getImm();
 
-  auto Mapper = AArch64SysReg::MSRMapper(getAvailableFeatures());
-  std::string Name = Mapper.toString(Val);
+  auto MSRMapper = AArch64SysReg::MSRMapper(getAvailableFeatures());
+  std::string Name = MSRMapper.toString(Val);
 
   O << StringRef(Name).upper();
 }
@@ -1297,7 +1310,8 @@
   unsigned Val = MI->getOperand(OpNo).getImm();
 
   bool Valid;
-  StringRef Name = AArch64PState::PStateMapper().toString(Val, Valid);
+  auto PStateMapper = AArch64PState::PStateMapper(getAvailableFeatures());
+  StringRef Name = PStateMapper.toString(Val, Valid);
   if (Valid)
     O << StringRef(Name.str()).upper();
   else
Index: lib/Target/AArch64/Utils/AArch64BaseInfo.h
===================================================================
--- lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -280,11 +280,14 @@
   struct Mapping {
     const char *Name;
     uint32_t Value;
+    uint64_t SubTargetFeature;
   };
 
   template<int N>
-  AArch64NamedImmMapper(const Mapping (&Pairs)[N], uint32_t TooBigImm)
-    : Pairs(&Pairs[0]), NumPairs(N), TooBigImm(TooBigImm) {}
+  AArch64NamedImmMapper(uint64_t SubTargetFeatureBits, const Mapping (&Mappings)[N],
+      uint32_t TooBigImm)
+    : FeatureBits(SubTargetFeatureBits), Mappings(&Mappings[0]), NumMappings(N),
+      TooBigImm(TooBigImm) {}
 
   StringRef toString(uint32_t Value, bool &Valid) const;
   uint32_t fromString(StringRef Name, bool &Valid) const;
@@ -294,9 +297,13 @@
   /// N being 0 indicates no immediate syntax-form is allowed.
   bool validImm(uint32_t Value) const;
 protected:
-  const Mapping *Pairs;
-  size_t NumPairs;
+  uint64_t FeatureBits;
+  const Mapping *Mappings;
+  size_t NumMappings;
   uint32_t TooBigImm;
+  bool hasFeature(uint64_t SubTargetFeature) const {
+    return SubTargetFeature == 0 || (SubTargetFeature & FeatureBits) != 0;
+  }
 };
 
 namespace AArch64AT {
@@ -317,9 +324,9 @@
   };
 
   struct ATMapper : AArch64NamedImmMapper {
-    const static Mapping ATPairs[];
+    const static Mapping ATMappings[];
 
-    ATMapper();
+    ATMapper(uint64_t SubTargetFeatureBits);
   };
 
 }
@@ -341,9 +348,9 @@
   };
 
   struct DBarrierMapper : AArch64NamedImmMapper {
-    const static Mapping DBarrierPairs[];
+    const static Mapping DBarrierMappings[];
 
-    DBarrierMapper();
+    DBarrierMapper(uint64_t SubTargetFeatureBits);
   };
 }
 
@@ -361,9 +368,9 @@
   };
 
   struct DCMapper : AArch64NamedImmMapper {
-    const static Mapping DCPairs[];
+    const static Mapping DCMappings[];
 
-    DCMapper();
+    DCMapper(uint64_t SubTargetFeatureBits);
   };
 
 }
@@ -378,9 +385,9 @@
 
 
   struct ICMapper : AArch64NamedImmMapper {
-    const static Mapping ICPairs[];
+    const static Mapping ICMappings[];
 
-    ICMapper();
+    ICMapper(uint64_t SubTargetFeatureBits);
   };
 
   static inline bool NeedsRegister(ICValues Val) {
@@ -394,9 +401,9 @@
     SY = 0xf
   };
   struct ISBMapper : AArch64NamedImmMapper {
-    const static Mapping ISBPairs[];
+    const static Mapping ISBMappings[];
 
-    ISBMapper();
+    ISBMapper(uint64_t SubTargetFeatureBits);
   };
 }
 
@@ -424,9 +431,9 @@
   };
 
   struct PRFMMapper : AArch64NamedImmMapper {
-    const static Mapping PRFMPairs[];
+    const static Mapping PRFMMappings[];
 
-    PRFMMapper();
+    PRFMMapper(uint64_t SubTargetFeatureBits);
   };
 }
 
@@ -435,13 +442,14 @@
     Invalid = -1,
     SPSel = 0x05,
     DAIFSet = 0x1e,
-    DAIFClr = 0x1f
+    DAIFClr = 0x1f,
+    PAN = 0x04,
   };
 
   struct PStateMapper : AArch64NamedImmMapper {
-    const static Mapping PStatePairs[];
+    const static Mapping PStateMappings[];
 
-    PStateMapper();
+    PStateMapper(uint64_t SubTargetFeatureBits);
   };
 
 }
@@ -1122,10 +1130,47 @@
     ICH_LR13_EL2      = 0xe66d, // 11  100  1100  1101  101
     ICH_LR14_EL2      = 0xe66e, // 11  100  1100  1101  110
     ICH_LR15_EL2      = 0xe66f, // 11  100  1100  1101  111
-  };
 
-  // Cyclone specific system registers
-  enum CycloneSysRegValues {
+    // Privileged Access Never extension specific system registers
+    PAN               = 0xc213, // 11  000  0100  0010  011
+
+    // Limited Ordering Regions extension system registers
+    LORSA_EL1         = 0xc520, // 11  000  1010  0100  000
+    LOREA_EL1         = 0xc521, // 11  000  1010  0100  001
+    LORN_EL1          = 0xc522, // 11  000  1010  0100  010
+    LORC_EL1          = 0xc523, // 11  000  1010  0100  011
+    LORID_EL1         = 0xc527, // 11  000  1010  0100  111
+
+    // Virtualization host extensions system registers
+    TTBR1_EL2         = 0xe101, // 11  100  0010  0000  001
+    CONTEXTIDR_EL2    = 0xe681, // 11  100  1101  0000  001
+    CNTHV_TVAL_EL2    = 0xe718, // 11  100  1110  0011  000
+    CNTHV_CVAL_EL2    = 0xe71a, // 11  100  1110  0011  010
+    CNTHV_CTL_EL2     = 0xe719, // 11  100  1110  0011  001
+    SCTLR_EL12        = 0xe880, // 11  101  0001  0000  000
+    CPACR_EL12        = 0xe882, // 11  101  0001  0000  010
+    TTBR0_EL12        = 0xe900, // 11  101  0010  0000  000
+    TTBR1_EL12        = 0xe901, // 11  101  0010  0000  001
+    TCR_EL12          = 0xe902, // 11  101  0010  0000  010
+    AFSR0_EL12        = 0xea88, // 11  101  0101  0001  000
+    AFSR1_EL12        = 0xea89, // 11  101  0101  0001  001
+    ESR_EL12          = 0xea90, // 11  101  0101  0010  000
+    FAR_EL12          = 0xeb00, // 11  101  0110  0000  000
+    MAIR_EL12         = 0xed10, // 11  101  1010  0010  000
+    AMAIR_EL12        = 0xed18, // 11  101  1010  0011  000
+    VBAR_EL12         = 0xee00, // 11  101  1100  0000  000
+    CONTEXTIDR_EL12   = 0xee81, // 11  101  1101  0000  001
+    CNTKCTL_EL12      = 0xef08, // 11  101  1110  0001  000
+    CNTP_TVAL_EL02    = 0xef10, // 11  101  1110  0010  000
+    CNTP_CTL_EL02     = 0xef11, // 11  101  1110  0010  001
+    CNTP_CVAL_EL02    = 0xef12, // 11  101  1110  0010  010
+    CNTV_TVAL_EL02    = 0xef18, // 11  101  1110  0011  000
+    CNTV_CTL_EL02     = 0xef19, // 11  101  1110  0011  001
+    CNTV_CVAL_EL02    = 0xef1a, // 11  101  1110  0011  010
+    SPSR_EL12         = 0xea00, // 11  101  0100  0000  000
+    ELR_EL12          = 0xea01, // 11  101  0100  0000  001
+
+    // Cyclone specific system registers
     CPM_IOACC_CTL_EL3 = 0xff90
   };
 
@@ -1134,25 +1179,28 @@
   // burdening the common AArch64NamedImmMapper with abstractions only needed in
   // this one case.
   struct SysRegMapper {
-    static const AArch64NamedImmMapper::Mapping SysRegPairs[];
-    static const AArch64NamedImmMapper::Mapping CycloneSysRegPairs[];
+    static const AArch64NamedImmMapper::Mapping SysRegMappings[];
 
-    const AArch64NamedImmMapper::Mapping *InstPairs;
-    size_t NumInstPairs;
+    const AArch64NamedImmMapper::Mapping *InstMappings;
+    size_t NumInstMappings;
     uint64_t FeatureBits;
 
+    bool hasFeature(uint64_t SubTargetFeature) const {
+      return SubTargetFeature == 0 || (SubTargetFeature & FeatureBits) != 0;
+    }
+
     SysRegMapper(uint64_t FeatureBits) : FeatureBits(FeatureBits) { }
     uint32_t fromString(StringRef Name, bool &Valid) const;
     std::string toString(uint32_t Bits) const;
   };
 
   struct MSRMapper : SysRegMapper {
-    static const AArch64NamedImmMapper::Mapping MSRPairs[];
+    static const AArch64NamedImmMapper::Mapping MSRMappings[];
     MSRMapper(uint64_t FeatureBits);
   };
 
   struct MRSMapper : SysRegMapper {
-    static const AArch64NamedImmMapper::Mapping MRSPairs[];
+    static const AArch64NamedImmMapper::Mapping MRSMappings[];
     MRSMapper(uint64_t FeatureBits);
   };
 
@@ -1197,9 +1245,9 @@
   };
 
   struct TLBIMapper : AArch64NamedImmMapper {
-    const static Mapping TLBIPairs[];
+    const static Mapping TLBIMappings[];
 
-    TLBIMapper();
+    TLBIMapper(uint64_t SubTargetFeatureBits);
   };
 
   static inline bool NeedsRegister(TLBIValues Val) {
Index: lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
===================================================================
--- lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
+++ lib/Target/AArch64/Utils/AArch64BaseInfo.cpp
@@ -19,10 +19,10 @@
 using namespace llvm;
 
 StringRef AArch64NamedImmMapper::toString(uint32_t Value, bool &Valid) const {
-  for (unsigned i = 0; i < NumPairs; ++i) {
-    if (Pairs[i].Value == Value) {
+  for (unsigned i = 0; i < NumMappings; ++i) {
+    if (hasFeature(Mappings[i].SubTargetFeature) && Mappings[i].Value == Value) {
       Valid = true;
-      return Pairs[i].Name;
+      return Mappings[i].Name;
     }
   }
 
@@ -32,10 +32,10 @@
 
 uint32_t AArch64NamedImmMapper::fromString(StringRef Name, bool &Valid) const {
   std::string LowerCaseName = Name.lower();
-  for (unsigned i = 0; i < NumPairs; ++i) {
-    if (Pairs[i].Name == LowerCaseName) {
+  for (unsigned i = 0; i < NumMappings; ++i) {
+    if (hasFeature(Mappings[i].SubTargetFeature) && Mappings[i].Name == LowerCaseName) {
       Valid = true;
-      return Pairs[i].Value;
+      return Mappings[i].Value;
     }
   }
 
@@ -47,7 +47,7 @@
   return Value < TooBigImm;
 }
 
-const AArch64NamedImmMapper::Mapping AArch64AT::ATMapper::ATPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64AT::ATMapper::ATMappings[] = {
   {"s1e1r", S1E1R},
   {"s1e2r", S1E2R},
   {"s1e3r", S1E3R},
@@ -62,10 +62,10 @@
   {"s12e0w", S12E0W},
 };
 
-AArch64AT::ATMapper::ATMapper()
-  : AArch64NamedImmMapper(ATPairs, 0) {}
+AArch64AT::ATMapper::ATMapper(uint64_t SubTargetFeatureBits)
+  : AArch64NamedImmMapper(SubTargetFeatureBits, ATMappings, 0) {}
 
-const AArch64NamedImmMapper::Mapping AArch64DB::DBarrierMapper::DBarrierPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64DB::DBarrierMapper::DBarrierMappings[] = {
   {"oshld", OSHLD},
   {"oshst", OSHST},
   {"osh", OSH},
@@ -80,10 +80,10 @@
   {"sy", SY}
 };
 
-AArch64DB::DBarrierMapper::DBarrierMapper()
-  : AArch64NamedImmMapper(DBarrierPairs, 16u) {}
+AArch64DB::DBarrierMapper::DBarrierMapper(uint64_t SubTargetFeatureBits)
+  : AArch64NamedImmMapper(SubTargetFeatureBits, DBarrierMappings, 16u) {}
 
-const AArch64NamedImmMapper::Mapping AArch64DC::DCMapper::DCPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64DC::DCMapper::DCMappings[] = {
   {"zva", ZVA},
   {"ivac", IVAC},
   {"isw", ISW},
@@ -94,26 +94,26 @@
   {"cisw", CISW}
 };
 
-AArch64DC::DCMapper::DCMapper()
-  : AArch64NamedImmMapper(DCPairs, 0) {}
+AArch64DC::DCMapper::DCMapper(uint64_t SubTargetFeatureBits)
+  : AArch64NamedImmMapper(SubTargetFeatureBits, DCMappings, 0) {}
 
-const AArch64NamedImmMapper::Mapping AArch64IC::ICMapper::ICPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64IC::ICMapper::ICMappings[] = {
   {"ialluis",  IALLUIS},
   {"iallu", IALLU},
   {"ivau", IVAU}
 };
 
-AArch64IC::ICMapper::ICMapper()
-  : AArch64NamedImmMapper(ICPairs, 0) {}
+AArch64IC::ICMapper::ICMapper(uint64_t SubTargetFeatureBits)
+  : AArch64NamedImmMapper(SubTargetFeatureBits, ICMappings, 0) {}
 
-const AArch64NamedImmMapper::Mapping AArch64ISB::ISBMapper::ISBPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64ISB::ISBMapper::ISBMappings[] = {
   {"sy",  SY},
 };
 
-AArch64ISB::ISBMapper::ISBMapper()
-  : AArch64NamedImmMapper(ISBPairs, 16) {}
+AArch64ISB::ISBMapper::ISBMapper(uint64_t SubTargetFeatureBits)
+  : AArch64NamedImmMapper(SubTargetFeatureBits, ISBMappings, 16) {}
 
-const AArch64NamedImmMapper::Mapping AArch64PRFM::PRFMMapper::PRFMPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64PRFM::PRFMMapper::PRFMMappings[] = {
   {"pldl1keep", PLDL1KEEP},
   {"pldl1strm", PLDL1STRM},
   {"pldl2keep", PLDL2KEEP},
@@ -134,19 +134,20 @@
   {"pstl3strm", PSTL3STRM}
 };
 
-AArch64PRFM::PRFMMapper::PRFMMapper()
-  : AArch64NamedImmMapper(PRFMPairs, 32) {}
+AArch64PRFM::PRFMMapper::PRFMMapper(uint64_t SubTargetFeatureBits)
+  : AArch64NamedImmMapper(SubTargetFeatureBits, PRFMMappings, 32) {}
 
-const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStatePairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64PState::PStateMapper::PStateMappings[] = {
   {"spsel", SPSel},
   {"daifset", DAIFSet},
-  {"daifclr", DAIFClr}
+  {"daifclr", DAIFClr},
+  {"pan", PAN, AArch64::FeatureV8_1a},
 };
 
-AArch64PState::PStateMapper::PStateMapper()
-  : AArch64NamedImmMapper(PStatePairs, 0) {}
+AArch64PState::PStateMapper::PStateMapper(uint64_t SubTargetFeatureBits)
+  : AArch64NamedImmMapper(SubTargetFeatureBits, PStateMappings, 0) {}
 
-const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64SysReg::MRSMapper::MRSMappings[] = {
   {"mdccsr_el0", MDCCSR_EL0},
   {"dbgdtrrx_el0", DBGDTRRX_EL0},
   {"mdrar_el1", MDRAR_EL1},
@@ -245,13 +246,13 @@
   {"ich_elsr_el2", ICH_ELSR_EL2}
 };
 
-AArch64SysReg::MRSMapper::MRSMapper(uint64_t FeatureBits)
-  : SysRegMapper(FeatureBits) {
-    InstPairs = &MRSPairs[0];
-    NumInstPairs = llvm::array_lengthof(MRSPairs);
+AArch64SysReg::MRSMapper::MRSMapper(uint64_t SubTargetFeatureBits)
+  : SysRegMapper(SubTargetFeatureBits) {
+    InstMappings = &MRSMappings[0];
+    NumInstMappings = llvm::array_lengthof(MRSMappings);
 }
 
-const AArch64NamedImmMapper::Mapping AArch64SysReg::MSRMapper::MSRPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64SysReg::MSRMapper::MSRMappings[] = {
   {"dbgdtrtx_el0", DBGDTRTX_EL0},
   {"oslar_el1", OSLAR_EL1},
   {"pmswinc_el0", PMSWINC_EL0},
@@ -266,17 +267,20 @@
   {"icc_dir_el1", ICC_DIR_EL1},
   {"icc_sgi1r_el1", ICC_SGI1R_EL1},
   {"icc_asgi1r_el1", ICC_ASGI1R_EL1},
-  {"icc_sgi0r_el1", ICC_SGI0R_EL1}
+  {"icc_sgi0r_el1", ICC_SGI0R_EL1},
+
+  // Privileged Access Never extension specific system registers
+  {"pan", PAN, AArch64::FeatureV8_1a},
 };
 
-AArch64SysReg::MSRMapper::MSRMapper(uint64_t FeatureBits)
-  : SysRegMapper(FeatureBits) {
-    InstPairs = &MSRPairs[0];
-    NumInstPairs = llvm::array_lengthof(MSRPairs);
+AArch64SysReg::MSRMapper::MSRMapper(uint64_t SubTargetFeatureBits)
+  : SysRegMapper(SubTargetFeatureBits) {
+    InstMappings = &MSRMappings[0];
+    NumInstMappings = llvm::array_lengthof(MSRMappings);
 }
 
 
-const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64SysReg::SysRegMapper::SysRegMappings[] = {
   {"osdtrrx_el1", OSDTRRX_EL1},
   {"osdtrtx_el1",  OSDTRTX_EL1},
   {"teecr32_el1", TEECR32_EL1},
@@ -752,12 +756,47 @@
   {"ich_lr12_el2", ICH_LR12_EL2},
   {"ich_lr13_el2", ICH_LR13_EL2},
   {"ich_lr14_el2", ICH_LR14_EL2},
-  {"ich_lr15_el2", ICH_LR15_EL2}
-};
-
-const AArch64NamedImmMapper::Mapping
-AArch64SysReg::SysRegMapper::CycloneSysRegPairs[] = {
-  {"cpm_ioacc_ctl_el3", CPM_IOACC_CTL_EL3}
+  {"ich_lr15_el2", ICH_LR15_EL2},
+  {"cpm_ioacc_ctl_el3", CPM_IOACC_CTL_EL3, AArch64::ProcCyclone},
+
+  // Privileged Access Never extension specific system registers
+  {"pan", PAN, AArch64::FeatureV8_1a},
+
+  // Limited Ordering Regions extension system registers
+  {"lorsa_el1", LORSA_EL1, AArch64::FeatureV8_1a},
+  {"lorea_el1", LOREA_EL1, AArch64::FeatureV8_1a},
+  {"lorn_el1", LORN_EL1, AArch64::FeatureV8_1a},
+  {"lorc_el1", LORC_EL1, AArch64::FeatureV8_1a},
+  {"lorid_el1", LORID_EL1, AArch64::FeatureV8_1a},  
+
+  // Virtualization host extensions system registers
+  {"ttbr1_el2", TTBR1_EL2, AArch64::FeatureV8_1a},
+  {"contextidr_el2", CONTEXTIDR_EL2, AArch64::FeatureV8_1a},
+  {"cnthv_tval_el2", CNTHV_TVAL_EL2, AArch64::FeatureV8_1a},
+  {"cnthv_cval_el2", CNTHV_CVAL_EL2, AArch64::FeatureV8_1a},
+  {"cnthv_ctl_el2", CNTHV_CTL_EL2, AArch64::FeatureV8_1a},
+  {"sctlr_el12", SCTLR_EL12, AArch64::FeatureV8_1a},
+  {"cpacr_el12", CPACR_EL12, AArch64::FeatureV8_1a},
+  {"ttbr0_el12", TTBR0_EL12, AArch64::FeatureV8_1a},
+  {"ttbr1_el12", TTBR1_EL12, AArch64::FeatureV8_1a},
+  {"tcr_el12", TCR_EL12, AArch64::FeatureV8_1a},
+  {"afsr0_el12", AFSR0_EL12, AArch64::FeatureV8_1a},
+  {"afsr1_el12", AFSR1_EL12, AArch64::FeatureV8_1a},
+  {"esr_el12", ESR_EL12, AArch64::FeatureV8_1a},
+  {"far_el12", FAR_EL12, AArch64::FeatureV8_1a},
+  {"mair_el12", MAIR_EL12, AArch64::FeatureV8_1a},
+  {"amair_el12", AMAIR_EL12, AArch64::FeatureV8_1a},
+  {"vbar_el12", VBAR_EL12, AArch64::FeatureV8_1a},
+  {"contextidr_el12", CONTEXTIDR_EL12, AArch64::FeatureV8_1a},
+  {"cntkctl_el12", CNTKCTL_EL12, AArch64::FeatureV8_1a},
+  {"cntp_tval_el02", CNTP_TVAL_EL02, AArch64::FeatureV8_1a},
+  {"cntp_ctl_el02", CNTP_CTL_EL02, AArch64::FeatureV8_1a},
+  {"cntp_cval_el02", CNTP_CVAL_EL02, AArch64::FeatureV8_1a},
+  {"cntv_tval_el02", CNTV_TVAL_EL02, AArch64::FeatureV8_1a},
+  {"cntv_ctl_el02", CNTV_CTL_EL02, AArch64::FeatureV8_1a},
+  {"cntv_cval_el02", CNTV_CVAL_EL02, AArch64::FeatureV8_1a},
+  {"spsr_el12", SPSR_EL12, AArch64::FeatureV8_1a},
+  {"elr_el12", ELR_EL12, AArch64::FeatureV8_1a},
 };
 
 uint32_t
@@ -765,29 +804,21 @@
   std::string NameLower = Name.lower();
 
   // First search the registers shared by all
-  for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) {
-    if (SysRegPairs[i].Name == NameLower) {
+  for (unsigned i = 0; i < array_lengthof(SysRegMappings); ++i) {
+    if (hasFeature(SysRegMappings[i].SubTargetFeature) &&
+        SysRegMappings[i].Name == NameLower) {
       Valid = true;
-      return SysRegPairs[i].Value;
-    }
-  }
-
-  // Next search for target specific registers
-  if (FeatureBits & AArch64::ProcCyclone) {
-    for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) {
-      if (CycloneSysRegPairs[i].Name == NameLower) {
-        Valid = true;
-        return CycloneSysRegPairs[i].Value;
-      }
+      return SysRegMappings[i].Value;
     }
   }
 
   // Now try the instruction-specific registers (either read-only or
   // write-only).
-  for (unsigned i = 0; i < NumInstPairs; ++i) {
-    if (InstPairs[i].Name == NameLower) {
+  for (unsigned i = 0; i < NumInstMappings; ++i) {
+    if (hasFeature(InstMappings[i].SubTargetFeature) &&
+        InstMappings[i].Name == NameLower) {
       Valid = true;
-      return InstPairs[i].Value;
+      return InstMappings[i].Value;
     }
   }
 
@@ -816,26 +847,19 @@
 std::string
 AArch64SysReg::SysRegMapper::toString(uint32_t Bits) const {
   // First search the registers shared by all
-  for (unsigned i = 0; i < array_lengthof(SysRegPairs); ++i) {
-    if (SysRegPairs[i].Value == Bits) {
-      return SysRegPairs[i].Name;
-    }
-  }
-
-  // Next search for target specific registers
-  if (FeatureBits & AArch64::ProcCyclone) {
-    for (unsigned i = 0; i < array_lengthof(CycloneSysRegPairs); ++i) {
-      if (CycloneSysRegPairs[i].Value == Bits) {
-        return CycloneSysRegPairs[i].Name;
-      }
+  for (unsigned i = 0; i < array_lengthof(SysRegMappings); ++i) {
+    if (hasFeature(SysRegMappings[i].SubTargetFeature) &&
+        SysRegMappings[i].Value == Bits) {
+      return SysRegMappings[i].Name;
     }
   }
 
   // Now try the instruction-specific registers (either read-only or
   // write-only).
-  for (unsigned i = 0; i < NumInstPairs; ++i) {
-    if (InstPairs[i].Value == Bits) {
-      return InstPairs[i].Name;
+  for (unsigned i = 0; i < NumInstMappings; ++i) {
+    if (hasFeature(InstMappings[i].SubTargetFeature) &&
+        InstMappings[i].Value == Bits) {
+      return InstMappings[i].Name;
     }
   }
 
@@ -850,7 +874,7 @@
                + "_c" + utostr(CRm) + "_" + utostr(Op2);
 }
 
-const AArch64NamedImmMapper::Mapping AArch64TLBI::TLBIMapper::TLBIPairs[] = {
+const AArch64NamedImmMapper::Mapping AArch64TLBI::TLBIMapper::TLBIMappings[] = {
   {"ipas2e1is", IPAS2E1IS},
   {"ipas2le1is", IPAS2LE1IS},
   {"vmalle1is", VMALLE1IS},
@@ -885,5 +909,5 @@
   {"vaale1", VAALE1}
 };
 
-AArch64TLBI::TLBIMapper::TLBIMapper()
-  : AArch64NamedImmMapper(TLBIPairs, 0) {}
+AArch64TLBI::TLBIMapper::TLBIMapper(uint64_t SubTargetFeatureBits)
+  : AArch64NamedImmMapper(SubTargetFeatureBits, TLBIMappings, 0) {}
Index: lib/Target/ARM/ARM.td
===================================================================
--- lib/Target/ARM/ARM.td
+++ lib/Target/ARM/ARM.td
@@ -175,6 +175,9 @@
                                    "Support ARM v8 instructions",
                                    [HasV7Ops, FeatureVirtualization,
                                     FeatureMP]>;
+def FeatureV8_1a : SubtargetFeature<"v8.1a", "HasV8_1a", "true",
+                                   "Support ARM v8.1a instructions",
+                                   [HasV8Ops, FeatureAClass, FeatureCRC]>;
 
 //===----------------------------------------------------------------------===//
 // ARM Processors supported.
@@ -449,6 +452,14 @@
                                      FeatureDB,FeatureDSPThumb2,
                                      FeatureHasRAS, FeatureZCZeroing]>;
 
+// V8.1 Processors
+def : ProcNoItin<"generic-armv8.1-a", [HasV8Ops, FeatureV8_1a,
+                                       FeatureDB, FeatureFPARMv8,
+                                       FeatureNEON, FeatureDSPThumb2,
+                                       FeatureHWDiv, FeatureHWDivARM,
+                                       FeatureTrustZone, FeatureT2XtPk, 
+                                       FeatureCrypto]>;
+
 //===----------------------------------------------------------------------===//
 // Register File Description
 //===----------------------------------------------------------------------===//
Index: lib/Target/ARM/ARMAsmPrinter.cpp
===================================================================
--- lib/Target/ARM/ARMAsmPrinter.cpp
+++ lib/Target/ARM/ARMAsmPrinter.cpp
@@ -666,7 +666,7 @@
 
   std::string CPUString = STI.getCPUString();
 
-  if (CPUString != "generic") {
+  if (CPUString.find("generic")) {
     // FIXME: remove krait check when GNU tools support krait cpu
     if (STI.isKrait()) {
       ATS.emitTextAttribute(ARMBuildAttrs::CPU_name, "cortex-a9");
@@ -720,7 +720,8 @@
     // Emit Tag_Advanced_SIMD_arch for ARMv8 architecture
     if (STI.hasV8Ops())
       ATS.emitAttribute(ARMBuildAttrs::Advanced_SIMD_arch,
-                        ARMBuildAttrs::AllowNeonARMv8);
+                        STI.hasV8_1a() ? ARMBuildAttrs::AllowNeonARMv8_1a:
+                                         ARMBuildAttrs::AllowNeonARMv8);
   } else {
     if (STI.hasFPARMv8())
       // FPv5 and FP-ARMv8 have the same instructions, so are modeled as one
Index: lib/Target/ARM/ARMInstrInfo.td
===================================================================
--- lib/Target/ARM/ARMInstrInfo.td
+++ lib/Target/ARM/ARMInstrInfo.td
@@ -226,6 +226,8 @@
                                  AssemblerPredicate<"FeatureCrypto", "crypto">;
 def HasCRC           : Predicate<"Subtarget->hasCRC()">,
                                  AssemblerPredicate<"FeatureCRC", "crc">;
+def HasV8_1a         : Predicate<"Subtarget->hasV8_1a()">,
+                                 AssemblerPredicate<"FeatureV8_1a", "v8.1a">;
 def HasFP16          : Predicate<"Subtarget->hasFP16()">,
                                  AssemblerPredicate<"FeatureFP16","half-float">;
 def HasDivide        : Predicate<"Subtarget->hasDivide()">,
@@ -1415,7 +1417,8 @@
 let isCompare = 1, Defs = [CPSR] in {
 multiclass AI1_cmp_irs<bits<4> opcod, string opc,
                      InstrItinClass iii, InstrItinClass iir, InstrItinClass iis,
-                       PatFrag opnode, bit Commutable = 0> {
+                       PatFrag opnode, bit Commutable = 0,
+                       string rrDecoderMethod = ""> {
   def ri : AI1<opcod, (outs), (ins GPR:$Rn, mod_imm:$imm), DPFrm, iii,
                opc, "\t$Rn, $imm",
                [(opnode GPR:$Rn, mod_imm:$imm)]>,
@@ -1443,6 +1446,7 @@
     let Inst{15-12} = 0b0000;
     let Inst{11-4} = 0b00000000;
     let Inst{3-0} = Rm;
+    let DecoderMethod = rrDecoderMethod;
 
     let Unpredictable{15-12} = 0b1111;
   }
@@ -4261,6 +4265,30 @@
 def CRC32CW : AI_crc32<1, 0b10, "cw", int_arm_crc32cw>;
 
 //===----------------------------------------------------------------------===//
+// ARMv8.1a Privilege Access Never extension
+//
+// SETPAN #imm1
+
+def SETPAN : AInoP<(outs), (ins imm0_1:$imm), MiscFrm, NoItinerary, "setpan",
+                "\t$imm", []>, Requires<[IsARM, HasV8, HasV8_1a]> {
+  bits<1> imm;
+
+  let Inst{31-28} = 0b1111;
+  let Inst{27-20} = 0b00010001;
+  let Inst{19-16} = 0b0000;
+  let Inst{15-10} = 0b000000;
+  let Inst{9} = imm;
+  let Inst{8} = 0b0;
+  let Inst{7-4} = 0b0000;
+  let Inst{3-0} = 0b0000;
+
+  let Unpredictable{19-16} = 0b1111;
+  let Unpredictable{15-10} = 0b111111;
+  let Unpredictable{8} = 0b1;
+  let Unpredictable{3-0} = 0b1111;
+}
+
+//===----------------------------------------------------------------------===//
 //  Comparison Instructions...
 //
 
@@ -4364,7 +4392,8 @@
 // Note that TST/TEQ don't set all the same flags that CMP does!
 defm TST  : AI1_cmp_irs<0b1000, "tst",
                         IIC_iTSTi, IIC_iTSTr, IIC_iTSTsr,
-                      BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>, 1>;
+                      BinOpFrag<(ARMcmpZ (and_su node:$LHS, node:$RHS), 0)>, 1,
+                      "DecodeTSTInstruction">;
 defm TEQ  : AI1_cmp_irs<0b1001, "teq",
                         IIC_iTSTi, IIC_iTSTr, IIC_iTSTsr,
                       BinOpFrag<(ARMcmpZ (xor_su node:$LHS, node:$RHS), 0)>, 1>;
Index: lib/Target/ARM/ARMInstrNEON.td
===================================================================
--- lib/Target/ARM/ARMInstrNEON.td
+++ lib/Target/ARM/ARMInstrNEON.td
@@ -2790,7 +2790,7 @@
                                                      imm:$lane)))))))]>;
 class N3VDMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                     string OpcodeStr, string Dt,
-                    ValueType Ty, SDNode MulOp, SDNode ShOp>
+                    ValueType Ty, SDPatternOperator MulOp, SDPatternOperator ShOp>
   : N3VLane16<0, 1, op21_20, op11_8, 1, 0,
         (outs DPR:$Vd),
         (ins DPR:$src1, DPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
@@ -2826,7 +2826,7 @@
 class N3VQMulOpSL16<bits<2> op21_20, bits<4> op11_8, InstrItinClass itin,
                     string OpcodeStr, string Dt,
                     ValueType ResTy, ValueType OpTy,
-                    SDNode MulOp, SDNode ShOp>
+                    SDPatternOperator MulOp, SDPatternOperator ShOp>
   : N3VLane16<1, 1, op21_20, op11_8, 1, 0,
         (outs QPR:$Vd),
         (ins QPR:$src1, QPR:$Vn, DPR_8:$Vm, VectorIndex16:$lane),
@@ -3674,7 +3674,7 @@
 multiclass N3VMulOpSL_HS<bits<4> op11_8,
                          InstrItinClass itinD16, InstrItinClass itinD32,
                          InstrItinClass itinQ16, InstrItinClass itinQ32,
-                         string OpcodeStr, string Dt, SDNode ShOp> {
+                         string OpcodeStr, string Dt, SDPatternOperator ShOp> {
   def v4i16 : N3VDMulOpSL16<0b01, op11_8, itinD16,
                             OpcodeStr, !strconcat(Dt, "16"), v4i16, mul, ShOp>;
   def v2i32 : N3VDMulOpSL<0b10, op11_8, itinD32,
@@ -3711,27 +3711,38 @@
 }
 
 // Neon 3-argument intrinsics,
-//   element sizes of 8, 16 and 32 bits:
-multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
-                       InstrItinClass itinD, InstrItinClass itinQ,
+//   element sizes of 16 and 32 bits:
+multiclass N3VInt3_HS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itinD16, InstrItinClass itinD32,
+                       InstrItinClass itinQ16, InstrItinClass itinQ32,
                        string OpcodeStr, string Dt, SDPatternOperator IntOp> {
   // 64-bit vector types.
-  def v8i8  : N3VDInt3<op24, op23, 0b00, op11_8, op4, itinD,
-                       OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>;
-  def v4i16 : N3VDInt3<op24, op23, 0b01, op11_8, op4, itinD,
+  def v4i16 : N3VDInt3<op24, op23, 0b01, op11_8, op4, itinD16,
                        OpcodeStr, !strconcat(Dt, "16"), v4i16, v4i16, IntOp>;
-  def v2i32 : N3VDInt3<op24, op23, 0b10, op11_8, op4, itinD,
+  def v2i32 : N3VDInt3<op24, op23, 0b10, op11_8, op4, itinD32,
                        OpcodeStr, !strconcat(Dt, "32"), v2i32, v2i32, IntOp>;
 
   // 128-bit vector types.
-  def v16i8 : N3VQInt3<op24, op23, 0b00, op11_8, op4, itinQ,
-                       OpcodeStr, !strconcat(Dt, "8"), v16i8, v16i8, IntOp>;
-  def v8i16 : N3VQInt3<op24, op23, 0b01, op11_8, op4, itinQ,
+  def v8i16 : N3VQInt3<op24, op23, 0b01, op11_8, op4, itinQ16,
                        OpcodeStr, !strconcat(Dt, "16"), v8i16, v8i16, IntOp>;
-  def v4i32 : N3VQInt3<op24, op23, 0b10, op11_8, op4, itinQ,
+  def v4i32 : N3VQInt3<op24, op23, 0b10, op11_8, op4, itinQ32,
                        OpcodeStr, !strconcat(Dt, "32"), v4i32, v4i32, IntOp>;
 }
 
+//   element sizes of 8, 16 and 32 bits:
+multiclass N3VInt3_QHS<bit op24, bit op23, bits<4> op11_8, bit op4,
+                       InstrItinClass itinD16, InstrItinClass itinD32,
+                       InstrItinClass itinQ16, InstrItinClass itinQ32,
+                       string OpcodeStr, string Dt, SDPatternOperator IntOp>
+           :N3VInt3_HS <op24, op23, op11_8, op4, itinD16, itinD32,
+                        itinQ16, itinQ32, OpcodeStr, Dt, IntOp>{
+  // 64-bit vector types.
+  def v8i8  : N3VDInt3<op24, op23, 0b00, op11_8, op4, itinD16,
+                       OpcodeStr, !strconcat(Dt, "8"), v8i8, v8i8, IntOp>;
+  // 128-bit vector types.
+  def v16i8 : N3VQInt3<op24, op23, 0b00, op11_8, op4, itinQ16,
+                       OpcodeStr, !strconcat(Dt, "8"), v16i8, v16i8, IntOp>;
+}
 
 // Neon Long Multiply-Op vector operations,
 //   element sizes of 8, 16 and 32 bits:
@@ -4305,6 +4316,147 @@
 defm VMLALsls : N3VLMulOpSL_HS<0, 0b0010, "vmlal", "s", NEONvmulls, add>;
 defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", NEONvmullu, add>;
 
+let Predicates = [HasNEON, HasV8_1a] in {
+  // v8.1a Neon Rounding Double Multiply-Op vector operations,
+  // VQRDMLAH : Vector Saturating Rounding Doubling Multiply Accumulate Long
+  //            (Q += D * D)
+  defm VQRDMLAH : N3VInt3_HS<1, 0, 0b1011, 1, IIC_VMACi16D, IIC_VMACi32D,
+                             IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s",
+                             null_frag>;
+  def : Pat<(v4i16 (int_arm_neon_vqadds
+                     (v4i16 DPR:$src1),
+                     (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn),
+                                                   (v4i16 DPR:$Vm))))),
+            (v4i16 (VQRDMLAHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
+  def : Pat<(v2i32 (int_arm_neon_vqadds
+                     (v2i32 DPR:$src1),
+                     (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn),
+                                                   (v2i32 DPR:$Vm))))),
+            (v2i32 (VQRDMLAHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
+  def : Pat<(v8i16 (int_arm_neon_vqadds
+                     (v8i16 QPR:$src1),
+                     (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn),
+                                                   (v8i16 QPR:$Vm))))),
+            (v8i16 (VQRDMLAHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>;
+  def : Pat<(v4i32 (int_arm_neon_vqadds
+                     (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn),
+                                                   (v4i32 QPR:$Vm))))),
+            (v4i32 (VQRDMLAHv4i32 QPR:$src1, QPR:$Vn, QPR:$Vm))>;
+
+  defm VQRDMLAHsl : N3VMulOpSL_HS<0b1110, IIC_VMACi16D, IIC_VMACi32D,
+                                  IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlah", "s",
+                                  null_frag>;
+  def : Pat<(v4i16 (int_arm_neon_vqadds
+                     (v4i16 DPR:$src1),
+                     (v4i16 (int_arm_neon_vqrdmulh
+                              (v4i16 DPR:$Vn),
+                              (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
+                                                   imm:$lane)))))),
+            (v4i16 (VQRDMLAHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm,
+                                    imm:$lane))>;
+  def : Pat<(v2i32 (int_arm_neon_vqadds
+                     (v2i32 DPR:$src1),
+                     (v2i32 (int_arm_neon_vqrdmulh
+                              (v2i32 DPR:$Vn),
+                              (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
+                                                   imm:$lane)))))),
+            (v2i32 (VQRDMLAHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm,
+                                    imm:$lane))>;
+  def : Pat<(v8i16 (int_arm_neon_vqadds
+                     (v8i16 QPR:$src1),
+                     (v8i16 (int_arm_neon_vqrdmulh
+                              (v8i16 QPR:$src2),
+                              (v8i16 (NEONvduplane (v8i16 QPR:$src3),
+                                                   imm:$lane)))))),
+            (v8i16 (VQRDMLAHslv8i16 (v8i16 QPR:$src1),
+                                    (v8i16 QPR:$src2),
+                                    (v4i16 (EXTRACT_SUBREG
+                                             QPR:$src3,
+                                             (DSubReg_i16_reg imm:$lane))),
+                                    (SubReg_i16_lane imm:$lane)))>;
+  def : Pat<(v4i32 (int_arm_neon_vqadds
+                     (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqrdmulh 
+                              (v4i32 QPR:$src2),
+                              (v4i32 (NEONvduplane (v4i32 QPR:$src3), 
+                                                   imm:$lane)))))),
+            (v4i32 (VQRDMLAHslv4i32 (v4i32 QPR:$src1),
+                                    (v4i32 QPR:$src2),
+                                    (v2i32 (EXTRACT_SUBREG
+                                             QPR:$src3,
+                                             (DSubReg_i32_reg imm:$lane))),
+                                    (SubReg_i32_lane imm:$lane)))>;
+
+  //   VQRDMLSH : Vector Saturating Rounding Doubling Multiply Subtract Long
+  //              (Q -= D * D)
+  defm VQRDMLSH : N3VInt3_HS<1, 0, 0b1100, 1, IIC_VMACi16D, IIC_VMACi32D,
+                             IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s",
+                             null_frag>;
+  def : Pat<(v4i16 (int_arm_neon_vqsubs
+                     (v4i16 DPR:$src1),
+                     (v4i16 (int_arm_neon_vqrdmulh (v4i16 DPR:$Vn),
+                                                   (v4i16 DPR:$Vm))))),
+            (v4i16 (VQRDMLSHv4i16 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
+  def : Pat<(v2i32 (int_arm_neon_vqsubs
+                     (v2i32 DPR:$src1),
+                     (v2i32 (int_arm_neon_vqrdmulh (v2i32 DPR:$Vn),
+                                                   (v2i32 DPR:$Vm))))),
+            (v2i32 (VQRDMLSHv2i32 DPR:$src1, DPR:$Vn, DPR:$Vm))>;
+  def : Pat<(v8i16 (int_arm_neon_vqsubs
+                     (v8i16 QPR:$src1),
+                     (v8i16 (int_arm_neon_vqrdmulh (v8i16 QPR:$Vn),
+                                                   (v8i16 QPR:$Vm))))),
+            (v8i16 (VQRDMLSHv8i16 QPR:$src1, QPR:$Vn, QPR:$Vm))>;
+  def : Pat<(v4i32 (int_arm_neon_vqsubs
+                     (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$Vn),
+                                                   (v4i32 QPR:$Vm))))),
+            (v4i32 (VQRDMLSHv4i32 QPR:$src1, QPR:$Vn, QPR:$Vm))>;
+
+  defm VQRDMLSHsl : N3VMulOpSL_HS<0b1111, IIC_VMACi16D, IIC_VMACi32D,
+                                  IIC_VMACi16Q, IIC_VMACi32Q, "vqrdmlsh", "s",
+                                  null_frag>;
+  def : Pat<(v4i16 (int_arm_neon_vqsubs
+                     (v4i16 DPR:$src1),
+                     (v4i16 (int_arm_neon_vqrdmulh
+                              (v4i16 DPR:$Vn),
+                              (v4i16 (NEONvduplane (v4i16 DPR_8:$Vm),
+                                                   imm:$lane)))))),
+            (v4i16 (VQRDMLSHslv4i16 DPR:$src1, DPR:$Vn, DPR_8:$Vm, imm:$lane))>;
+  def : Pat<(v2i32 (int_arm_neon_vqsubs
+                     (v2i32 DPR:$src1),
+                     (v2i32 (int_arm_neon_vqrdmulh
+                              (v2i32 DPR:$Vn),
+                              (v2i32 (NEONvduplane (v2i32 DPR_VFP2:$Vm),
+                                                   imm:$lane)))))),
+            (v2i32 (VQRDMLSHslv2i32 DPR:$src1, DPR:$Vn, DPR_VFP2:$Vm, 
+                                    imm:$lane))>;
+  def : Pat<(v8i16 (int_arm_neon_vqsubs
+                     (v8i16 QPR:$src1),
+                     (v8i16 (int_arm_neon_vqrdmulh
+                              (v8i16 QPR:$src2),
+                              (v8i16 (NEONvduplane (v8i16 QPR:$src3), 
+                                                   imm:$lane)))))),
+            (v8i16 (VQRDMLSHslv8i16 (v8i16 QPR:$src1),
+                                    (v8i16 QPR:$src2),
+                                    (v4i16 (EXTRACT_SUBREG 
+                                             QPR:$src3,
+                                             (DSubReg_i16_reg imm:$lane))),
+                                    (SubReg_i16_lane imm:$lane)))>;
+  def : Pat<(v4i32 (int_arm_neon_vqsubs
+                     (v4i32 QPR:$src1),
+                     (v4i32 (int_arm_neon_vqrdmulh
+                              (v4i32 QPR:$src2),
+                              (v4i32 (NEONvduplane (v4i32 QPR:$src3),
+                                                    imm:$lane)))))),
+            (v4i32 (VQRDMLSHslv4i32 (v4i32 QPR:$src1),
+                                    (v4i32 QPR:$src2),
+                                    (v2i32 (EXTRACT_SUBREG 
+                                             QPR:$src3,
+                                             (DSubReg_i32_reg imm:$lane))),
+                                    (SubReg_i32_lane imm:$lane)))>;
+}
 //   VQDMLAL  : Vector Saturating Doubling Multiply Accumulate Long (Q += D * D)
 defm VQDMLAL  : N3VLInt3_HS<0, 1, 0b1001, 0, IIC_VMACi16D, IIC_VMACi32D,
                             "vqdmlal", "s", null_frag>;
Index: lib/Target/ARM/ARMInstrThumb2.td
===================================================================
--- lib/Target/ARM/ARMInstrThumb2.td
+++ lib/Target/ARM/ARMInstrThumb2.td
@@ -4281,6 +4281,23 @@
 
 
 //===----------------------------------------------------------------------===//
+// ARMv8.1 Privilege Access Never extension
+//
+// SETPAN #imm1
+
+def t2SETPAN : T1I<(outs), (ins imm0_1:$imm), NoItinerary, "setpan\t$imm", []>,
+               T1Misc<0b0110000>, Requires<[IsThumb2, HasV8, HasV8_1a]> {
+  bits<1> imm;
+
+  let Inst{4} = 0b1;
+  let Inst{3} = imm;
+  let Inst{2-0} = 0b000;
+
+  let Unpredictable{4} = 0b1;
+  let Unpredictable{2-0} = 0b111;
+}
+
+//===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
 //
 
Index: lib/Target/ARM/ARMSubtarget.h
===================================================================
--- lib/Target/ARM/ARMSubtarget.h
+++ lib/Target/ARM/ARMSubtarget.h
@@ -182,6 +182,9 @@
   /// HasCRC - if true, processor supports CRC instructions
   bool HasCRC;
 
+  /// HasV8_1a - if true, the processor has V8.1a: five extensions over V8a
+  bool HasV8_1a;
+
   /// If true, the instructions "vmov.i32 d0, #0" and "vmov.i32 q0, #0" are
   /// particularly effective at zeroing a VFP register.
   bool HasZeroCycleZeroing;
@@ -310,6 +313,7 @@
   bool hasNEON() const { return HasNEON;  }
   bool hasCrypto() const { return HasCrypto; }
   bool hasCRC() const { return HasCRC; }
+  bool hasV8_1a() const { return HasV8_1a; }
   bool hasVirtualization() const { return HasVirtualization; }
   bool useNEONForSinglePrecisionFP() const {
     return hasNEON() && UseNEONForSinglePrecisionFP;
Index: lib/Target/ARM/ARMSubtarget.cpp
===================================================================
--- lib/Target/ARM/ARMSubtarget.cpp
+++ lib/Target/ARM/ARMSubtarget.cpp
@@ -166,6 +166,7 @@
   HasTrustZone = false;
   HasCrypto = false;
   HasCRC = false;
+  HasV8_1a = false;
   HasZeroCycleZeroing = false;
   AllowsUnalignedMem = false;
   Thumb2DSP = false;
Index: lib/Target/ARM/AsmParser/ARMAsmParser.cpp
===================================================================
--- lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -276,6 +276,9 @@
   bool hasD16() const {
     return STI.getFeatureBits() & ARM::FeatureD16;
   }
+  bool hasV8_1a() const {
+    return STI.getFeatureBits() & ARM::FeatureV8_1a;
+  }
 
   void SwitchMode() {
     uint64_t FB = ComputeAvailableFeatures(STI.ToggleFeature(ARM::ModeThumb));
@@ -5442,6 +5445,7 @@
       Mnemonic == "vcvtn" || Mnemonic == "vcvtp" || Mnemonic == "vcvtm" ||
       Mnemonic == "vrinta" || Mnemonic == "vrintn" || Mnemonic == "vrintp" ||
       Mnemonic == "vrintm" || Mnemonic.startswith("aes") || Mnemonic == "hvc" ||
+      Mnemonic == "setpan" ||
       Mnemonic.startswith("sha1") || Mnemonic.startswith("sha256") ||
       (FullInst.startswith("vmull") && FullInst.endswith(".p64"))) {
     // These mnemonics are never predicable
Index: lib/Target/ARM/Disassembler/ARMDisassembler.cpp
===================================================================
--- lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -212,6 +212,10 @@
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeCPSInstruction(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeTSTInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeSETPANInstruction(MCInst &Inst, unsigned Insn,
+                               uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeT2CPSInstruction(MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val,
@@ -2119,6 +2123,54 @@
   return S;
 }
 
+static DecodeStatus DecodeTSTInstruction(MCInst &Inst, unsigned Insn,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Pred = fieldFromInstruction(Insn, 28, 4);
+  unsigned Rn = fieldFromInstruction(Insn, 16, 4);
+  unsigned Rm = fieldFromInstruction(Insn, 0, 4);
+
+  if (Pred == 0xF)
+    return DecodeSETPANInstruction(Inst, Insn, Address, Decoder);
+
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rn, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeGPRRegisterClass(Inst, Rm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodePredicateOperand(Inst, Pred, Address, Decoder)))
+    return MCDisassembler::Fail;
+
+  return S;
+}
+
+static DecodeStatus DecodeSETPANInstruction(MCInst &Inst, unsigned Insn,
+                                  uint64_t Address, const void *Decoder) {
+  DecodeStatus S = MCDisassembler::Success;
+
+  unsigned Imm = fieldFromInstruction(Insn, 9, 1);
+
+  const MCDisassembler *Dis = static_cast<const MCDisassembler*>(Decoder);
+  uint64_t FeatureBits = Dis->getSubtargetInfo().getFeatureBits();
+  if ((FeatureBits & ARM::FeatureV8_1a) == 0 ||
+      (FeatureBits & ARM::HasV8Ops) == 0)
+    return MCDisassembler::Fail;
+
+  // Decoder can be called from DecodeTST, which does not check the full
+  // encoding is valid.
+  if (fieldFromInstruction(Insn, 20,12) != 0xf11 ||
+      fieldFromInstruction(Insn, 4,4) != 0)
+    return MCDisassembler::Fail;
+  if (fieldFromInstruction(Insn, 10,10) != 0 ||
+      fieldFromInstruction(Insn, 0,4) != 0)
+    S = MCDisassembler::SoftFail;
+
+  Inst.setOpcode(ARM::SETPAN);
+  Inst.addOperand(MCOperand::CreateImm(Imm));
+
+  return S;
+}
+
 static DecodeStatus DecodeAddrModeImm12Operand(MCInst &Inst, unsigned Val,
                            uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
Index: lib/Target/ARM/MCTargetDesc/ARMArchName.def
===================================================================
--- lib/Target/ARM/MCTargetDesc/ARMArchName.def
+++ lib/Target/ARM/MCTargetDesc/ARMArchName.def
@@ -44,6 +44,8 @@
 ARM_ARCH_ALIAS("armv7m", ARMV7M)
 ARM_ARCH_NAME("armv8-a", ARMV8A,  "8-A",     v8)
 ARM_ARCH_ALIAS("armv8a", ARMV8A)
+ARM_ARCH_NAME("armv8.1-a", ARMV8_1A, "8.1-A", v8)
+ARM_ARCH_ALIAS("armv8.1a", ARMV8_1A)
 ARM_ARCH_NAME("iwmmxt",  IWMMXT,  "iwmmxt",  v5TE)
 ARM_ARCH_NAME("iwmmxt2", IWMMXT2, "iwmmxt2", v5TE)
 
Index: lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
===================================================================
--- lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -817,6 +817,7 @@
     break;
 
   case ARM::ARMV8A:
+  case ARM::ARMV8_1A:
     setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
     setAttributeItem(ARM_ISA_use, Allowed, false);
     setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
@@ -914,9 +915,8 @@
     setAttributeItem(ARMBuildAttrs::FP_arch,
                      ARMBuildAttrs::AllowFPARMv8A,
                      /* OverwriteExisting= */ false);
-    setAttributeItem(ARMBuildAttrs::Advanced_SIMD_arch,
-                     ARMBuildAttrs::AllowNeonARMv8,
-                     /* OverwriteExisting= */ false);
+    // 'Advanced_SIMD_arch' must be emitted not here, but within
+    // ARMAsmPrinter::emitAttributes(), depending on hasV8Ops() and hasV8_1a()
     break;
 
   case ARM::SOFTVFP:
Index: lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
===================================================================
--- lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -153,6 +153,17 @@
       // Use CPU to figure out the exact features
       ARMArchFeature = "+v8";
     break;
+  case Triple::ARMSubArch_v8_1a:
+    if (NoCPU)
+      // v8.1a: FeatureDB, FeatureFPARMv8, FeatureNEON, FeatureDSPThumb2,
+      //      FeatureMP, FeatureHWDiv, FeatureHWDivARM, FeatureTrustZone,
+      //      FeatureT2XtPk, FeatureCrypto, FeatureCRC, FeatureV8_1a
+      ARMArchFeature = "+v8.1a,+db,+fp-armv8,+neon,+t2dsp,+mp,+hwdiv,+hwdiv-arm,"
+                       "+trustzone,+t2xtpk,+crypto,+crc";
+    else
+      // Use CPU to figure out the exact features
+      ARMArchFeature = "+v8.1a";
+    break;
   case Triple::ARMSubArch_v7m:
     isThumb = true;
     if (NoCPU)
Index: test/CodeGen/AArch64/arm64-neon-v8.1a.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/arm64-neon-v8.1a.ll
@@ -0,0 +1,408 @@
+; RUN: llc < %s -verify-machineinstrs -march=arm64                                          | FileCheck %s --check-prefix=CHECK-V8a
+; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a                            | FileCheck %s --check-prefix=CHECK-V81a
+; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a -aarch64-neon-syntax=apple | FileCheck %s --check-prefix=CHECK-V81a-apple
+
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
+declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32)
+declare i16 @llvm.aarch64.neon.sqrdmulh.i16(i16, i16)
+
+declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
+declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
+declare i16 @llvm.aarch64.neon.sqadd.i16(i16, i16)
+
+declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
+declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
+declare i16 @llvm.aarch64.neon.sqsub.i16(i16, i16)
+
+;-----------------------------------------------------------------------------
+; RDMA Vector
+; test for SIMDThreeSameVectorSQRDMLxHTiedHS
+
+define <4 x i16> @test_sqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmlah_v4i16:
+   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
+   %retval =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc,  <4 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.4h
+; CHECK-V81a:       sqrdmlah    v0.4h, v1.4h, v2.4h
+; CHECK-V81a-apple: sqrdmlah.4h v0,    v1,    v2
+   ret <4 x i16> %retval
+}
+
+define <8 x i16> @test_sqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmlah_v8i16:
+   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
+   %retval =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.8h
+; CHECK-V81a:       sqrdmlah    v0.8h, v1.8h, v2.8h
+; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2
+   ret <8 x i16> %retval
+}
+
+define <2 x i32> @test_sqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlah_v2i32:
+   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
+   %retval =  call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.2s
+; CHECK-V81a:       sqrdmlah    v0.2s, v1.2s, v2.2s
+; CHECK-V81a-apple: sqrdmlah.2s v0,    v1,    v2
+   ret <2 x i32> %retval
+}
+
+define <4 x i32> @test_sqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlah_v4i32:
+   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
+   %retval =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+; CHECK-V81:        sqrdmulh    v1.4s, v1.4s, v2.4s
+; CHECK-V81a:       sqrdmlah    v0.4s, v1.4s, v2.4s
+; CHECK-V81a-apple: sqrdmlah.4s v0,    v1,    v2
+   ret <4 x i32> %retval
+}
+
+define <4 x i16> @test_sqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_v4i16:
+   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
+   %retval =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.4h
+; CHECK-V81a:       sqrdmlsh    v0.4h, v1.4h, v2.4h
+; CHECK-V81a-apple: sqrdmlsh.4h v0,    v1,    v2
+   ret <4 x i16> %retval
+}
+
+define <8 x i16> @test_sqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_v8i16:
+   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
+   %retval =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.8h
+; CHECK-V81a:       sqrdmlsh    v0.8h, v1.8h, v2.8h
+; CHECK-V81a-apple: sqrdmlsh.8h v0,    v1,    v2
+   ret <8 x i16> %retval
+}
+
+define <2 x i32> @test_sqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_v2i32:
+   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
+   %retval =  call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.2s
+; CHECK-V81a:       sqrdmlsh    v0.2s, v1.2s, v2.2s
+; CHECK-V81a-apple: sqrdmlsh.2s v0,    v1,    v2
+   ret <2 x i32> %retval
+}
+
+define <4 x i32> @test_sqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_v4i32:
+   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
+   %retval =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.4s
+; CHECK-V81a:       sqrdmlsh    v0.4s, v1.4s, v2.4s
+; CHECK-V81a-apple: sqrdmlsh.4s v0,    v1,    v2
+   ret <4 x i32> %retval
+}
+
+;-----------------------------------------------------------------------------
+; RDMA Vector, by element
+; tests for vXiYY_indexed, vXiYY_indexed in SIMDIndexedSQRDMLxHSDTied
+
+define <4 x i16> @test_sqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlah_lane_s16:
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
+  %retval =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod)
+; CHECK-V8a :       sqrdmulh    v1.4h, v1.4h, v2.h[3]
+; CHECK-V81a:       sqrdmlah    v0.4h, v1.4h, v2.h[3]
+; CHECK-V81a-apple: sqrdmlah.4h v0,    v1,    v2[3]
+  ret <4 x i16> %retval
+}
+
+define <8 x i16> @test_sqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlahq_lane_s16:
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
+  %retval =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.h[2]
+; CHECK-V81a:       sqrdmlah    v0.8h, v1.8h, v2.h[2]
+; CHECK-V81a-apple: sqrdmlah.8h v0,    v1,    v2[2]
+  ret <8 x i16> %retval
+}
+
+define <2 x i32> @test_sqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlah_lane_s32:
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
+  %retval =  call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.s[1]
+; CHECK-V81a:       sqrdmlah    v0.2s, v1.2s, v2.s[1]
+; CHECK-V81a-apple: sqrdmlah.2s v0,    v1,    v2[1]
+  ret <2 x i32> %retval
+}
+
+define <4 x i32> @test_sqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlahq_lane_s32:
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
+  %retval =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.s[0]
+; CHECK-V81a:       sqrdmlah    v0.4s, v1.4s, v2.s[0]
+; CHECK-V81a-apple: sqrdmlah.4s v0,    v1,    v2[0]
+  ret <4 x i32> %retval
+}
+
+define <4 x i16> @test_sqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlsh_lane_s16:
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
+  %retval =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.h[3]
+; CHECK-V81a:       sqrdmlsh    v0.4h, v1.4h, v2.h[3]
+; CHECK-V81a-apple: sqrdmlsh.4h v0,    v1,    v2[3]
+  ret <4 x i16> %retval
+}
+
+define <8 x i16> @test_sqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlshq_lane_s16:
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
+  %retval =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.h[2]
+; CHECK-V81a:       sqrdmlsh    v0.8h, v1.8h, v2.h[2]
+; CHECK-V81a-apple: sqrdmlsh.8h v0,    v1,    v2[2]
+  ret <8 x i16> %retval
+}
+
+define <2 x i32> @test_sqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlsh_lane_s32:
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
+  %retval =  call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.s[1]
+; CHECK-V81a:       sqrdmlsh    v0.2s, v1.2s, v2.s[1]
+; CHECK-V81a-apple: sqrdmlsh.2s v0,    v1,    v2[1]
+  ret <2 x i32> %retval
+}
+
+define <4 x i32> @test_sqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlshq_lane_s32:
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
+  %retval =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.s[0]
+; CHECK-V81a:       sqrdmlsh    v0.4s, v1.4s, v2.s[0]
+; CHECK-V81a-apple: sqrdmlsh.4s v0,    v1,    v2[0]
+  ret <4 x i32> %retval
+}
+
+;-----------------------------------------------------------------------------
+; RDMA Vector, by element, extracted
+; tests for "def : Pat" in SIMDIndexedSQRDMLxHSDTied
+
+; FIXME: after fix of https://llvm.org/bugs/show_bug.cgi?id=22886
+; uncomment this function, and replace "cHECK" for "CHECK"
+;define i16 @test_sqrdmlah_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
+;; cHECK-LABEL: test_sqrdmlah_extracted_lane_s16:
+;entry:
+;  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+;  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
+;  %extract = extractelement <4 x i16> %prod, i64 0
+;  %retval =  call i16 @llvm.aarch64.neon.sqadd.i16(i16 %acc, i16 %extract)
+;; cHECK: sqrdmlah {{v[2-9]+}}.4h, v0.4h, v1.h[0]
+;  ret i16 %retval
+;}
+
+; FIXME: after fix of https://llvm.org/bugs/show_bug.cgi?id=22886
+; uncomment this function, and replace "cHECK" for "CHECK"
+;define i16 @test_sqrdmlahq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
+;; cHECK-LABEL: test_sqrdmlahq_extracted_lane_s16:
+;entry:
+;  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
+;  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
+;  %extract = extractelement <8 x i16> %prod, i64 0
+;  %retval =  call i16 @llvm.aarch64.neon.sqadd.i16(i16 %acc, i16 %extract)
+;; cHECK: sqrdmlah {{v[2-9]+}}.8h, v0.8h, v1.h[0]
+;  ret i16 %retval
+;}
+
+define i32 @test_sqrdmlah_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlah_extracted_lane_s32:
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
+  %extract = extractelement <2 x i32> %prod, i64 0
+  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
+; CHECK-V8a:        sqrdmulh    v0.2s, v0.2s, v1.s[0]
+; CHECK-V81a:       sqrdmlah    v2.2s, v0.2s, v1.s[0]
+; CHECK-V81a-apple: sqrdmlah.2s v2,    v0,    v1[0]
+  ret i32 %retval
+}
+
+define i32 @test_sqrdmlahq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s32:
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
+  %extract = extractelement <4 x i32> %prod, i64 0
+  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
+; CHECK-V8a:        sqrdmulh    v0.4s, v0.4s, v1.s[0]
+; CHECK-V81a:       sqrdmlah    v2.4s, v0.4s, v1.s[0]
+; CHECK-V81a-apple: sqrdmlah.4s v2,    v0,    v1[0]
+  ret i32 %retval
+}
+
+; FIXME: after fix of https://llvm.org/bugs/show_bug.cgi?id=22886
+; uncomment this function, and replace "cHECK" for "CHECK"
+;define i16 @test_sqrdmlsh_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
+;; cHECK-LABEL: test_sqrdmlsh_extracted_lane_s16:
+;entry:
+;  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+;  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
+;  %extract = extractelement <4 x i16> %prod, i64 0
+;  %retval =  call i16 @llvm.aarch64.neon.sqsub.i16(i16 %acc, i16 %extract)
+;; cHECK: sqrdmlah {{v[2-9]+}}.4h, v0.4h, v1.h[0]
+;  ret i16 %retval
+;}
+
+; FIXME: after fix of https://llvm.org/bugs/show_bug.cgi?id=22886
+; uncomment this function, and replace "cHECK" for "CHECK"
+;define i16 @test_sqrdmlshq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
+;; cHECK-LABEL: test_sqrdmlshq_extracted_lane_s16:
+;entry:
+;  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
+;  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
+;  %extract = extractelement <8 x i16> %prod, i64 0
+;  %retval =  call i16 @llvm.aarch64.neon.sqsub.i16(i16 %acc, i16 %extract)
+;; cHECK: sqrdmlah {{v[0-9]+}}.8h, v0.8h, v1.h[0]
+;  ret i16 %retval
+;}
+
+define i32 @test_sqrdmlsh_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s32:
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
+  %extract = extractelement <2 x i32> %prod, i64 0
+  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
+; CHECK-V8a:        sqrdmulh    v0.2s, v0.2s, v1.s[0]
+; CHECK-V81a:       sqrdmlsh    v2.2s, v0.2s, v1.s[0]
+; CHECK-V81a-apple: sqrdmlsh.2s v2,    v0,    v1[0]
+  ret i32 %retval
+}
+
+define i32 @test_sqrdmlshq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s32:
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
+  %extract = extractelement <4 x i32> %prod, i64 0
+  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
+; CHECK-V8a:        sqrdmulh    v0.4s, v0.4s, v1.s[0]
+; CHECK-V81a:       sqrdmlsh    v2.4s, v0.4s, v1.s[0]
+; CHECK-V81a-apple: sqrdmlsh.4s v2,    v0,    v1[0]
+  ret i32 %retval
+}
+
+;-----------------------------------------------------------------------------
+; RDMA Scalar
+; test for "def : Pat" near SIMDThreeScalarHSTied in AArch64InstInfo.td
+
+; FIXME: after fix of https://llvm.org/bugs/show_bug.cgi?id=22886
+; uncomment this function, and replace "cHECK" for "CHECK"
+;define i16 @test_sqrdmlah_i16(i16 %acc, i16 %mhs, i16 %rhs) {
+;; cHECK-LABEL: test_sqrdmlah_i16:
+;  %prod = call i16 @llvm.aarch64.neon.sqrdmulh.i16(i16 %mhs,  i16 %rhs)
+;  %retval =  call i16 @llvm.aarch64.neon.sqadd.i16(i16 %acc,  i16 %prod)
+;; cHECK: sqrdmlah {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;  ret i16 %retval
+;}
+
+define i32 @test_sqrdmlah_i32(i32 %acc, i32 %mhs, i32 %rhs) {
+; CHECK-LABEL: test_sqrdmlah_i32:
+  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %rhs)
+  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc,  i32 %prod)
+; CHECK-V8a:        sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-V81a:       sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-V81a-apple: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  ret i32 %retval
+}
+
+; FIXME: after fix of https://llvm.org/bugs/show_bug.cgi?id=22886
+; uncomment this function, and replace "cHECK" for "CHECK"
+;define i16 @test_sqrdmlsh_i16(i16 %acc, i16 %mhs, i16 %rhs) {
+;; cHECK-LABEL: test_sqrdmlsh_i16:
+;  %prod = call i16 @llvm.aarch64.neon.sqrdmulh.i16(i16 %mhs,  i16 %rhs)
+;  %retval =  call i16 @llvm.aarch64.neon.sqsub.i16(i16 %acc,  i16 %prod)
+;; cHECK: sqrdmlsh {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;  ret i16 %retval
+;}
+
+define i32 @test_sqrdmlsh_i32(i32 %acc, i32 %mhs, i32 %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_i32:
+  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %rhs)
+  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc,  i32 %prod)
+; CHECK-V8a:        sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-V81a:       sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-V81a-apple: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  ret i32 %retval
+}
+
+;-----------------------------------------------------------------------------
+; RDMA Scalar, by element
+; tests for iYY_indexed in SIMDIndexedSQRDMLxHSDTied
+
+; FIXME: after fix of https://llvm.org/bugs/show_bug.cgi?id=22886
+; uncomment this function, and replace "cHECK" for "CHECK"
+;define i16 @test_sqrdmlah_extract_i16(i16 %acc, i16 %mhs, <4 x i16> %rhs) {
+;; cHECK-LABEL: test_sqrdmlah_extract_i32:
+;  %extract = extractelement <4 x i16> %rhs, i32 3
+;  %prod = call i16 @llvm.aarch64.neon.sqrdmulh.i16(i16 %mhs,  i16 %extract)
+;  %retval =  call i16 @llvm.aarch64.neon.sqadd.i16(i16 %acc,  i16 %prod)
+;; cHECK: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
+;  ret i16 %retval
+;}
+
+define i32 @test_sqrdmlah_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlah_extract_i32:
+  %extract = extractelement <4 x i32> %rhs, i32 3
+  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %extract)
+  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc,  i32 %prod)
+; CHECK-V8a:        sqrdmulh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
+; CHECK-V81a:       sqrdmlah   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
+; CHECK-V81a-apple: sqrdmlah.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
+  ret i32 %retval
+}
+
+; FIXME: after fix of https://llvm.org/bugs/show_bug.cgi?id=22886
+; uncomment this function, and replace "cHECK" for "CHECK"
+;define i16 @test_sqrdmlsh_extract_i16(i16 %acc, i16 %mhs, <4 x i16> %rhs) {
+;; cHECK-LABEL: test_sqrdmlsh_extract_i32:
+;  %extract = extractelement <4 x i16> %rhs, i32 3
+;  %prod = call i16 @llvm.aarch64.neon.sqrdmulh.i16(i16 %mhs,  i16 %extract)
+;  %retval =  call i16 @llvm.aarch64.neon.sqsub.i16(i16 %acc,  i16 %prod)
+;; cHECK: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
+;  ret i16 %retval
+;}
+
+define i32 @test_sqrdmlsh_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_extract_i32:
+  %extract = extractelement <4 x i32> %rhs, i32 3
+  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %extract)
+  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc,  i32 %prod)
+; CHECK-V8a:        sqrdmulh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
+; CHECK-V81a:       sqrdmlsh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
+; CHECK-V81a-apple: sqrdmlsh.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
+  ret i32 %retval
+}
Index: test/CodeGen/ARM/build-attributes.ll
===================================================================
--- test/CodeGen/ARM/build-attributes.ll
+++ test/CodeGen/ARM/build-attributes.ll
@@ -96,6 +96,9 @@
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a72 | FileCheck %s --check-prefix=CORTEX-A72
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a72  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=CORTEX-A72-FAST
 ; RUN: llc < %s -mtriple=armv8-linux-gnueabi -mcpu=cortex-a72 -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
+; RUN: llc < %s -mtriple=armv8.1a-linux-gnueabi | FileCheck %s --check-prefix=GENERIC-ARMV8_1-A
+; RUN: llc < %s -mtriple=armv8.1a-linux-gnueabi  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s --check-prefix=GENERIC-ARMV8_1-A-FAST
+; RUN: llc < %s -mtriple=armv8.1a-linux-gnueabi -enable-sign-dependent-rounding-fp-math | FileCheck %s --check-prefix=DYN-ROUNDING
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 | FileCheck %s  --check-prefix=CORTEX-A7-CHECK
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7  -enable-unsafe-fp-math -disable-fp-elim -enable-no-infs-fp-math -enable-no-nans-fp-math -fp-contract=fast | FileCheck %s  --check-prefix=CORTEX-A7-CHECK-FAST
 ; RUN: llc < %s -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a7 -mattr=-vfp2,-vfp3,-vfp4,-neon | FileCheck %s --check-prefix=CORTEX-A7-NOFPU
@@ -112,6 +115,10 @@
 ; RUN: llc < %s -mtriple=arm-none-linux-gnueabi | FileCheck %s --check-prefix=PCS-R9-USE
 ; RUN: llc < %s -mtriple=arm-none-linux-gnueabi -arm-reserve-r9 | FileCheck %s --check-prefix=PCS-R9-RESERVE
 
+; ARMv8.1a (AArch32)
+; RUN: llc < %s -mtriple=armv8.1a-none-linux-gnueabi -arm-no-strict-align | FileCheck %s --check-prefix=NO-STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv8.1a-none-linux-gnueabi -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
+; RUN: llc < %s -mtriple=armv8.1a-none-linux-gnueabi | FileCheck %s --check-prefix=NO-STRICT-ALIGN
 ; ARMv8a (AArch32)
 ; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a57 -arm-no-strict-align | FileCheck %s --check-prefix=NO-STRICT-ALIGN
 ; RUN: llc < %s -mtriple=armv8-none-linux-gnueabi -mcpu=cortex-a57 -arm-strict-align | FileCheck %s --check-prefix=STRICT-ALIGN
@@ -1153,6 +1160,35 @@
 ; CORTEX-A72-FAST-NOT:  .eabi_attribute 22
 ; CORTEX-A72-FAST:  .eabi_attribute 23, 1
 
+; GENERIC-ARMV8_1-A:  .eabi_attribute 6, 14
+; GENERIC-ARMV8_1-A:  .eabi_attribute 7, 65
+; GENERIC-ARMV8_1-A:  .eabi_attribute 8, 1
+; GENERIC-ARMV8_1-A:  .eabi_attribute 9, 2
+; GENERIC-ARMV8_1-A:  .fpu crypto-neon-fp-armv8
+; GENERIC-ARMV8_1-A:  .eabi_attribute 12, 4
+; GENERIC-ARMV8_1-A-NOT:   .eabi_attribute 19
+;; We default to IEEE 754 compliance
+; GENERIC-ARMV8_1-A:  .eabi_attribute 20, 1
+; GENERIC-ARMV8_1-A:  .eabi_attribute 21, 1
+; GENERIC-ARMV8_1-A-NOT:  .eabi_attribute 22
+; GENERIC-ARMV8_1-A:  .eabi_attribute 23, 3
+; GENERIC-ARMV8_1-A:  .eabi_attribute 24, 1
+; GENERIC-ARMV8_1-A:  .eabi_attribute 25, 1
+; GENERIC-ARMV8_1-A-NOT:  .eabi_attribute 27
+; GENERIC-ARMV8_1-A-NOT:  .eabi_attribute 28
+; GENERIC-ARMV8_1-A:  .eabi_attribute 36, 1
+; GENERIC-ARMV8_1-A:  .eabi_attribute 38, 1
+; GENERIC-ARMV8_1-A:  .eabi_attribute 42, 1
+; GENERIC-ARMV8_1-A-NOT:  .eabi_attribute 44
+; GENERIC-ARMV8_1-A:  .eabi_attribute 68, 3
+
+; GENERIC-ARMV8_1-A-FAST-NOT:   .eabi_attribute 19
+;; GENERIC-ARMV8_1-A has the ARMv8 FP unit, which always flushes preserving sign.
+; GENERIC-ARMV8_1-A-FAST:  .eabi_attribute 20, 2
+; GENERIC-ARMV8_1-A-FAST-NOT:  .eabi_attribute 21
+; GENERIC-ARMV8_1-A-FAST-NOT:  .eabi_attribute 22
+; GENERIC-ARMV8_1-A-FAST:  .eabi_attribute 23, 1
+
 ; RELOC-PIC:  .eabi_attribute 15, 1
 ; RELOC-PIC:  .eabi_attribute 16, 1
 ; RELOC-PIC:  .eabi_attribute 17, 2
Index: test/CodeGen/ARM/neon-v8.1a.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/neon-v8.1a.ll
@@ -0,0 +1,166 @@
+; RUN: llc < %s -mtriple=armv8 -mattr=+v8.1a | FileCheck %s
+
+;-----------------------------------------------------------------------------
+; RDMA Vector
+
+declare <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32>, <4 x i32>)
+
+declare <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32>, <4 x i32>)
+
+declare <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32>, <4 x i32>)
+
+define <4 x i16> @test_vqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
+; CHECK-LABEL: test_vqrdmlah_v4i16:
+   %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
+   %retval =  call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %acc,  <4 x i16> %prod)
+; CHECK: vqrdmlah.s16 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+   ret <4 x i16> %retval
+}
+
+define <8 x i16> @test_vqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
+; CHECK-LABEL: test_vqrdmlah_v8i16:
+   %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
+   %retval =  call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+; CHECK: vqrdmlah.s16 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
+   ret <8 x i16> %retval
+}
+
+define <2 x i32> @test_vqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
+; CHECK-LABEL: test_vqrdmlah_v2i32:
+   %prod = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
+   %retval =  call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+; CHECK: vqrdmlah.s32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+   ret <2 x i32> %retval
+}
+
+define <4 x i32> @test_vqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_vqrdmlah_v4i32:
+   %prod = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
+   %retval =  call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+; CHECK: vqrdmlah.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
+   ret <4 x i32> %retval
+}
+
+define <4 x i16> @test_vqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
+; CHECK-LABEL: test_vqrdmlsh_v4i16:
+   %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
+   %retval =  call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %acc, <4 x i16> %prod)
+; CHECK: vqrdmlsh.s16 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+   ret <4 x i16> %retval
+}
+
+define <8 x i16> @test_vqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
+; CHECK-LABEL: test_vqrdmlsh_v8i16:
+   %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
+   %retval =  call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+; CHECK: vqrdmlsh.s16 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
+   ret <8 x i16> %retval
+}
+
+define <2 x i32> @test_vqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
+; CHECK-LABEL: test_vqrdmlsh_v2i32:
+   %prod = call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
+   %retval =  call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+; CHECK: vqrdmlsh.s32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}
+   ret <2 x i32> %retval
+}
+
+define <4 x i32> @test_vqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_vqrdmlsh_v4i32:
+   %prod = call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
+   %retval =  call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+; CHECK: vqrdmlsh.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{q[0-9]+}}
+   ret <4 x i32> %retval
+}
+
+;-----------------------------------------------------------------------------
+; RDMA Scalar
+
+define <4 x i16> @test_vqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
+; CHECK-LABEL: test_vqrdmlah_lane_s16:
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
+  %retval =  call <4 x i16> @llvm.arm.neon.vqadds.v4i16(<4 x i16> %acc, <4 x i16> %prod)
+; CHECK: vqrdmlah.s16 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}[3]
+  ret <4 x i16> %retval
+}
+
+define <8 x i16> @test_vqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <4 x i16> %v) {
+; CHECK-LABEL: test_vqrdmlahq_lane_s16:
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
+  %retval =  call <8 x i16> @llvm.arm.neon.vqadds.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+; CHECK: vqrdmlah.s16 {{q[0-9]+}}, {{q[0-9]+}}, {{d[0-9]+}}[2]
+  ret <8 x i16> %retval
+}
+
+define <2 x i32> @test_vqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_vqrdmlah_lane_s32:
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %prod = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
+  %retval =  call <2 x i32> @llvm.arm.neon.vqadds.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+; CHECK: vqrdmlah.s32 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}[1]
+  ret <2 x i32> %retval
+}
+
+define <4 x i32> @test_vqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_vqrdmlahq_lane_s32:
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
+  %prod = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
+  %retval =  call <4 x i32> @llvm.arm.neon.vqadds.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+; CHECK: vqrdmlah.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{d[0-9]+}}[0]
+  ret <4 x i32> %retval
+}
+
+define <4 x i16> @test_vqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
+; CHECK-LABEL: test_vqrdmlsh_lane_s16:
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %prod = call <4 x i16> @llvm.arm.neon.vqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
+  %retval =  call <4 x i16> @llvm.arm.neon.vqsubs.v4i16(<4 x i16> %acc, <4 x i16> %prod)
+; CHECK: vqrdmlsh.s16 {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}[3]
+  ret <4 x i16> %retval
+}
+
+define <8 x i16> @test_vqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <4 x i16> %v) {
+; CHECK-LABEL: test_vqrdmlshq_lane_s16:
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %prod = call <8 x i16> @llvm.arm.neon.vqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
+  %retval =  call <8 x i16> @llvm.arm.neon.vqsubs.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+; CHECK: vqrdmlsh.s16 {{q[0-9]+}}, {{q[0-9]+}}, {{d[0-9]+}}[2]
+  ret <8 x i16> %retval
+}
+
+define <2 x i32> @test_vqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_vqrdmlsh_lane_s32:
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %prod = tail call <2 x i32> @llvm.arm.neon.vqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
+  %retval =  call <2 x i32> @llvm.arm.neon.vqsubs.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+; CHECK: vqrdmlsh.s32  {{d[0-9]+}}, {{d[0-9]+}}, {{d[0-9]+}}[1]
+  ret <2 x i32> %retval
+}
+
+define <4 x i32> @test_vqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_vqrdmlshq_lane_s32:
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <4 x i32> zeroinitializer
+  %prod = tail call <4 x i32> @llvm.arm.neon.vqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
+  %retval =  call <4 x i32> @llvm.arm.neon.vqsubs.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+; CHECK: vqrdmlsh.s32 {{q[0-9]+}}, {{q[0-9]+}}, {{d[0-9]+}}[0]
+  ret <4 x i32> %retval
+}
Index: test/MC/AArch64/armv8-extension-atomic.s
===================================================================
--- /dev/null
+++ test/MC/AArch64/armv8-extension-atomic.s
@@ -0,0 +1,163 @@
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8.1a -show-encoding < %s 2> %t | FileCheck %s
+// RUN: FileCheck --check-prefix=CHECK-ERROR <%t %s
+  .text
+
+  //8 bits
+  casb   w0, w1, [x2]
+  casab  w0, w1, [x2]
+  caslb  w0, w1, [x2]
+  casalb   w0, w1, [x2]
+
+//CHECK:  casb   w0, w1, [x2]        //      encoding: [0x41,0x7c,0xa0,0x08]
+//CHECK:  casab  w0, w1, [x2]        //      encoding: [0x41,0x7c,0xe0,0x08]
+//CHECK:  caslb   w0, w1, [x2]       //      encoding: [0x41,0xfc,0xa0,0x08]
+//CHECK:  casalb   w0, w1, [x2]      //      encoding: [0x41,0xfc,0xe0,0x08]
+
+  casb w0, w1, [w2]
+  casalb x0, x1, [x2]
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR:   casb w0, w1, [w2]
+//CHECK-ERROR:                 ^
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR:   casalb x0, x1, [x2]
+//CHECK-ERROR:          ^
+
+  //16 bits
+  cash   w0, w1, [x2]
+  casah  w0, w1, [x2]
+  caslh  w0, w1, [x2]
+  casalh   w0, w1, [x2]
+
+//CHECK:  cash   w0, w1, [x2]        //      encoding: [0x41,0x7c,0xa0,0x48]
+//CHECK:  casah  w0, w1, [x2]        //      encoding: [0x41,0x7c,0xe0,0x48]
+//CHECK:  caslh   w0, w1, [x2]       //      encoding: [0x41,0xfc,0xa0,0x48]
+//CHECK:  casalh   w0, w1, [x2]      //      encoding: [0x41,0xfc,0xe0,0x48]
+
+  //32 bits
+  cas   w0, w1, [x2]
+  casa  w0, w1, [x2]
+  casl  w0, w1, [x2]
+  casal   w0, w1, [x2]
+
+//CHECK:  cas   w0, w1, [x2]        //      encoding: [0x41,0x7c,0xa0,0x88]
+//CHECK:  casa  w0, w1, [x2]        //      encoding: [0x41,0x7c,0xe0,0x88]
+//CHECK:  casl   w0, w1, [x2]       //      encoding: [0x41,0xfc,0xa0,0x88]
+//CHECK:  casal   w0, w1, [x2]      //      encoding: [0x41,0xfc,0xe0,0x88]
+
+  cas   w0, w1, [w2]
+  casl  w0, x1, [x2]
+
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR:   cas   w0, w1, [w2]
+//CHECK-ERROR:                  ^
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR:   casl  w0, x1, [x2]
+//CHECK-ERROR:             ^
+
+  //64 bits
+  cas   x0, x1, [x2]
+  casa  x0, x1, [x2]
+  casl   x0, x1, [x2]
+  casal   x0, x1, [x2]
+
+//CHECK:  cas   x0, x1, [x2]        //      encoding: [0x41,0x7c,0xa0,0xc8]
+//CHECK:  casa  x0, x1, [x2]        //      encoding: [0x41,0x7c,0xe0,0xc8]
+//CHECK:  casl   x0, x1, [x2]       //      encoding: [0x41,0xfc,0xa0,0xc8]
+//CHECK:  casal   x0, x1, [x2]      //      encoding: [0x41,0xfc,0xe0,0xc8]
+
+  casa   x0, x1, [w2]
+  casal  x0, w1, [x2]
+
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR:   casa   x0, x1, [w2]
+//CHECK-ERROR:                   ^
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR:   casal  x0, w1, [x2]
+//CHECK-ERROR:              ^
+
+  // LD<OP> intructions
+  ldadda x0, x1, [x2]
+  ldclrl x0, x1, [x2]
+  ldeoral x0, x1, [x2]
+  ldset x0, x1, [x2]
+  ldsmaxa w0, w1, [x2]
+  ldsminlb w0, w1, [x2]
+  ldumaxalh w0, w1, [x2]
+  ldumin w0, w1, [x2]
+//CHECK: ldadda     x0, x1, [x2]  // encoding: [0x41,0x00,0xa0,0xf8]
+//CHECK: ldclrl     x0, x1, [x2]  // encoding: [0x41,0x10,0x60,0xf8]
+//CHECK: ldeoral    x0, x1, [x2]  // encoding: [0x41,0x20,0xe0,0xf8]
+//CHECK: ldset      x0, x1, [x2]  // encoding: [0x41,0x30,0x20,0xf8]
+//CHECK: ldsmaxa    w0, w1, [x2]  // encoding: [0x41,0x40,0xa0,0xb8]
+//CHECK: ldsminlb   w0, w1, [x2]  // encoding: [0x41,0x50,0x60,0x38]
+//CHECK: ldumaxalh  w0, w1, [x2]  // encoding: [0x41,0x60,0xe0,0x78]
+//CHECK: ldumin     w0, w1, [x2]  // encoding: [0x41,0x70,0x20,0xb8]
+
+  // ST<OP> intructions: aliases to LD<OP>
+  staddlb w0, [x2]
+  stclrlh w0, [x2]
+  steorl  w0, [x2]
+  stsetl  x0, [x2]
+  stsmaxb  w0, [x2]
+  stsminh  w0, [x2]
+  stumax   w0, [x2]
+  stumin   x0, [x2]
+//CHECK: staddlb    w0, [x2]  // encoding: [0x5f,0x00,0x60,0x38]
+//CHECK: stclrlh    w0, [x2]  // encoding: [0x5f,0x10,0x60,0x78]
+//CHECK: steorl     w0, [x2]  // encoding: [0x5f,0x20,0x60,0xb8]
+//CHECK: stsetl     x0, [x2]  // encoding: [0x5f,0x30,0x60,0xf8]
+//CHECK: stsmaxb     w0, [x2]  // encoding: [0x5f,0x40,0x20,0x38]
+//CHECK: stsminh     w0, [x2]  // encoding: [0x5f,0x50,0x20,0x78]
+//CHECK: stumax      w0, [x2]  // encoding: [0x5f,0x60,0x20,0xb8]
+//CHECK: stumin      x0, [x2]  // encoding: [0x5f,0x70,0x20,0xf8]
+
+  ldsmax x0, x1, [w2]
+  ldeorl w0, w1, [w2]
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR:   ldsmax x0, x1, [w2]
+//CHECK-ERROR:                   ^
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR:   ldeorl w0, w1, [w2]
+//CHECK-ERROR:                   ^
+
+  //SWP instruction
+  swp   x0, x1, [x2]
+  swpb  w0, w1, [x2]
+  swplh w0, w1, [x2]
+  swpal x0, x1, [sp]
+//CHECK: swp   x0, x1, [x2]       // encoding: [0x41,0x80,0x20,0xf8]
+//CHECK: swpb  w0, w1, [x2]       // encoding: [0x41,0x80,0x20,0x38]
+//CHECK: swplh w0, w1, [x2]       // encoding: [0x41,0x80,0x60,0x78]
+//CHECK: swpal x0, x1, [sp]       // encoding: [0xe1,0x83,0xe0,0xf8]
+
+  swp   x0, x1, [w2]
+  swp   x0, x1, [xzr]
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR:   swp   x0, x1, [w2]
+//CHECK-ERROR:                  ^
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR:   swp   x0, x1, [xzr]
+//CHECK-ERROR:                  ^
+
+  //CASP instruction
+  casp x0, x1, x2, x3, [x4]
+  casp w0, w1, w2, w3, [x4]
+//CHECK: casp x0, x1, x2, x3, [x4]      // encoding: [0x82,0x7c,0x20,0x48]
+//CHECK: casp w0, w1, w2, w3, [x4]      // encoding: [0x82,0x7c,0x20,0x08]
+
+  casp x1, x2, x4, x5, [x6]
+  casp x0, x1, x3, x4, [x5]
+  casp x0, x2, x4, x5, [x6]
+  casp x0, x1, x2, x4, [x5]
+//CHECK-ERROR: error: Expected even register
+//CHECK-ERROR:  casp x1, x2, x4, x5, [x6]
+//CHECK-ERROR:       ^
+//CHECK-ERROR: error: Expected even register
+//CHECK-ERROR:  casp x0, x1, x3, x4, [x5]
+//CHECK-ERROR:               ^
+//CHECK-ERROR: error: Expected consecutive registers
+//CHECK-ERROR:  casp x0, x2, x4, x5, [x6]
+//CHECK-ERROR:           ^
+//CHECK-ERROR: error: Expected consecutive registers
+//CHECK-ERROR:  casp x0, x1, x2, x4, [x5]
+//CHECK-ERROR:                   ^
Index: test/MC/AArch64/armv8-extension-lor.s
===================================================================
--- /dev/null
+++ test/MC/AArch64/armv8-extension-lor.s
@@ -0,0 +1,33 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.1a < %s | FileCheck %s
+
+
+//------------------------------------------------------------------------------
+// Load acquire / store release
+//------------------------------------------------------------------------------
+        ldlarb w0,[x1]
+        ldlarh w0,[x1]
+        ldlar  w0,[x1]
+        ldlar  x0,[x1]
+// CHECK:   ldlarb w0, [x1]   // encoding: [0x20,0x7c,0xdf,0x08]
+// CHECK:   ldlarh w0, [x1]   // encoding: [0x20,0x7c,0xdf,0x48]
+// CHECK:   ldlar  w0, [x1]   // encoding: [0x20,0x7c,0xdf,0x88]
+// CHECK:   ldlar  x0, [x1]   // encoding: [0x20,0x7c,0xdf,0xc8]
+        stllrb w0,[x1]
+        stllrh w0,[x1]
+        stllr  w0,[x1]
+        stllr  x0,[x1]
+// CHECK:   stllrb w0, [x1]   // encoding: [0x20,0x7c,0x9f,0x08]
+// CHECK:   stllrh w0, [x1]   // encoding: [0x20,0x7c,0x9f,0x48]
+// CHECK:   stllr  w0, [x1]   // encoding: [0x20,0x7c,0x9f,0x88]
+// CHECK:   stllr  x0, [x1]   // encoding: [0x20,0x7c,0x9f,0xc8]
+
+        msr    LORSA_EL1, x0
+        msr    LOREA_EL1, x0
+        msr    LORN_EL1, x0
+        msr    LORC_EL1, x0
+        mrs    x0, LORID_EL1
+// CHECK:   msr    LORSA_EL1, x0 // encoding: [0x00,0xa4,0x18,0xd5]
+// CHECK:   msr    LOREA_EL1, x0 // encoding: [0x20,0xa4,0x18,0xd5]
+// CHECK:   msr    LORN_EL1, x0  // encoding: [0x40,0xa4,0x18,0xd5]
+// CHECK:   msr    LORC_EL1, x0  // encoding: [0x60,0xa4,0x18,0xd5]
+// CHECK:   mrs    x0, LORID_EL1 // encoding: [0xe0,0xa4,0x38,0xd5]
Index: test/MC/AArch64/armv8-extension-pan.s
===================================================================
--- /dev/null
+++ test/MC/AArch64/armv8-extension-pan.s
@@ -0,0 +1,30 @@
+//RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8.1a -show-encoding < %s 2> %t | FileCheck %s
+//RUN: FileCheck --check-prefix=CHECK-ERROR %s < %t
+
+  .text
+
+  msr pan, #0
+//CHECK:  msr PAN, #0           // encoding: [0x9f,0x40,0x00,0xd5]
+  msr pan, #1
+//CHECK:  msr PAN, #1           // encoding: [0x9f,0x41,0x00,0xd5]
+  msr pan, x5
+//CHECK:  msr PAN, x5           // encoding: [0x65,0x42,0x18,0xd5]
+  mrs x13, pan
+//CHECK:  mrs x13, PAN          // encoding: [0x6d,0x42,0x38,0xd5]
+
+  msr pan, #-1
+  msr pan, #20
+  msr pan, w0
+  mrs w0, pan
+//CHECK-ERROR: error: immediate must be an integer in range [0, 15].
+//CHECK-ERROR:   msr pan, #-1
+//CHECK-ERROR:            ^
+//CHECK-ERROR: error: immediate must be an integer in range [0, 15].
+//CHECK-ERROR:   msr pan, #20
+//CHECK-ERROR:            ^
+//CHECK-ERROR: error: immediate must be an integer in range [0, 15].
+//CHECK-ERROR:   msr pan, w0
+//CHECK-ERROR:            ^
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR:   mrs w0, pan
+//CHECK-ERROR:       ^
Index: test/MC/AArch64/armv8-extension-rdma.s
===================================================================
--- /dev/null
+++ test/MC/AArch64/armv8-extension-rdma.s
@@ -0,0 +1,154 @@
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8.1a -show-encoding < %s 2> %t | FileCheck %s
+// RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s
+  .text
+
+  //AdvSIMD RDMA vector
+  sqrdmlah v0.4h, v1.4h, v2.4h
+  sqrdmlsh v0.4h, v1.4h, v2.4h
+  sqrdmlah v0.2s, v1.2s, v2.2s
+  sqrdmlsh v0.2s, v1.2s, v2.2s
+  sqrdmlah v0.4s, v1.4s, v2.4s
+  sqrdmlsh v0.4s, v1.4s, v2.4s
+  sqrdmlah v0.8h, v1.8h, v2.8h
+  sqrdmlsh v0.8h, v1.8h, v2.8h
+// CHECK: sqrdmlah  v0.4h, v1.4h, v2.4h // encoding: [0x20,0x84,0x42,0x2e]
+// CHECK: sqrdmlsh  v0.4h, v1.4h, v2.4h // encoding: [0x20,0x8c,0x42,0x2e]
+// CHECK: sqrdmlah  v0.2s, v1.2s, v2.2s // encoding: [0x20,0x84,0x82,0x2e]
+// CHECK: sqrdmlsh  v0.2s, v1.2s, v2.2s // encoding: [0x20,0x8c,0x82,0x2e]
+// CHECK: sqrdmlah  v0.4s, v1.4s, v2.4s // encoding: [0x20,0x84,0x82,0x6e]
+// CHECK: sqrdmlsh  v0.4s, v1.4s, v2.4s // encoding: [0x20,0x8c,0x82,0x6e]
+// CHECK: sqrdmlah  v0.8h, v1.8h, v2.8h // encoding: [0x20,0x84,0x42,0x6e]
+// CHECK: sqrdmlsh  v0.8h, v1.8h, v2.8h // encoding: [0x20,0x8c,0x42,0x6e]
+
+  sqrdmlah v0.2h, v1.2h, v2.2h
+  sqrdmlsh v0.2h, v1.2h, v2.2h
+  sqrdmlah v0.8s, v1.8s, v2.8s
+  sqrdmlsh v0.8s, v1.8s, v2.8s
+  sqrdmlah v0.2s, v1.4h, v2.8h
+  sqrdmlsh v0.4s, v1.8h, v2.2s
+// CHECK-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR:   sqrdmlah v0.2h, v1.2h, v2.2h
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR:   sqrdmlah v0.2h, v1.2h, v2.2h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR:   sqrdmlah v0.2h, v1.2h, v2.2h
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:   sqrdmlah v0.2h, v1.2h, v2.2h
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR:   sqrdmlsh v0.2h, v1.2h, v2.2h
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR:   sqrdmlsh v0.2h, v1.2h, v2.2h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR:   sqrdmlsh v0.2h, v1.2h, v2.2h
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:   sqrdmlsh v0.2h, v1.2h, v2.2h
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR:   sqrdmlah v0.8s, v1.8s, v2.8s
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR:   sqrdmlah v0.8s, v1.8s, v2.8s
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR:   sqrdmlah v0.8s, v1.8s, v2.8s
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:   sqrdmlah v0.8s, v1.8s, v2.8s
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR:   sqrdmlsh v0.8s, v1.8s, v2.8s
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR:   sqrdmlsh v0.8s, v1.8s, v2.8s
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR:   sqrdmlsh v0.8s, v1.8s, v2.8s
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:   sqrdmlsh v0.8s, v1.8s, v2.8s
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:   sqrdmlah v0.2s, v1.4h, v2.8h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:   sqrdmlsh v0.4s, v1.8h, v2.2s
+// CHECK-ERROR:                   ^
+
+  //AdvSIMD RDMA scalar
+  sqrdmlah h0, h1, h2
+  sqrdmlsh h0, h1, h2
+  sqrdmlah s0, s1, s2
+  sqrdmlsh s0, s1, s2
+// CHECK: sqrdmlah h0, h1, h2  // encoding: [0x20,0x84,0x42,0x7e]
+// CHECK: sqrdmlsh h0, h1, h2  // encoding: [0x20,0x8c,0x42,0x7e]
+// CHECK: sqrdmlah s0, s1, s2  // encoding: [0x20,0x84,0x82,0x7e]
+// CHECK: sqrdmlsh s0, s1, s2  // encoding: [0x20,0x8c,0x82,0x7e]
+
+  //AdvSIMD RDMA vector by-element
+  sqrdmlah v0.4h, v1.4h, v2.h[3]
+  sqrdmlsh v0.4h, v1.4h, v2.h[3]
+  sqrdmlah v0.2s, v1.2s, v2.s[1]
+  sqrdmlsh v0.2s, v1.2s, v2.s[1]
+  sqrdmlah v0.8h, v1.8h, v2.h[3]
+  sqrdmlsh v0.8h, v1.8h, v2.h[3]
+  sqrdmlah v0.4s, v1.4s, v2.s[3]
+  sqrdmlsh v0.4s, v1.4s, v2.s[3]
+// CHECK: sqrdmlah v0.4h, v1.4h, v2.h[3]  // encoding: [0x20,0xd0,0x72,0x2f]
+// CHECK: sqrdmlsh v0.4h, v1.4h, v2.h[3]  // encoding: [0x20,0xf0,0x72,0x2f]
+// CHECK: sqrdmlah v0.2s, v1.2s, v2.s[1]  // encoding: [0x20,0xd0,0xa2,0x2f]
+// CHECK: sqrdmlsh v0.2s, v1.2s, v2.s[1]  // encoding: [0x20,0xf0,0xa2,0x2f]
+// CHECK: sqrdmlah v0.8h, v1.8h, v2.h[3]  // encoding: [0x20,0xd0,0x72,0x6f]
+// CHECK: sqrdmlsh v0.8h, v1.8h, v2.h[3]  // encoding: [0x20,0xf0,0x72,0x6f]
+// CHECK: sqrdmlah v0.4s, v1.4s, v2.s[3]  // encoding: [0x20,0xd8,0xa2,0x6f]
+// CHECK: sqrdmlsh v0.4s, v1.4s, v2.s[3]  // encoding: [0x20,0xf8,0xa2,0x6f]
+
+  sqrdmlah v0.4s, v1.2s, v2.s[1]
+  sqrdmlsh v0.2s, v1.2d, v2.s[1]
+  sqrdmlah v0.8h, v1.8h, v2.s[3]
+  sqrdmlsh v0.8h, v1.8h, v2.h[8]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:   sqrdmlah v0.4s, v1.2s, v2.s[1]
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:   sqrdmlsh v0.2s, v1.2d, v2.s[1]
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:   sqrdmlah v0.8h, v1.8h, v2.s[3]
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: vector lane must be an integer in range [0, 7].
+// CHECK-ERROR:   sqrdmlsh v0.8h, v1.8h, v2.h[8]
+// CHECK-ERROR:                              ^
+
+  //AdvSIMD RDMA scalar by-element
+  sqrdmlah h0, h1, v2.h[3]
+  sqrdmlsh h0, h1, v2.h[3]
+  sqrdmlah s0, s1, v2.s[3]
+  sqrdmlsh s0, s1, v2.s[3]
+// CHECK: sqrdmlah h0, h1, v2.h[3]  // encoding: [0x20,0xd0,0x72,0x7f]
+// CHECK: sqrdmlsh h0, h1, v2.h[3]  // encoding: [0x20,0xf0,0x72,0x7f]
+// CHECK: sqrdmlah s0, s1, v2.s[3]  // encoding: [0x20,0xd8,0xa2,0x7f]
+// CHECK: sqrdmlsh s0, s1, v2.s[3]  // encoding: [0x20,0xf8,0xa2,0x7f]
+
+  sqrdmlah b0, h1, v2.h[3]
+  sqrdmlah s0, d1, v2.s[3]
+  sqrdmlsh h0, h1, v2.s[3]
+  sqrdmlsh s0, s1, v2.s[4]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:   sqrdmlah b0, h1, v2.h[3]
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:   sqrdmlah s0, d1, v2.s[3]
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:   sqrdmlsh h0, h1, v2.s[3]
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: vector lane must be an integer in range [0, 3].
+// CHECK-ERROR:   sqrdmlsh s0, s1, v2.s[4]
+// CHECK-ERROR:                        ^
Index: test/MC/AArch64/armv8-extension-vhe.s
===================================================================
--- /dev/null
+++ test/MC/AArch64/armv8-extension-vhe.s
@@ -0,0 +1,61 @@
+// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.1a < %s | FileCheck %s
+
+
+//------------------------------------------------------------------------------
+// Virtualization Host Extensions
+//------------------------------------------------------------------------------
+    msr TTBR1_EL2, x0
+    msr CONTEXTIDR_EL2, x0
+    msr CNTHV_TVAL_EL2, x0
+    msr CNTHV_CVAL_EL2, x0
+    msr CNTHV_CTL_EL2, x0
+    msr SCTLR_EL12, x0
+    msr CPACR_EL12, x0
+    msr TTBR0_EL12, x0
+    msr TTBR1_EL12, x0
+    msr TCR_EL12, x0
+    msr AFSR0_EL12, x0
+    msr AFSR1_EL12, x0
+    msr ESR_EL12, x0
+    msr FAR_EL12, x0
+    msr MAIR_EL12, x0
+    msr AMAIR_EL12, x0
+    msr VBAR_EL12, x0
+    msr CONTEXTIDR_EL12, x0
+    msr CNTKCTL_EL12, x0
+    msr CNTP_TVAL_EL02, x0
+    msr CNTP_CTL_EL02, x0
+    msr CNTP_CVAL_EL02, x0
+    msr CNTV_TVAL_EL02, x0
+    msr CNTV_CTL_EL02, x0
+    msr CNTV_CVAL_EL02, x0
+    msr SPSR_EL12, x0
+    msr ELR_EL12, x0
+
+// CHECK:   msr TTBR1_EL2, x0           // encoding: [0x20,0x20,0x1c,0xd5]
+// CHECK:   msr CONTEXTIDR_EL2, x0      // encoding: [0x20,0xd0,0x1c,0xd5]
+// CHECK:   msr CNTHV_TVAL_EL2, x0      // encoding: [0x00,0xe3,0x1c,0xd5]
+// CHECK:   msr CNTHV_CVAL_EL2, x0      // encoding: [0x40,0xe3,0x1c,0xd5]
+// CHECK:   msr CNTHV_CTL_EL2, x0       // encoding: [0x20,0xe3,0x1c,0xd5]
+// CHECK:   msr SCTLR_EL12, x0          // encoding: [0x00,0x10,0x1d,0xd5]
+// CHECK:   msr CPACR_EL12, x0          // encoding: [0x40,0x10,0x1d,0xd5]
+// CHECK:   msr TTBR0_EL12, x0          // encoding: [0x00,0x20,0x1d,0xd5]
+// CHECK:   msr TTBR1_EL12, x0          // encoding: [0x20,0x20,0x1d,0xd5]
+// CHECK:   msr TCR_EL12, x0            // encoding: [0x40,0x20,0x1d,0xd5]
+// CHECK:   msr AFSR0_EL12, x0          // encoding: [0x00,0x51,0x1d,0xd5]
+// CHECK:   msr AFSR1_EL12, x0          // encoding: [0x20,0x51,0x1d,0xd5]
+// CHECK:   msr ESR_EL12, x0            // encoding: [0x00,0x52,0x1d,0xd5]
+// CHECK:   msr FAR_EL12, x0            // encoding: [0x00,0x60,0x1d,0xd5]
+// CHECK:   msr MAIR_EL12, x0           // encoding: [0x00,0xa2,0x1d,0xd5]
+// CHECK:   msr AMAIR_EL12, x0          // encoding: [0x00,0xa3,0x1d,0xd5]
+// CHECK:   msr VBAR_EL12, x0           // encoding: [0x00,0xc0,0x1d,0xd5]
+// CHECK:   msr CONTEXTIDR_EL12, x0     // encoding: [0x20,0xd0,0x1d,0xd5]
+// CHECK:   msr CNTKCTL_EL12, x0        // encoding: [0x00,0xe1,0x1d,0xd5]
+// CHECK:   msr CNTP_TVAL_EL02, x0      // encoding: [0x00,0xe2,0x1d,0xd5]
+// CHECK:   msr CNTP_CTL_EL02, x0       // encoding: [0x20,0xe2,0x1d,0xd5]
+// CHECK:   msr CNTP_CVAL_EL02, x0      // encoding: [0x40,0xe2,0x1d,0xd5]
+// CHECK:   msr CNTV_TVAL_EL02, x0      // encoding: [0x00,0xe3,0x1d,0xd5]
+// CHECK:   msr CNTV_CTL_EL02, x0       // encoding: [0x20,0xe3,0x1d,0xd5]
+// CHECK:   msr CNTV_CVAL_EL02, x0      // encoding: [0x40,0xe3,0x1d,0xd5]
+// CHECK:   msr SPSR_EL12, x0           // encoding: [0x00,0x40,0x1d,0xd5]
+// CHECK:   msr ELR_EL12, x0            // encoding: [0x20,0x40,0x1d,0xd5]
Index: test/MC/ARM/basic-arm-instructions-v8.1a.s
===================================================================
--- /dev/null
+++ test/MC/ARM/basic-arm-instructions-v8.1a.s
@@ -0,0 +1,206 @@
+//RUN: not llvm-mc -triple thumb-none-linux-gnu -mattr=+v8.1a -mattr=neon -show-encoding < %s 2>%t | FileCheck %s --check-prefix=CHECK-V81aTHUMB
+//RUN: FileCheck --check-prefix=CHECK-ERROR <%t %s
+//RUN: not llvm-mc -triple arm-none-linux-gnu -mattr=+v8.1a -mattr=neon -show-encoding < %s 2>%t | FileCheck %s --check-prefix=CHECK-V81aARM
+//RUN: FileCheck --check-prefix=CHECK-ERROR <%t %s
+
+//RUN: not llvm-mc -triple thumb-none-linux-gnu -mattr=+v8 -mattr=neon -show-encoding < %s 2>1 |& FileCheck %s --check-prefix=CHECK-V8
+//RUN: not llvm-mc -triple arm-none-linux-gnu -mattr=+v8 -mattr=neon -show-encoding < %s 2>1 |& FileCheck %s --check-prefix=CHECK-V8
+
+
+  .text
+//CHECK-V8THUMB: .text
+
+  vqrdmlah.i8   q0, q1, q2
+  vqrdmlah.u16  d0, d1, d2
+  vqrdmlsh.f32  q3, q4, q5
+  vqrdmlsh.f64  d3, d5, d5
+
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR:   vqrdmlah.i8   q0, q1, q2
+//CHECK-ERROR:           ^
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR:   vqrdmlah.u16  d0, d1, d2
+//CHECK-ERROR:           ^
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR:   vqrdmlsh.f32  q3, q4, q5
+//CHECK-ERROR:           ^
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR:   vqrdmlsh.f64  d3, d5, d5
+//CHECK-ERROR:           ^
+//CHECK-V8: error: invalid operand for instruction
+//CHECK-V8:   vqrdmlah.i8   q0, q1, q2
+//CHECK-V8:           ^
+//CHECK-V8: error: invalid operand for instruction
+//CHECK-V8:   vqrdmlah.u16  d0, d1, d2
+//CHECK-V8:           ^
+//CHECK-V8: error: invalid operand for instruction
+//CHECK-V8:   vqrdmlsh.f32  q3, q4, q5
+//CHECK-V8:           ^
+//CHECK-V8: error: invalid operand for instruction
+//CHECK-V8   vqrdmlsh.f64  d3, d5, d5
+//CHECK-V8:           ^
+
+  vqrdmlah.s16    d0, d1, d2
+//CHECK-V81aARM:   vqrdmlah.s16  d0, d1, d2      @ encoding: [0x12,0x0b,0x11,0xf3]
+//CHECK-V81aTHUMB: vqrdmlah.s16  d0, d1, d2      @ encoding: [0x11,0xff,0x12,0x0b]
+//CHECK-V8: error: instruction requires: v8.1a
+//CHECK-V8:  vqrdmlah.s16    d0, d1, d2
+//CHECK-V8:  ^
+
+  vqrdmlah.s32  d0, d1, d2
+//CHECK-V81aARM:   vqrdmlah.s32  d0, d1, d2      @ encoding: [0x12,0x0b,0x21,0xf3]
+//CHECK-V81aTHUMB: vqrdmlah.s32  d0, d1, d2      @ encoding: [0x21,0xff,0x12,0x0b]
+//CHECK-V8: error: instruction requires: v8.1a
+//CHECK-V8:  vqrdmlah.s32  d0, d1, d2
+//CHECK-V8:  ^
+
+  vqrdmlah.s16  q0, q1, q2
+//CHECK-V81aARM:   vqrdmlah.s16  q0, q1, q2      @ encoding: [0x54,0x0b,0x12,0xf3]
+//CHECK-V81aTHUMB: vqrdmlah.s16  q0, q1, q2      @ encoding: [0x12,0xff,0x54,0x0b]
+//CHECK-V8: error: instruction requires: v8.1a
+//CHECK-V8:  vqrdmlah.s16  q0, q1, q2
+//CHECK-V8:  ^
+
+  vqrdmlah.s32  q2, q3, q0
+//CHECK-V81aARM:   vqrdmlah.s32  q2, q3, q0      @ encoding: [0x50,0x4b,0x26,0xf3]
+//CHECK-V81aTHUMB: vqrdmlah.s32  q2, q3, q0      @ encoding: [0x26,0xff,0x50,0x4b]
+//CHECK-V8: error: instruction requires: v8.1a
+//CHECK-V8:  vqrdmlah.s32  q2, q3, q0
+//CHECK-V8:  ^
+
+
+  vqrdmlsh.s16  d7, d6, d5
+//CHECK-V81aARM:   vqrdmlsh.s16  d7, d6, d5      @ encoding: [0x15,0x7c,0x16,0xf3]
+//CHECK-V81aTHUMB: vqrdmlsh.s16  d7, d6, d5      @ encoding: [0x16,0xff,0x15,0x7c]
+//CHECK-V8: error: instruction requires: v8.1a
+//CHECK-V8:  vqrdmlsh.s16  d7, d6, d5
+//CHECK-V8:  ^
+
+  vqrdmlsh.s32  d0, d1, d2
+//CHECK-V81aARM:   vqrdmlsh.s32  d0, d1, d2      @ encoding: [0x12,0x0c,0x21,0xf3]
+//CHECK-V81aTHUMB: vqrdmlsh.s32  d0, d1, d2      @ encoding: [0x21,0xff,0x12,0x0c]
+//CHECK-V8: error: instruction requires: v8.1a
+//CHECK-V8:  vqrdmlsh.s32  d0, d1, d2
+//CHECK-V8:  ^
+
+  vqrdmlsh.s16  q0, q1, q2
+//CHECK-V81aARM:   vqrdmlsh.s16  q0, q1, q2      @ encoding: [0x54,0x0c,0x12,0xf3]
+//CHECK-V81aTHUMB: vqrdmlsh.s16  q0, q1, q2      @ encoding: [0x12,0xff,0x54,0x0c]
+//CHECK-V8: error: instruction requires: v8.1a
+//CHECK-V8:  vqrdmlsh.s16  q0, q1, q2
+//CHECK-V8:  ^
+
+  vqrdmlsh.s32    q3, q4, q5
+//CHECK-V81aARM:   vqrdmlsh.s32  q3, q4, q5      @ encoding: [0x5a,0x6c,0x28,0xf3]
+//CHECK-V81aTHUMB: vqrdmlsh.s32  q3, q4, q5      @ encoding: [0x28,0xff,0x5a,0x6c]
+//CHECK-V8: error: instruction requires: v8.1a
+//CHECK-V8:  vqrdmlsh.s32  q3, q4, q5
+//CHECK-V8:  ^
+
+
+  vqrdmlah.i8   q0, q1, d9[7]
+  vqrdmlah.u16  d0, d1, d2[3]
+  vqrdmlsh.f32  q3, q4, d5[1]
+  vqrdmlsh.f64  d3, d5, d5[0]
+
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR:   vqrdmlah.i8   q0, q1, d9[7]
+//CHECK-ERROR:           ^
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR:   vqrdmlah.u16  d0, d1, d2[3]
+//CHECK-ERROR:           ^
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR:   vqrdmlsh.f32  q3, q4, d5[1]
+//CHECK-ERROR:           ^
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR:   vqrdmlsh.f64  d3, d5, d5[0]
+//CHECK-ERROR:           ^
+
+  vqrdmlah.s16  d0, d1, d2[0]
+//CHECK-V81aARM:   vqrdmlah.s16 d0, d1, d2[0]    @ encoding: [0x42,0x0e,0x91,0xf2]
+//CHECK-V81aTHUMB: vqrdmlah.s16  d0, d1, d2[0]   @ encoding: [0x91,0xef,0x42,0x0e]
+//CHECK-V8: error: instruction requires: v8.1a
+//CHECK-V8:  vqrdmlah.s16  d0, d1, d2[0]
+//CHECK-V8:  ^
+
+  vqrdmlah.s32  d0, d1, d2[0]
+//CHECK-V81aARM:   vqrdmlah.s32 d0, d1, d2[0]    @ encoding: [0x42,0x0e,0xa1,0xf2]
+//CHECK-V81aTHUMB: vqrdmlah.s32  d0, d1, d2[0]   @ encoding: [0xa1,0xef,0x42,0x0e]
+//CHECK-V8: error: instruction requires: v8.1a
+//CHECK-V8:  vqrdmlah.s32  d0, d1, d2[0]
+//CHECK-V8:  ^
+
+  vqrdmlah.s16  q0, q1, d2[0]
+//CHECK-V81aARM:   vqrdmlah.s16  q0, q1, d2[0]   @ encoding: [0x42,0x0e,0x92,0xf3]
+//CHECK-V81aTHUMB: vqrdmlah.s16  q0, q1, d2[0]   @ encoding: [0x92,0xff,0x42,0x0e]
+//CHECK-V8: error: instruction requires: v8.1a
+//CHECK-V8:  vqrdmlah.s16  q0, q1, d2[0]
+//CHECK-V8:  ^
+
+  vqrdmlah.s32  q0, q1, d2[0]
+//CHECK-V81aARM:   vqrdmlah.s32  q0, q1, d2[0]   @ encoding: [0x42,0x0e,0xa2,0xf3]
+//CHECK-V81aTHUMB: vqrdmlah.s32  q0, q1, d2[0]   @ encoding: [0xa2,0xff,0x42,0x0e]
+//CHECK-V8: error: instruction requires: v8.1a
+//CHECK-V8:  vqrdmlah.s32  q0, q1, d2[0]
+//CHECK-V8:  ^
+
+
+  vqrdmlsh.s16  d0, d1, d2[0]
+//CHECK-V81aARM:   vqrdmlsh.s16 d0, d1, d2[0]    @ encoding: [0x42,0x0f,0x91,0xf2]
+//CHECK-V81aTHUMB: vqrdmlsh.s16  d0, d1, d2[0]   @ encoding: [0x91,0xef,0x42,0x0f]
+//CHECK-V8: error: instruction requires: v8.1a
+//CHECK-V8:  vqrdmlsh.s16  d0, d1, d2[0]
+//CHECK-V8:  ^
+
+  vqrdmlsh.s32  d0, d1, d2[0]
+//CHECK-V81aARM:   vqrdmlsh.s32 d0, d1, d2[0]    @ encoding: [0x42,0x0f,0xa1,0xf2]
+//CHECK-V81aTHUMB: vqrdmlsh.s32  d0, d1, d2[0]   @ encoding: [0xa1,0xef,0x42,0x0f]
+//CHECK-V8: error: instruction requires: v8.1a
+//CHECK-V8:  vqrdmlsh.s32  d0, d1, d2[0]
+//CHECK-V8:  ^
+
+  vqrdmlsh.s16  q0, q1, d2[0]
+//CHECK-V81aARM:   vqrdmlsh.s16 q0, q1, d2[0]    @ encoding: [0x42,0x0f,0x92,0xf3]
+//CHECK-V81aTHUMB: vqrdmlsh.s16  q0, q1, d2[0]   @ encoding: [0x92,0xff,0x42,0x0f]
+//CHECK-V8: error: instruction requires: v8.1a
+//CHECK-V8:  vqrdmlsh.s16  q0, q1, d2[0]
+//CHECK-V8:  ^
+
+  vqrdmlsh.s32  q0, q1, d2[0]
+//CHECK-V81aARM:   vqrdmlsh.s32 q0, q1, d2[0]    @ encoding: [0x42,0x0f,0xa2,0xf3]
+//CHECK-V81aTHUMB: vqrdmlsh.s32  q0, q1, d2[0]   @ encoding: [0xa2,0xff,0x42,0x0f]
+//CHECK-V8: error: instruction requires: v8.1a
+//CHECK-V8:  vqrdmlsh.s32  q0, q1, d2[0]
+//CHECK-V8:  ^
+
+  setpan  #0
+//CHECK-V81aTHUMB:  setpan  #0                @       encoding: [0x10,0xb6]
+//CHECK-V81aARM:    setpan  #0                @       encoding: [0x00,0x00,0x10,0xf1]
+//CHECK-V8: error: instruction requires: v8.1a
+//CHECK-V8:  setpan  #0
+//CHECK-V8:  ^
+
+  setpan  #1
+//CHECK-V81aTHUMB:  setpan  #1                @       encoding: [0x18,0xb6]
+//CHECK-V81aARM:    setpan  #1                @       encoding: [0x00,0x02,0x10,0xf1]
+//CHECK-V8: error: instruction requires: v8.1a
+//CHECK-V8:  setpan  #1
+//CHECK-V8:  ^
+  setpan
+  setpan #-1
+  setpan #2
+//CHECK-ERROR: error: too few operands for instruction
+//CHECK-ERROR:  setpan
+//CHECK-ERROR:  ^
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR:  setpan #-1
+//CHECK-ERROR:         ^
+//CHECK-ERROR: error: invalid operand for instruction
+//CHECK-ERROR:  setpan #2
+//CHECK-ERROR:         ^
+
+  it eq
+  setpaneq #0
+//CHECK-THUMB-ERROR: error: instruction 'setpan' is not predicable, but condition code specified
+//CHECK-THUMB-ERROR:  setpaneq #0
+//CHECK-THUMB-ERROR:  ^
Index: test/MC/Disassembler/AArch64/armv8-extension-atomic.txt
===================================================================
--- /dev/null
+++ test/MC/Disassembler/AArch64/armv8-extension-atomic.txt
@@ -0,0 +1,83 @@
+# RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8.1a --disassemble < %s | FileCheck %s
+
+0x41,0x7c,0xa0,0x08
+0x41,0x7c,0xe0,0x08
+0x41,0xfc,0xa0,0x08
+0x41,0xfc,0xe0,0x08
+0x41,0x7c,0xa0,0x48
+0x41,0x7c,0xe0,0x48
+0x41,0xfc,0xa0,0x48
+0x41,0xfc,0xe0,0x48
+# CHECK:  casb   w0, w1, [x2]
+# CHECK:  casab  w0, w1, [x2]
+# CHECK:  caslb   w0, w1, [x2]
+# CHECK:  casalb   w0, w1, [x2]
+# CHECK:  cash   w0, w1, [x2]
+# CHECK:  casah  w0, w1, [x2]
+# CHECK:  caslh   w0, w1, [x2]
+# CHECK:  casalh   w0, w1, [x2]
+
+0x41,0x7c,0xa0,0x88
+0x41,0x7c,0xe0,0x88
+0x41,0xfc,0xa0,0x88
+0x41,0xfc,0xe0,0x88
+0x41,0x7c,0xa0,0xc8
+0x41,0x7c,0xe0,0xc8
+0x41,0xfc,0xa0,0xc8
+0x41,0xfc,0xe0,0xc8
+# CHECK:  cas   w0, w1, [x2]
+# CHECK:  casa  w0, w1, [x2]
+# CHECK:  casl   w0, w1, [x2]
+# CHECK:  casal   w0, w1, [x2]
+# CHECK:  cas   x0, x1, [x2]
+# CHECK:  casa  x0, x1, [x2]
+# CHECK:  casl   x0, x1, [x2]
+# CHECK:  casal   x0, x1, [x2]
+
+0x41,0x80,0x20,0xf8
+0x41,0x80,0x20,0x38
+0x41,0x80,0x60,0x78
+0xe1,0x83,0xe0,0xf8
+# CHECK: swp   x0, x1, [x2]
+# CHECK: swpb  w0, w1, [x2]
+# CHECK: swplh w0, w1, [x2]
+# CHECK: swpal x0, x1, [sp]
+
+0x41,0x00,0xa0,0xf8
+0x41,0x10,0x60,0xf8
+0x41,0x20,0xe0,0xf8
+0x41,0x30,0x20,0xf8
+0x41,0x40,0xa0,0xb8
+0x41,0x50,0x60,0x38
+0x41,0x60,0xe0,0x78
+0x41,0x70,0x20,0xb8
+# CHECK:  ldadda    x0, x1, [x2]
+# CHECK:  ldclrl    x0, x1, [x2]
+# CHECK:  ldeoral   x0, x1, [x2]
+# CHECK:  ldset     x0, x1, [x2]
+# CHECK:  ldsmaxa   w0, w1, [x2]
+# CHECK:  ldsminlb  w0, w1, [x2]
+# CHECK:  ldumaxalh w0, w1, [x2]
+# CHECK:  ldumin    w0, w1, [x2]
+
+0x5f,0x00,0x60,0x38
+0x5f,0x10,0x60,0x78
+0x5f,0x20,0x60,0xb8
+0x5f,0x30,0x60,0xf8
+0x5f,0x40,0x20,0x38
+0x5f,0x50,0x20,0x78
+0x5f,0x60,0x20,0xb8
+0x5f,0x70,0x20,0xf8
+# CHECK: staddlb    w0, [x2]
+# CHECK: stclrlh    w0, [x2]
+# CHECK: steorl     w0, [x2]
+# CHECK: stsetl     x0, [x2]
+# CHECK: stsmaxb     w0, [x2]
+# CHECK: stsminh     w0, [x2]
+# CHECK: stumax      w0, [x2]
+# CHECK: stumin      x0, [x2]
+
+0x82,0x7c,0x20,0x48
+0x82,0x7c,0x20,0x08
+# CHECK: casp x0, x1, x2, x3, [x4]
+# CHECK: casp w0, w1, w2, w3, [x4]
Index: test/MC/Disassembler/AArch64/armv8-extension-lor.txt
===================================================================
--- /dev/null
+++ test/MC/Disassembler/AArch64/armv8-extension-lor.txt
@@ -0,0 +1,28 @@
+# RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8.1a --disassemble < %s | FileCheck %s
+
+0x20,0x7c,0xdf,0x08
+0x20,0x7c,0xdf,0x48
+0x20,0x7c,0xdf,0x88
+0x20,0x7c,0xdf,0xc8
+0x20,0x7c,0x9f,0x08
+0x20,0x7c,0x9f,0x48
+0x20,0x7c,0x9f,0x88
+0x20,0x7c,0x9f,0xc8
+# CHECK:   ldlarb w0, [x1]
+# CHECK:   ldlarh w0, [x1]
+# CHECK:   ldlar  w0, [x1]
+# CHECK:   ldlar  x0, [x1]
+# CHECK:   stllrb w0, [x1]
+# CHECK:   stllrh w0, [x1]
+# CHECK:   stllr  w0, [x1]
+# CHECK:   stllr  x0, [x1]
+0x00,0xa4,0x18,0xd5
+0x20,0xa4,0x18,0xd5
+0x40,0xa4,0x18,0xd5
+0x60,0xa4,0x18,0xd5
+0xe0,0xa4,0x38,0xd5
+# CHECK:   msr LORSA_EL1, x0
+# CHECK:   msr LOREA_EL1, x0
+# CHECK:   msr LORN_EL1, x0
+# CHECK:   msr LORC_EL1, x0
+# CHECK:   mrs x0, LORID_EL1
Index: test/MC/Disassembler/AArch64/armv8-extension-pan.txt
===================================================================
--- /dev/null
+++ test/MC/Disassembler/AArch64/armv8-extension-pan.txt
@@ -0,0 +1,10 @@
+# RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8.1a --disassemble < %s | FileCheck %s
+
+0x9f,0x40,0x00,0xd5
+0x9f,0x41,0x00,0xd5
+0x65,0x42,0x18,0xd5
+0x6d,0x42,0x38,0xd5
+# CHECK:  msr PAN, #0
+# CHECK:  msr PAN, #1
+# CHECK:  msr PAN, x5
+# CHECK:  mrs x13, PAN
Index: test/MC/Disassembler/AArch64/armv8-extension-rdma.txt
===================================================================
--- /dev/null
+++ test/MC/Disassembler/AArch64/armv8-extension-rdma.txt
@@ -0,0 +1,129 @@
+# RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8.1a --disassemble < %s 2>&1 | FileCheck %s
+
+[0x20,0x84,0x02,0x2e] # sqrdmlah  v0.8b, v1.8b, v2.8b
+[0x20,0x8c,0x02,0x2e] # sqrdmlsh  v0.8b, v1.8b, v2.8b
+[0x20,0x84,0xc2,0x2e] # sqrdmlah  v0.1d, v1.1d, v2.1d
+[0x20,0x8c,0xc2,0x2e] # sqrdmlsh  v0.1d, v1.1d, v2.1d
+[0x20,0x84,0x02,0x6e] # sqrdmlah  v0.16b, v1.16b, v2.16b
+[0x20,0x8c,0x02,0x6e] # sqrdmlsh  v0.16b, v1.16b, v2.16b
+[0x20,0x84,0xc2,0x6e] # sqrdmlah  v0.2d, v1.2d, v2.2d
+[0x20,0x8c,0xc2,0x6e] # sqrdmlsh  v0.2d, v1.2d, v2.2d
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0x84,0x02,0x2e]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0x8c,0x02,0x2e]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0x84,0xc2,0x2e]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0x8c,0xc2,0x2e]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0x84,0x02,0x6e]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0x8c,0x02,0x6e]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0x84,0xc2,0x6e]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0x8c,0xc2,0x6e]
+
+[0x20,0x84,0x02,0x7e] # sqrdmlah b0, b1, b2
+[0x20,0x8c,0x02,0x7e] # sqrdmlsh b0, b1, b2
+[0x20,0x84,0xc2,0x7e] # sqrdmlah d0, d1, d2
+[0x20,0x8c,0xc2,0x7e] # sqrdmlsh d0, d1, d2
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0x84,0x02,0x7e]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0x8c,0x02,0x7e]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0x84,0xc2,0x7e]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0x8c,0xc2,0x7e]
+
+[0x20,0xd0,0x32,0x2f] # sqrdmlah v0.8b, v1.8b, v2.b[3]
+[0x20,0xf0,0x32,0x2f] # sqrdmlsh v0.8b, v1.8b, v2.b[3]
+[0x20,0xd0,0xe2,0x2f] # sqrdmlah v0.1d, v1.1d, v2.d[1]
+[0x20,0xf0,0xe2,0x2f] # sqrdmlsh v0.1d, v1.1d, v2.d[1]
+[0x20,0xd0,0x32,0x6f] # sqrdmlah v0.16b, v1.16b, v2.b[3]
+[0x20,0xf0,0x32,0x6f] # sqrdmlsh v0.16b, v1.16b, v2.b[3]
+[0x20,0xd8,0xe2,0x6f] # sqrdmlah v0.2d, v1.2d, v2.d[3]
+[0x20,0xf8,0xe2,0x6f] # sqrdmlsh v0.2d, v1.2d, v2.d[3]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0xd0,0x32,0x2f]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0xf0,0x32,0x2f]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0xd0,0xe2,0x2f]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0xf0,0xe2,0x2f]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0xd0,0x32,0x6f]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0xf0,0x32,0x6f]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0xd8,0xe2,0x6f]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0xf8,0xe2,0x6f]
+
+[0x20,0xd0,0x32,0x7f] # sqrdmlah b0, b1, v2.b[3]
+[0x20,0xf0,0x32,0x7f] # sqrdmlsh b0, b1, v2.b[3]
+[0x20,0xd8,0xe2,0x7f] # sqrdmlah d0, d1, v2.d[3]
+[0x20,0xf8,0xe2,0x7f] # sqrdmlsh d0, d1, v2.d[3]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0xd0,0x32,0x7f]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0xf0,0x32,0x7f]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0xd8,0xe2,0x7f]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0xf8,0xe2,0x7f]
+
+[0x20,0x84,0x42,0x2e]
+[0x20,0x8c,0x42,0x2e]
+[0x20,0x84,0x82,0x2e]
+[0x20,0x8c,0x82,0x2e]
+[0x20,0x84,0x42,0x6e]
+[0x20,0x8c,0x42,0x6e]
+[0x20,0x84,0x82,0x6e]
+[0x20,0x8c,0x82,0x6e]
+# CHECK: sqrdmlah  v0.4h, v1.4h, v2.4h
+# CHECK: sqrdmlsh  v0.4h, v1.4h, v2.4h
+# CHECK: sqrdmlah  v0.2s, v1.2s, v2.2s
+# CHECK: sqrdmlsh  v0.2s, v1.2s, v2.2s
+# CHECK: sqrdmlah  v0.8h, v1.8h, v2.8h
+# CHECK: sqrdmlsh  v0.8h, v1.8h, v2.8h
+# CHECK: sqrdmlah  v0.4s, v1.4s, v2.4s
+# CHECK: sqrdmlsh  v0.4s, v1.4s, v2.4s
+
+[0x20,0x84,0x42,0x7e]
+[0x20,0x8c,0x42,0x7e]
+[0x20,0x84,0x82,0x7e]
+[0x20,0x8c,0x82,0x7e]
+# CHECK: sqrdmlah h0, h1, h2
+# CHECK: sqrdmlsh h0, h1, h2
+# CHECK: sqrdmlah s0, s1, s2
+# CHECK: sqrdmlsh s0, s1, s2
+
+0x20,0xd0,0x72,0x2f
+0x20,0xf0,0x72,0x2f
+0x20,0xd0,0xa2,0x2f
+0x20,0xf0,0xa2,0x2f
+0x20,0xd0,0x72,0x6f
+0x20,0xf0,0x72,0x6f
+0x20,0xd8,0xa2,0x6f
+0x20,0xf8,0xa2,0x6f
+# CHECK: sqrdmlah v0.4h, v1.4h, v2.h[3]
+# CHECK: sqrdmlsh v0.4h, v1.4h, v2.h[3]
+# CHECK: sqrdmlah v0.2s, v1.2s, v2.s[1]
+# CHECK: sqrdmlsh v0.2s, v1.2s, v2.s[1]
+# CHECK: sqrdmlah v0.8h, v1.8h, v2.h[3]
+# CHECK: sqrdmlsh v0.8h, v1.8h, v2.h[3]
+# CHECK: sqrdmlah v0.4s, v1.4s, v2.s[3]
+# CHECK: sqrdmlsh v0.4s, v1.4s, v2.s[3]
+
+0x20,0xd0,0x72,0x7f
+0x20,0xf0,0x72,0x7f
+0x20,0xd8,0xa2,0x7f
+0x20,0xf8,0xa2,0x7f
+# CHECK: sqrdmlah h0, h1, v2.h[3]
+# CHECK: sqrdmlsh h0, h1, v2.h[3]
+# CHECK: sqrdmlah s0, s1, v2.s[3]
+# CHECK: sqrdmlsh s0, s1, v2.s[3]
Index: test/MC/Disassembler/AArch64/armv8-extension-vhe.txt
===================================================================
--- /dev/null
+++ test/MC/Disassembler/AArch64/armv8-extension-vhe.txt
@@ -0,0 +1,56 @@
+# RUN: llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8.1a --disassemble < %s | FileCheck %s
+
+0x20,0x20,0x1c,0xd5
+0x20,0xd0,0x1c,0xd5
+0x00,0xe3,0x1c,0xd5
+0x40,0xe3,0x1c,0xd5
+0x20,0xe3,0x1c,0xd5
+0x00,0x10,0x1d,0xd5
+0x40,0x10,0x1d,0xd5
+0x00,0x20,0x1d,0xd5
+0x20,0x20,0x1d,0xd5
+0x40,0x20,0x1d,0xd5
+0x00,0x51,0x1d,0xd5
+0x20,0x51,0x1d,0xd5
+0x00,0x52,0x1d,0xd5
+0x00,0x60,0x1d,0xd5
+0x00,0xa2,0x1d,0xd5
+0x00,0xa3,0x1d,0xd5
+0x00,0xc0,0x1d,0xd5
+0x20,0xd0,0x1d,0xd5
+0x00,0xe1,0x1d,0xd5
+0x00,0xe2,0x1d,0xd5
+0x20,0xe2,0x1d,0xd5
+0x40,0xe2,0x1d,0xd5
+0x00,0xe3,0x1d,0xd5
+0x20,0xe3,0x1d,0xd5
+0x40,0xe3,0x1d,0xd5
+0x00,0x40,0x1d,0xd5
+0x20,0x40,0x1d,0xd5
+# CHECK:   msr TTBR1_EL2, x0
+# CHECK:   msr CONTEXTIDR_EL2, x0
+# CHECK:   msr CNTHV_TVAL_EL2, x0
+# CHECK:   msr CNTHV_CVAL_EL2, x0
+# CHECK:   msr CNTHV_CTL_EL2, x0
+# CHECK:   msr SCTLR_EL12, x0
+# CHECK:   msr CPACR_EL12, x0
+# CHECK:   msr TTBR0_EL12, x0
+# CHECK:   msr TTBR1_EL12, x0
+# CHECK:   msr TCR_EL12, x0
+# CHECK:   msr AFSR0_EL12, x0
+# CHECK:   msr AFSR1_EL12, x0
+# CHECK:   msr ESR_EL12, x0
+# CHECK:   msr FAR_EL12, x0
+# CHECK:   msr MAIR_EL12, x0
+# CHECK:   msr AMAIR_EL12, x0
+# CHECK:   msr VBAR_EL12, x0
+# CHECK:   msr CONTEXTIDR_EL12, x0
+# CHECK:   msr CNTKCTL_EL12, x0
+# CHECK:   msr CNTP_TVAL_EL02, x0
+# CHECK:   msr CNTP_CTL_EL02, x0
+# CHECK:   msr CNTP_CVAL_EL02, x0
+# CHECK:   msr CNTV_TVAL_EL02, x0
+# CHECK:   msr CNTV_CTL_EL02, x0
+# CHECK:   msr CNTV_CVAL_EL02, x0
+# CHECK:   msr SPSR_EL12, x0
+# CHECK:   msr ELR_EL12, x0
Index: test/MC/Disassembler/ARM/armv8.1a.txt
===================================================================
--- /dev/null
+++ test/MC/Disassembler/ARM/armv8.1a.txt
@@ -0,0 +1,52 @@
+# RUN: llvm-mc -triple armv8 -mattr=+v8.1a  --disassemble < %s 2>&1 | FileCheck %s --check-prefix=CHECK-V81a
+# RUN: not llvm-mc -triple armv8 -mattr=+v8 --disassemble < %s 2>&1 | FileCheck %s --check-prefix=CHECK-V8
+
+[0x54,0x0b,0x12,0xf3]
+[0x12,0x0b,0x21,0xf3]
+[0x54,0x0c,0x12,0xf3]
+[0x12,0x0c,0x21,0xf3]
+# CHECK-V81a:  vqrdmlah.s16  q0, q1, q2
+# CHECK-V81a:  vqrdmlah.s32  d0, d1, d2
+# CHECK-V81a:  vqrdmlsh.s16  q0, q1, q2
+# CHECK-V81a:  vqrdmlsh.s32  d0, d1, d2
+# CHECK-V8: warning: invalid instruction encoding
+# CHECK-V8: [0x54,0x0b,0x12,0xf3]
+# CHECK-V8: warning: invalid instruction encoding
+# CHECK-V8: [0x12,0x0b,0x21,0xf3]
+# CHECK-V8: warning: invalid instruction encoding
+# CHECK-V8: [0x54,0x0c,0x12,0xf3]
+# CHECK-V8: warning: invalid instruction encoding
+# CHECK-V8: [0x12,0x0c,0x21,0xf3]
+
+[0x42,0x0e,0x92,0xf3]
+[0x42,0x0e,0xa1,0xf2]
+[0x42,0x0f,0x92,0xf3]
+[0x42,0x0f,0xa1,0xf2]
+# CHECK-V81a:  vqrdmlah.s16 q0, q1, d2[0]
+# CHECK-V81a:  vqrdmlah.s32 d0, d1, d2[0]
+# CHECK-V81a:  vqrdmlsh.s16 q0, q1, d2[0]
+# CHECK-V81a:  vqrdmlsh.s32 d0, d1, d2[0]
+# CHECK-V8: warning: invalid instruction encoding
+# CHECK-V8: [0x42,0x0e,0x92,0xf3]
+# CHECK-V8: warning: invalid instruction encoding
+# CHECK-V8: [0x42,0x0e,0xa1,0xf2]
+# CHECK-V8: warning: invalid instruction encoding
+# CHECK-V8: [0x42,0x0f,0x92,0xf3]
+# CHECK-V8: warning: invalid instruction encoding
+# CHECK-V8: [0x42,0x0f,0xa1,0xf2]
+
+# The SETPAN(v8.1a) and TST(v8) instructions occupy the same space, but SETPAN 
+# uses the encoding for the invalid NV predicate operand. This test checks that 
+# the disassembler is correctly disambiguating and decoding these instructions.
+
+[0x00 0x00 0x10 0xf1]
+# CHECK: setpan #0
+
+[0x00 0x02 0x10 0xf1]
+# CHECK: setpan #1
+
+[0x00 0x00 0x10 0xe1]
+# CHECK: tst r0, r0
+
+[0x00 0x02 0x10 0xe1]
+# CHECK: tst r0, r0, lsl #4
Index: test/MC/Disassembler/ARM/invalid-armv8.1a.txt
===================================================================
--- /dev/null
+++ test/MC/Disassembler/ARM/invalid-armv8.1a.txt
@@ -0,0 +1,83 @@
+# RUN: not llvm-mc -triple armv8 -mattr=+v8.1a --disassemble < %s 2>&1 | FileCheck %s
+
+# Check, if sizes 00 and 11 are undefined for RDMA
+[0x12,0x0b,0x01,0xf3] # vqrdmlah.s8   d0, d1, d2
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x12,0x0b,0x01,0xf3] # vqrdmlah.s8   d0, d1, d2
+# CHECK-NEXT:  ^
+
+[0x12,0x0b,0x31,0xf3] # vqrdmlah.s64  d0, d1, d2
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x12,0x0b,0x31,0xf3] # vqrdmlah.s64  d0, d1, d2
+# CHECK-NEXT:  ^
+
+[0x54,0x0b,0x02,0xf3] # vqrdmlah.s8   q0, q1, q2
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x54,0x0b,0x02,0xf3] # vqrdmlah.s8   q0, q1, q2
+# CHECK-NEXT:  ^
+
+[0x54,0x0b,0x32,0xf3] # vqrdmlah.s64  q2, q3, q0
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x54,0x0b,0x32,0xf3] # vqrdmlah.s64  q2, q3, q0
+# CHECK-NEXT:  ^
+
+[0x15,0x7c,0x06,0xf3] # vqrdmlsh.s8   d0, d1, d2
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x15,0x7c,0x06,0xf3] # vqrdmlsh.s8   d0, d1, d2
+# CHECK-NEXT:  ^
+
+[0x15,0x7c,0x36,0xf3] # vqrdmlsh.s64  d0, d1, d2
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x15,0x7c,0x36,0xf3] # vqrdmlsh.s64  d0, d1, d2
+# CHECK-NEXT:  ^
+
+[0x54,0x0c,0x02,0xf3] # vqrdmlsh.s8   q0, q1, q2
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x54,0x0c,0x02,0xf3] # vqrdmlsh.s8   q0, q1, q2
+# CHECK-NEXT:  ^
+
+[0x54,0x0c,0x32,0xf3] # vqrdmlsh.s64  q0, q1, q2
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x54,0x0c,0x32,0xf3] # vqrdmlsh.s64  q0, q1, q2
+# CHECK-NEXT:  ^
+
+[0x42,0x0e,0x81,0xf2] # vqrdmlah.s8   d0, d1, d2[0]
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x42,0x0e,0x81,0xf2] # vqrdmlah.s8   d0, d1, d2[0]
+# CHECK-NEXT:  ^
+
+[0x42,0x0e,0xb1,0xf2] # vqrdmlah.s64  d0, d1, d2[0]
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x42,0x0e,0xb1,0xf2] # vqrdmlah.s64  d0, d1, d2[0]
+# CHECK-NEXT:  ^
+
+[0x42,0x0e,0x82,0xf3] # vqrdmlah.s8   q0, q1, d2[0]
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x42,0x0e,0x82,0xf3] # vqrdmlah.s8   q0, q1, d2[0]
+# CHECK-NEXT:  ^
+
+[0x42,0x0e,0xb2,0xf3] # vqrdmlah.s64  q0, q1, d2[0]
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x42,0x0e,0xb2,0xf3] # vqrdmlah.s64  q0, q1, d2[0]
+# CHECK-NEXT:  ^
+
+
+[0x42,0x0f,0x81,0xf2] # vqrdmlsh.s8   d0, d1, d2[0]
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x42,0x0f,0x81,0xf2] # vqrdmlsh.s8   d0, d1, d2[0]
+# CHECK-NEXT:  ^
+
+[0x42,0x0f,0xb1,0xf2] # vqrdmlsh.s64  d0, d1, d2[0]
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x42,0x0f,0xb1,0xf2] # vqrdmlsh.s64  d0, d1, d2[0]
+# CHECK-NEXT:  ^
+
+[0x42,0x0f,0x82,0xf3] # vqrdmlsh.s8   q0, q1, d2[0]
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x42,0x0f,0x82,0xf3] # vqrdmlsh.s8   q0, q1, d2[0]
+# CHECK-NEXT:  ^
+
+[0x42,0x0f,0xb2,0xf3] # vqrdmlsh.s64  q0, q1, d2[0]
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x42,0x0f,0xb2,0xf3] # vqrdmlsh.s64  q0, q1, d2[0]
+# CHECK-NEXT:  ^
Index: test/MC/Disassembler/ARM/invalid-thumbv8.1a.txt
===================================================================
--- /dev/null
+++ test/MC/Disassembler/ARM/invalid-thumbv8.1a.txt
@@ -0,0 +1,72 @@
+# RUN: not llvm-mc -triple thumbv8 -mattr=+v8.1a --disassemble < %s 2>&1 | FileCheck %s
+
+# Check, if sizes 00 and 11 are undefined for RDMA
+[0x01,0xff,0x12,0x0b] # vqrdmlah.s8   d0, d1, d2
+[0x31,0xff,0x12,0x0b] # vqrdmlah.s64  d0, d1, d2
+[0x02,0xff,0x54,0x0b] # vqrdmlah.s8   q0, q1, q2
+[0x06,0xff,0x50,0x4b] # vqrdmlah.s64  q2, q3, q0
+
+[0x01,0xff,0x12,0x0c] # vqrdmlsh.s8   d0, d1, d2
+[0x31,0xff,0x12,0x0c] # vqrdmlsh.s64  d0, d1, d2
+[0x02,0xff,0x54,0x0c] # vqrdmlsh.s8   q0, q1, q2
+[0x32,0xff,0x54,0x0c] # vqrdmlsh.s64  q0, q1, q2
+
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x01,0xff,0x12,0x0b] # vqrdmlah.s8   d0, d1, d2
+# CHECK-NEXT:  ^
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x31,0xff,0x12,0x0b] # vqrdmlah.s64  d0, d1, d2
+# CHECK-NEXT:  ^
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x02,0xff,0x54,0x0b] # vqrdmlah.s8   q0, q1, q2
+# CHECK-NEXT:  ^
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x06,0xff,0x50,0x4b] # vqrdmlah.s64  q2, q3, q0
+# CHECK-NEXT:  ^
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x01,0xff,0x12,0x0c] # vqrdmlsh.s8   d0, d1, d2
+# CHECK-NEXT:  ^
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x31,0xff,0x12,0x0c] # vqrdmlsh.s64  d0, d1, d2
+# CHECK-NEXT:  ^
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x02,0xff,0x54,0x0c] # vqrdmlsh.s8   q0, q1, q2
+# CHECK-NEXT:  ^
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x32,0xff,0x54,0x0c] # vqrdmlsh.s64  q0, q1, q2
+# CHECK-NEXT:  ^
+
+[0x81,0xef,0x42,0x0e] # vqrdmlah.s8   d0, d1, d2[0]
+[0xb1,0xef,0x42,0x0e] # vqrdmlah.s64  d0, d1, d2[0]
+[0x82,0xff,0x42,0x0e] # vqrdmlah.s8   q0, q1, d2[0]
+[0xb2,0xff,0x42,0x0e] # vqrdmlah.s64  q0, q1, d2[0]
+
+[0x81,0xef,0x42,0x0f] # vqrdmlsh.s8   d0, d1, d2[0]
+[0xb1,0xef,0x42,0x0f] # vqrdmlsh.s64  d0, d1, d2[0]
+[0x82,0xff,0x42,0x0f] # vqrdmlsh.s8   q0, q1, d2[0]
+[0xb2,0xff,0x42,0x0f] # vqrdmlsh.s64  q0, q1, d2[0]
+
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x81,0xef,0x42,0x0e] # vqrdmlah.s8   d0, d1, d2[0]
+# CHECK-NEXT:  ^
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0xb1,0xef,0x42,0x0e] # vqrdmlah.s64  d0, d1, d2[0]
+# CHECK-NEXT:  ^
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x82,0xff,0x42,0x0e] # vqrdmlah.s8   q0, q1, d2[0]
+# CHECK-NEXT:  ^
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0xb2,0xff,0x42,0x0e] # vqrdmlah.s64  q0, q1, d2[0]
+# CHECK-NEXT:  ^
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x81,0xef,0x42,0x0f] # vqrdmlsh.s8   d0, d1, d2[0]
+# CHECK-NEXT:  ^
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0xb1,0xef,0x42,0x0f] # vqrdmlsh.s64  d0, d1, d2[0]
+# CHECK-NEXT:  ^
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0x82,0xff,0x42,0x0f] # vqrdmlsh.s8   q0, q1, d2[0]
+# CHECK-NEXT:  ^
+# CHECK:      warning: invalid instruction encoding
+# CHECK-NEXT: [0xb2,0xff,0x42,0x0f] # vqrdmlsh.s64  q0, q1, d2[0]
+# CHECK-NEXT:  ^
Index: test/MC/Disassembler/ARM/thumb-v8.1a.txt
===================================================================
--- /dev/null
+++ test/MC/Disassembler/ARM/thumb-v8.1a.txt
@@ -0,0 +1,110 @@
+# RUN: llvm-mc -triple thumbv8 -mattr=+v8.1a  --disassemble < %s |& FileCheck %s --check-prefix=CHECK-V81a
+# RUN: not llvm-mc -triple thumbv8 -mattr=+v8 --disassemble < %s |& FileCheck %s --check-prefix=CHECK-V8
+
+[0x11,0xff,0x12,0x0b]
+# CHECK-V81a: vqrdmlah.s16  d0, d1, d2
+# CHECK-V8: warning: invalid instruction encoding
+# CHECK-V8: [0x11,0xff,0x12,0x0b]
+# CHECK-V8: ^
+
+[0x21,0xff,0x12,0x0b]
+# CHECK-V81a: vqrdmlah.s32  d0, d1, d2
+# CHECK-V8: warning: invalid instruction encoding
+# CHECK-V8: [0x21,0xff,0x12,0x0b]
+# CHECK-V8: ^
+
+[0x12,0xff,0x54,0x0b]
+# CHECK-V81a: vqrdmlah.s16  q0, q1, q2
+# CHECK-V8: warning: invalid instruction encoding
+# CHECK-V8: [0x12,0xff,0x54,0x0b]
+# CHECK-V8: ^
+
+[0x26,0xff,0x50,0x4b]
+# CHECK-V81a: vqrdmlah.s32  q2, q3, q0
+# CHECK-V8: warning: invalid instruction encoding
+# CHECK-V8: [0x26,0xff,0x50,0x4b]
+# CHECK-V8: ^
+
+[0x16,0xff,0x15,0x7c]
+# CHECK-V81a: vqrdmlsh.s16  d7, d6, d5
+# CHECK-V8: warning: invalid instruction encoding
+# CHECK-V8: [0x16,0xff,0x15,0x7c]
+# CHECK-V8: ^
+
+[0x21,0xff,0x12,0x0c]
+# CHECK-V81a: vqrdmlsh.s32  d0, d1, d2
+# CHECK-V8: warning: invalid instruction encoding
+# CHECK-V8: [0x21,0xff,0x12,0x0c]
+# CHECK-V8: ^
+
+[0x12,0xff,0x54,0x0c]
+# CHECK-V81a: vqrdmlsh.s16  q0, q1, q2
+# CHECK-V8: warning: invalid instruction encoding
+# CHECK-V8: [0x12,0xff,0x54,0x0c]
+# CHECK-V8: ^
+
+[0x28,0xff,0x5a,0x6c]
+# CHECK-V81a: vqrdmlsh.s32  q3, q4, q5
+# CHECK-V8: warning: invalid instruction encoding
+# CHECK-V8: [0x28,0xff,0x5a,0x6c]
+# CHECK-V8: ^
+
+[0x91,0xef,0x42,0x0e]
+# CHECK-V81a: vqrdmlah.s16  d0, d1, d2[0]
+# CHECK-V8: warning: invalid instruction encoding
+# CHECK-V8: [0x91,0xef,0x42,0x0e]
+# CHECK-V8: ^
+
+[0xa1,0xef,0x42,0x0e]
+# CHECK-V81a: vqrdmlah.s32  d0, d1, d2[0]
+# CHECK-V8: warning: invalid instruction encoding
+# CHECK-V8: [0xa1,0xef,0x42,0x0e]
+# CHECK-V8: ^
+
+[0x92,0xff,0x42,0x0e]
+# CHECK-V81a: vqrdmlah.s16  q0, q1, d2[0]
+# CHECK-V8: warning: invalid instruction encoding
+# CHECK-V8: [0x92,0xff,0x42,0x0e]
+# CHECK-V8: ^
+
+[0xa2,0xff,0x42,0x0e]
+# CHECK-V81a: vqrdmlah.s32  q0, q1, d2[0]
+# CHECK-V8: warning: invalid instruction encoding
+# CHECK-V8: [0xa2,0xff,0x42,0x0e]
+# CHECK-V8: ^
+
+[0x91,0xef,0x42,0x0f]
+# CHECK-V81a: vqrdmlsh.s16  d0, d1, d2[0]
+# CHECK-V8: warning: invalid instruction encoding
+# CHECK-V8: [0x91,0xef,0x42,0x0f]
+# CHECK-V8: ^
+
+[0xa1,0xef,0x42,0x0f]
+# CHECK-V81a: vqrdmlsh.s32  d0, d1, d2[0]
+# CHECK-V8: warning: invalid instruction encoding
+# CHECK-V8: [0xa1,0xef,0x42,0x0f]
+# CHECK-V8: ^
+
+[0x92,0xff,0x42,0x0f]
+# CHECK-V81a: vqrdmlsh.s16  q0, q1, d2[0]
+# CHECK-V8: warning: invalid instruction encoding
+# CHECK-V8: [0x92,0xff,0x42,0x0f]
+# CHECK-V8: ^
+
+[0xa2,0xff,0x42,0x0f]
+# CHECK-V81a: vqrdmlsh.s32  q0, q1, d2[0]
+# CHECK-V8: warning: invalid instruction encoding
+# CHECK-V8: [0xa2,0xff,0x42,0x0f]
+# CHECK-V8: ^
+
+[0x10,0xb6]
+# CHECK-V81a: setpan #0
+# CHECK-V8: warning: invalid instruction encoding
+# CHECK-V8: [0x10,0xb6]
+# CHECK-V8: ^
+
+[0x18,0xb6]
+# CHECK-V81a: setpan #1
+# CHECK-V8: warning: invalid instruction encoding
+# CHECK-V8: [0x18,0xb6]
+# CHECK-V8: ^