Index: lib/Target/AArch64/AArch64InstrFormats.td
===================================================================
--- lib/Target/AArch64/AArch64InstrFormats.td
+++ lib/Target/AArch64/AArch64InstrFormats.td
@@ -5302,6 +5302,27 @@
   let Inst{4-0}   = Rd;
 }
 
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in
+class BaseSIMDThreeScalarTied<bit U, bits<2> size, bit R, bits<5> opcode,
+                        dag oops, dag iops, string asm,
+            list<dag> pattern>
+  : I<oops, iops, asm, "\t$Rd, $Rn, $Rm", "$Rd = $dst", pattern>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-30} = 0b01;
+  let Inst{29}    = U;
+  let Inst{28-24} = 0b11110;
+  let Inst{23-22} = size;
+  let Inst{21}    = R;
+  let Inst{20-16} = Rm;
+  let Inst{15-11} = opcode;
+  let Inst{10}    = 1;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
 multiclass SIMDThreeScalarD<bit U, bits<5> opc, string asm,
                             SDPatternOperator OpNode> {
   def v1i64  : BaseSIMDThreeScalar<U, 0b11, opc, FPR64, asm,
@@ -5329,6 +5350,16 @@
   def v1i16  : BaseSIMDThreeScalar<U, 0b01, opc, FPR16, asm, []>;
 }
 
+multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm,
+                                 SDPatternOperator OpNode = null_frag> {
+  def v1i32: BaseSIMDThreeScalarTied<U, 0b10, R, opc, (outs FPR32:$dst),
+                                     (ins FPR32:$Rd, FPR32:$Rn, FPR32:$Rm), 
+                                     asm, []>;
+  def v1i16: BaseSIMDThreeScalarTied<U, 0b01, R, opc, (outs FPR16:$dst),
+                                     (ins FPR16:$Rd, FPR16:$Rn, FPR16:$Rm), 
+                                     asm, []>;
+}
+
 multiclass SIMDThreeScalarSD<bit U, bit S, bits<5> opc, string asm,
                              SDPatternOperator OpNode = null_frag> {
   let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
@@ -8521,6 +8552,197 @@
 } // end of 'let Predicates = [HasNEON]'
 
 //----------------------------------------------------------------------------
+// AdvSIMD v8.1 Rounding Double Multiply Add/Subtract
+//----------------------------------------------------------------------------
+
+let Predicates = [HasNEON, HasV8_1a] in {
+
+class BaseSIMDThreeSameVectorTiedR0<bit Q, bit U, bits<2> size, bits<5> opcode,
+                                    RegisterOperand regtype, string asm, 
+                                    string kind, list<dag> pattern>
+  : BaseSIMDThreeSameVectorTied<Q, U, size, opcode, regtype, asm, kind, 
+                                pattern> {
+  let Inst{21}=0;
+}
+multiclass SIMDThreeSameVectorSQRDMLxHTiedHS<bit U, bits<5> opc, string asm,
+                                             SDPatternOperator Accum> {
+  def v4i16 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b01, opc, V64, asm, ".4h",
+    [(set (v4i16 V64:$dst),
+          (Accum (v4i16 V64:$Rd),
+                 (v4i16 (int_aarch64_neon_sqrdmulh (v4i16 V64:$Rn),
+                                                   (v4i16 V64:$Rm)))))]>;         
+  def v8i16 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b01, opc, V128, asm, ".8h",
+    [(set (v8i16 V128:$dst),
+          (Accum (v8i16 V128:$Rd),
+                 (v8i16 (int_aarch64_neon_sqrdmulh (v8i16 V128:$Rn),
+                                                   (v8i16 V128:$Rm)))))]>;
+  def v2i32 : BaseSIMDThreeSameVectorTiedR0<0, U, 0b10, opc, V64, asm, ".2s",
+    [(set (v2i32 V64:$dst),
+          (Accum (v2i32 V64:$Rd),
+                 (v2i32 (int_aarch64_neon_sqrdmulh (v2i32 V64:$Rn),
+                                                   (v2i32 V64:$Rm)))))]>;
+  def v4i32 : BaseSIMDThreeSameVectorTiedR0<1, U, 0b10, opc, V128, asm, ".4s",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqrdmulh (v4i32 V128:$Rn),
+                                                   (v4i32 V128:$Rm)))))]>;
+}
+
+multiclass SIMDIndexedSQRDMLxHSDTied<bit U, bits<4> opc, string asm,
+                                     SDPatternOperator Accum> {
+  def v4i16_indexed : BaseSIMDIndexedTied<0, U, 0, 0b01, opc,
+                                          V64, V64, V128_lo, VectorIndexH,
+                                          asm, ".4h", ".4h", ".4h", ".h",
+    [(set (v4i16 V64:$dst),
+          (Accum (v4i16 V64:$Rd),
+                 (v4i16 (int_aarch64_neon_sqrdmulh
+                          (v4i16 V64:$Rn),
+                          (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                    VectorIndexH:$idx))))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+  // FIXME: uncomment the following, after backend will support i16 neon type
+  //def : Pat<(i16 (Accum (i16 FPR16Op:$Rd),
+  //              (i16 (vector_extract (v4i16
+  //                       (int_aarch64_neon_sqdmull (v4i16 V64:$Rn),
+  //                           (v4i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+  //                                                  VectorIndexH:$idx)))),
+  //                       (i64 0))))),
+  //          (EXTRACT_SUBREG
+  //              (!cast<Instruction>(NAME # v4i16_indexed)
+  //                  (SUBREG_TO_REG (i32 0), FPR16Op:$Rd, ssub), V64:$Rn,
+  //                  V128_lo:$Rm, VectorIndexH:$idx),
+  //              ssub)>;  
+
+  def v8i16_indexed : BaseSIMDIndexedTied<1, U, 0, 0b01, opc,
+                                          V128, V128, V128_lo, VectorIndexH,
+                                          asm, ".8h", ".8h", ".8h", ".h",
+    [(set (v8i16 V128:$dst),
+          (Accum (v8i16 V128:$Rd),
+                 (v8i16 (int_aarch64_neon_sqrdmulh
+                          (v8i16 V128:$Rn),
+                          (v8i16 (AArch64duplane16 (v8i16 V128_lo:$Rm),
+                                                   VectorIndexH:$idx))))))]> {
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+  // FIXME: It should be a "def" here, similar to one above, 
+  // after backend will support i16 neon type
+
+  def v2i32_indexed : BaseSIMDIndexedTied<0, U, 0, 0b10, opc,
+                                          V64, V64, V128, VectorIndexS,
+                                          asm, ".2s", ".2s", ".2s", ".s",
+    [(set (v2i32 V64:$dst),
+        (Accum (v2i32 V64:$Rd),
+               (v2i32 (int_aarch64_neon_sqrdmulh
+                        (v2i32 V64:$Rn),
+                        (v2i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                 VectorIndexS:$idx))))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  // FIXME: it would be nice to use the scalar (v1i32) instruction here, but 
+  // an intermediate EXTRACT_SUBREG would be untyped.
+  // FIXME: direct EXTRACT_SUBREG from v2i32 to i32 is illegal, that's why we 
+  // got it lowered here as (i32 vector_extract (v4i32 insert_subvector(..)))
+  def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
+                        (i32 (vector_extract 
+                               (v4i32 (insert_subvector
+                                        (undef), 
+                                        (v2i32 (int_aarch64_neon_sqrdmulh 
+                                                 (v2i32 V64:$Rn),
+                                                 (v2i32 (AArch64duplane32 
+                                                          (v4i32 V128:$Rm),
+                                                          VectorIndexS:$idx)))),
+                                      (i32 0))),
+                               (i64 0))))),
+            (EXTRACT_SUBREG
+                (!cast<Instruction>(NAME # v2i32_indexed)
+                  (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)), 
+                                        FPR32Op:$Rd, 
+                                        ssub)), 
+                  V64:$Rn,
+                  V128:$Rm, 
+                  VectorIndexS:$idx),
+                ssub)>;
+
+  def v4i32_indexed : BaseSIMDIndexedTied<1, U, 0, 0b10, opc,
+                                          V128, V128, V128, VectorIndexS,
+                                          asm, ".4s", ".4s", ".4s", ".s",
+    [(set (v4i32 V128:$dst),
+          (Accum (v4i32 V128:$Rd),
+                 (v4i32 (int_aarch64_neon_sqrdmulh
+                          (v4i32 V128:$Rn),
+                          (v4i32 (AArch64duplane32 (v4i32 V128:$Rm),
+                                                   VectorIndexS:$idx))))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+
+  // FIXME: it would be nice to use the scalar (v1i32) instruction here, but
+  // an intermediate EXTRACT_SUBREG would be untyped.
+  def : Pat<(i32 (Accum (i32 FPR32Op:$Rd),
+                        (i32 (vector_extract 
+                               (v4i32 (int_aarch64_neon_sqrdmulh 
+                                        (v4i32 V128:$Rn),
+                                        (v4i32 (AArch64duplane32 
+                                                 (v4i32 V128:$Rm),
+                                                 VectorIndexS:$idx)))),
+                               (i64 0))))),
+            (EXTRACT_SUBREG
+                (v4i32 (!cast<Instruction>(NAME # v4i32_indexed)
+                         (v4i32 (INSERT_SUBREG (v4i32 (IMPLICIT_DEF)), 
+                                               FPR32Op:$Rd, 
+                                               ssub)), 
+                         V128:$Rn,
+                         V128:$Rm, 
+                         VectorIndexS:$idx)),
+                ssub)>;
+
+  def i16_indexed : BaseSIMDIndexedTied<1, U, 1, 0b01, opc,
+                                        FPR16Op, FPR16Op, V128_lo,
+                                        VectorIndexH, asm, ".h", "", "", ".h",
+    [
+  // FIXME: uncomment the following, after backend will support i16 neon type
+  //  (set (i16 FPR16Op:$dst),
+  //        (Accum (i16 FPR16Op:$Rd),
+  //               (i16 (int_aarch64_neon_sqrdmulh
+  //                          (i16 FPR16Op:$Rn),
+  //                          (i16 (vector_extract (v8i16 V128_lo:$Rm),
+  //                                               VectorIndexH:$idx))))))
+    ]> {
+    
+    bits<3> idx;
+    let Inst{11} = idx{2};
+    let Inst{21} = idx{1};
+    let Inst{20} = idx{0};
+  }
+
+  def i32_indexed : BaseSIMDIndexedTied<1, U, 1, 0b10, opc,
+                                        FPR32Op, FPR32Op, V128, VectorIndexS,
+                                        asm, ".s", "", "", ".s",
+    [(set (i32 FPR32Op:$dst),
+          (Accum (i32 FPR32Op:$Rd),
+                 (i32 (int_aarch64_neon_sqrdmulh
+                        (i32 FPR32Op:$Rn),
+                        (i32 (vector_extract (v4i32 V128:$Rm),
+                                             VectorIndexS:$idx))))))]> {
+    bits<2> idx;
+    let Inst{11} = idx{1};
+    let Inst{21} = idx{0};
+  }
+}
+} // let Predicates = [HasNeon, HasV8_1a]
+
+//----------------------------------------------------------------------------
 // Crypto extensions
 //----------------------------------------------------------------------------
 
Index: lib/Target/AArch64/AArch64InstrInfo.td
===================================================================
--- lib/Target/AArch64/AArch64InstrInfo.td
+++ lib/Target/AArch64/AArch64InstrInfo.td
@@ -2859,6 +2859,10 @@
 defm URHADD   : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>;
 defm URSHL    : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>;
 defm USHL     : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>;
+defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah",
+                                                  int_aarch64_neon_sqadd>;
+defm SQRDMLSH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10001,"sqrdmlsh",
+                                                    int_aarch64_neon_sqsub>;
 
 defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
 defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
@@ -3075,6 +3079,32 @@
 defm UQSUB    : SIMDThreeScalarBHSD<1, 0b00101, "uqsub", int_aarch64_neon_uqsub>;
 defm URSHL    : SIMDThreeScalarD<   1, 0b01010, "urshl", int_aarch64_neon_urshl>;
 defm USHL     : SIMDThreeScalarD<   1, 0b01000, "ushl", int_aarch64_neon_ushl>;
+let Predicates = [HasV8_1a] in {
+  defm SQRDMLAH : SIMDThreeScalarHSTied<1, 0, 0b10000, "sqrdmlah">;
+  defm SQRDMLSH : SIMDThreeScalarHSTied<1, 0, 0b10001, "sqrdmlsh">;
+  // FIXME: uncomment the following, after backend will support i16 neon type
+  //def : Pat<(i16 (int_aarch64_neon_sqadd
+  //                 (i16 FPR16:$Rd),
+  //                 (i16 (int_aarch64_neon_sqrdmulh (i16 FPR16:$Rn),
+  //                                                 (i16 FPR16:$Rm))))),
+  //          (SQRDMLAHv1i16 FPR16:$Rd, FPR16:$Rn, FPR16:$Rm)>;
+  def : Pat<(i32 (int_aarch64_neon_sqadd
+                   (i32 FPR32:$Rd),
+                   (i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn),
+                                                   (i32 FPR32:$Rm))))),
+            (SQRDMLAHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+  // FIXME: uncomment the following, after backend will support i16 neon type
+  //def : Pat<(i16 (int_aarch64_neon_sqsub
+  //                 (i16 FPR16:$Rd),
+  //                 (i16 (int_aarch64_neon_sqrdmulh (i16 FPR16:$Rn),
+  //                                                 (i16 FPR16:$Rm))))),
+  //          (SQRDMLSHv1i16 FPR16:$Rd, FPR16:$Rn, FPR16:$Rm)>;
+  def : Pat<(i32 (int_aarch64_neon_sqsub
+                   (i32 FPR32:$Rd),
+                   (i32 (int_aarch64_neon_sqrdmulh (i32 FPR32:$Rn),
+                                                   (i32 FPR32:$Rm))))),
+            (SQRDMLSHv1i32 FPR32:$Rd, FPR32:$Rn, FPR32:$Rm)>;
+}
 
 def : InstAlias<"cmls $dst, $src1, $src2",
                 (CMHSv1i64 FPR64:$dst, FPR64:$src2, FPR64:$src1), 0>;
@@ -4405,6 +4435,10 @@
                                            int_aarch64_neon_sqadd>;
 defm SQDMLSL : SIMDIndexedLongSQDMLXSDTied<0, 0b0111, "sqdmlsl",
                                            int_aarch64_neon_sqsub>;
+defm SQRDMLAH : SIMDIndexedSQRDMLxHSDTied<1, 0b1101, "sqrdmlah",
+                                          int_aarch64_neon_sqadd>;
+defm SQRDMLSH : SIMDIndexedSQRDMLxHSDTied<1, 0b1111, "sqrdmlsh",
+                                          int_aarch64_neon_sqsub>;
 defm SQDMULL : SIMDIndexedLongSD<0, 0b1011, "sqdmull", int_aarch64_neon_sqdmull>;
 defm UMLAL   : SIMDVectorIndexedLongSDTied<1, 0b0010, "umlal",
     TriOpFrag<(add node:$LHS, (int_aarch64_neon_umull node:$MHS, node:$RHS))>>;
Index: test/CodeGen/AArch64/arm64-neon-v8.1a.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/arm64-neon-v8.1a.ll
@@ -0,0 +1,408 @@
+; RUN: llc < %s -verify-machineinstrs -march=arm64                                          | FileCheck %s --check-prefix=CHECK-V8a
+; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a                            | FileCheck %s --check-prefix=CHECK-V81a
+; RUN: llc < %s -verify-machineinstrs -march=arm64 -mattr=+v8.1a -aarch64-neon-syntax=apple | FileCheck %s --check-prefix=CHECK-V81a-apple
+
+declare <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32>, <4 x i32>)
+declare i32 @llvm.aarch64.neon.sqrdmulh.i32(i32, i32)
+declare i16 @llvm.aarch64.neon.sqrdmulh.i16(i16, i16)
+
+declare <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32>, <4 x i32>)
+declare i32 @llvm.aarch64.neon.sqadd.i32(i32, i32)
+declare i16 @llvm.aarch64.neon.sqadd.i16(i16, i16)
+
+declare <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16>, <4 x i16>)
+declare <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16>, <8 x i16>)
+declare <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32>, <2 x i32>)
+declare <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32>, <4 x i32>)
+declare i32 @llvm.aarch64.neon.sqsub.i32(i32, i32)
+declare i16 @llvm.aarch64.neon.sqsub.i16(i16, i16)
+
+;-----------------------------------------------------------------------------
+; RDMA Vector
+; test for SIMDThreeSameVectorSQRDMLxHTiedHS
+
+define <4 x i16> @test_sqrdmlah_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmlah_v4i16:
+   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
+   %retval =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc,  <4 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.4h
+; CHECK-V81a:       sqrdmlah    v0.4h, v1.4h, v2.4h
+; CHECK-V81a-apple: sqrdmlah.4h v0,    v1,    v2
+   ret <4 x i16> %retval
+}
+
+define <8 x i16> @test_sqrdmlah_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmlah_v8i16:
+   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
+   %retval =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.8h
+; CHECK-V81a:       sqrdmlah    v0.8h, v1.8h, v2.8h
+; CHECK-V81a-apple: sqrdmlah.8h v0, v1, v2
+   ret <8 x i16> %retval
+}
+
+define <2 x i32> @test_sqrdmlah_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlah_v2i32:
+   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
+   %retval =  call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.2s
+; CHECK-V81a:       sqrdmlah    v0.2s, v1.2s, v2.2s
+; CHECK-V81a-apple: sqrdmlah.2s v0,    v1,    v2
+   ret <2 x i32> %retval
+}
+
+define <4 x i32> @test_sqrdmlah_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlah_v4i32:
+   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
+   %retval =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+; CHECK-V81:        sqrdmulh    v1.4s, v1.4s, v2.4s
+; CHECK-V81a:       sqrdmlah    v0.4s, v1.4s, v2.4s
+; CHECK-V81a-apple: sqrdmlah.4s v0,    v1,    v2
+   ret <4 x i32> %retval
+}
+
+define <4 x i16> @test_sqrdmlsh_v4i16(<4 x i16> %acc, <4 x i16> %mhs, <4 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_v4i16:
+   %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %mhs,  <4 x i16> %rhs)
+   %retval =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.4h
+; CHECK-V81a:       sqrdmlsh    v0.4h, v1.4h, v2.4h
+; CHECK-V81a-apple: sqrdmlsh.4h v0,    v1,    v2
+   ret <4 x i16> %retval
+}
+
+define <8 x i16> @test_sqrdmlsh_v8i16(<8 x i16> %acc, <8 x i16> %mhs, <8 x i16> %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_v8i16:
+   %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %mhs, <8 x i16> %rhs)
+   %retval =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.8h
+; CHECK-V81a:       sqrdmlsh    v0.8h, v1.8h, v2.8h
+; CHECK-V81a-apple: sqrdmlsh.8h v0,    v1,    v2
+   ret <8 x i16> %retval
+}
+
+define <2 x i32> @test_sqrdmlsh_v2i32(<2 x i32> %acc, <2 x i32> %mhs, <2 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_v2i32:
+   %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %mhs, <2 x i32> %rhs)
+   %retval =  call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.2s
+; CHECK-V81a:       sqrdmlsh    v0.2s, v1.2s, v2.2s
+; CHECK-V81a-apple: sqrdmlsh.2s v0,    v1,    v2
+   ret <2 x i32> %retval
+}
+
+define <4 x i32> @test_sqrdmlsh_v4i32(<4 x i32> %acc, <4 x i32> %mhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_v4i32:
+   %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %mhs, <4 x i32> %rhs)
+   %retval =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.4s
+; CHECK-V81a:       sqrdmlsh    v0.4s, v1.4s, v2.4s
+; CHECK-V81a-apple: sqrdmlsh.4s v0,    v1,    v2
+   ret <4 x i32> %retval
+}
+
+;-----------------------------------------------------------------------------
+; RDMA Vector, by element
+; tests for vXiYY_indexed, vXiYY_indexed in SIMDIndexedSQRDMLxHSDTied
+
+define <4 x i16> @test_sqrdmlah_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlah_lane_s16:
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
+  %retval =  call <4 x i16> @llvm.aarch64.neon.sqadd.v4i16(<4 x i16> %acc, <4 x i16> %prod)
+; CHECK-V8a :       sqrdmulh    v1.4h, v1.4h, v2.h[3]
+; CHECK-V81a:       sqrdmlah    v0.4h, v1.4h, v2.h[3]
+; CHECK-V81a-apple: sqrdmlah.4h v0,    v1,    v2[3]
+  ret <4 x i16> %retval
+}
+
+define <8 x i16> @test_sqrdmlahq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlahq_lane_s16:
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
+  %retval =  call <8 x i16> @llvm.aarch64.neon.sqadd.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.h[2]
+; CHECK-V81a:       sqrdmlah    v0.8h, v1.8h, v2.h[2]
+; CHECK-V81a-apple: sqrdmlah.8h v0,    v1,    v2[2]
+  ret <8 x i16> %retval
+}
+
+define <2 x i32> @test_sqrdmlah_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlah_lane_s32:
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
+  %retval =  call <2 x i32> @llvm.aarch64.neon.sqadd.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.s[1]
+; CHECK-V81a:       sqrdmlah    v0.2s, v1.2s, v2.s[1]
+; CHECK-V81a-apple: sqrdmlah.2s v0,    v1,    v2[1]
+  ret <2 x i32> %retval
+}
+
+define <4 x i32> @test_sqrdmlahq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlahq_lane_s32:
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
+  %retval =  call <4 x i32> @llvm.aarch64.neon.sqadd.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.s[0]
+; CHECK-V81a:       sqrdmlah    v0.4s, v1.4s, v2.s[0]
+; CHECK-V81a-apple: sqrdmlah.4s v0,    v1,    v2[0]
+  ret <4 x i32> %retval
+}
+
+define <4 x i16> @test_sqrdmlsh_lane_s16(<4 x i16> %acc, <4 x i16> %x, <4 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlsh_lane_s16:
+entry:
+  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> <i32 3, i32 3, i32 3, i32 3>
+  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
+  %retval =  call <4 x i16> @llvm.aarch64.neon.sqsub.v4i16(<4 x i16> %acc, <4 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.4h, v1.4h, v2.h[3]
+; CHECK-V81a:       sqrdmlsh    v0.4h, v1.4h, v2.h[3]
+; CHECK-V81a-apple: sqrdmlsh.4h v0,    v1,    v2[3]
+  ret <4 x i16> %retval
+}
+
+define <8 x i16> @test_sqrdmlshq_lane_s16(<8 x i16> %acc, <8 x i16> %x, <8 x i16> %v) {
+; CHECK-LABEL: test_sqrdmlshq_lane_s16:
+entry:
+  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> <i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2, i32 2>
+  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
+  %retval =  call <8 x i16> @llvm.aarch64.neon.sqsub.v8i16(<8 x i16> %acc, <8 x i16> %prod)
+; CHECK-V8a:        sqrdmulh    v1.8h, v1.8h, v2.h[2]
+; CHECK-V81a:       sqrdmlsh    v0.8h, v1.8h, v2.h[2]
+; CHECK-V81a-apple: sqrdmlsh.8h v0,    v1,    v2[2]
+  ret <8 x i16> %retval
+}
+
+define <2 x i32> @test_sqrdmlsh_lane_s32(<2 x i32> %acc, <2 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlsh_lane_s32:
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
+  %retval =  call <2 x i32> @llvm.aarch64.neon.sqsub.v2i32(<2 x i32> %acc, <2 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.2s, v1.2s, v2.s[1]
+; CHECK-V81a:       sqrdmlsh    v0.2s, v1.2s, v2.s[1]
+; CHECK-V81a-apple: sqrdmlsh.2s v0,    v1,    v2[1]
+  ret <2 x i32> %retval
+}
+
+define <4 x i32> @test_sqrdmlshq_lane_s32(<4 x i32> %acc,<4 x i32> %x, <4 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlshq_lane_s32:
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
+  %retval =  call <4 x i32> @llvm.aarch64.neon.sqsub.v4i32(<4 x i32> %acc, <4 x i32> %prod)
+; CHECK-V8a:        sqrdmulh    v1.4s, v1.4s, v2.s[0]
+; CHECK-V81a:       sqrdmlsh    v0.4s, v1.4s, v2.s[0]
+; CHECK-V81a-apple: sqrdmlsh.4s v0,    v1,    v2[0]
+  ret <4 x i32> %retval
+}
+
+;-----------------------------------------------------------------------------
+; RDMA Vector, by element, extracted
+; tests for "def : Pat" in SIMDIndexedSQRDMLxHSDTied
+
+; FIXME: after fix of https://llvm.org/bugs/show_bug.cgi?id=22886
+; uncomment this function, and replace "cHECK" for "CHECK"
+;define i16 @test_sqrdmlah_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
+;; cHECK-LABEL: test_sqrdmlah_extracted_lane_s16:
+;entry:
+;  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+;  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
+;  %extract = extractelement <4 x i16> %prod, i64 0
+;  %retval =  call i16 @llvm.aarch64.neon.sqadd.i16(i16 %acc, i16 %extract)
+;; cHECK: sqrdmlah {{v[2-9]+}}.4h, v0.4h, v1.h[0]
+;  ret i16 %retval
+;}
+
+; FIXME: after fix of https://llvm.org/bugs/show_bug.cgi?id=22886
+; uncomment this function, and replace "cHECK" for "CHECK"
+;define i16 @test_sqrdmlahq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
+;; cHECK-LABEL: test_sqrdmlahq_extracted_lane_s16:
+;entry:
+;  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
+;  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
+;  %extract = extractelement <8 x i16> %prod, i64 0
+;  %retval =  call i16 @llvm.aarch64.neon.sqadd.i16(i16 %acc, i16 %extract)
+;; cHECK: sqrdmlah {{v[2-9]+}}.8h, v0.8h, v1.h[0]
+;  ret i16 %retval
+;}
+
+define i32 @test_sqrdmlah_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlah_extracted_lane_s32:
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
+  %extract = extractelement <2 x i32> %prod, i64 0
+  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
+; CHECK-V8a:        sqrdmulh    v0.2s, v0.2s, v1.s[0]
+; CHECK-V81a:       sqrdmlah    v2.2s, v0.2s, v1.s[0]
+; CHECK-V81a-apple: sqrdmlah.2s v2,    v0,    v1[0]
+  ret i32 %retval
+}
+
+define i32 @test_sqrdmlahq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlahq_extracted_lane_s32:
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
+  %extract = extractelement <4 x i32> %prod, i64 0
+  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc, i32 %extract)
+; CHECK-V8a:        sqrdmulh    v0.4s, v0.4s, v1.s[0]
+; CHECK-V81a:       sqrdmlah    v2.4s, v0.4s, v1.s[0]
+; CHECK-V81a-apple: sqrdmlah.4s v2,    v0,    v1[0]
+  ret i32 %retval
+}
+
+; FIXME: after fix of https://llvm.org/bugs/show_bug.cgi?id=22886
+; uncomment this function, and replace "cHECK" for "CHECK"
+;define i16 @test_sqrdmlsh_extracted_lane_s16(i16 %acc,<4 x i16> %x, <4 x i16> %v) {
+;; cHECK-LABEL: test_sqrdmlsh_extracted_lane_s16:
+;entry:
+;  %shuffle = shufflevector <4 x i16> %v, <4 x i16> undef, <4 x i32> zeroinitializer
+;  %prod = call <4 x i16> @llvm.aarch64.neon.sqrdmulh.v4i16(<4 x i16> %x, <4 x i16> %shuffle)
+;  %extract = extractelement <4 x i16> %prod, i64 0
+;  %retval =  call i16 @llvm.aarch64.neon.sqsub.i16(i16 %acc, i16 %extract)
+;; cHECK: sqrdmlah {{v[2-9]+}}.4h, v0.4h, v1.h[0]
+;  ret i16 %retval
+;}
+
+; FIXME: after fix of https://llvm.org/bugs/show_bug.cgi?id=22886
+; uncomment this function, and replace "cHECK" for "CHECK"
+;define i16 @test_sqrdmlshq_extracted_lane_s16(i16 %acc,<8 x i16> %x, <8 x i16> %v) {
+;; cHECK-LABEL: test_sqrdmlshq_extracted_lane_s16:
+;entry:
+;  %shuffle = shufflevector <8 x i16> %v, <8 x i16> undef, <8 x i32> zeroinitializer
+;  %prod = call <8 x i16> @llvm.aarch64.neon.sqrdmulh.v8i16(<8 x i16> %x, <8 x i16> %shuffle)
+;  %extract = extractelement <8 x i16> %prod, i64 0
+;  %retval =  call i16 @llvm.aarch64.neon.sqsub.i16(i16 %acc, i16 %extract)
+;; cHECK: sqrdmlah {{v[0-9]+}}.8h, v0.8h, v1.h[0]
+;  ret i16 %retval
+;}
+
+define i32 @test_sqrdmlsh_extracted_lane_s32(i32 %acc,<2 x i32> %x, <2 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlsh_extracted_lane_s32:
+entry:
+  %shuffle = shufflevector <2 x i32> %v, <2 x i32> undef, <2 x i32> zeroinitializer
+  %prod = call <2 x i32> @llvm.aarch64.neon.sqrdmulh.v2i32(<2 x i32> %x, <2 x i32> %shuffle)
+  %extract = extractelement <2 x i32> %prod, i64 0
+  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
+; CHECK-V8a:        sqrdmulh    v0.2s, v0.2s, v1.s[0]
+; CHECK-V81a:       sqrdmlsh    v2.2s, v0.2s, v1.s[0]
+; CHECK-V81a-apple: sqrdmlsh.2s v2,    v0,    v1[0]
+  ret i32 %retval
+}
+
+define i32 @test_sqrdmlshq_extracted_lane_s32(i32 %acc,<4 x i32> %x, <4 x i32> %v) {
+; CHECK-LABEL: test_sqrdmlshq_extracted_lane_s32:
+entry:
+  %shuffle = shufflevector <4 x i32> %v, <4 x i32> undef, <4 x i32> zeroinitializer
+  %prod = call <4 x i32> @llvm.aarch64.neon.sqrdmulh.v4i32(<4 x i32> %x, <4 x i32> %shuffle)
+  %extract = extractelement <4 x i32> %prod, i64 0
+  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc, i32 %extract)
+; CHECK-V8a:        sqrdmulh    v0.4s, v0.4s, v1.s[0]
+; CHECK-V81a:       sqrdmlsh    v2.4s, v0.4s, v1.s[0]
+; CHECK-V81a-apple: sqrdmlsh.4s v2,    v0,    v1[0]
+  ret i32 %retval
+}
+
+;-----------------------------------------------------------------------------
+; RDMA Scalar
+; test for "def : Pat" near SIMDThreeScalarHSTied in AArch64InstInfo.td
+
+; FIXME: after fix of https://llvm.org/bugs/show_bug.cgi?id=22886
+; uncomment this function, and replace "cHECK" for "CHECK"
+;define i16 @test_sqrdmlah_i16(i16 %acc, i16 %mhs, i16 %rhs) {
+;; cHECK-LABEL: test_sqrdmlah_i16:
+;  %prod = call i16 @llvm.aarch64.neon.sqrdmulh.i16(i16 %mhs,  i16 %rhs)
+;  %retval =  call i16 @llvm.aarch64.neon.sqadd.i16(i16 %acc,  i16 %prod)
+;; cHECK: sqrdmlah {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;  ret i16 %retval
+;}
+
+define i32 @test_sqrdmlah_i32(i32 %acc, i32 %mhs, i32 %rhs) {
+; CHECK-LABEL: test_sqrdmlah_i32:
+  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %rhs)
+  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc,  i32 %prod)
+; CHECK-V8a:        sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-V81a:       sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-V81a-apple: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  ret i32 %retval
+}
+
+; FIXME: after fix of https://llvm.org/bugs/show_bug.cgi?id=22886
+; uncomment this function, and replace "cHECK" for "CHECK"
+;define i16 @test_sqrdmlsh_i16(i16 %acc, i16 %mhs, i16 %rhs) {
+;; cHECK-LABEL: test_sqrdmlsh_i16:
+;  %prod = call i16 @llvm.aarch64.neon.sqrdmulh.i16(i16 %mhs,  i16 %rhs)
+;  %retval =  call i16 @llvm.aarch64.neon.sqsub.i16(i16 %acc,  i16 %prod)
+;; cHECK: sqrdmlsh {{w[0-9]+}}, {{w[0-9]+}}, {{w[0-9]+}}
+;  ret i16 %retval
+;}
+
+define i32 @test_sqrdmlsh_i32(i32 %acc, i32 %mhs, i32 %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_i32:
+  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %rhs)
+  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc,  i32 %prod)
+; CHECK-V8a:        sqrdmulh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-V81a:       sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+; CHECK-V81a-apple: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, {{s[0-9]+}}
+  ret i32 %retval
+}
+
+;-----------------------------------------------------------------------------
+; RDMA Scalar, by element
+; tests for iYY_indexed in SIMDIndexedSQRDMLxHSDTied
+
+; FIXME: after fix of https://llvm.org/bugs/show_bug.cgi?id=22886
+; uncomment this function, and replace "cHECK" for "CHECK"
+;define i16 @test_sqrdmlah_extract_i16(i16 %acc, i16 %mhs, <4 x i16> %rhs) {
+;; cHECK-LABEL: test_sqrdmlah_extract_i32:
+;  %extract = extractelement <4 x i16> %rhs, i32 3
+;  %prod = call i16 @llvm.aarch64.neon.sqrdmulh.i16(i16 %mhs,  i16 %extract)
+;  %retval =  call i16 @llvm.aarch64.neon.sqadd.i16(i16 %acc,  i16 %prod)
+;; cHECK: sqrdmlah {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
+;  ret i16 %retval
+;}
+
+define i32 @test_sqrdmlah_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlah_extract_i32:
+  %extract = extractelement <4 x i32> %rhs, i32 3
+  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %extract)
+  %retval =  call i32 @llvm.aarch64.neon.sqadd.i32(i32 %acc,  i32 %prod)
+; CHECK-V8a:        sqrdmulh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
+; CHECK-V81a:       sqrdmlah   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
+; CHECK-V81a-apple: sqrdmlah.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
+  ret i32 %retval
+}
+
+; FIXME: after fix of https://llvm.org/bugs/show_bug.cgi?id=22886
+; uncomment this function, and replace "cHECK" for "CHECK"
+;define i16 @test_sqrdmlsh_extract_i16(i16 %acc, i16 %mhs, <4 x i16> %rhs) {
+;; cHECK-LABEL: test_sqrdmlsh_extract_i32:
+;  %extract = extractelement <4 x i16> %rhs, i32 3
+;  %prod = call i16 @llvm.aarch64.neon.sqrdmulh.i16(i16 %mhs,  i16 %extract)
+;  %retval =  call i16 @llvm.aarch64.neon.sqsub.i16(i16 %acc,  i16 %prod)
+;; cHECK: sqrdmlsh {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
+;  ret i16 %retval
+;}
+
+define i32 @test_sqrdmlsh_extract_i32(i32 %acc, i32 %mhs, <4 x i32> %rhs) {
+; CHECK-LABEL: test_sqrdmlsh_extract_i32:
+  %extract = extractelement <4 x i32> %rhs, i32 3
+  %prod = call i32 @llvm.aarch64.neon.sqrdmulh.i32(i32 %mhs,  i32 %extract)
+  %retval =  call i32 @llvm.aarch64.neon.sqsub.i32(i32 %acc,  i32 %prod)
+; CHECK-V8a:        sqrdmulh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
+; CHECK-V81a:       sqrdmlsh   {{s[0-9]+}}, {{s[0-9]+}}, v0.s[3]
+; CHECK-V81a-apple: sqrdmlsh.s {{s[0-9]+}}, {{s[0-9]+}}, v0[3]
+  ret i32 %retval
+}
Index: test/MC/AArch64/armv8-extension-rdma.s
===================================================================
--- /dev/null
+++ test/MC/AArch64/armv8-extension-rdma.s
@@ -0,0 +1,154 @@
+// RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8.1a -show-encoding < %s 2> %t | FileCheck %s
+// RUN: FileCheck --check-prefix=CHECK-ERROR < %t %s
+  .text
+
+  //AdvSIMD RDMA vector
+  sqrdmlah v0.4h, v1.4h, v2.4h
+  sqrdmlsh v0.4h, v1.4h, v2.4h
+  sqrdmlah v0.2s, v1.2s, v2.2s
+  sqrdmlsh v0.2s, v1.2s, v2.2s
+  sqrdmlah v0.4s, v1.4s, v2.4s
+  sqrdmlsh v0.4s, v1.4s, v2.4s
+  sqrdmlah v0.8h, v1.8h, v2.8h
+  sqrdmlsh v0.8h, v1.8h, v2.8h
+// CHECK: sqrdmlah  v0.4h, v1.4h, v2.4h // encoding: [0x20,0x84,0x42,0x2e]
+// CHECK: sqrdmlsh  v0.4h, v1.4h, v2.4h // encoding: [0x20,0x8c,0x42,0x2e]
+// CHECK: sqrdmlah  v0.2s, v1.2s, v2.2s // encoding: [0x20,0x84,0x82,0x2e]
+// CHECK: sqrdmlsh  v0.2s, v1.2s, v2.2s // encoding: [0x20,0x8c,0x82,0x2e]
+// CHECK: sqrdmlah  v0.4s, v1.4s, v2.4s // encoding: [0x20,0x84,0x82,0x6e]
+// CHECK: sqrdmlsh  v0.4s, v1.4s, v2.4s // encoding: [0x20,0x8c,0x82,0x6e]
+// CHECK: sqrdmlah  v0.8h, v1.8h, v2.8h // encoding: [0x20,0x84,0x42,0x6e]
+// CHECK: sqrdmlsh  v0.8h, v1.8h, v2.8h // encoding: [0x20,0x8c,0x42,0x6e]
+
+  sqrdmlah v0.2h, v1.2h, v2.2h
+  sqrdmlsh v0.2h, v1.2h, v2.2h
+  sqrdmlah v0.8s, v1.8s, v2.8s
+  sqrdmlsh v0.8s, v1.8s, v2.8s
+  sqrdmlah v0.2s, v1.4h, v2.8h
+  sqrdmlsh v0.4s, v1.8h, v2.2s
+// CHECK-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR:   sqrdmlah v0.2h, v1.2h, v2.2h
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR:   sqrdmlah v0.2h, v1.2h, v2.2h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR:   sqrdmlah v0.2h, v1.2h, v2.2h
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:   sqrdmlah v0.2h, v1.2h, v2.2h
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR:   sqrdmlsh v0.2h, v1.2h, v2.2h
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR:   sqrdmlsh v0.2h, v1.2h, v2.2h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR:   sqrdmlsh v0.2h, v1.2h, v2.2h
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:   sqrdmlsh v0.2h, v1.2h, v2.2h
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR:   sqrdmlah v0.8s, v1.8s, v2.8s
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR:   sqrdmlah v0.8s, v1.8s, v2.8s
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR:   sqrdmlah v0.8s, v1.8s, v2.8s
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:   sqrdmlah v0.8s, v1.8s, v2.8s
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR:   sqrdmlsh v0.8s, v1.8s, v2.8s
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR:   sqrdmlsh v0.8s, v1.8s, v2.8s
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid vector kind qualifier
+// CHECK-ERROR:   sqrdmlsh v0.8s, v1.8s, v2.8s
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:   sqrdmlsh v0.8s, v1.8s, v2.8s
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:   sqrdmlah v0.2s, v1.4h, v2.8h
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:   sqrdmlsh v0.4s, v1.8h, v2.2s
+// CHECK-ERROR:                   ^
+
+  //AdvSIMD RDMA scalar
+  sqrdmlah h0, h1, h2
+  sqrdmlsh h0, h1, h2
+  sqrdmlah s0, s1, s2
+  sqrdmlsh s0, s1, s2
+// CHECK: sqrdmlah h0, h1, h2  // encoding: [0x20,0x84,0x42,0x7e]
+// CHECK: sqrdmlsh h0, h1, h2  // encoding: [0x20,0x8c,0x42,0x7e]
+// CHECK: sqrdmlah s0, s1, s2  // encoding: [0x20,0x84,0x82,0x7e]
+// CHECK: sqrdmlsh s0, s1, s2  // encoding: [0x20,0x8c,0x82,0x7e]
+
+  //AdvSIMD RDMA vector by-element
+  sqrdmlah v0.4h, v1.4h, v2.h[3]
+  sqrdmlsh v0.4h, v1.4h, v2.h[3]
+  sqrdmlah v0.2s, v1.2s, v2.s[1]
+  sqrdmlsh v0.2s, v1.2s, v2.s[1]
+  sqrdmlah v0.8h, v1.8h, v2.h[3]
+  sqrdmlsh v0.8h, v1.8h, v2.h[3]
+  sqrdmlah v0.4s, v1.4s, v2.s[3]
+  sqrdmlsh v0.4s, v1.4s, v2.s[3]
+// CHECK: sqrdmlah v0.4h, v1.4h, v2.h[3]  // encoding: [0x20,0xd0,0x72,0x2f]
+// CHECK: sqrdmlsh v0.4h, v1.4h, v2.h[3]  // encoding: [0x20,0xf0,0x72,0x2f]
+// CHECK: sqrdmlah v0.2s, v1.2s, v2.s[1]  // encoding: [0x20,0xd0,0xa2,0x2f]
+// CHECK: sqrdmlsh v0.2s, v1.2s, v2.s[1]  // encoding: [0x20,0xf0,0xa2,0x2f]
+// CHECK: sqrdmlah v0.8h, v1.8h, v2.h[3]  // encoding: [0x20,0xd0,0x72,0x6f]
+// CHECK: sqrdmlsh v0.8h, v1.8h, v2.h[3]  // encoding: [0x20,0xf0,0x72,0x6f]
+// CHECK: sqrdmlah v0.4s, v1.4s, v2.s[3]  // encoding: [0x20,0xd8,0xa2,0x6f]
+// CHECK: sqrdmlsh v0.4s, v1.4s, v2.s[3]  // encoding: [0x20,0xf8,0xa2,0x6f]
+
+  sqrdmlah v0.4s, v1.2s, v2.s[1]
+  sqrdmlsh v0.2s, v1.2d, v2.s[1]
+  sqrdmlah v0.8h, v1.8h, v2.s[3]
+  sqrdmlsh v0.8h, v1.8h, v2.h[8]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:   sqrdmlah v0.4s, v1.2s, v2.s[1]
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:   sqrdmlsh v0.2s, v1.2d, v2.s[1]
+// CHECK-ERROR:                   ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:   sqrdmlah v0.8h, v1.8h, v2.s[3]
+// CHECK-ERROR:                          ^
+// CHECK-ERROR: error: vector lane must be an integer in range [0, 7].
+// CHECK-ERROR:   sqrdmlsh v0.8h, v1.8h, v2.h[8]
+// CHECK-ERROR:                              ^
+
+  //AdvSIMD RDMA scalar by-element
+  sqrdmlah h0, h1, v2.h[3]
+  sqrdmlsh h0, h1, v2.h[3]
+  sqrdmlah s0, s1, v2.s[3]
+  sqrdmlsh s0, s1, v2.s[3]
+// CHECK: sqrdmlah h0, h1, v2.h[3]  // encoding: [0x20,0xd0,0x72,0x7f]
+// CHECK: sqrdmlsh h0, h1, v2.h[3]  // encoding: [0x20,0xf0,0x72,0x7f]
+// CHECK: sqrdmlah s0, s1, v2.s[3]  // encoding: [0x20,0xd8,0xa2,0x7f]
+// CHECK: sqrdmlsh s0, s1, v2.s[3]  // encoding: [0x20,0xf8,0xa2,0x7f]
+
+  sqrdmlah b0, h1, v2.h[3]
+  sqrdmlah s0, d1, v2.s[3]
+  sqrdmlsh h0, h1, v2.s[3]
+  sqrdmlsh s0, s1, v2.s[4]
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:   sqrdmlah b0, h1, v2.h[3]
+// CHECK-ERROR:            ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:   sqrdmlah s0, d1, v2.s[3]
+// CHECK-ERROR:                ^
+// CHECK-ERROR: error: invalid operand for instruction
+// CHECK-ERROR:   sqrdmlsh h0, h1, v2.s[3]
+// CHECK-ERROR:                    ^
+// CHECK-ERROR: error: vector lane must be an integer in range [0, 3].
+// CHECK-ERROR:   sqrdmlsh s0, s1, v2.s[4]
+// CHECK-ERROR:                        ^
Index: test/MC/Disassembler/AArch64/armv8-extension-rdma.txt
===================================================================
--- /dev/null
+++ test/MC/Disassembler/AArch64/armv8-extension-rdma.txt
@@ -0,0 +1,129 @@
+# RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8.1a --disassemble < %s 2>&1 | FileCheck %s
+
+[0x20,0x84,0x02,0x2e] # sqrdmlah  v0.8b, v1.8b, v2.8b
+[0x20,0x8c,0x02,0x2e] # sqrdmlsh  v0.8b, v1.8b, v2.8b
+[0x20,0x84,0xc2,0x2e] # sqrdmlah  v0.1d, v1.1d, v2.1d
+[0x20,0x8c,0xc2,0x2e] # sqrdmlsh  v0.1d, v1.1d, v2.1d
+[0x20,0x84,0x02,0x6e] # sqrdmlah  v0.16b, v1.16b, v2.16b
+[0x20,0x8c,0x02,0x6e] # sqrdmlsh  v0.16b, v1.16b, v2.16b
+[0x20,0x84,0xc2,0x6e] # sqrdmlah  v0.2d, v1.2d, v2.2d
+[0x20,0x8c,0xc2,0x6e] # sqrdmlsh  v0.2d, v1.2d, v2.2d
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0x84,0x02,0x2e]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0x8c,0x02,0x2e]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0x84,0xc2,0x2e]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0x8c,0xc2,0x2e]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0x84,0x02,0x6e]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0x8c,0x02,0x6e]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0x84,0xc2,0x6e]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0x8c,0xc2,0x6e]
+
+[0x20,0x84,0x02,0x7e] # sqrdmlah b0, b1, b2
+[0x20,0x8c,0x02,0x7e] # sqrdmlsh b0, b1, b2
+[0x20,0x84,0xc2,0x7e] # sqrdmlah d0, d1, d2
+[0x20,0x8c,0xc2,0x7e] # sqrdmlsh d0, d1, d2
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0x84,0x02,0x7e]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0x8c,0x02,0x7e]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0x84,0xc2,0x7e]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0x8c,0xc2,0x7e]
+
+[0x20,0xd0,0x32,0x2f] # sqrdmlah v0.8b, v1.8b, v2.b[3]
+[0x20,0xf0,0x32,0x2f] # sqrdmlsh v0.8b, v1.8b, v2.b[3]
+[0x20,0xd0,0xe2,0x2f] # sqrdmlah v0.1d, v1.1d, v2.d[1]
+[0x20,0xf0,0xe2,0x2f] # sqrdmlsh v0.1d, v1.1d, v2.d[1]
+[0x20,0xd0,0x32,0x6f] # sqrdmlah v0.16b, v1.16b, v2.b[3]
+[0x20,0xf0,0x32,0x6f] # sqrdmlsh v0.16b, v1.16b, v2.b[3]
+[0x20,0xd8,0xe2,0x6f] # sqrdmlah v0.2d, v1.2d, v2.d[3]
+[0x20,0xf8,0xe2,0x6f] # sqrdmlsh v0.2d, v1.2d, v2.d[3]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0xd0,0x32,0x2f]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0xf0,0x32,0x2f]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0xd0,0xe2,0x2f]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0xf0,0xe2,0x2f]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0xd0,0x32,0x6f]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0xf0,0x32,0x6f]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0xd8,0xe2,0x6f]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0xf8,0xe2,0x6f]
+
+[0x20,0xd0,0x32,0x7f] # sqrdmlah b0, b1, v2.b[3]
+[0x20,0xf0,0x32,0x7f] # sqrdmlsh b0, b1, v2.b[3]
+[0x20,0xd8,0xe2,0x7f] # sqrdmlah d0, d1, v2.d[3]
+[0x20,0xf8,0xe2,0x7f] # sqrdmlsh d0, d1, v2.d[3]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0xd0,0x32,0x7f]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0xf0,0x32,0x7f]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0xd8,0xe2,0x7f]
+# CHECK: warning: invalid instruction encoding
+# CHECK: [0x20,0xf8,0xe2,0x7f]
+
+[0x20,0x84,0x42,0x2e]
+[0x20,0x8c,0x42,0x2e]
+[0x20,0x84,0x82,0x2e]
+[0x20,0x8c,0x82,0x2e]
+[0x20,0x84,0x42,0x6e]
+[0x20,0x8c,0x42,0x6e]
+[0x20,0x84,0x82,0x6e]
+[0x20,0x8c,0x82,0x6e]
+# CHECK: sqrdmlah  v0.4h, v1.4h, v2.4h
+# CHECK: sqrdmlsh  v0.4h, v1.4h, v2.4h
+# CHECK: sqrdmlah  v0.2s, v1.2s, v2.2s
+# CHECK: sqrdmlsh  v0.2s, v1.2s, v2.2s
+# CHECK: sqrdmlah  v0.8h, v1.8h, v2.8h
+# CHECK: sqrdmlsh  v0.8h, v1.8h, v2.8h
+# CHECK: sqrdmlah  v0.4s, v1.4s, v2.4s
+# CHECK: sqrdmlsh  v0.4s, v1.4s, v2.4s
+
+[0x20,0x84,0x42,0x7e]
+[0x20,0x8c,0x42,0x7e]
+[0x20,0x84,0x82,0x7e]
+[0x20,0x8c,0x82,0x7e]
+# CHECK: sqrdmlah h0, h1, h2
+# CHECK: sqrdmlsh h0, h1, h2
+# CHECK: sqrdmlah s0, s1, s2
+# CHECK: sqrdmlsh s0, s1, s2
+
+0x20,0xd0,0x72,0x2f
+0x20,0xf0,0x72,0x2f
+0x20,0xd0,0xa2,0x2f
+0x20,0xf0,0xa2,0x2f
+0x20,0xd0,0x72,0x6f
+0x20,0xf0,0x72,0x6f
+0x20,0xd8,0xa2,0x6f
+0x20,0xf8,0xa2,0x6f
+# CHECK: sqrdmlah v0.4h, v1.4h, v2.h[3]
+# CHECK: sqrdmlsh v0.4h, v1.4h, v2.h[3]
+# CHECK: sqrdmlah v0.2s, v1.2s, v2.s[1]
+# CHECK: sqrdmlsh v0.2s, v1.2s, v2.s[1]
+# CHECK: sqrdmlah v0.8h, v1.8h, v2.h[3]
+# CHECK: sqrdmlsh v0.8h, v1.8h, v2.h[3]
+# CHECK: sqrdmlah v0.4s, v1.4s, v2.s[3]
+# CHECK: sqrdmlsh v0.4s, v1.4s, v2.s[3]
+
+0x20,0xd0,0x72,0x7f
+0x20,0xf0,0x72,0x7f
+0x20,0xd8,0xa2,0x7f
+0x20,0xf8,0xa2,0x7f
+# CHECK: sqrdmlah h0, h1, v2.h[3]
+# CHECK: sqrdmlsh h0, h1, v2.h[3]
+# CHECK: sqrdmlah s0, s1, v2.s[3]
+# CHECK: sqrdmlsh s0, s1, v2.s[3]