Index: llvm/trunk/include/llvm/IR/IntrinsicsARM.td
===================================================================
--- llvm/trunk/include/llvm/IR/IntrinsicsARM.td
+++ llvm/trunk/include/llvm/IR/IntrinsicsARM.td
@@ -717,4 +717,14 @@
 def int_arm_neon_sha256h2: SHA_3Arg_v4i32_Intrinsic;
 def int_arm_neon_sha256su1: SHA_3Arg_v4i32_Intrinsic;
 
+// Armv8.2-A dot product instructions
+class Neon_Dot_Intrinsic
+  : Intrinsic<[llvm_anyvector_ty],
+              [LLVMMatchType<0>, llvm_anyvector_ty,
+               LLVMMatchType<1>],
+              [IntrNoMem]>;
+def int_arm_neon_udot : Neon_Dot_Intrinsic;
+def int_arm_neon_sdot : Neon_Dot_Intrinsic;
+
+
 } // end TargetPrefix
Index: llvm/trunk/lib/Target/ARM/ARMInstrNEON.td
===================================================================
--- llvm/trunk/lib/Target/ARM/ARMInstrNEON.td
+++ llvm/trunk/lib/Target/ARM/ARMInstrNEON.td
@@ -4682,37 +4682,59 @@
 // We put them in the VFPV8 decoder namespace because the ARM and Thumb
 // encodings are the same and thus no further bit twiddling is necessary
 // in the disassembler.
-let Predicates = [HasDotProd], DecoderNamespace = "VFPV8" in {
-
-def VUDOTD : N3Vnp<0b11000, 0b10, 0b1101, 0b0, 0b1,
-                  (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm),
-                  N3RegFrm, IIC_VDOTPROD, "vudot", "u8", []>;
-def VSDOTD : N3Vnp<0b11000, 0b10, 0b1101, 0b0, 0b0,
-                  (outs DPR:$Vd), (ins DPR:$Vn, DPR:$Vm),
-                  N3RegFrm, IIC_VDOTPROD, "vsdot", "s8", []>;
-def VUDOTQ : N3Vnp<0b11000, 0b10, 0b1101, 0b1, 0b1,
-                  (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm),
-                  N3RegFrm, IIC_VDOTPROD, "vudot", "u8", []>;
-def VSDOTQ : N3Vnp<0b11000, 0b10, 0b1101, 0b1, 0b0,
-                  (outs QPR:$Vd), (ins QPR:$Vn, QPR:$Vm),
-                  N3RegFrm, IIC_VDOTPROD, "vsdot", "s8", []>;
+class VDOT<bit op6, bit op4, RegisterClass RegTy, string Asm, string AsmTy,
+           ValueType AccumTy, ValueType InputTy,
+           SDPatternOperator OpNode> :
+      N3Vnp<0b11000, 0b10, 0b1101, op6, op4, (outs RegTy:$dst),
+            (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm), N3RegFrm, IIC_VDOTPROD,
+            Asm, AsmTy,
+            [(set (AccumTy RegTy:$dst),
+                  (OpNode (AccumTy RegTy:$Vd),
+                          (InputTy RegTy:$Vn),
+                          (InputTy RegTy:$Vm)))]> {
+  let Predicates = [HasDotProd];
+  let DecoderNamespace = "VFPV8";
+  let Constraints = "$dst = $Vd";
+}
+
+def VUDOTD : VDOT<0, 1, DPR, "vudot", "u8", v2i32, v8i8,  int_arm_neon_udot>;
+def VSDOTD : VDOT<0, 0, DPR, "vsdot", "s8", v2i32, v8i8,  int_arm_neon_sdot>;
+def VUDOTQ : VDOT<1, 1, QPR, "vudot", "u8", v4i32, v16i8, int_arm_neon_udot>;
+def VSDOTQ : VDOT<1, 0, QPR, "vsdot", "s8", v4i32, v16i8, int_arm_neon_sdot>;
 
 // Indexed dot product instructions:
-class DOTI<string opc, string dt, bit Q, bit U, RegisterClass Ty> :
-  N3Vnp<0b11100, 0b10, 0b1101, Q, U,
-       (outs Ty:$Vd), (ins Ty:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
-       N3RegFrm, IIC_VDOTPROD, opc, dt, []> {
-  bit lane;
-  let Inst{5} = lane;
-  let AsmString = !strconcat(opc, ".", dt, "\t$Vd, $Vn, $Vm$lane");
-}
-
-def VUDOTDI : DOTI<"vudot", "u8", 0b0, 0b1, DPR>;
-def VSDOTDI : DOTI<"vsdot", "s8", 0b0, 0b0, DPR>;
-def VUDOTQI : DOTI<"vudot", "u8", 0b1, 0b1, QPR>;
-def VSDOTQI : DOTI<"vsdot", "s8", 0b1, 0b0, QPR>;
+multiclass DOTI<string opc, string dt, bit Q, bit U, RegisterClass Ty,
+           ValueType AccumType, ValueType InputType, SDPatternOperator OpNode,
+           dag RHS> {
+  def "" : N3Vnp<0b11100, 0b10, 0b1101, Q, U, (outs Ty:$dst),
+                 (ins Ty:$Vd, Ty:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane),
+                 N3RegFrm, IIC_VDOTPROD, opc, dt, []> {
+    bit lane;
+    let Inst{5} = lane;
+    let AsmString = !strconcat(opc, ".", dt, "\t$Vd, $Vn, $Vm$lane");
+    let Constraints = "$dst = $Vd";
+    let Predicates = [HasDotProd];
+    let DecoderNamespace = "VFPV8";
+  }
+
+  def : Pat<
+    (AccumType (OpNode (AccumType Ty:$Vd),
+                       (InputType Ty:$Vn),
+                       (InputType (bitconvert (AccumType
+                                  (NEONvduplane (AccumType Ty:$Vm),
+                                                 VectorIndex32:$lane)))))),
+    (!cast<Instruction>(NAME) Ty:$Vd, Ty:$Vn, RHS, VectorIndex32:$lane)>;
+}
+
+defm VUDOTDI : DOTI<"vudot", "u8", 0b0, 0b1, DPR, v2i32, v8i8,
+                    int_arm_neon_udot, (v2i32 DPR_VFP2:$Vm)>;
+defm VSDOTDI : DOTI<"vsdot", "s8", 0b0, 0b0, DPR, v2i32, v8i8,
+                    int_arm_neon_sdot, (v2i32 DPR_VFP2:$Vm)>;
+defm VUDOTQI : DOTI<"vudot", "u8", 0b1, 0b1, QPR, v4i32, v16i8,
+                    int_arm_neon_udot, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
+defm VSDOTQI : DOTI<"vsdot", "s8", 0b1, 0b0, QPR, v4i32, v16i8,
+                    int_arm_neon_sdot, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
 
-}  // HasDotProd
 
 // ARMv8.3 complex operations
 class BaseN3VCP8ComplexTied<bit op21, bit op4, bit s, bit q,
Index: llvm/trunk/test/CodeGen/ARM/neon-dot-product.ll
===================================================================
--- llvm/trunk/test/CodeGen/ARM/neon-dot-product.ll
+++ llvm/trunk/test/CodeGen/ARM/neon-dot-product.ll
@@ -0,0 +1,82 @@
+; RUN: llc -mtriple armv8a-none-linux-gnu -mattr=+dotprod -float-abi=hard < %s | FileCheck %s
+
+declare <2 x i32> @llvm.arm.neon.udot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>)
+declare <4 x i32> @llvm.arm.neon.udot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>)
+declare <2 x i32> @llvm.arm.neon.sdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>)
+declare <4 x i32> @llvm.arm.neon.sdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>)
+
+define <2 x i32> @test_vdot_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+entry:
+; CHECK-LABEL: test_vdot_u32:
+; CHECK: vudot.u8        d0, d1, d2
+  %vdot1.i = call <2 x i32> @llvm.arm.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #2
+  ret <2 x i32> %vdot1.i
+}
+
+define <4 x i32> @test_vdotq_u32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+entry:
+; CHECK-LABEL: test_vdotq_u32:
+; CHECK: vudot.u8        q0, q1, q2
+  %vdot1.i = call <4 x i32> @llvm.arm.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #2
+  ret <4 x i32> %vdot1.i
+}
+
+define <2 x i32> @test_vdot_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 {
+entry:
+; CHECK-LABEL: test_vdot_s32:
+; CHECK: vsdot.s8        d0, d1, d2
+  %vdot1.i = call <2 x i32> @llvm.arm.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #2
+  ret <2 x i32> %vdot1.i
+}
+
+define <4 x i32> @test_vdotq_s32(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 {
+entry:
+; CHECK-LABEL: test_vdotq_s32:
+; CHECK: vsdot.s8        q0, q1, q2
+  %vdot1.i = call <4 x i32> @llvm.arm.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #2
+  ret <4 x i32> %vdot1.i
+}
+
+define <2 x i32> @test_vdot_lane_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) {
+entry:
+; CHECK-LABEL: test_vdot_lane_u32:
+; CHECK: vudot.u8        d0, d1, d2[1]
+  %.cast = bitcast <8 x i8> %c to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
+  %vdot1.i = call <2 x i32> @llvm.arm.neon.udot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2
+  ret <2 x i32> %vdot1.i
+}
+
+define <4 x i32> @test_vdotq_lane_u32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) {
+entry:
+; CHECK-LABEL: test_vdotq_lane_u32:
+; CHECK: vudot.u8        q0, q1, d4[1]
+  %.cast = bitcast <8 x i8> %c to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
+  %vdot1.i = call <4 x i32> @llvm.arm.neon.udot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2
+  ret <4 x i32> %vdot1.i
+}
+
+define <2 x i32> @test_vdot_lane_s32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) {
+entry:
+; CHECK-LABEL: test_vdot_lane_s32:
+; CHECK: vsdot.s8        d0, d1, d2[1]
+  %.cast = bitcast <8 x i8> %c to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <2 x i32> <i32 1, i32 1>
+  %.cast5 = bitcast <2 x i32> %shuffle to <8 x i8>
+  %vdot1.i = call <2 x i32> @llvm.arm.neon.sdot.v2i32.v8i8(<2 x i32> %a, <8 x i8> %b, <8 x i8> %.cast5) #2
+  ret <2 x i32> %vdot1.i
+}
+
+define <4 x i32> @test_vdotq_lane_s32(<4 x i32> %a, <16 x i8> %b, <8 x i8> %c) {
+entry:
+; CHECK-LABEL: test_vdotq_lane_s32:
+; CHECK: vsdot.s8        q0, q1, d4[1]
+  %.cast = bitcast <8 x i8> %c to <2 x i32>
+  %shuffle = shufflevector <2 x i32> %.cast, <2 x i32> undef, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
+  %.cast3 = bitcast <4 x i32> %shuffle to <16 x i8>
+  %vdot1.i = call <4 x i32> @llvm.arm.neon.sdot.v4i32.v16i8(<4 x i32> %a, <16 x i8> %b, <16 x i8> %.cast3) #2
+  ret <4 x i32> %vdot1.i
+}