Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -5622,6 +5622,11 @@ v2i32, v8i8, OpNode>; def v16i8 : BaseSIMDThreeSameVectorDot<1, U, Mixed, asm, ".4s", ".16b", V128, v4i32, v16i8, OpNode>; + + def : Pat<(add (v2i32 V64:$Rd), (OpNode (v2i32 immAllZerosV), (v8i8 V64:$Rn), (v8i8 V64:$Rm))), + (!cast(NAME # "v8i8") $Rd, $Rn, $Rm)>; + def : Pat<(add (v4i32 V128:$Rd), (OpNode (v4i32 immAllZerosV), (v16i8 V128:$Rn), (v16i8 V128:$Rm))), + (!cast(NAME # "v16i8") $Rd, $Rn, $Rm)>; } // ARMv8.2-A Fused Multiply Add-Long Instructions (Vector): These instructions Index: llvm/test/CodeGen/AArch64/neon-dot-product.ll =================================================================== --- llvm/test/CodeGen/AArch64/neon-dot-product.ll +++ llvm/test/CodeGen/AArch64/neon-dot-product.ll @@ -42,6 +42,43 @@ ret <4 x i32> %vdot1.i } + +define <2 x i32> @test_vdot_u32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 { +entry: +; CHECK-LABEL: test_vdot_u32_zero: +; CHECK: udot v0.2s, v1.8b, v2.8b + %vdot1.i = call <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %c) #2 + %ret = add <2 x i32> %vdot1.i, %a + ret <2 x i32> %ret +} + +define <4 x i32> @test_vdotq_u32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 { +entry: +; CHECK-LABEL: test_vdotq_u32_zero: +; CHECK: udot v0.4s, v1.16b, v2.16b + %vdot1.i = call <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %c) #2 + %ret = add <4 x i32> %vdot1.i, %a + ret <4 x i32> %ret +} + +define <2 x i32> @test_vdot_s32_zero(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) #0 { +entry: +; CHECK-LABEL: test_vdot_s32_zero: +; CHECK: sdot v0.2s, v1.8b, v2.8b + %vdot1.i = call <2 x i32> @llvm.aarch64.neon.sdot.v2i32.v8i8(<2 x i32> zeroinitializer, <8 x i8> %b, <8 x i8> %c) #2 + %ret = add <2 x i32> %vdot1.i, %a + ret <2 x i32> %ret +} + +define <4 x i32> @test_vdotq_s32_zero(<4 x i32> %a, <16 x i8> %b, <16 x i8> %c) #0 { +entry: +; CHECK-LABEL: test_vdotq_s32_zero: +; CHECK: sdot v0.4s, v1.16b, v2.16b + %vdot1.i = call <4 x i32> @llvm.aarch64.neon.sdot.v4i32.v16i8(<4 x i32> zeroinitializer, <16 x i8> %b, <16 x i8> %c) #2 + %ret = add <4 x i32> %vdot1.i, %a + ret <4 x i32> %ret +} + define <2 x i32> @test_vdot_lane_u32(<2 x i32> %a, <8 x i8> %b, <8 x i8> %c) { entry: ; CHECK-LABEL: test_vdot_lane_u32: