diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td --- a/clang/include/clang/Basic/arm_neon.td +++ b/clang/include/clang/Basic/arm_neon.td @@ -221,6 +221,21 @@ def OP_FMLSL_LN_Hi : Op<(call "vfmlsl_high", $p0, $p1, (dup_typed $p1, (call "vget_lane", $p2, $p3)))>; +def OP_USDOT_LN + : Op<(call "vusdot", $p0, $p1, + (cast "8", "S", (call_mangled "splat_lane", (bitcast "int32x2_t", $p2), $p3)))>; +def OP_USDOT_LNQ + : Op<(call "vusdot", $p0, $p1, + (cast "8", "S", (call_mangled "splat_lane", (bitcast "int32x4_t", $p2), $p3)))>; + +// sudot splats the second vector and then calls vusdot +def OP_SUDOT_LN + : Op<(call "vusdot", $p0, + (cast "8", "U", (call_mangled "splat_lane", (bitcast "int32x2_t", $p2), $p3)), $p1)>; +def OP_SUDOT_LNQ + : Op<(call "vusdot", $p0, + (cast "8", "U", (call_mangled "splat_lane", (bitcast "int32x4_t", $p2), $p3)), $p1)>; + //===----------------------------------------------------------------------===// // Auxiliary Instructions //===----------------------------------------------------------------------===// @@ -1792,6 +1807,23 @@ } } +let ArchGuard = "defined(__ARM_FEATURE_MATMUL_INT8)" in { + def VMMLA : SInst<"vmmla", "..(<<)(<<)", "QUiQi">; + def VUSMMLA : SInst<"vusmmla", "..(<; + + def VUSDOT : SInst<"vusdot", "..(<; + + def VUSDOT_LANE : SOpInst<"vusdot_lane", "..(<; + def VSUDOT_LANE : SOpInst<"vsudot_lane", "..(<<)(<; + + let ArchGuard = "defined(__aarch64__)" in { + let isLaneQ = 1 in { + def VUSDOT_LANEQ : SOpInst<"vusdot_laneq", "..(<; + def VSUDOT_LANEQ : SOpInst<"vsudot_laneq", "..(<<)(<; + } + } +} + // v8.3-A Vector complex addition intrinsics let ArchGuard = "defined(__ARM_FEATURE_COMPLEX) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)" in { def VCADD_ROT90_FP16 : SInst<"vcadd_rot90", "...", "h">; @@ -1808,4 +1840,4 @@ let ArchGuard = "defined(__ARM_FEATURE_COMPLEX) && defined(__aarch64__)" in { def VCADDQ_ROT90_FP64 : SInst<"vcaddq_rot90", "QQQ", "d">; def VCADDQ_ROT270_FP64 : SInst<"vcaddq_rot270", "QQQ", "d">; -} \ No newline at end of file +} diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h --- a/clang/lib/Basic/Targets/AArch64.h +++ b/clang/lib/Basic/Targets/AArch64.h @@ -36,6 +36,7 @@ bool HasFP16FML; bool HasMTE; bool HasTME; + bool HasMatMul; llvm::AArch64::ArchKind ArchKind; diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -280,6 +280,9 @@ if (HasTME) Builder.defineMacro("__ARM_FEATURE_TME", "1"); + if (HasMatMul) + Builder.defineMacro("__ARM_FEATURE_MATMUL_INT8", "1"); + if ((FPU & NeonMode) && HasFP16FML) Builder.defineMacro("__ARM_FEATURE_FP16FML", "1"); @@ -356,6 +359,7 @@ HasFP16FML = false; HasMTE = false; HasTME = false; + HasMatMul = false; ArchKind = llvm::AArch64::ArchKind::ARMV8A; for (const auto &Feature : Features) { @@ -391,6 +395,8 @@ HasMTE = true; if (Feature == "+tme") HasTME = true; + if (Feature == "+i8mm") + HasMatMul = true; } setDataLayout(); diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -5009,6 +5009,7 @@ NEONMAP1(vld1q_x2_v, aarch64_neon_ld1x2, 0), NEONMAP1(vld1q_x3_v, aarch64_neon_ld1x3, 0), NEONMAP1(vld1q_x4_v, aarch64_neon_ld1x4, 0), + NEONMAP2(vmmlaq_v, aarch64_neon_ummla, aarch64_neon_smmla, 0), NEONMAP0(vmovl_v), NEONMAP0(vmovn_v), NEONMAP1(vmul_v, aarch64_neon_pmul, Add1ArgType), @@ -5091,6 +5092,9 @@ NEONMAP0(vsubhn_v), NEONMAP0(vtst_v), NEONMAP0(vtstq_v), + NEONMAP1(vusdot_v, aarch64_neon_usdot, 0), + NEONMAP1(vusdotq_v, aarch64_neon_usdot, 0), + NEONMAP1(vusmmlaq_v, aarch64_neon_usmmla, 0), }; static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = { @@ -6076,6 +6080,26 @@ llvm::Type *Tys[2] = { Ty, InputTy }; return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_high"); } + case NEON::BI__builtin_neon_vmmlaq_v: { + llvm::Type *InputTy = + llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); + llvm::Type *Tys[2] = { Ty, InputTy }; + Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmmla"); + } + case NEON::BI__builtin_neon_vusmmlaq_v: { + llvm::Type *InputTy = + llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); + llvm::Type *Tys[2] = { Ty, InputTy }; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusmmla"); + } + case NEON::BI__builtin_neon_vusdot_v: + case NEON::BI__builtin_neon_vusdotq_v: { + llvm::Type *InputTy = + llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); + llvm::Type *Tys[2] = { Ty, InputTy }; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusdot"); + } } assert(Int && "Expected valid intrinsic number"); diff --git a/clang/test/CodeGen/aarch64-matmul.cpp b/clang/test/CodeGen/aarch64-matmul.cpp new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-matmul.cpp @@ -0,0 +1,8 @@ +// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +neon -target-feature +i8mm -S -emit-llvm %s -o - | FileCheck %s + +#ifdef __ARM_FEATURE_MATMUL_INT8 +extern "C" void arm_feature_matmulint8_defined() {} +#endif +// CHECK: define void @arm_feature_matmulint8_defined() + + diff --git a/clang/test/CodeGen/aarch64-v8.6a-neon-intrinsics.c b/clang/test/CodeGen/aarch64-v8.6a-neon-intrinsics.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-v8.6a-neon-intrinsics.c @@ -0,0 +1,147 @@ +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm \ +// RUN: -fallow-half-arguments-and-returns -S -disable-O0-optnone -emit-llvm -o - %s \ +// RUN: | opt -S -mem2reg -sroa \ +// RUN: | FileCheck %s + +// REQUIRES: aarch64-registered-target + +#include + +// CHECK-LABEL: test_vmmlaq_s32 +// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) +// CHECK: ret <4 x i32> [[VAL]] +int32x4_t test_vmmlaq_s32(int32x4_t r, int8x16_t a, int8x16_t b) { + return vmmlaq_s32(r, a, b); +} + +// CHECK-LABEL: test_vmmlaq_u32 +// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) +// CHECK: ret <4 x i32> [[VAL]] +uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) { + return vmmlaq_u32(r, a, b); +} + +// CHECK-LABEL: test_vusmmlaq_s32 +// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) +// CHECK: ret <4 x i32> [[VAL]] +int32x4_t test_vusmmlaq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) { + return vusmmlaq_s32(r, a, b); +} + +// CHECK-LABEL: test_vusdot_s32 +// CHECK: [[VAL:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) +// CHECK: ret <2 x i32> [[VAL]] +int32x2_t test_vusdot_s32(int32x2_t r, uint8x8_t a, int8x8_t b) { + return vusdot_s32(r, a, b); +} + +// CHECK-LABEL: test_vusdot_lane_s32 +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8> +// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP4]]) +// CHECK: ret <2 x i32> [[OP]] +int32x2_t test_vusdot_lane_s32(int32x2_t r, uint8x8_t a, int8x8_t b) { + return vusdot_lane_s32(r, a, b, 0); +} + +// CHECK-LABEL: test_vsudot_lane_s32 +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> %0 to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> %1 to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <2 x i32> zeroinitializer +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8> +// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP4]], <8 x i8> %a) +// CHECK: ret <2 x i32> [[OP]] +int32x2_t test_vsudot_lane_s32(int32x2_t r, int8x8_t a, uint8x8_t b) { + return vsudot_lane_s32(r, a, b, 0); +} + +// CHECK-LABEL: test_vusdot_laneq_s32 +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <2 x i32> zeroinitializer +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8> +// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP4]]) +// CHECK: ret <2 x i32> [[OP]] +int32x2_t test_vusdot_laneq_s32(int32x2_t r, uint8x8_t a, int8x16_t b) { + return vusdot_laneq_s32(r, a, b, 0); +} + +// CHECK-LABEL: test_vsudot_laneq_s32 +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <2 x i32> zeroinitializer +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8> +// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP4]], <8 x i8> %a) +// CHECK: ret <2 x i32> [[OP]] +int32x2_t test_vsudot_laneq_s32(int32x2_t r, int8x8_t a, uint8x16_t b) { + return vsudot_laneq_s32(r, a, b, 0); +} + +// CHECK-LABEL: test_vusdotq_s32 +// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) +// CHECK: ret <4 x i32> [[VAL]] +int32x4_t test_vusdotq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) { + return vusdotq_s32(r, a, b); +} + +// CHECK-LABEL: test_vusdotq_lane_s32 +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> +// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]]) +// CHECK: ret <4 x i32> [[OP]] +int32x4_t test_vusdotq_lane_s32(int32x4_t r, uint8x16_t a, int8x8_t b) { + return vusdotq_lane_s32(r, a, b, 0); +} + +// CHECK-LABEL: test_vsudotq_lane_s32 +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8> %b to <2 x i32> +// CHECK: [[TMP1:%.*]] = bitcast <2 x i32> [[TMP0]] to <8 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <8 x i8> [[TMP1]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP2]], <2 x i32> [[TMP2]], <4 x i32> zeroinitializer +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> +// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> [[TMP4]], <16 x i8> %a) +// CHECK: ret <4 x i32> [[OP]] +int32x4_t test_vsudotq_lane_s32(int32x4_t r, int8x16_t a, uint8x8_t b) { + return vsudotq_lane_s32(r, a, b, 0); +} + +// CHECK-LABEL: test_vusdotq_laneq_s32 +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> +// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]]) +// CHECK: ret <4 x i32> [[OP]] +int32x4_t test_vusdotq_laneq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) { + return vusdotq_laneq_s32(r, a, b, 0); +} + +// CHECK-LABEL: test_vsudotq_laneq_s32 +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8> %b to <4 x i32> +// CHECK: [[TMP1:%.*]] = bitcast <4 x i32> [[TMP0]] to <16 x i8> +// CHECK: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to <4 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP2]], <4 x i32> [[TMP2]], <4 x i32> zeroinitializer +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> +// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> [[TMP4]], <16 x i8> %a) +// CHECK: ret <4 x i32> [[OP]] +int32x4_t test_vsudotq_laneq_s32(int32x4_t r, int8x16_t a, uint8x16_t b) { + return vsudotq_laneq_s32(r, a, b, 0); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -173,6 +173,11 @@ : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>], [IntrNoMem]>; + + class AdvSIMD_MatMul_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>], + [IntrNoMem]>; } // Arithmetic ops @@ -449,6 +454,12 @@ def int_aarch64_neon_udot : AdvSIMD_Dot_Intrinsic; def int_aarch64_neon_sdot : AdvSIMD_Dot_Intrinsic; +// v8.6-A Matrix Multiply Intrinsics + def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic; + def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic; + def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic; + def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic; + // v8.2-A FP16 Fused Multiply-Add Long def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic; def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic; diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -373,6 +373,15 @@ def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16", "true", "Enable BFloat16 Extension" >; +def FeatureMatMulInt8 : SubtargetFeature<"i8mm", "HasMatMulInt8", + "true", "Enable Matrix Multiply Int8 Extension">; + +def FeatureMatMulFP32 : SubtargetFeature<"f32mm", "HasMatMulFP32", + "true", "Enable Matrix Multiply FP32 Extension", [FeatureSVE]>; + +def FeatureMatMulFP64 : SubtargetFeature<"f64mm", "HasMatMulFP64", + "true", "Enable Matrix Multiply FP64 Extension", [FeatureSVE]>; + def FeatureFineGrainedTraps : SubtargetFeature<"fgt", "HasFineGrainedTraps", "true", "Enable fine grained virtualization traps extension">; @@ -380,7 +389,6 @@ SubtargetFeature<"ecv", "HasEnhancedCounterVirtualization", "true", "Enable enhanced counter virtualization extension">; - //===----------------------------------------------------------------------===// // Architectures. // @@ -413,7 +421,7 @@ "v8.6a", "HasV8_6aOps", "true", "Support ARM v8.6a instructions", [HasV8_5aOps, FeatureAMVS, FeatureBF16, FeatureFineGrainedTraps, - FeatureEnhancedCounterVirtualization]>; + FeatureEnhancedCounterVirtualization, FeatureMatMulInt8]>; //===----------------------------------------------------------------------===// // Register File Description diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td --- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -5550,11 +5550,11 @@ // ARMv8.2-A Dot Product Instructions (Vector): These instructions extract // bytes from S-sized elements. -class BaseSIMDThreeSameVectorDot : - BaseSIMDThreeSameVectorTied { - def v8i8 : BaseSIMDThreeSameVectorDot<0, U, asm, ".2s", ".8b", V64, +multiclass SIMDThreeSameVectorDot { + def v8i8 : BaseSIMDThreeSameVectorDot<0, U, Mixed, asm, ".2s", ".8b", V64, v2i32, v8i8, OpNode>; - def v16i8 : BaseSIMDThreeSameVectorDot<1, U, asm, ".4s", ".16b", V128, + def v16i8 : BaseSIMDThreeSameVectorDot<1, U, Mixed, asm, ".4s", ".16b", V128, v4i32, v16i8, OpNode>; } @@ -7903,13 +7903,26 @@ } } // End of let mayStore = 0, mayLoad = 0, hasSideEffects = 0 +//---------------------------------------------------------------------------- +// Armv8.6 Matrix Multiply Extension +//---------------------------------------------------------------------------- + +class SIMDThreeSameVectorMatMul + : BaseSIMDThreeSameVectorTied<1, U, 0b100, {0b1010, B}, V128, asm, ".4s", + [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), + (v16i8 V128:$Rn), + (v16i8 V128:$Rm)))]> { + let AsmString = asm # "{\t$Rd.4s, $Rn.16b, $Rm.16b}"; +} + +//---------------------------------------------------------------------------- // ARMv8.2-A Dot Product Instructions (Indexed) -class BaseSIMDThreeSameVectorDotIndex size, string asm, + string dst_kind, string lhs_kind, string rhs_kind, RegisterOperand RegType, ValueType AccumType, ValueType InputType, SDPatternOperator OpNode> : - BaseSIMDIndexedTied size, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVectorDotIndex<0, U, asm, ".2s", ".8b", ".4b", + def v8i8 : BaseSIMDThreeSameVectorDotIndex<0, U, Mixed, size, asm, ".2s", ".8b", ".4b", V64, v2i32, v8i8, OpNode>; - def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, asm, ".4s", ".16b", ".4b", + def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, Mixed, size, asm, ".4s", ".16b", ".4b", V128, v4i32, v16i8, OpNode>; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -146,6 +146,12 @@ AssemblerPredicate<(all_of FeatureTRBE), "trbe">; def HasBF16 : Predicate<"Subtarget->hasBF16()">, AssemblerPredicate<(all_of FeatureBF16), "bf16">; +def HasMatMulInt8 : Predicate<"Subtarget->hasMatMulInt8()">, + AssemblerPredicate<(all_of FeatureMatMulInt8), "i8mm">; +def HasMatMulFP32 : Predicate<"Subtarget->hasMatMulFP32()">, + AssemblerPredicate<(all_of FeatureMatMulFP32), "f32mm">; +def HasMatMulFP64 : Predicate<"Subtarget->hasMatMulFP64()">, + AssemblerPredicate<(all_of FeatureMatMulFP64), "f64mm">; def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; def IsWindows : Predicate<"Subtarget->isTargetWindows()">; @@ -745,10 +751,10 @@ // ARMv8.2-A Dot Product let Predicates = [HasDotProd] in { -defm SDOT : SIMDThreeSameVectorDot<0, "sdot", int_aarch64_neon_sdot>; -defm UDOT : SIMDThreeSameVectorDot<1, "udot", int_aarch64_neon_udot>; -defm SDOTlane : SIMDThreeSameVectorDotIndex<0, "sdot", int_aarch64_neon_sdot>; -defm UDOTlane : SIMDThreeSameVectorDotIndex<1, "udot", int_aarch64_neon_udot>; +defm SDOT : SIMDThreeSameVectorDot<0, 0, "sdot", int_aarch64_neon_sdot>; +defm UDOT : SIMDThreeSameVectorDot<1, 0, "udot", int_aarch64_neon_udot>; +defm SDOTlane : SIMDThreeSameVectorDotIndex<0, 0, 0b10, "sdot", int_aarch64_neon_sdot>; +defm UDOTlane : SIMDThreeSameVectorDotIndex<1, 0, 0b10, "udot", int_aarch64_neon_udot>; } // ARMv8.6-A BFloat @@ -765,6 +771,40 @@ def BFCVT : BF16ToSinglePrecision<"bfcvt">; } +// ARMv8.6A AArch64 matrix multiplication +let Predicates = [HasMatMulInt8] in { +def SMMLA : SIMDThreeSameVectorMatMul<0, 0, "smmla", int_aarch64_neon_smmla>; +def UMMLA : SIMDThreeSameVectorMatMul<0, 1, "ummla", int_aarch64_neon_ummla>; +def USMMLA : SIMDThreeSameVectorMatMul<1, 0, "usmmla", int_aarch64_neon_usmmla>; +defm USDOT : SIMDThreeSameVectorDot<0, 1, "usdot", int_aarch64_neon_usdot>; +defm USDOTlane : SIMDThreeSameVectorDotIndex<0, 1, 0b10, "usdot", int_aarch64_neon_usdot>; + +// sudot lane has a pattern where usdot is expected (there is no sudot). +// The second operand is used in the dup operation to repeat the indexed +// element. +class BaseSIMDSUDOTIndex + : BaseSIMDThreeSameVectorDotIndex { + let Pattern = [(set (AccumType RegType:$dst), + (AccumType (int_aarch64_neon_usdot (AccumType RegType:$Rd), + (InputType (bitconvert (AccumType + (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx)))), + (InputType RegType:$Rn))))]; +} + +multiclass SIMDSUDOTIndex { + def v8i8 : BaseSIMDSUDOTIndex<0, ".2s", ".8b", ".4b", V64, v2i32, v8i8>; + def v16i8 : BaseSIMDSUDOTIndex<1, ".4s", ".16b", ".4b", V128, v4i32, v16i8>; +} + +defm SUDOTlane : SIMDSUDOTIndex; + +} + // ARMv8.2-A FP16 Fused Multiply-Add Long let Predicates = [HasNEON, HasFP16FML] in { defm FMLAL : SIMDThreeSameVectorFML<0, 1, 0b001, "fmlal", int_aarch64_neon_fmlal>; diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -148,6 +148,9 @@ // Armv8.6-A Extensions bool HasBF16 = false; + bool HasMatMulInt8 = false; + bool HasMatMulFP32 = false; + bool HasMatMulFP64 = false; bool HasAMVS = false; bool HasFineGrainedTraps = false; bool HasEnhancedCounterVirtualization = false; @@ -417,6 +420,9 @@ bool hasSVE2SM4() const { return HasSVE2SM4; } bool hasSVE2SHA3() const { return HasSVE2SHA3; } bool hasSVE2BitPerm() const { return HasSVE2BitPerm; } + bool hasMatMulInt8() const { return HasMatMulInt8; } + bool hasMatMulFP32() const { return HasMatMulFP32; } + bool hasMatMulFP64() const { return HasMatMulFP64; } // Armv8.6-A Extensions bool hasBF16() const { return HasBF16; } diff --git a/llvm/test/CodeGen/AArch64/aarch64-matmul.ll b/llvm/test/CodeGen/AArch64/aarch64-matmul.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-matmul.ll @@ -0,0 +1,136 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s -o -| FileCheck %s + +define <4 x i32> @smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: smmla.v4i32.v16i8 +; CHECK: smmla v0.4s, v1.16b, v2.16b + %vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) + ret <4 x i32> %vmmla1.i +} + +define <4 x i32> @ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: ummla.v4i32.v16i8 +; CHECK: ummla v0.4s, v1.16b, v2.16b + %vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) + ret <4 x i32> %vmmla1.i +} + +define <4 x i32> @usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: usmmla.v4i32.v16i8 +; CHECK: usmmla v0.4s, v1.16b, v2.16b + %vusmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3 + ret <4 x i32> %vusmmla1.i +} + +define <2 x i32> @usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) { +entry: +; CHECK-LABEL: usdot.v2i32.v8i8 +; CHECK: usdot v0.2s, v1.8b, v2.8b + %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) + ret <2 x i32> %vusdot1.i +} + +define <2 x i32> @usdot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) { +entry: +; CHECK-LABEL: usdot_lane.v2i32.v8i8 +; CHECK: usdot v0.2s, v1.8b, v2.4b[0] + %0 = bitcast <8 x i8> %b to <2 x i32> + %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer + %1 = bitcast <2 x i32> %shuffle to <8 x i8> + %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %1) + ret <2 x i32> %vusdot1.i +} + +define <2 x i32> @sudot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) { +entry: +; CHECK-LABEL: sudot_lane.v2i32.v8i8 +; CHECK: sudot v0.2s, v1.8b, v2.4b[0] + %0 = bitcast <8 x i8> %b to <2 x i32> + %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer + %1 = bitcast <2 x i32> %shuffle to <8 x i8> + %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %1, <8 x i8> %a) + ret <2 x i32> %vusdot1.i +} + +define <2 x i32> @usdot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: usdot_lane.v2i32.v16i8 +; CHECK: usdot v0.2s, v1.8b, v2.4b[0] + %0 = bitcast <16 x i8> %b to <4 x i32> + %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer + %1 = bitcast <2 x i32> %shuffle to <8 x i8> + %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %1) + ret <2 x i32> %vusdot1.i +} + +define <2 x i32> @sudot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: sudot_lane.v2i32.v16i8 +; CHECK: sudot v0.2s, v1.8b, v2.4b[0] + %0 = bitcast <16 x i8> %b to <4 x i32> + %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer + %1 = bitcast <2 x i32> %shuffle to <8 x i8> + %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %1, <8 x i8> %a) #3 + ret <2 x i32> %vusdot1.i +} + +define <4 x i32> @usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: usdot.v4i32.v16i8 +; CHECK: usdot v0.4s, v1.16b, v2.16b + %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3 + ret <4 x i32> %vusdot1.i +} + +define <4 x i32> @usdot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) { +entry: +; CHECK-LABEL: usdot_lane.v4i32.v16i8 +; CHECK: usdot v0.4s, v1.16b, v2.4b[0] + %0 = bitcast <8 x i8> %b to <2 x i32> + %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer + %1 = bitcast <4 x i32> %shuffle to <16 x i8> + %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %1) #3 + ret <4 x i32> %vusdot1.i +} + +define <4 x i32> @sudot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) { +entry: +; CHECK-LABEL: sudot_lane.v4i32.v16i8 +; CHECK: sudot v0.4s, v1.16b, v2.4b[0] + %0 = bitcast <8 x i8> %b to <2 x i32> + %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer + %1 = bitcast <4 x i32> %shuffle to <16 x i8> + %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %1, <16 x i8> %a) #3 + ret <4 x i32> %vusdot1.i +} + +define <4 x i32> @usdot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: usdot_laneq.v4i32.v16i8 +; CHECK: usdot v0.4s, v1.16b, v2.4b[0] + %0 = bitcast <16 x i8> %b to <4 x i32> + %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer + %1 = bitcast <4 x i32> %shuffle to <16 x i8> + %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %1) #3 + ret <4 x i32> %vusdot1.i +} + +define <4 x i32> @sudot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: sudot_laneq.v4i32.v16i8 +; CHECK: sudot v0.4s, v1.16b, v2.4b[0] + %0 = bitcast <16 x i8> %b to <4 x i32> + %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer + %1 = bitcast <4 x i32> %shuffle to <16 x i8> + %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %1, <16 x i8> %a) #3 + ret <4 x i32> %vusdot1.i +} + +declare <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2 +declare <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2 +declare <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2 +declare <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) #2 +declare <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2 + diff --git a/llvm/test/MC/AArch64/armv8.6a-simd-matmul-error.s b/llvm/test/MC/AArch64/armv8.6a-simd-matmul-error.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/AArch64/armv8.6a-simd-matmul-error.s @@ -0,0 +1,34 @@ +// RUN: not llvm-mc -triple aarch64 -show-encoding -mattr=+i8mm < %s 2>&1 | FileCheck %s + +// No interesting edge cases for [US]MMLA, except for the fact that the data +// types are fixed (no 64-bit version), and USMMLA exists, but SUMMLA does not. +smmla v1.2s, v16.8b, v31.8b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +summla v1.4s, v16.16b, v31.16b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: unrecognized instruction mnemonic, did you mean: smmla, ummla, usmmla? + +// USDOT (vector) has two valid data type combinations, others are rejected. +usdot v3.4s, v15.8b, v30.8b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +usdot v3.2s, v15.16b, v30.16b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +// For USDOT and SUDOT (indexed), the index is in range [0,3] (regardless of data types) +usdot v31.2s, v1.8b, v2.4b[4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3]. +usdot v31.4s, v1.16b, v2.4b[4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3]. +sudot v31.2s, v1.8b, v2.4b[4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3]. +sudot v31.4s, v1.16b, v2.4b[4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3]. + +// The arrangement specifiers of the first two operands must match. +usdot v31.4s, v1.8b, v2.4b[0] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +usdot v31.2s, v1.16b, v2.4b[0] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +sudot v31.4s, v1.8b, v2.4b[0] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +sudot v31.2s, v1.16b, v2.4b[0] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction diff --git a/llvm/test/MC/AArch64/armv8.6a-simd-matmul.s b/llvm/test/MC/AArch64/armv8.6a-simd-matmul.s new file mode 100644 --- /dev/null +++ b/llvm/test/MC/AArch64/armv8.6a-simd-matmul.s @@ -0,0 +1,43 @@ +// RUN: llvm-mc -triple aarch64 -show-encoding -mattr=+i8mm < %s | FileCheck %s +// RUN: llvm-mc -triple aarch64 -show-encoding -mattr=+v8.6a < %s | FileCheck %s +// RUN: not llvm-mc -triple aarch64 -show-encoding -mattr=+v8.6a-i8mm < %s 2>&1 | FileCheck %s --check-prefix=NOMATMUL + +smmla v1.4s, v16.16b, v31.16b +ummla v1.4s, v16.16b, v31.16b +usmmla v1.4s, v16.16b, v31.16b +// CHECK: smmla v1.4s, v16.16b, v31.16b // encoding: [0x01,0xa6,0x9f,0x4e] +// CHECK: ummla v1.4s, v16.16b, v31.16b // encoding: [0x01,0xa6,0x9f,0x6e] +// CHECK: usmmla v1.4s, v16.16b, v31.16b // encoding: [0x01,0xae,0x9f,0x4e] +// NOMATMUL: instruction requires: i8mm +// NOMATMUL-NEXT: smmla v1.4s, v16.16b, v31.16b +// NOMATMUL: instruction requires: i8mm +// NOMATMUL-NEXT: ummla v1.4s, v16.16b, v31.16b +// NOMATMUL: instruction requires: i8mm +// NOMATMUL-NEXT: usmmla v1.4s, v16.16b, v31.16b + +usdot v3.2s, v15.8b, v30.8b +usdot v3.4s, v15.16b, v30.16b +// CHECK: usdot v3.2s, v15.8b, v30.8b // encoding: [0xe3,0x9d,0x9e,0x0e] +// CHECK: usdot v3.4s, v15.16b, v30.16b // encoding: [0xe3,0x9d,0x9e,0x4e] +// NOMATMUL: instruction requires: i8mm +// NOMATMUL-NEXT: usdot v3.2s, v15.8b, v30.8b +// NOMATMUL: instruction requires: i8mm +// NOMATMUL-NEXT: usdot v3.4s, v15.16b, v30.16b + +usdot v31.2s, v1.8b, v2.4b[3] +usdot v31.4s, v1.16b, v2.4b[3] +// CHECK: usdot v31.2s, v1.8b, v2.4b[3] // encoding: [0x3f,0xf8,0xa2,0x0f] +// CHECK: usdot v31.4s, v1.16b, v2.4b[3] // encoding: [0x3f,0xf8,0xa2,0x4f] +// NOMATMUL: instruction requires: i8mm +// NOMATMUL-NEXT: usdot v31.2s, v1.8b, v2.4b[3] +// NOMATMUL: instruction requires: i8mm +// NOMATMUL-NEXT: usdot v31.4s, v1.16b, v2.4b[3] + +sudot v31.2s, v1.8b, v2.4b[3] +sudot v31.4s, v1.16b, v2.4b[3] +// CHECK: sudot v31.2s, v1.8b, v2.4b[3] // encoding: [0x3f,0xf8,0x22,0x0f] +// CHECK: sudot v31.4s, v1.16b, v2.4b[3] // encoding: [0x3f,0xf8,0x22,0x4f] +// NOMATMUL: instruction requires: i8mm +// NOMATMUL-NEXT: sudot v31.2s, v1.8b, v2.4b[3] +// NOMATMUL: instruction requires: i8mm +// NOMATMUL-NEXT: sudot v31.4s, v1.16b, v2.4b[3] diff --git a/llvm/test/MC/Disassembler/AArch64/armv8.6a-simd-matmul.txt b/llvm/test/MC/Disassembler/AArch64/armv8.6a-simd-matmul.txt new file mode 100644 --- /dev/null +++ b/llvm/test/MC/Disassembler/AArch64/armv8.6a-simd-matmul.txt @@ -0,0 +1,34 @@ +# RUN: llvm-mc -triple=aarch64 -mattr=+i8mm -disassemble < %s | FileCheck %s +# RUN: llvm-mc -triple=aarch64 -mattr=+v8.6a -disassemble < %s | FileCheck %s +# RUN: not llvm-mc -triple=aarch64 -mattr=+v8.5a -disassemble < %s 2>&1 | FileCheck %s --check-prefix=NOI8MM + +[0x01,0xa6,0x9f,0x4e] +[0x01,0xa6,0x9f,0x6e] +[0x01,0xae,0x9f,0x4e] +# CHECK: smmla v1.4s, v16.16b, v31.16b +# CHECK: ummla v1.4s, v16.16b, v31.16b +# CHECK: usmmla v1.4s, v16.16b, v31.16b +# NOI8MM: [[@LINE-6]]:{{[0-9]+}}: warning: invalid instruction encoding +# NOI8MM: [[@LINE-6]]:{{[0-9]+}}: warning: invalid instruction encoding +# NOI8MM: [[@LINE-6]]:{{[0-9]+}}: warning: invalid instruction encoding + +[0xe3,0x9d,0x9e,0x0e] +[0xe3,0x9d,0x9e,0x4e] +# CHECK: usdot v3.2s, v15.8b, v30.8b +# CHECK: usdot v3.4s, v15.16b, v30.16b +# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding +# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding + +[0x3f,0xf8,0xa2,0x0f] +[0x3f,0xf8,0xa2,0x4f] +# CHECK: usdot v31.2s, v1.8b, v2.4b[3] +# CHECK: usdot v31.4s, v1.16b, v2.4b[3] +# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding +# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding + +[0x3f,0xf8,0x22,0x0f] +[0x3f,0xf8,0x22,0x4f] +# CHECK: sudot v31.2s, v1.8b, v2.4b[3] +# CHECK: sudot v31.4s, v1.16b, v2.4b[3] +# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding +# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding