Index: clang/include/clang/Basic/arm_neon.td =================================================================== --- clang/include/clang/Basic/arm_neon.td +++ clang/include/clang/Basic/arm_neon.td @@ -221,6 +221,21 @@ def OP_FMLSL_LN_Hi : Op<(call "vfmlsl_high", $p0, $p1, (dup_typed $p1, (call "vget_lane", $p2, $p3)))>; +def OP_USDOT_LN + : Op<(call "vusdot", $p0, $p1, + (cast "8", "S", (call_mangled "splat_lane", (bitcast "int32x2_t", $p2), $p3)))>; +def OP_USDOT_LNQ + : Op<(call "vusdot", $p0, $p1, + (cast "8", "S", (call_mangled "splat_lane", (bitcast "int32x4_t", $p2), $p3)))>; + +// sudot splats the second vector and then calls vusdot +def OP_SUDOT_LN + : Op<(call "vusdot", $p0, + (cast "8", "U", (call_mangled "splat_lane", (bitcast "int32x2_t", $p2), $p3)), $p1)>; +def OP_SUDOT_LNQ + : Op<(call "vusdot", $p0, + (cast "8", "U", (call_mangled "splat_lane", (bitcast "int32x4_t", $p2), $p3)), $p1)>; + //===----------------------------------------------------------------------===// // Auxiliary Instructions //===----------------------------------------------------------------------===// @@ -1792,6 +1807,23 @@ } } +let ArchGuard = "defined(__ARM_FEATURE_MATMUL_INT8)" in { + def VMMLA : SInst<"vmmla", "..(<<)(<<)", "QUiQi">; + def VUSMMLA : SInst<"vusmmla", "..(<; + + def VUSDOT : SInst<"vusdot", "..(<; + + def VUSDOT_LANE : SOpInst<"vusdot_lane", "..(<; + def VSUDOT_LANE : SOpInst<"vsudot_lane", "..(<<)(<; + + let ArchGuard = "defined(__aarch64__)" in { + let isLaneQ = 1 in { + def VUSDOT_LANEQ : SOpInst<"vusdot_laneq", "..(<; + def VSUDOT_LANEQ : SOpInst<"vsudot_laneq", "..(<<)(<; + } + } +} + // v8.3-A Vector complex addition intrinsics let ArchGuard = "defined(__ARM_FEATURE_COMPLEX) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC)" in { def VCADD_ROT90_FP16 : SInst<"vcadd_rot90", "...", "h">; @@ -1808,4 +1840,4 @@ let ArchGuard = "defined(__ARM_FEATURE_COMPLEX) && defined(__aarch64__)" in { def VCADDQ_ROT90_FP64 : SInst<"vcaddq_rot90", "QQQ", "d">; def VCADDQ_ROT270_FP64 : SInst<"vcaddq_rot270", "QQQ", "d">; -} \ No newline at end of file +} Index: clang/lib/Basic/Targets/AArch64.h =================================================================== --- clang/lib/Basic/Targets/AArch64.h +++ clang/lib/Basic/Targets/AArch64.h @@ -36,6 +36,7 @@ bool HasFP16FML; bool HasMTE; bool HasTME; + bool HasMatMul; llvm::AArch64::ArchKind ArchKind; Index: clang/lib/Basic/Targets/AArch64.cpp =================================================================== --- clang/lib/Basic/Targets/AArch64.cpp +++ clang/lib/Basic/Targets/AArch64.cpp @@ -280,6 +280,9 @@ if (HasTME) Builder.defineMacro("__ARM_FEATURE_TME", "1"); + if (HasMatMul) + Builder.defineMacro("__ARM_FEATURE_MATMUL_INT8", "1"); + if ((FPU & NeonMode) && HasFP16FML) Builder.defineMacro("__ARM_FEATURE_FP16FML", "1"); @@ -356,6 +359,7 @@ HasFP16FML = false; HasMTE = false; HasTME = false; + HasMatMul = false; ArchKind = llvm::AArch64::ArchKind::ARMV8A; for (const auto &Feature : Features) { @@ -391,6 +395,8 @@ HasMTE = true; if (Feature == "+tme") HasTME = true; + if (Feature == "+i8mm") + HasMatMul = true; } setDataLayout(); Index: clang/lib/Basic/Targets/ARM.h =================================================================== --- clang/lib/Basic/Targets/ARM.h +++ clang/lib/Basic/Targets/ARM.h @@ -75,6 +75,7 @@ unsigned DSP : 1; unsigned Unaligned : 1; unsigned DotProd : 1; + unsigned HasMatMul : 1; enum { LDREX_B = (1 << 0), /// byte (8-bit) Index: clang/lib/Basic/Targets/ARM.cpp =================================================================== --- clang/lib/Basic/Targets/ARM.cpp +++ clang/lib/Basic/Targets/ARM.cpp @@ -425,6 +425,7 @@ // Note that SoftFloatABI is initialized in our constructor. HWDiv = 0; DotProd = 0; + HasMatMul = 0; HasFloat16 = true; ARMCDECoprocMask = 0; @@ -491,6 +492,8 @@ FPU |= FPARMV8; MVE |= MVE_INT | MVE_FP; HW_FP |= HW_FP_SP | HW_FP_HP; + } else if (Feature == "+i8mm") { + HasMatMul = 1; } else if (Feature.size() == strlen("+cdecp0") && Feature >= "+cdecp0" && Feature <= "+cdecp7") { unsigned Coproc = Feature.back() - '0'; @@ -820,6 +823,9 @@ if (DotProd) Builder.defineMacro("__ARM_FEATURE_DOTPROD", "1"); + if (HasMatMul) + Builder.defineMacro("__ARM_FEATURE_MATMUL_INT8", "1"); + switch (ArchKind) { default: break; Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -4781,6 +4781,7 @@ NEONMAP1(vminnm_v, arm_neon_vminnm, Add1ArgType), NEONMAP1(vminnmq_v, arm_neon_vminnm, Add1ArgType), NEONMAP2(vminq_v, arm_neon_vminu, arm_neon_vmins, Add1ArgType | UnsignedAlts), + NEONMAP2(vmmlaq_v, arm_neon_ummla, arm_neon_smmla, 0), NEONMAP0(vmovl_v), NEONMAP0(vmovn_v), NEONMAP1(vmul_v, arm_neon_vmulp, Add1ArgType), @@ -4888,6 +4889,9 @@ NEONMAP0(vtrnq_v), NEONMAP0(vtst_v), NEONMAP0(vtstq_v), + NEONMAP1(vusdot_v, arm_neon_usdot, 0), + NEONMAP1(vusdotq_v, arm_neon_usdot, 0), + NEONMAP1(vusmmlaq_v, arm_neon_usmmla, 0), NEONMAP0(vuzp_v), NEONMAP0(vuzpq_v), NEONMAP0(vzip_v), @@ -4983,6 +4987,7 @@ NEONMAP1(vld1q_x2_v, aarch64_neon_ld1x2, 0), NEONMAP1(vld1q_x3_v, aarch64_neon_ld1x3, 0), NEONMAP1(vld1q_x4_v, aarch64_neon_ld1x4, 0), + NEONMAP2(vmmlaq_v, aarch64_neon_ummla, aarch64_neon_smmla, 0), NEONMAP0(vmovl_v), NEONMAP0(vmovn_v), NEONMAP1(vmul_v, aarch64_neon_pmul, Add1ArgType), @@ -5065,6 +5070,9 @@ NEONMAP0(vsubhn_v), NEONMAP0(vtst_v), NEONMAP0(vtstq_v), + NEONMAP1(vusdot_v, aarch64_neon_usdot, 0), + NEONMAP1(vusdotq_v, aarch64_neon_usdot, 0), + NEONMAP1(vusmmlaq_v, aarch64_neon_usmmla, 0), }; static const ARMVectorIntrinsicInfo AArch64SISDIntrinsicMap[] = { @@ -6047,6 +6055,26 @@ llvm::Type *Tys[2] = { Ty, InputTy }; return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vfmlsl_high"); } + case NEON::BI__builtin_neon_vmmlaq_v: { + llvm::Type *InputTy = + llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); + llvm::Type *Tys[2] = { Ty, InputTy }; + Int = Usgn ? LLVMIntrinsic : AltLLVMIntrinsic; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vmmla"); + } + case NEON::BI__builtin_neon_vusmmlaq_v: { + llvm::Type *InputTy = + llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); + llvm::Type *Tys[2] = { Ty, InputTy }; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusmmla"); + } + case NEON::BI__builtin_neon_vusdot_v: + case NEON::BI__builtin_neon_vusdotq_v: { + llvm::Type *InputTy = + llvm::VectorType::get(Int8Ty, Ty->getPrimitiveSizeInBits() / 8); + llvm::Type *Tys[2] = { Ty, InputTy }; + return EmitNeonCall(CGM.getIntrinsic(Int, Tys), Ops, "vusdot"); + } } assert(Int && "Expected valid intrinsic number"); Index: clang/lib/Driver/ToolChains/Arch/AArch64.cpp =================================================================== --- clang/lib/Driver/ToolChains/Arch/AArch64.cpp +++ clang/lib/Driver/ToolChains/Arch/AArch64.cpp @@ -54,7 +54,8 @@ // Decode AArch64 features from string like +[no]featureA+[no]featureB+... static bool DecodeAArch64Features(const Driver &D, StringRef text, - std::vector &Features) { + std::vector &Features, + llvm::AArch64::ArchKind ArchKind) { SmallVector Split; text.split(Split, StringRef("+"), -1, false); @@ -66,6 +67,10 @@ D.Diag(clang::diag::err_drv_no_neon_modifier); else return false; + + // +sve implies +f32mm if the base architecture is v8.6A + if ((ArchKind == llvm::AArch64::ArchKind::ARMV8_6A) && Feature == "sve") + Features.push_back("+f32mm"); } return true; } @@ -76,6 +81,7 @@ std::vector &Features) { std::pair Split = Mcpu.split("+"); CPU = Split.first; + llvm::AArch64::ArchKind ArchKind = llvm::AArch64::ArchKind::ARMV8A; if (CPU == "native") CPU = llvm::sys::getHostCPUName(); @@ -83,7 +89,7 @@ if (CPU == "generic") { Features.push_back("+neon"); } else { - llvm::AArch64::ArchKind ArchKind = llvm::AArch64::parseCPUArch(CPU); + ArchKind = llvm::AArch64::parseCPUArch(CPU); if (!llvm::AArch64::getArchFeatures(ArchKind, Features)) return false; @@ -92,10 +98,11 @@ return false; } - if (Split.second.size() && !DecodeAArch64Features(D, Split.second, Features)) - return false; + if (Split.second.size() && + !DecodeAArch64Features(D, Split.second, Features, ArchKind)) + return false; - return true; + return true; } static bool @@ -108,7 +115,8 @@ llvm::AArch64::ArchKind ArchKind = llvm::AArch64::parseArch(Split.first); if (ArchKind == llvm::AArch64::ArchKind::INVALID || !llvm::AArch64::getArchFeatures(ArchKind, Features) || - (Split.second.size() && !DecodeAArch64Features(D, Split.second, Features))) + (Split.second.size() && + !DecodeAArch64Features(D, Split.second, Features, ArchKind))) return false; return true; Index: clang/test/CodeGen/aarch64-matmul.cpp =================================================================== --- /dev/null +++ clang/test/CodeGen/aarch64-matmul.cpp @@ -0,0 +1,8 @@ +// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +neon -target-feature +i8mm -S -emit-llvm %s -o - | FileCheck %s + +#ifdef __ARM_FEATURE_MATMUL_INT8 +extern "C" void arm_feature_matmulint8_defined() {} +#endif +// CHECK: define void @arm_feature_matmulint8_defined() + + Index: clang/test/CodeGen/aarch64-v8.6a-neon-intrinsics.c =================================================================== --- /dev/null +++ clang/test/CodeGen/aarch64-v8.6a-neon-intrinsics.c @@ -0,0 +1,171 @@ +// RUN: %clang_cc1 -triple arm64-none-linux-gnu -target-feature +neon -target-feature +fullfp16 -target-feature +v8.6a -target-feature +i8mm \ +// RUN: -fallow-half-arguments-and-returns -S -disable-O0-optnone -emit-llvm -o - %s \ +// RUN: | opt -S -mem2reg \ +// RUN: | FileCheck %s + +// REQUIRES: aarch64-registered-target + +#include + +// CHECK-LABEL: test_vmmlaq_s32 +// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) +// CHECK: ret <4 x i32> [[VAL]] +int32x4_t test_vmmlaq_s32(int32x4_t r, int8x16_t a, int8x16_t b) { + return vmmlaq_s32(r, a, b); +} + +// CHECK-LABEL: test_vmmlaq_u32 +// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) +// CHECK: ret <4 x i32> [[VAL]] +uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) { + return vmmlaq_u32(r, a, b); +} + +// CHECK-LABEL: test_vusmmlaq_s32 +// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) +// CHECK: ret <4 x i32> [[VAL]] +int32x4_t test_vusmmlaq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) { + return vusmmlaq_s32(r, a, b); +} + +// CHECK-LABEL: test_vusdot_s32 +// CHECK: [[VAL:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) +// CHECK: ret <2 x i32> [[VAL]] +int32x2_t test_vusdot_s32(int32x2_t r, uint8x8_t a, int8x8_t b) { + return vusdot_s32(r, a, b); +} + +// CHECK-LABEL: test_vusdot_lane_s32 +// CHECK: [[REINT:%.*]] = alloca <8 x i8> +// CHECK: store <8 x i8> %b, <8 x i8>* [[REINT]] +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8>* [[REINT]] to <2 x i32>* +// CHECK: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]] +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8> +// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP4]]) +// CHECK: ret <2 x i32> [[OP]] +int32x2_t test_vusdot_lane_s32(int32x2_t r, uint8x8_t a, int8x8_t b) { + return vusdot_lane_s32(r, a, b, 0); +} + +// CHECK-LABEL: test_vsudot_lane_s32 +// CHECK: [[REINT:%.*]] = alloca <8 x i8> +// CHECK: store <8 x i8> %b, <8 x i8>* [[REINT]] +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8>* [[REINT]] to <2 x i32>* +// CHECK: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]] +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %1 to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> %2 to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> %3, <2 x i32> %3, <2 x i32> zeroinitializer +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8> +// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP4]], <8 x i8> %a) +// CHECK: ret <2 x i32> [[OP]] +int32x2_t test_vsudot_lane_s32(int32x2_t r, int8x8_t a, uint8x8_t b) { + return vsudot_lane_s32(r, a, b, 0); +} + +// CHECK-LABEL: test_vusdot_laneq_s32 +// CHECK: [[REINT:%.*]] = alloca <16 x i8> +// CHECK: store <16 x i8> %b, <16 x i8>* [[REINT]] +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8>* [[REINT]] to <4 x i32>* +// CHECK: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]] +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <2 x i32> zeroinitializer +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8> +// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP4]]) +// CHECK: ret <2 x i32> [[OP]] +int32x2_t test_vusdot_laneq_s32(int32x2_t r, uint8x8_t a, int8x16_t b) { + return vusdot_laneq_s32(r, a, b, 0); +} + +// CHECK-LABEL: test_vsudot_laneq_s32 +// CHECK: [[REINT:%.*]] = alloca <16 x i8> +// CHECK: store <16 x i8> %b, <16 x i8>* [[REINT]] +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8>* [[REINT]] to <4 x i32>* +// CHECK: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]] +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <2 x i32> zeroinitializer +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8> +// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP4]], <8 x i8> %a) +// CHECK: ret <2 x i32> [[OP]] +int32x2_t test_vsudot_laneq_s32(int32x2_t r, int8x8_t a, uint8x16_t b) { + return vsudot_laneq_s32(r, a, b, 0); +} + +// CHECK-LABEL: test_vusdotq_s32 +// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) +// CHECK: ret <4 x i32> [[VAL]] +int32x4_t test_vusdotq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) { + return vusdotq_s32(r, a, b); +} + +// CHECK-LABEL: test_vusdotq_lane_s32 +// CHECK: [[REINT:%.*]] = alloca <8 x i8> +// CHECK: store <8 x i8> %b, <8 x i8>* [[REINT]] +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8>* [[REINT]] to <2 x i32>* +// CHECK: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]] +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <4 x i32> zeroinitializer +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> +// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]]) +// CHECK: ret <4 x i32> [[OP]] +int32x4_t test_vusdotq_lane_s32(int32x4_t r, uint8x16_t a, int8x8_t b) { + return vusdotq_lane_s32(r, a, b, 0); +} + +// CHECK-LABEL: test_vsudotq_lane_s32 +// CHECK: [[REINT:%.*]] = alloca <8 x i8> +// CHECK: store <8 x i8> %b, <8 x i8>* [[REINT]] +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8>* [[REINT]] to <2 x i32>* +// CHECK: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]] +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <4 x i32> zeroinitializer +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> +// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> [[TMP4]], <16 x i8> %a) +// CHECK: ret <4 x i32> [[OP]] +int32x4_t test_vsudotq_lane_s32(int32x4_t r, int8x16_t a, uint8x8_t b) { + return vsudotq_lane_s32(r, a, b, 0); +} + +// CHECK-LABEL: test_vusdotq_laneq_s32 +// CHECK: [[REINT:%.*]] = alloca <16 x i8> +// CHECK: store <16 x i8> %b, <16 x i8>* [[REINT]] +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8>* [[REINT]] to <4 x i32>* +// CHECK: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]] +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> +// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]]) +// CHECK: ret <4 x i32> [[OP]] +int32x4_t test_vusdotq_laneq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) { + return vusdotq_laneq_s32(r, a, b, 0); +} + +// CHECK-LABEL: test_vsudotq_laneq_s32 +// CHECK: [[REINT:%.*]] = alloca <16 x i8> +// CHECK: store <16 x i8> %b, <16 x i8>* [[REINT]] +// CHECK: [[TMP0:%.*]] = bitcast <16 x i8>* [[REINT]] to <4 x i32>* +// CHECK: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]] +// CHECK: [[TMP2:%.*]] = bitcast <4 x i32> [[TMP1]] to <16 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <4 x i32> [[TMP3]], <4 x i32> [[TMP3]], <4 x i32> zeroinitializer +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> +// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> [[TMP4]], <16 x i8> %a) +// CHECK: ret <4 x i32> [[OP]] +int32x4_t test_vsudotq_laneq_s32(int32x4_t r, int8x16_t a, uint8x16_t b) { + return vsudotq_laneq_s32(r, a, b, 0); +} Index: clang/test/CodeGen/arm-v8.6a-neon-intrinsics.c =================================================================== --- /dev/null +++ clang/test/CodeGen/arm-v8.6a-neon-intrinsics.c @@ -0,0 +1,100 @@ +// RUN: %clang_cc1 -triple armv8.6a-arm-none-eabi -target-feature +neon -target-feature +fullfp16 -target-feature +i8mm \ +// RUN: -fallow-half-arguments-and-returns -S -disable-O0-optnone -emit-llvm -o - %s \ +// RUN: | opt -S -mem2reg \ +// RUN: | FileCheck %s + +// REQUIRES: arm-registered-target + +#include + +// CHECK-LABEL: test_vmmlaq_s32 +// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.arm.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) +// CHECK: ret <4 x i32> [[VAL]] +int32x4_t test_vmmlaq_s32(int32x4_t r, int8x16_t a, int8x16_t b) { + return vmmlaq_s32(r, a, b); +} + +// CHECK-LABEL: test_vmmlaq_u32 +// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.arm.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) +// CHECK: ret <4 x i32> [[VAL]] +uint32x4_t test_vmmlaq_u32(uint32x4_t r, uint8x16_t a, uint8x16_t b) { + return vmmlaq_u32(r, a, b); +} + +// CHECK-LABEL: test_vusmmlaq_s32 +// CHECK: [[VAL:%.*]] = call <4 x i32> @llvm.arm.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) +// CHECK: ret <4 x i32> [[VAL]] +int32x4_t test_vusmmlaq_s32(int32x4_t r, uint8x16_t a, int8x16_t b) { + return vusmmlaq_s32(r, a, b); +} + +// CHECK-LABEL: test_vusdot_s32 +// CHECK: [[VAL:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) +// CHECK: ret <2 x i32> [[VAL]] +int32x2_t test_vusdot_s32(int32x2_t r, uint8x8_t a, int8x8_t b) { + return vusdot_s32(r, a, b); +} + +// CHECK-LABEL: test_vusdot_lane_s32 +// CHECK: [[REINT:%.*]] = alloca <8 x i8> +// CHECK: store <8 x i8> %b, <8 x i8>* [[REINT]] +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8>* [[REINT]] to <2 x i32>* +// CHECK: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]] +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8> +// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> [[TMP4]]) +// CHECK: ret <2 x i32> [[OP]] +int32x2_t test_vusdot_lane_s32(int32x2_t r, uint8x8_t a, int8x8_t b) { + return vusdot_lane_s32(r, a, b, 0); +} + +// CHECK-LABEL: test_vsudot_lane_s32 +// CHECK: [[REINT:%.*]] = alloca <8 x i8> +// CHECK: store <8 x i8> %b, <8 x i8>* [[REINT]] +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8>* [[REINT]] to <2 x i32>* +// CHECK: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]] +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <2 x i32> zeroinitializer +// CHECK: [[TMP4:%.*]] = bitcast <2 x i32> [[LANE]] to <8 x i8> +// CHECK: [[TMP5:%.*]] = bitcast <2 x i32> %r to <8 x i8> +// CHECK: [[OP:%.*]] = call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> [[TMP4]], <8 x i8> %a) +// CHECK: ret <2 x i32> [[OP]] +int32x2_t test_vsudot_lane_s32(int32x2_t r, int8x8_t a, uint8x8_t b) { + return vsudot_lane_s32(r, a, b, 0); +} + +// CHECK-LABEL: test_vusdotq_lane_s32 +// CHECK: [[REINT:%.*]] = alloca <8 x i8> +// CHECK: store <8 x i8> %b, <8 x i8>* [[REINT]] +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8>* [[REINT]] to <2 x i32>* +// CHECK: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]] +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> [[TMP1]] to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> [[TMP2]] to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> [[TMP3]], <2 x i32> [[TMP3]], <4 x i32> zeroinitializer +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> +// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> [[TMP4]]) +// CHECK: ret <4 x i32> [[OP]] +int32x4_t test_vusdotq_lane_s32(int32x4_t r, uint8x16_t a, int8x8_t b) { + return vusdotq_lane_s32(r, a, b, 0); +} + +// CHECK-LABEL: test_vsudotq_lane_s32 +// CHECK: [[REINT:%.*]] = alloca <8 x i8> +// CHECK: store <8 x i8> %b, <8 x i8>* [[REINT]] +// CHECK: [[TMP0:%.*]] = bitcast <8 x i8>* [[REINT]] to <2 x i32>* +// CHECK: [[TMP1:%.*]] = load <2 x i32>, <2 x i32>* [[TMP0]] +// CHECK: [[TMP2:%.*]] = bitcast <2 x i32> %1 to <8 x i8> +// CHECK: [[TMP3:%.*]] = bitcast <8 x i8> %2 to <2 x i32> +// CHECK: [[LANE:%.*]] = shufflevector <2 x i32> %3, <2 x i32> %3, <4 x i32> zeroinitializer +// CHECK: [[TMP4:%.*]] = bitcast <4 x i32> [[LANE]] to <16 x i8> +// CHECK: [[TMP5:%.*]] = bitcast <4 x i32> %r to <16 x i8> +// CHECK: [[OP:%.*]] = call <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> [[TMP4]], <16 x i8> %a) +// CHECK: ret <4 x i32> [[OP]] +int32x4_t test_vsudotq_lane_s32(int32x4_t r, int8x16_t a, uint8x8_t b) { + return vsudotq_lane_s32(r, a, b, 0); +} Index: clang/test/Driver/aarch64-cpus.c =================================================================== --- clang/test/Driver/aarch64-cpus.c +++ clang/test/Driver/aarch64-cpus.c @@ -636,6 +636,34 @@ // RUN: %clang -target aarch64 -march=armv8.5a+bf16+sve -### -c %s 2>&1 | FileCheck -check-prefixes=GENERICV85A-BF16-SVE %s // GENERICV85A-BF16-SVE: "-target-feature" "+bf16" "-target-feature" "+sve" +// The 8-bit integer matrix multiply extension is a mandatory component of the +// Armv8.6-A extensions, but is permitted as an optional feature for any +// implementation of Armv8.2-A to Armv8.5-A (inclusive) +// RUN: %clang -target aarch64 -march=armv8.5a -### -c %s 2>&1 | FileCheck -check-prefix=NO-I8MM %s +// RUN: %clang -target aarch64 -march=armv8.5a+i8mm -### -c %s 2>&1 | FileCheck -check-prefix=I8MM %s +// NO-I8MM-NOT: "-target-feature" "+i8mm" +// I8MM: "-target-feature" "+i8mm" + +// The 32-bit floating point matrix multiply extension is enabled by default +// for armv8.6-a targets (or later) with SVE, and can optionally be enabled for +// any target from armv8.2a onwards (we don't enforce not using it with earlier +// targets). +// RUN: %clang -target aarch64 -march=armv8.6a -### -c %s 2>&1 | FileCheck -check-prefix=NO-F32MM %s +// RUN: %clang -target aarch64 -march=armv8.6a+sve -### -c %s 2>&1 | FileCheck -check-prefix=F32MM %s +// RUN: %clang -target aarch64 -march=armv8.5a+f32mm -### -c %s 2>&1 | FileCheck -check-prefix=F32MM %s +// NO-F32MM-NOT: "-target-feature" "+f32mm" +// F32MM: "-target-feature" "+f32mm" + +// The 64-bit floating point matrix multiply extension is not currently enabled +// by default for any targets, because it requires an SVE vector length >= 256 +// bits. When we add a CPU which has that, then it can be enabled by default, +// but for now it can only be used by adding the +f64mm feature. +// RUN: %clang -target aarch64 -march=armv8.6a -### -c %s 2>&1 | FileCheck -check-prefix=NO-F64MM %s +// RUN: %clang -target aarch64 -march=armv8.6a+sve -### -c %s 2>&1 | FileCheck -check-prefix=NO-F64MM %s +// RUN: %clang -target aarch64 -march=armv8.6a+f64mm -### -c %s 2>&1 | FileCheck -check-prefix=F64MM %s +// NO-F64MM-NOT: "-target-feature" "+f64mm" +// F64MM: "-target-feature" "+f64mm" + // fullfp16 is off by default for v8a, feature must not be mentioned // RUN: %clang -target aarch64 -march=armv8a -### -c %s 2>&1 | FileCheck -check-prefix=V82ANOFP16 -check-prefix=GENERIC %s // RUN: %clang -target aarch64 -march=armv8-a -### -c %s 2>&1 | FileCheck -check-prefix=V82ANOFP16 -check-prefix=GENERIC %s Index: clang/test/Driver/arm-matrix-multiply.c =================================================================== --- /dev/null +++ clang/test/Driver/arm-matrix-multiply.c @@ -0,0 +1,14 @@ +// RUN: %clang -### -target arm-none-none-eabi -march=armv8.5a+i8mm %s 2>&1 | FileCheck %s +// RUN: %clang -### -target aarch64-none-none-eabi -march=armv8.5a+i8mm %s 2>&1 | FileCheck %s +// CHECK: "-target-feature" "+i8mm" +// CHECK-NOT: "-target-feature" "-i8mm" + +// RUN: %clang -### -target arm-none-none-eabi -march=armv8.6a+noi8mm %s 2>&1 | FileCheck %s --check-prefix=NOI8MM +// RUN: %clang -### -target aarch64-none-none-eabi -march=armv8.6a+noi8mm %s 2>&1 | FileCheck %s --check-prefix=NOI8MM +// NOI8MM: "-target-feature" "-i8mm" +// NOI8MM-NOT: "-target-feature" "+i8mm" + +// RUN: %clang -### -target arm-none-none-eabi %s 2>&1 | FileCheck %s --check-prefix=ABSENT +// RUN: %clang -### -target aarch64-none-none-eabi %s 2>&1 | FileCheck %s --check-prefix=ABSENT +// ABSENT-NOT: "-target-feature" "+i8mm" +// ABSENT-NOT: "-target-feature" "-i8mm" Index: llvm/include/llvm/IR/IntrinsicsAArch64.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAArch64.td +++ llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -173,6 +173,11 @@ : Intrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>], [IntrNoMem]>; + + class AdvSIMD_MatMul_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_anyvector_ty, LLVMMatchType<1>], + [IntrNoMem]>; } // Arithmetic ops @@ -449,6 +454,12 @@ def int_aarch64_neon_udot : AdvSIMD_Dot_Intrinsic; def int_aarch64_neon_sdot : AdvSIMD_Dot_Intrinsic; +// v8.6-A Matrix Multiply Intrinsics + def int_aarch64_neon_ummla : AdvSIMD_MatMul_Intrinsic; + def int_aarch64_neon_smmla : AdvSIMD_MatMul_Intrinsic; + def int_aarch64_neon_usmmla : AdvSIMD_MatMul_Intrinsic; + def int_aarch64_neon_usdot : AdvSIMD_Dot_Intrinsic; + // v8.2-A FP16 Fused Multiply-Add Long def int_aarch64_neon_fmlal : AdvSIMD_FP16FML_Intrinsic; def int_aarch64_neon_fmlsl : AdvSIMD_FP16FML_Intrinsic; Index: llvm/include/llvm/IR/IntrinsicsARM.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsARM.td +++ llvm/include/llvm/IR/IntrinsicsARM.td @@ -773,6 +773,19 @@ def int_arm_neon_udot : Neon_Dot_Intrinsic; def int_arm_neon_sdot : Neon_Dot_Intrinsic; +// v8.6-A Matrix Multiply Intrinsics +class Neon_MatMul_Intrinsic + : Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_anyvector_ty, + LLVMMatchType<1>], + [IntrNoMem]>; +def int_arm_neon_ummla : Neon_MatMul_Intrinsic; +def int_arm_neon_smmla : Neon_MatMul_Intrinsic; +def int_arm_neon_usmmla : Neon_MatMul_Intrinsic; +def int_arm_neon_usdot : Neon_Dot_Intrinsic; + +// v8.6-A Bfloat Intrinsics + def int_arm_cls: Intrinsic<[llvm_i32_ty], [llvm_i32_ty], [IntrNoMem]>; def int_arm_cls64: Intrinsic<[llvm_i32_ty], [llvm_i64_ty], [IntrNoMem]>; Index: llvm/include/llvm/Support/AArch64TargetParser.h =================================================================== --- llvm/include/llvm/Support/AArch64TargetParser.h +++ llvm/include/llvm/Support/AArch64TargetParser.h @@ -24,7 +24,7 @@ namespace AArch64 { // Arch extension modifiers for CPUs. -enum ArchExtKind : unsigned { +enum ArchExtKind : uint64_t { AEK_INVALID = 0, AEK_NONE = 1, AEK_CRC = 1 << 1, @@ -57,6 +57,8 @@ AEK_TME = 1 << 28, AEK_BF16 = 1 << 29, AEK_I8MM = 1 << 30, + AEK_F32MM = 1ULL << 31, + AEK_F64MM = 1ULL << 32, }; enum class ArchKind { Index: llvm/include/llvm/Support/AArch64TargetParser.def =================================================================== --- llvm/include/llvm/Support/AArch64TargetParser.def +++ llvm/include/llvm/Support/AArch64TargetParser.def @@ -88,6 +88,8 @@ AARCH64_ARCH_EXT_NAME("predres", AArch64::AEK_PREDRES, "+predres", "-predres") AARCH64_ARCH_EXT_NAME("bf16", AArch64::AEK_BF16, "+bf16", "-bf16") AARCH64_ARCH_EXT_NAME("i8mm", AArch64::AEK_I8MM, "+i8mm", "-i8mm") +AARCH64_ARCH_EXT_NAME("f32mm", AArch64::AEK_F32MM, "+f32mm", "-f32mm") +AARCH64_ARCH_EXT_NAME("f64mm", AArch64::AEK_F64MM, "+f64mm", "-f64mm") AARCH64_ARCH_EXT_NAME("tme", AArch64::AEK_TME, "+tme", "-tme") #undef AARCH64_ARCH_EXT_NAME Index: llvm/include/llvm/Support/ARMTargetParser.h =================================================================== --- llvm/include/llvm/Support/ARMTargetParser.h +++ llvm/include/llvm/Support/ARMTargetParser.h @@ -47,14 +47,15 @@ AEK_FP_DP = 1 << 18, AEK_LOB = 1 << 19, AEK_BF16 = 1 << 20, - AEK_CDECP0 = 1 << 21, - AEK_CDECP1 = 1 << 22, - AEK_CDECP2 = 1 << 23, - AEK_CDECP3 = 1 << 24, - AEK_CDECP4 = 1 << 25, - AEK_CDECP5 = 1 << 26, - AEK_CDECP6 = 1 << 27, - AEK_CDECP7 = 1 << 28, + AEK_I8MM = 1 << 21, + AEK_CDECP0 = 1 << 22, + AEK_CDECP1 = 1 << 23, + AEK_CDECP2 = 1 << 24, + AEK_CDECP3 = 1 << 25, + AEK_CDECP4 = 1 << 26, + AEK_CDECP5 = 1 << 27, + AEK_CDECP6 = 1 << 28, + AEK_CDECP7 = 1 << 29, // Unsupported extensions. AEK_OS = 1ULL << 59, Index: llvm/include/llvm/Support/ARMTargetParser.def =================================================================== --- llvm/include/llvm/Support/ARMTargetParser.def +++ llvm/include/llvm/Support/ARMTargetParser.def @@ -116,7 +116,8 @@ ARMBuildAttrs::CPUArch::v8_A, FK_CRYPTO_NEON_FP_ARMV8, (ARM::AEK_SEC | ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM | ARM::AEK_HWDIVTHUMB | ARM::AEK_DSP | ARM::AEK_CRC | ARM::AEK_RAS | - ARM::AEK_DOTPROD | ARM::AEK_BF16 | ARM::AEK_SHA2 | ARM::AEK_AES)) + ARM::AEK_DOTPROD | ARM::AEK_BF16 | ARM::AEK_SHA2 | ARM::AEK_AES | + ARM::AEK_I8MM)) ARM_ARCH("armv8-r", ARMV8R, "8-R", "v8r", ARMBuildAttrs::CPUArch::v8_R, FK_NEON_FP_ARMV8, (ARM::AEK_MP | ARM::AEK_VIRT | ARM::AEK_HWDIVARM | ARM::AEK_HWDIVTHUMB | @@ -171,6 +172,7 @@ ARM_ARCH_EXT_NAME("fp16fml", ARM::AEK_FP16FML, "+fp16fml", "-fp16fml") ARM_ARCH_EXT_NAME("bf16", ARM::AEK_BF16, "+bf16", "-bf16") ARM_ARCH_EXT_NAME("sb", ARM::AEK_SB, "+sb", "-sb") +ARM_ARCH_EXT_NAME("i8mm", ARM::AEK_I8MM, "+i8mm", "-i8mm") ARM_ARCH_EXT_NAME("lob", ARM::AEK_LOB, "+lob", "-lob") ARM_ARCH_EXT_NAME("cdecp0", ARM::AEK_CDECP0, "+cdecp0", "-cdecp0") ARM_ARCH_EXT_NAME("cdecp1", ARM::AEK_CDECP1, "+cdecp1", "-cdecp1") Index: llvm/lib/Target/AArch64/AArch64.td =================================================================== --- llvm/lib/Target/AArch64/AArch64.td +++ llvm/lib/Target/AArch64/AArch64.td @@ -373,6 +373,15 @@ def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16", "true", "Enable BFloat16 Extension" >; +def FeatureMatMulInt8 : SubtargetFeature<"i8mm", "HasMatMulInt8", + "true", "Enable Matrix Multiply Int8 Extension">; + +def FeatureMatMulFP32 : SubtargetFeature<"f32mm", "HasMatMulFP32", + "true", "Enable Matrix Multiply FP32 Extension", [FeatureSVE]>; + +def FeatureMatMulFP64 : SubtargetFeature<"f64mm", "HasMatMulFP64", + "true", "Enable Matrix Multiply FP64 Extension", [FeatureSVE]>; + def FeatureFineGrainedTraps : SubtargetFeature<"fgt", "HasFineGrainedTraps", "true", "Enable fine grained virtualization traps extension">; @@ -380,7 +389,6 @@ SubtargetFeature<"ecv", "HasEnhancedCounterVirtualization", "true", "Enable enhanced counter virtualization extension">; - //===----------------------------------------------------------------------===// // Architectures. // @@ -413,7 +421,7 @@ "v8.6a", "HasV8_6aOps", "true", "Support ARM v8.6a instructions", [HasV8_5aOps, FeatureAMVS, FeatureBF16, FeatureFineGrainedTraps, - FeatureEnhancedCounterVirtualization]>; + FeatureEnhancedCounterVirtualization, FeatureMatMulInt8]>; //===----------------------------------------------------------------------===// // Register File Description Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -498,6 +498,7 @@ def SImm4s3Operand : SImmScaledMemoryIndexed<4, 3>; def SImm4s4Operand : SImmScaledMemoryIndexed<4, 4>; def SImm4s16Operand : SImmScaledMemoryIndexed<4, 16>; +def SImm4s32Operand : SImmScaledMemoryIndexed<4, 32>; def simm4s1 : Operand, ImmLeaf=-8 && Imm <= 7; }]> { @@ -531,6 +532,12 @@ let ParserMatchClass = SImm4s16Operand; let DecoderMethod = "DecodeSImm<4>"; } +def simm4s32 : Operand, ImmLeaf=-256 && Imm <= 224 && (Imm % 32) == 0x0; }]> { + let PrintMethod = "printImmScale<32>"; + let ParserMatchClass = SImm4s32Operand; + let DecoderMethod = "DecodeSImm<4>"; +} def Imm1_8Operand : AsmImmRange<1, 8>; def Imm1_16Operand : AsmImmRange<1, 16>; @@ -5537,11 +5544,11 @@ // ARMv8.2-A Dot Product Instructions (Vector): These instructions extract // bytes from S-sized elements. -class BaseSIMDThreeSameVectorDot : - BaseSIMDThreeSameVectorTied { - def v8i8 : BaseSIMDThreeSameVectorDot<0, U, asm, ".2s", ".8b", V64, +multiclass SIMDThreeSameVectorDot { + def v8i8 : BaseSIMDThreeSameVectorDot<0, U, Mixed, asm, ".2s", ".8b", V64, v2i32, v8i8, OpNode>; - def v16i8 : BaseSIMDThreeSameVectorDot<1, U, asm, ".4s", ".16b", V128, + def v16i8 : BaseSIMDThreeSameVectorDot<1, U, Mixed, asm, ".4s", ".16b", V128, v4i32, v16i8, OpNode>; } @@ -7890,13 +7897,26 @@ } } // End of let mayStore = 0, mayLoad = 0, hasSideEffects = 0 +//---------------------------------------------------------------------------- +// Armv8.6 Matrix Multiply Extension +//---------------------------------------------------------------------------- + +class SIMDThreeSameVectorMatMul + : BaseSIMDThreeSameVectorTied<1, U, 0b100, {0b1010, B}, V128, asm, ".4s", + [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd), + (v16i8 V128:$Rn), + (v16i8 V128:$Rm)))]> { + let AsmString = asm # "{\t$Rd.4s, $Rn.16b, $Rm.16b}"; +} + +//---------------------------------------------------------------------------- // ARMv8.2-A Dot Product Instructions (Indexed) -class BaseSIMDThreeSameVectorDotIndex size, string asm, + string dst_kind, string lhs_kind, string rhs_kind, RegisterOperand RegType, ValueType AccumType, ValueType InputType, SDPatternOperator OpNode> : - BaseSIMDIndexedTied size, string asm, SDPatternOperator OpNode> { - def v8i8 : BaseSIMDThreeSameVectorDotIndex<0, U, asm, ".2s", ".8b", ".4b", + def v8i8 : BaseSIMDThreeSameVectorDotIndex<0, U, Mixed, size, asm, ".2s", ".8b", ".4b", V64, v2i32, v8i8, OpNode>; - def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, asm, ".4s", ".16b", ".4b", + def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, Mixed, size, asm, ".4s", ".16b", ".4b", V128, v4i32, v16i8, OpNode>; } Index: llvm/lib/Target/AArch64/AArch64InstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -146,6 +146,12 @@ AssemblerPredicate<(all_of FeatureTRBE), "trbe">; def HasBF16 : Predicate<"Subtarget->hasBF16()">, AssemblerPredicate<(all_of FeatureBF16), "bf16">; +def HasMatMulInt8 : Predicate<"Subtarget->hasMatMulInt8()">, + AssemblerPredicate<(all_of FeatureMatMulInt8), "i8mm">; +def HasMatMulFP32 : Predicate<"Subtarget->hasMatMulFP32()">, + AssemblerPredicate<(all_of FeatureMatMulFP32), "f32mm">; +def HasMatMulFP64 : Predicate<"Subtarget->hasMatMulFP64()">, + AssemblerPredicate<(all_of FeatureMatMulFP64), "f64mm">; def IsLE : Predicate<"Subtarget->isLittleEndian()">; def IsBE : Predicate<"!Subtarget->isLittleEndian()">; def IsWindows : Predicate<"Subtarget->isTargetWindows()">; @@ -745,10 +751,10 @@ // ARMv8.2-A Dot Product let Predicates = [HasDotProd] in { -defm SDOT : SIMDThreeSameVectorDot<0, "sdot", int_aarch64_neon_sdot>; -defm UDOT : SIMDThreeSameVectorDot<1, "udot", int_aarch64_neon_udot>; -defm SDOTlane : SIMDThreeSameVectorDotIndex<0, "sdot", int_aarch64_neon_sdot>; -defm UDOTlane : SIMDThreeSameVectorDotIndex<1, "udot", int_aarch64_neon_udot>; +defm SDOT : SIMDThreeSameVectorDot<0, 0, "sdot", int_aarch64_neon_sdot>; +defm UDOT : SIMDThreeSameVectorDot<1, 0, "udot", int_aarch64_neon_udot>; +defm SDOTlane : SIMDThreeSameVectorDotIndex<0, 0, 0b10, "sdot", int_aarch64_neon_sdot>; +defm UDOTlane : SIMDThreeSameVectorDotIndex<1, 0, 0b10, "udot", int_aarch64_neon_udot>; } // ARMv8.6-A BFloat @@ -765,6 +771,40 @@ def BFCVT : BF16ToSinglePrecision<"bfcvt">; } +// ARMv8.6A AArch64 matrix multiplication +let Predicates = [HasMatMulInt8] in { +def SMMLA : SIMDThreeSameVectorMatMul<0, 0, "smmla", int_aarch64_neon_smmla>; +def UMMLA : SIMDThreeSameVectorMatMul<0, 1, "ummla", int_aarch64_neon_ummla>; +def USMMLA : SIMDThreeSameVectorMatMul<1, 0, "usmmla", int_aarch64_neon_usmmla>; +defm USDOT : SIMDThreeSameVectorDot<0, 1, "usdot", int_aarch64_neon_usdot>; +defm USDOTlane : SIMDThreeSameVectorDotIndex<0, 1, 0b10, "usdot", int_aarch64_neon_usdot>; + +// sudot lane has a pattern where usdot is expected (there is no sudot). +// The second operand is used in the dup operation to repeat the indexed +// element. +class BaseSIMDSUDOTIndex + : BaseSIMDThreeSameVectorDotIndex { + let Pattern = [(set (AccumType RegType:$dst), + (AccumType (int_aarch64_neon_usdot (AccumType RegType:$Rd), + (InputType (bitconvert (AccumType + (AArch64duplane32 (v4i32 V128:$Rm), + VectorIndexS:$idx)))), + (InputType RegType:$Rn))))]; +} + +multiclass SIMDSUDOTIndex { + def v8i8 : BaseSIMDSUDOTIndex<0, ".2s", ".8b", ".4b", V64, v2i32, v8i8>; + def v16i8 : BaseSIMDSUDOTIndex<1, ".4s", ".16b", ".4b", V128, v4i32, v16i8>; +} + +defm SUDOTlane : SIMDSUDOTIndex; + +} + // ARMv8.2-A FP16 Fused Multiply-Add Long let Predicates = [HasNEON, HasFP16FML] in { defm FMLAL : SIMDThreeSameVectorFML<0, 1, 0b001, "fmlal", int_aarch64_neon_fmlal>; Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1634,6 +1634,37 @@ defm : ldff1; } +let Predicates = [HasSVE, HasMatMulInt8] in { + def SMMLA_ZZZ : sve_int_matmul<0b00, "smmla">; + def UMMLA_ZZZ : sve_int_matmul<0b11, "ummla">; + def USMMLA_ZZZ : sve_int_matmul<0b10, "usmmla">; + def USDOT_ZZZ : sve_int_dot_mixed<"usdot">; + def USDOT_ZZZI : sve_int_dot_mixed_indexed<0, "usdot">; + def SUDOT_ZZZI : sve_int_dot_mixed_indexed<1, "sudot">; +} + +let Predicates = [HasSVE, HasMatMulFP32] in { + def FMMLA_ZZZ_S : sve_fp_matrix_mla<0, "fmmla", ZPR32>; +} + +let Predicates = [HasSVE, HasMatMulFP64] in { + def FMMLA_ZZZ_D : sve_fp_matrix_mla<1, "fmmla", ZPR64>; + defm LD1RO_B_IMM : sve_mem_ldor_si<0b00, "ld1rob", Z_b, ZPR8>; + defm LD1RO_H_IMM : sve_mem_ldor_si<0b01, "ld1roh", Z_h, ZPR16>; + defm LD1RO_W_IMM : sve_mem_ldor_si<0b10, "ld1row", Z_s, ZPR32>; + defm LD1RO_D_IMM : sve_mem_ldor_si<0b11, "ld1rod", Z_d, ZPR64>; + defm LD1RO_B : sve_mem_ldor_ss<0b00, "ld1rob", Z_b, ZPR8, GPR64NoXZRshifted8>; + defm LD1RO_H : sve_mem_ldor_ss<0b01, "ld1roh", Z_h, ZPR16, GPR64NoXZRshifted16>; + defm LD1RO_W : sve_mem_ldor_ss<0b10, "ld1row", Z_s, ZPR32, GPR64NoXZRshifted32>; + defm LD1RO_D : sve_mem_ldor_ss<0b11, "ld1rod", Z_d, ZPR64, GPR64NoXZRshifted64>; + def ZIP1_ZZZ_128 : sve_int_perm_bin_perm_128_zz<0b00, 0, "zip1">; + def ZIP2_ZZZ_128 : sve_int_perm_bin_perm_128_zz<0b00, 1, "zip2">; + def UZP1_ZZZ_128 : sve_int_perm_bin_perm_128_zz<0b01, 0, "uzp1">; + def UZP2_ZZZ_128 : sve_int_perm_bin_perm_128_zz<0b01, 1, "uzp2">; + def TRN1_ZZZ_128 : sve_int_perm_bin_perm_128_zz<0b11, 0, "trn1">; + def TRN2_ZZZ_128 : sve_int_perm_bin_perm_128_zz<0b11, 1, "trn2">; +} + let Predicates = [HasSVE2] in { // SVE2 integer multiply-add (indexed) defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla", int_aarch64_sve_mla_lane>; Index: llvm/lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- llvm/lib/Target/AArch64/AArch64Subtarget.h +++ llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -147,6 +147,9 @@ // Armv8.6-A Extensions bool HasBF16 = false; + bool HasMatMulInt8 = false; + bool HasMatMulFP32 = false; + bool HasMatMulFP64 = false; bool HasAMVS = false; bool HasFineGrainedTraps = false; bool HasEnhancedCounterVirtualization = false; @@ -414,6 +417,9 @@ bool hasSVE2SM4() const { return HasSVE2SM4; } bool hasSVE2SHA3() const { return HasSVE2SHA3; } bool hasSVE2BitPerm() const { return HasSVE2BitPerm; } + bool hasMatMulInt8() const { return HasMatMulInt8; } + bool hasMatMulFP32() const { return HasMatMulFP32; } + bool hasMatMulFP64() const { return HasMatMulFP64; } // Armv8.6-A Extensions bool hasBF16() const { return HasBF16; } Index: llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp =================================================================== --- llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp +++ llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp @@ -4255,6 +4255,8 @@ return Error(Loc, "index must be a multiple of 4 in range [-32, 28]."); case Match_InvalidMemoryIndexed16SImm4: return Error(Loc, "index must be a multiple of 16 in range [-128, 112]."); + case Match_InvalidMemoryIndexed32SImm4: + return Error(Loc, "index must be a multiple of 32 in range [-256, 224]."); case Match_InvalidMemoryIndexed1SImm6: return Error(Loc, "index must be an integer in range [-32, 31]."); case Match_InvalidMemoryIndexedSImm8: @@ -4914,6 +4916,7 @@ case Match_InvalidMemoryIndexed4SImm4: case Match_InvalidMemoryIndexed1SImm6: case Match_InvalidMemoryIndexed16SImm4: + case Match_InvalidMemoryIndexed32SImm4: case Match_InvalidMemoryIndexed4SImm7: case Match_InvalidMemoryIndexed8SImm7: case Match_InvalidMemoryIndexed16SImm7: Index: llvm/lib/Target/AArch64/SVEInstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/SVEInstrFormats.td +++ llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -7528,6 +7528,180 @@ let ElementSize = ElementSizeS; } +//===----------------------------------------------------------------------===// +// SVE Integer Matrix Multiply Group +//===----------------------------------------------------------------------===// + +class sve_int_matmul uns, string asm> +: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR8:$Zm), asm, + "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<5> Zm; + let Inst{31-24} = 0b01000101; + let Inst{23-22} = uns; + let Inst{21} = 0; + let Inst{20-16} = Zm; + let Inst{15-10} = 0b100110; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ZPR32.ElementSize; +} + +//===----------------------------------------------------------------------===// +// SVE Integer Dot Product Mixed Sign Group +//===----------------------------------------------------------------------===// + +class sve_int_dot_mixed +: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR8:$Zm), asm, + "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<5> Zm; + let Inst{31-21} = 0b01000100100; + let Inst{20-16} = Zm; + let Inst{15-10} = 0b011110; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ZPR32.ElementSize; +} + +//===----------------------------------------------------------------------===// +// SVE Integer Dot Product Mixed Sign - Indexed Group +//===----------------------------------------------------------------------===// + +class sve_int_dot_mixed_indexed +: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR3b8:$Zm, VectorIndexS:$idx), + asm, "\t$Zda, $Zn, $Zm$idx", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<3> Zm; + bits<2> idx; + let Inst{31-21} = 0b01000100101; + let Inst{20-19} = idx; + let Inst{18-16} = Zm; + let Inst{15-11} = 0b00011; + let Inst{10} = U; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = ZPR32.ElementSize; +} + +//===----------------------------------------------------------------------===// +// SVE Floating Point Matrix Multiply Accumulate Group +//===----------------------------------------------------------------------===// + +class sve_fp_matrix_mla +: I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, zprty:$Zm), + asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> { + bits<5> Zda; + bits<5> Zn; + bits<5> Zm; + let Inst{31-23} = 0b011001001; + let Inst{22} = sz; + let Inst{21} = 1; + let Inst{20-16} = Zm; + let Inst{15-10} = 0b111001; + let Inst{9-5} = Zn; + let Inst{4-0} = Zda; + + let Constraints = "$Zda = $_Zda"; + let DestructiveInstType = DestructiveOther; + let ElementSize = zprty.ElementSize; +} + +//===----------------------------------------------------------------------===// +// SVE Memory - Contiguous Load And Replicate 256-bit Group +//===----------------------------------------------------------------------===// +class sve_mem_ldor_si sz, string asm, RegisterOperand VecList> +: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4), + asm, "\t$Zt, $Pg/z, [$Rn, $imm4]", "", []>, Sched<[]> { + bits<5> Zt; + bits<5> Rn; + bits<3> Pg; + bits<4> imm4; + let Inst{31-25} = 0b1010010; + let Inst{24-23} = sz; + let Inst{22-20} = 0b010; + let Inst{19-16} = imm4; + let Inst{15-13} = 0b001; + let Inst{12-10} = Pg; + let Inst{9-5} = Rn; + let Inst{4-0} = Zt; + + let mayLoad = 1; +} + +multiclass sve_mem_ldor_si sz, string asm, RegisterOperand listty, + ZPRRegOp zprty> { + def NAME : sve_mem_ldor_si; + def : InstAlias(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>; + def : InstAlias(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>; + def : InstAlias(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4), 0>; +} + +class sve_mem_ldor_ss sz, string asm, RegisterOperand VecList, + RegisterOperand gprty> +: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), + asm, "\t$Zt, $Pg/z, [$Rn, $Rm]", "", []>, Sched<[]> { + bits<5> Zt; + bits<3> Pg; + bits<5> Rn; + bits<5> Rm; + let Inst{31-25} = 0b1010010; + let Inst{24-23} = sz; + let Inst{22-21} = 0b01; + let Inst{20-16} = Rm; + let Inst{15-13} = 0; + let Inst{12-10} = Pg; + let Inst{9-5} = Rn; + let Inst{4-0} = Zt; + + let mayLoad = 1; +} + +multiclass sve_mem_ldor_ss sz, string asm, RegisterOperand listty, + ZPRRegOp zprty, RegisterOperand gprty> { + def NAME : sve_mem_ldor_ss; + + def : InstAlias(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>; +} + +//===----------------------------------------------------------------------===// +// SVE Interleave 128-bit Elements Group +//===----------------------------------------------------------------------===// + +class sve_int_perm_bin_perm_128_zz opc, bit P, string asm> +: I<(outs ZPR128:$Zd), (ins ZPR128:$Zn, ZPR128:$Zm), + asm, "\t$Zd, $Zn, $Zm", + "", + []>, Sched<[]> { + bits<5> Zd; + bits<5> Zm; + bits<5> Zn; + let Inst{31-21} = 0b00000101101; + let Inst{20-16} = Zm; + let Inst{15-13} = 0b000; + let Inst{12-11} = opc; + let Inst{10} = P; + let Inst{9-5} = Zn; + let Inst{4-0} = Zd; +} + + /// Addressing modes def am_sve_indexed_s4 :ComplexPattern", [], [SDNPWantRoot]>; def am_sve_indexed_s6 :ComplexPattern", [], [SDNPWantRoot]>; Index: llvm/lib/Target/ARM/ARM.td =================================================================== --- llvm/lib/Target/ARM/ARM.td +++ llvm/lib/Target/ARM/ARM.td @@ -428,6 +428,9 @@ def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16", "true", "Enable support for BFloat16 instructions", [FeatureNEON]>; +def FeatureMatMulInt8 : SubtargetFeature<"i8mm", "HasMatMulInt8", + "true", "Enable Matrix Multiply Int8 Extension", [FeatureNEON]>; + // Armv8.1-M extensions def FeatureLOB : SubtargetFeature<"lob", "HasLOB", "true", @@ -529,7 +532,8 @@ def HasV8_6aOps : SubtargetFeature<"v8.6a", "HasV8_6aOps", "true", "Support ARM v8.6a instructions", - [HasV8_5aOps, FeatureBF16]>; + [HasV8_5aOps, FeatureBF16, + FeatureMatMulInt8]>; def HasV8_1MMainlineOps : SubtargetFeature< "v8.1m.main", "HasV8_1MMainlineOps", "true", Index: llvm/lib/Target/ARM/ARMInstrNEON.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrNEON.td +++ llvm/lib/Target/ARM/ARMInstrNEON.td @@ -4823,10 +4823,10 @@ // We put them in the VFPV8 decoder namespace because the ARM and Thumb // encodings are the same and thus no further bit twiddling is necessary // in the disassembler. -class VDOT : - N3Vnp<0b11000, 0b10, 0b1101, op6, op4, (outs RegTy:$dst), + N3Vnp<{0b1100, op23}, 0b10, 0b1101, op6, op4, (outs RegTy:$dst), (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm), N3RegFrm, IIC_VDOTPROD, Asm, AsmTy, [(set (AccumTy RegTy:$dst), @@ -4836,12 +4836,13 @@ let Predicates = [HasDotProd]; let DecoderNamespace = "VFPV8"; let Constraints = "$dst = $Vd"; + let hasNoSchedulingInfo = 1; } -def VUDOTD : VDOT<0, 1, DPR, "vudot", "u8", v2i32, v8i8, int_arm_neon_udot>; -def VSDOTD : VDOT<0, 0, DPR, "vsdot", "s8", v2i32, v8i8, int_arm_neon_sdot>; -def VUDOTQ : VDOT<1, 1, QPR, "vudot", "u8", v4i32, v16i8, int_arm_neon_udot>; -def VSDOTQ : VDOT<1, 0, QPR, "vsdot", "s8", v4i32, v16i8, int_arm_neon_sdot>; +def VUDOTD : VDOT<0, 1, 0, DPR, "vudot", "u8", v2i32, v8i8, int_arm_neon_udot>; +def VSDOTD : VDOT<0, 0, 0, DPR, "vsdot", "s8", v2i32, v8i8, int_arm_neon_sdot>; +def VUDOTQ : VDOT<1, 1, 0, QPR, "vudot", "u8", v4i32, v16i8, int_arm_neon_udot>; +def VSDOTQ : VDOT<1, 0, 0, QPR, "vsdot", "s8", v4i32, v16i8, int_arm_neon_sdot>; // Indexed dot product instructions: multiclass DOTI; +// v8.6A matrix multiplication extension +let Predicates = [HasMatMulInt8] in { + class N3VMatMul + : N3Vnp<{0b1100, B}, 0b10, 0b1100, 1, U, (outs QPR:$dst), + (ins QPR:$Vd, QPR:$Vn, QPR:$Vm), N3RegFrm, NoItinerary, + Asm, AsmTy, + [(set (v4i32 QPR:$dst), (OpNode (v4i32 QPR:$Vd), + (v16i8 QPR:$Vn), + (v16i8 QPR:$Vm)))]> { + let DecoderNamespace = "VFPV8"; + let Constraints = "$dst = $Vd"; + let hasNoSchedulingInfo = 1; + } + + multiclass N3VMixedDotLane { + + def "" : N3Vnp<0b11101, 0b00, 0b1101, Q, U, (outs RegTy:$dst), + (ins RegTy:$Vd, RegTy:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane), N3RegFrm, + NoItinerary, Asm, AsmTy, []> { + bit lane; + let hasNoSchedulingInfo = 1; + let Inst{5} = lane; + let AsmString = !strconcat(Asm, ".", AsmTy, "\t$Vd, $Vn, $Vm$lane"); + let DecoderNamespace = "VFPV8"; + let Constraints = "$dst = $Vd"; + } + + def : Pat< + (AccumTy (OpNode (AccumTy RegTy:$Vd), + (InputTy RegTy:$Vn), + (InputTy (bitconvert (AccumTy + (ARMvduplane (AccumTy RegTy:$Vm), + VectorIndex32:$lane)))))), + (!cast(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>; + + } + + multiclass SUDOTLane + : N3VMixedDotLane { + def : Pat< + (AccumTy (int_arm_neon_usdot (AccumTy RegTy:$Vd), + (InputTy (bitconvert (AccumTy + (ARMvduplane (AccumTy RegTy:$Vm), + VectorIndex32:$lane)))), + (InputTy RegTy:$Vn))), + (!cast(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>; + } + + def VSMMLA : N3VMatMul<0, 0, "vsmmla", "s8", int_arm_neon_smmla>; + def VUMMLA : N3VMatMul<0, 1, "vummla", "u8", int_arm_neon_ummla>; + def VUSMMLA : N3VMatMul<1, 0, "vusmmla", "s8", int_arm_neon_usmmla>; + def VUSDOTD : VDOT<0, 0, 1, DPR, "vusdot", "s8", v2i32, v8i8, int_arm_neon_usdot>; + def VUSDOTQ : VDOT<1, 0, 1, QPR, "vusdot", "s8", v4i32, v16i8, int_arm_neon_usdot>; + + defm VUSDOTDI : N3VMixedDotLane<0, 0, "vusdot", "s8", DPR, v2i32, v8i8, + int_arm_neon_usdot, (v2i32 DPR_VFP2:$Vm)>; + defm VUSDOTQI : N3VMixedDotLane<1, 0, "vusdot", "s8", QPR, v4i32, v16i8, + int_arm_neon_usdot, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>; + defm VSUDOTDI : SUDOTLane<0, DPR, v2i32, v8i8, (v2i32 DPR_VFP2:$Vm)>; + defm VSUDOTQI : SUDOTLane<1, QPR, v4i32, v16i8, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>; +} // ARMv8.3 complex operations class BaseN3VCP8ComplexTied; def HasBF16 : Predicate<"Subtarget->hasBF16()">, AssemblerPredicate<(all_of FeatureBF16),"BFloat16 floating point extension">; +def HasMatMulInt8 : Predicate<"Subtarget->hasMatMulInt8()">, + AssemblerPredicate<(all_of FeatureMatMulInt8),"8-bit integer matrix multiply">; def HasDivideInThumb : Predicate<"Subtarget->hasDivideInThumbMode()">, AssemblerPredicate<(all_of FeatureHWDivThumb), "divide in THUMB">; def HasDivideInARM : Predicate<"Subtarget->hasDivideInARMMode()">, Index: llvm/lib/Target/ARM/ARMSubtarget.h =================================================================== --- llvm/lib/Target/ARM/ARMSubtarget.h +++ llvm/lib/Target/ARM/ARMSubtarget.h @@ -260,6 +260,9 @@ /// HasBF16 - True if subtarget supports BFloat16 floating point operations bool HasBF16 = false; + /// HasMatMulInt8 - True if subtarget supports 8-bit integer matrix multiply + bool HasMatMulInt8 = false; + /// HasD32 - True if subtarget has the full 32 double precision /// FP registers for VFPv3. bool HasD32 = false; @@ -704,6 +707,8 @@ /// Return true if the CPU supports any kind of instruction fusion. bool hasFusion() const { return hasFuseAES() || hasFuseLiterals(); } + bool hasMatMulInt8() const { return HasMatMulInt8; } + const Triple &getTargetTriple() const { return TargetTriple; } bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); } Index: llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp =================================================================== --- llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp +++ llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp @@ -6330,7 +6330,10 @@ Mnemonic == "csel" || Mnemonic == "csinc" || Mnemonic == "csinv" || Mnemonic == "csneg" || Mnemonic == "cinc" || Mnemonic == "cinv" || Mnemonic == "cneg" || Mnemonic == "cset" || - Mnemonic == "csetm") + Mnemonic == "csetm" || + Mnemonic == "vsmmla" || Mnemonic == "vummla" || + Mnemonic == "vusmmla" || Mnemonic == "vsudot" || + Mnemonic == "vusdot") return Mnemonic; // First, split out any predication code. Ignore mnemonics we know aren't @@ -6466,7 +6469,9 @@ Mnemonic == "vfmat" || Mnemonic == "vfmab" || Mnemonic == "vdot" || Mnemonic == "vmmla" || Mnemonic == "sb" || Mnemonic == "ssbb" || - Mnemonic == "pssbb" || + Mnemonic == "pssbb" || Mnemonic == "vsmmla" || + Mnemonic == "vummla" || Mnemonic == "vusmmla" || + Mnemonic == "vusdot" || Mnemonic == "vsudot" || Mnemonic == "bfcsel" || Mnemonic == "wls" || Mnemonic == "dls" || Mnemonic == "le" || Mnemonic == "csel" || Mnemonic == "csinc" || Mnemonic == "csinv" || Mnemonic == "csneg" || Index: llvm/test/CodeGen/AArch64/aarch64-matmul.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/aarch64-matmul.ll @@ -0,0 +1,136 @@ +; RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+neon,+i8mm < %s -o -| FileCheck %s + +define <4 x i32> @smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: smmla.v4i32.v16i8 +; CHECK: smmla v0.4s, v1.16b, v2.16b + %vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) + ret <4 x i32> %vmmla1.i +} + +define <4 x i32> @ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: ummla.v4i32.v16i8 +; CHECK: ummla v0.4s, v1.16b, v2.16b + %vmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) + ret <4 x i32> %vmmla1.i +} + +define <4 x i32> @usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: usmmla.v4i32.v16i8 +; CHECK: usmmla v0.4s, v1.16b, v2.16b + %vusmmla1.i = tail call <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3 + ret <4 x i32> %vusmmla1.i +} + +define <2 x i32> @usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) { +entry: +; CHECK-LABEL: usdot.v2i32.v8i8 +; CHECK: usdot v0.2s, v1.8b, v2.8b + %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) + ret <2 x i32> %vusdot1.i +} + +define <2 x i32> @usdot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) { +entry: +; CHECK-LABEL: usdot_lane.v2i32.v8i8 +; CHECK: usdot v0.2s, v1.8b, v2.4b[0] + %0 = bitcast <8 x i8> %b to <2 x i32> + %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer + %1 = bitcast <2 x i32> %shuffle to <8 x i8> + %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %1) + ret <2 x i32> %vusdot1.i +} + +define <2 x i32> @sudot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) { +entry: +; CHECK-LABEL: sudot_lane.v2i32.v8i8 +; CHECK: sudot v0.2s, v1.8b, v2.4b[0] + %0 = bitcast <8 x i8> %b to <2 x i32> + %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer + %1 = bitcast <2 x i32> %shuffle to <8 x i8> + %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %1, <8 x i8> %a) + ret <2 x i32> %vusdot1.i +} + +define <2 x i32> @usdot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: usdot_lane.v2i32.v16i8 +; CHECK: usdot v0.2s, v1.8b, v2.4b[0] + %0 = bitcast <16 x i8> %b to <4 x i32> + %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer + %1 = bitcast <2 x i32> %shuffle to <8 x i8> + %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %1) + ret <2 x i32> %vusdot1.i +} + +define <2 x i32> @sudot_lane.v2i32.v16i8(<2 x i32> %r, <8 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: sudot_lane.v2i32.v16i8 +; CHECK: sudot v0.2s, v1.8b, v2.4b[0] + %0 = bitcast <16 x i8> %b to <4 x i32> + %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <2 x i32> zeroinitializer + %1 = bitcast <2 x i32> %shuffle to <8 x i8> + %vusdot1.i = tail call <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %1, <8 x i8> %a) #3 + ret <2 x i32> %vusdot1.i +} + +define <4 x i32> @usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: usdot.v4i32.v16i8 +; CHECK: usdot v0.4s, v1.16b, v2.16b + %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3 + ret <4 x i32> %vusdot1.i +} + +define <4 x i32> @usdot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) { +entry: +; CHECK-LABEL: usdot_lane.v4i32.v16i8 +; CHECK: usdot v0.4s, v1.16b, v2.4b[0] + %0 = bitcast <8 x i8> %b to <2 x i32> + %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer + %1 = bitcast <4 x i32> %shuffle to <16 x i8> + %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %1) #3 + ret <4 x i32> %vusdot1.i +} + +define <4 x i32> @sudot_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) { +entry: +; CHECK-LABEL: sudot_lane.v4i32.v16i8 +; CHECK: sudot v0.4s, v1.16b, v2.4b[0] + %0 = bitcast <8 x i8> %b to <2 x i32> + %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer + %1 = bitcast <4 x i32> %shuffle to <16 x i8> + %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %1, <16 x i8> %a) #3 + ret <4 x i32> %vusdot1.i +} + +define <4 x i32> @usdot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: usdot_laneq.v4i32.v16i8 +; CHECK: usdot v0.4s, v1.16b, v2.4b[0] + %0 = bitcast <16 x i8> %b to <4 x i32> + %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer + %1 = bitcast <4 x i32> %shuffle to <16 x i8> + %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %1) #3 + ret <4 x i32> %vusdot1.i +} + +define <4 x i32> @sudot_laneq.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: sudot_laneq.v4i32.v16i8 +; CHECK: sudot v0.4s, v1.16b, v2.4b[0] + %0 = bitcast <16 x i8> %b to <4 x i32> + %shuffle = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> zeroinitializer + %1 = bitcast <4 x i32> %shuffle to <16 x i8> + %vusdot1.i = tail call <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %1, <16 x i8> %a) #3 + ret <4 x i32> %vusdot1.i +} + +declare <4 x i32> @llvm.aarch64.neon.smmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2 +declare <4 x i32> @llvm.aarch64.neon.ummla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2 +declare <4 x i32> @llvm.aarch64.neon.usmmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2 +declare <2 x i32> @llvm.aarch64.neon.usdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) #2 +declare <4 x i32> @llvm.aarch64.neon.usdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2 + Index: llvm/test/CodeGen/ARM/arm-matmul.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/ARM/arm-matmul.ll @@ -0,0 +1,83 @@ +; RUN: llc -mtriple=arm-none-linux-gnu -mattr=+neon,+i8mm -float-abi=hard < %s -o -| FileCheck %s + +define <4 x i32> @smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: smmla.v4i32.v16i8 +; CHECK: vsmmla.s8 q0, q1, q2 + %vmmla1.i = tail call <4 x i32> @llvm.arm.neon.smmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3 + ret <4 x i32> %vmmla1.i +} + +define <4 x i32> @ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: ummla.v4i32.v16i8 +; CHECK: vummla.u8 q0, q1, q2 + %vmmla1.i = tail call <4 x i32> @llvm.arm.neon.ummla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3 + ret <4 x i32> %vmmla1.i +} + +define <4 x i32> @usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) { +entry: +; CHECK-LABEL: usmmla.v4i32.v16i8 +; CHECK: vusmmla.s8 q0, q1, q2 + %vusmmla1.i = tail call <4 x i32> @llvm.arm.neon.usmmla.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %b) #3 + ret <4 x i32> %vusmmla1.i +} + +define <2 x i32> @usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) { +entry: +; CHECK-LABEL: usdot.v2i32.v8i8 +; CHECK: vusdot.s8 d0, d1, d2 + %vusdot1.i = tail call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) #3 + ret <2 x i32> %vusdot1.i +} + +define <2 x i32> @usdot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) { +entry: +; CHECK-LABEL: usdot_lane.v2i32.v8i8 +; CHECK: vusdot.s8 d0, d1, d2[0] + %0 = bitcast <8 x i8> %b to <2 x i32> + %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer + %1 = bitcast <2 x i32> %shuffle to <8 x i8> + %vusdot1.i = tail call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %1) #3 + ret <2 x i32> %vusdot1.i +} + +define <2 x i32> @sudot_lane.v2i32.v8i8(<2 x i32> %r, <8 x i8> %a, <8 x i8> %b) { +entry: +; CHECK-LABEL: sudot_lane.v2i32.v8i8 +; CHECK: vsudot.u8 d0, d1, d2[0] + %0 = bitcast <8 x i8> %b to <2 x i32> + %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <2 x i32> zeroinitializer + %1 = bitcast <2 x i32> %shuffle to <8 x i8> + %vusdot1.i = tail call <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32> %r, <8 x i8> %1, <8 x i8> %a) #3 + ret <2 x i32> %vusdot1.i +} + +define <4 x i32> @usdotq_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) { +entry: +; CHECK-LABEL: usdotq_lane.v4i32.v16i8 +; CHECK: vusdot.s8 q0, q1, d4[0] + %0 = bitcast <8 x i8> %b to <2 x i32> + %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer + %1 = bitcast <4 x i32> %shuffle to <16 x i8> + %vusdot1.i = tail call <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <16 x i8> %1) #3 + ret <4 x i32> %vusdot1.i +} + +define <4 x i32> @sudotq_lane.v4i32.v16i8(<4 x i32> %r, <16 x i8> %a, <8 x i8> %b) { +entry: +; CHECK-LABEL: sudotq_lane.v4i32.v16i8 +; CHECK: vsudot.u8 q0, q1, d4[0] + %0 = bitcast <8 x i8> %b to <2 x i32> + %shuffle = shufflevector <2 x i32> %0, <2 x i32> undef, <4 x i32> zeroinitializer + %1 = bitcast <4 x i32> %shuffle to <16 x i8> + %vusdot1.i = tail call <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32> %r, <16 x i8> %1, <16 x i8> %a) #3 + ret <4 x i32> %vusdot1.i +} + +declare <4 x i32> @llvm.arm.neon.smmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2 +declare <4 x i32> @llvm.arm.neon.ummla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2 +declare <4 x i32> @llvm.arm.neon.usmmla.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2 +declare <2 x i32> @llvm.arm.neon.usdot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) #2 +declare <4 x i32> @llvm.arm.neon.usdot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) #2 Index: llvm/test/MC/AArch64/SVE/matrix-multiply-fp-diagnostics.s =================================================================== --- /dev/null +++ llvm/test/MC/AArch64/SVE/matrix-multiply-fp-diagnostics.s @@ -0,0 +1,86 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve,+f32mm,+f64mm 2>&1 < %s | FileCheck %s + +// --------------------------------------------------------------------------// +// FMMLA (SVE) + +// Invalid element size + +fmmla z0.h, z1.h, z2.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width + +// Mis-matched element size + +fmmla z0.d, z1.s, z2.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +fmmla z0.s, z1.d, z2.s +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +fmmla z0.s, z1.s, z2.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width + + +// --------------------------------------------------------------------------// +// LD1RO (SVE, scalar plus immediate) + +// Immediate too high (>224) +ld1rob { z0.b }, p1/z, [x2, #256] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: index must be a multiple of 32 in range [-256, 224]. +ld1roh { z0.h }, p1/z, [x2, #256] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: index must be a multiple of 32 in range [-256, 224]. +ld1row { z0.s }, p1/z, [x2, #256] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: index must be a multiple of 32 in range [-256, 224]. +ld1rod { z0.d }, p1/z, [x2, #256] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: index must be a multiple of 32 in range [-256, 224]. + +// Immediate too low (<-256) +ld1rob { z0.b }, p1/z, [x2, #-288] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: index must be a multiple of 32 in range [-256, 224]. +ld1roh { z0.h }, p1/z, [x2, #-288] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: index must be a multiple of 32 in range [-256, 224]. +ld1row { z0.s }, p1/z, [x2, #-288] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: index must be a multiple of 32 in range [-256, 224]. +ld1rod { z0.d }, p1/z, [x2, #-288] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: index must be a multiple of 32 in range [-256, 224]. + +// Immediate not a multiple of 32 +ld1rob { z0.b }, p1/z, [x2, #16] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: index must be a multiple of 32 in range [-256, 224]. +ld1roh { z0.h }, p1/z, [x2, #16] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: index must be a multiple of 32 in range [-256, 224]. +ld1row { z0.s }, p1/z, [x2, #16] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: index must be a multiple of 32 in range [-256, 224]. +ld1rod { z0.d }, p1/z, [x2, #16] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: index must be a multiple of 32 in range [-256, 224]. + +// Prediate register too high +ld1rob { z0.b }, p8/z, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix) +ld1roh { z0.h }, p8/z, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix) +ld1row { z0.s }, p8/z, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix) +ld1rod { z0.d }, p8/z, [x2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix) + + +// --------------------------------------------------------------------------// +// LD1RO (SVE, scalar plus scalar) + +// Shift ammount not matched to data width +ld1rob { z0.b }, p1/z, [x2, x3, lsl #1] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: register must be x0..x30 without shift +ld1roh { z0.h }, p1/z, [x2, x3, lsl #0] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: register must be x0..x30 with required shift 'lsl #1' +ld1row { z0.s }, p1/z, [x2, x3, lsl #3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: register must be x0..x30 with required shift 'lsl #2' +ld1rod { z0.d }, p1/z, [x2, x3, lsl #2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: register must be x0..x30 with required shift 'lsl #3' + +// Prediate register too high +ld1rob { z0.b }, p8/z, [x2, x3, lsl #0] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix) +ld1roh { z0.h }, p8/z, [x2, x3, lsl #1] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix) +ld1row { z0.s }, p8/z, [x2, x3, lsl #2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix) +ld1rod { z0.d }, p8/z, [x2, x3, lsl #3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid restricted predicate register, expected p0..p7 (without element suffix) Index: llvm/test/MC/AArch64/SVE/matrix-multiply-fp32.s =================================================================== --- /dev/null +++ llvm/test/MC/AArch64/SVE/matrix-multiply-fp32.s @@ -0,0 +1,17 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve,+f32mm < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve,+i8mm,+f64mm < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve,+f32mm < %s \ +// RUN: | llvm-objdump -d --mattr=+sve,+f32mm - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve,+f32mm < %s \ +// RUN: | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN + +// --------------------------------------------------------------------------// +// FMMLA (SVE) + +fmmla z0.s, z1.s, z2.s +// CHECK-INST: fmmla z0.s, z1.s, z2.s +// CHECK-ENCODING: [0x20,0xe4,0xa2,0x64] +// CHECK-ERROR: instruction requires: f32mm +// CHECK-UNKNOWN: 20 e4 a2 64 Index: llvm/test/MC/AArch64/SVE/matrix-multiply-fp64.s =================================================================== --- /dev/null +++ llvm/test/MC/AArch64/SVE/matrix-multiply-fp64.s @@ -0,0 +1,185 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve,+f64mm < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve,+i8mm,+f32mm < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve,+f64mm < %s \ +// RUN: | llvm-objdump -d --mattr=+sve,+f64mm - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve,+f64mm < %s \ +// RUN: | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN + +// --------------------------------------------------------------------------// +// FMMLA (SVE) + +fmmla z0.d, z1.d, z2.d +// CHECK-INST: fmmla z0.d, z1.d, z2.d +// CHECK-ENCODING: [0x20,0xe4,0xe2,0x64] +// CHECK-ERROR: instruction requires: f64mm +// CHECK-UNKNOWN: 20 e4 e2 64 + +// --------------------------------------------------------------------------// +// LD1RO (SVE, scalar plus immediate) + +// With maximum immediate (224) + +ld1rob { z0.b }, p1/z, [x2, #224] +// CHECK-INST: ld1rob { z0.b }, p1/z, [x2, #224] +// CHECK-ENCODING: [0x40,0x24,0x27,0xa4] +// CHECK-ERROR: instruction requires: f64mm +// CHECK-UNKNOWN: 40 24 27 a4 + +ld1roh { z0.h }, p1/z, [x2, #224] +// CHECK-INST: ld1roh { z0.h }, p1/z, [x2, #224] +// CHECK-ENCODING: [0x40,0x24,0xa7,0xa4] +// CHECK-ERROR: instruction requires: f64mm +// CHECK-UNKNOWN: 40 24 a7 a4 + +ld1row { z0.s }, p1/z, [x2, #224] +// CHECK-INST: ld1row { z0.s }, p1/z, [x2, #224] +// CHECK-ENCODING: [0x40,0x24,0x27,0xa5] +// CHECK-ERROR: instruction requires: f64mm +// CHECK-UNKNOWN: 40 24 27 a5 + +ld1rod { z0.d }, p1/z, [x2, #224] +// CHECK-INST: ld1rod { z0.d }, p1/z, [x2, #224] +// CHECK-ENCODING: [0x40,0x24,0xa7,0xa5] +// CHECK-ERROR: instruction requires: f64mm +// CHECK-UNKNOWN: 40 24 a7 a5 + +// With minimum immediate (-256) + +ld1rob { z0.b }, p1/z, [x2, #-256] +// CHECK-INST: ld1rob { z0.b }, p1/z, [x2, #-256] +// CHECK-ENCODING: [0x40,0x24,0x28,0xa4] +// CHECK-ERROR: instruction requires: f64mm +// CHECK-UNKNOWN: 40 24 28 a4 + +ld1roh { z0.h }, p1/z, [x2, #-256] +// CHECK-INST: ld1roh { z0.h }, p1/z, [x2, #-256] +// CHECK-ENCODING: [0x40,0x24,0xa8,0xa4] +// CHECK-ERROR: instruction requires: f64mm +// CHECK-UNKNOWN: 40 24 a8 a4 + +ld1row { z0.s }, p1/z, [x2, #-256] +// CHECK-INST: ld1row { z0.s }, p1/z, [x2, #-256] +// CHECK-ENCODING: [0x40,0x24,0x28,0xa5] +// CHECK-ERROR: instruction requires: f64mm +// CHECK-UNKNOWN: 40 24 28 a5 + +ld1rod { z0.d }, p1/z, [x2, #-256] +// CHECK-INST: ld1rod { z0.d }, p1/z, [x2, #-256] +// CHECK-ENCODING: [0x40,0x24,0xa8,0xa5] +// CHECK-ERROR: instruction requires: f64mm +// CHECK-UNKNOWN: 40 24 a8 a5 + +// Aliases with a plain (non-list) first operand, and omitted offset. + +ld1rob z0.b, p1/z, [x2] +// CHECK-INST: ld1rob { z0.b }, p1/z, [x2] +// CHECK-ENCODING: [0x40,0x24,0x20,0xa4] +// CHECK-ERROR: instruction requires: f64mm +// CHECK-UNKNOWN: 40 24 20 a4 + +ld1roh z0.h, p1/z, [x2] +// CHECK-INST: ld1roh { z0.h }, p1/z, [x2] +// CHECK-ENCODING: [0x40,0x24,0xa0,0xa4] +// CHECK-ERROR: instruction requires: f64mm +// CHECK-UNKNOWN: 40 24 a0 a4 + +ld1row z0.s, p1/z, [x2] +// CHECK-INST: ld1row { z0.s }, p1/z, [x2] +// CHECK-ENCODING: [0x40,0x24,0x20,0xa5] +// CHECK-ERROR: instruction requires: f64mm +// CHECK-UNKNOWN: 40 24 20 a5 + +ld1rod z0.d, p1/z, [x2] +// CHECK-INST: ld1rod { z0.d }, p1/z, [x2] +// CHECK-ENCODING: [0x40,0x24,0xa0,0xa5] +// CHECK-ERROR: instruction requires: f64mm +// CHECK-UNKNOWN: 40 24 a0 a5 + + +// --------------------------------------------------------------------------// +// LD1RO (SVE, scalar plus scalar) + +ld1rob { z0.b }, p1/z, [x2, x3, lsl #0] +// CHECK-INST: ld1rob { z0.b }, p1/z, [x2, x3] +// CHECK-ENCODING: [0x40,0x04,0x23,0xa4] +// CHECK-ERROR: instruction requires: f64mm +// CHECK-UNKNOWN: 40 04 23 a4 + +ld1roh { z0.h }, p1/z, [x2, x3, lsl #1] +// CHECK-INST: ld1roh { z0.h }, p1/z, [x2, x3, lsl #1] +// CHECK-ENCODING: [0x40,0x04,0xa3,0xa4] +// CHECK-ERROR: instruction requires: f64mm +// CHECK-UNKNOWN: 40 04 a3 a4 + +ld1row { z0.s }, p1/z, [x2, x3, lsl #2] +// CHECK-INST: ld1row { z0.s }, p1/z, [x2, x3, lsl #2] +// CHECK-ENCODING: [0x40,0x04,0x23,0xa5] +// CHECK-ERROR: instruction requires: f64mm +// CHECK-UNKNOWN: 40 04 23 a5 + +ld1rod { z0.d }, p1/z, [x2, x3, lsl #3] +// CHECK-INST: ld1rod { z0.d }, p1/z, [x2, x3, lsl #3] +// CHECK-ENCODING: [0x40,0x04,0xa3,0xa5] +// CHECK-ERROR: instruction requires: f64mm +// CHECK-UNKNOWN: 40 04 a3 a5 + +// Aliases with a plain (non-list) first operand, and omitted shift for the +// byte variant. + +ld1rob z0.b, p1/z, [x2, x3] +// CHECK-INST: ld1rob { z0.b }, p1/z, [x2, x3] +// CHECK-ENCODING: [0x40,0x04,0x23,0xa4] +// CHECK-ERROR: instruction requires: f64mm +// CHECK-UNKNOWN: 40 04 23 a4 + +ld1roh z0.h, p1/z, [x2, x3, lsl #1] +// CHECK-INST: ld1roh { z0.h }, p1/z, [x2, x3, lsl #1] +// CHECK-ENCODING: [0x40,0x04,0xa3,0xa4] +// CHECK-ERROR: instruction requires: f64mm +// CHECK-UNKNOWN: 40 04 a3 a4 + +ld1row z0.s, p1/z, [x2, x3, lsl #2] +// CHECK-INST: ld1row { z0.s }, p1/z, [x2, x3, lsl #2] +// CHECK-ENCODING: [0x40,0x04,0x23,0xa5] +// CHECK-ERROR: instruction requires: f64mm +// CHECK-UNKNOWN: 40 04 23 a5 + +ld1rod z0.d, p1/z, [x2, x3, lsl #3] +// CHECK-INST: ld1rod { z0.d }, p1/z, [x2, x3, lsl #3] +// CHECK-ENCODING: [0x40,0x04,0xa3,0xa5] +// CHECK-ERROR: instruction requires: f64mm +// CHECK-UNKNOWN: 40 04 a3 a5 + + +// --------------------------------------------------------------------------// +// ZIP1, ZIP2 (SVE, 128-bit element) + +zip1 z0.q, z1.q, z2.q +// CHECK-INST: zip1 z0.q, z1.q, z2.q +// CHECK-ENCODING: [0x20,0x00,0xa2,0x05] +// CHECK-ERROR: instruction requires: f64mm +// CHECK-UNKNOWN: 20 00 a2 05 + +zip2 z0.q, z1.q, z2.q +// CHECK-INST: zip2 z0.q, z1.q, z2.q +// CHECK-ENCODING: [0x20,0x04,0xa2,0x05] +// CHECK-ERROR: instruction requires: f64mm +// CHECK-UNKNOWN: 20 04 a2 05 + + +// --------------------------------------------------------------------------// +// TRN1, TRN2 (SVE, 128-bit element) + +trn1 z0.q, z1.q, z2.q +// CHECK-INST: trn1 z0.q, z1.q, z2.q +// CHECK-ENCODING: [0x20,0x18,0xa2,0x05] +// CHECK-ERROR: instruction requires: f64mm +// CHECK-UNKNOWN: 20 18 a2 05 + +trn2 z0.q, z1.q, z2.q +// CHECK-INST: trn2 z0.q, z1.q, z2.q +// CHECK-ENCODING: [0x20,0x1c,0xa2,0x05] +// CHECK-ERROR: instruction requires: f64mm +// CHECK-UNKNOWN: 20 1c a2 05 Index: llvm/test/MC/AArch64/SVE/matrix-multiply-int8-diagnostics.s =================================================================== --- /dev/null +++ llvm/test/MC/AArch64/SVE/matrix-multiply-int8-diagnostics.s @@ -0,0 +1,78 @@ +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve,+i8mm 2>&1 < %s | FileCheck %s + +// --------------------------------------------------------------------------// +// SMMLA, UMMLA, USMMLA (SVE) + +// Invalid element size + +ummla z0.h, z1.b, z2.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +ummla z0.s, z1.h, z2.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +ummla z0.s, z1.b, z2.d +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width + +// Negative tests for instructions that are incompatible with movprfx + +movprfx z0.d, p0/z, z7.d +ummla z0.s, z1.b, z2.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx, suggest using unpredicated movprfx +movprfx z0.d, p0/z, z7.d +smmla z0.s, z1.b, z2.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx, suggest using unpredicated movprfx +movprfx z0.d, p0/z, z7.d +usmmla z0.s, z1.b, z2.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx, suggest using unpredicated movprfx + + +// --------------------------------------------------------------------------// +// USDOT (SVE, vectors) + +// Invalid element size + +usdot z0.d, z1.b, z2.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +usdot z0.s, z1.s, z2.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +usdot z0.s, z1.b, z2.h +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected z0.b..z7.b + +// Negative tests for instructions that are incompatible with movprfx + +movprfx z0.d, p0/z, z7.d +usdot z0.s, z1.b, z2.b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx, suggest using unpredicated movprfx + + +// --------------------------------------------------------------------------// +// USDOT, SUDOT (SVE, indexed) + +// Invalid element size + +usdot z0.h, z1.b, z2.b[0] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +sudot z0.s, z1.h, z2.b[0] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid element width +usdot z0.s, z1.b, z2.s[0] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected z0.b..z7.b + +// Invalid restricted register for indexed vector. +usdot z0.s, z1.b, z9.b[0] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +sudot z0.s, z1.b, z9.b[0] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: Invalid restricted vector register, expected z0.b..z7.b + +// Invalid element index +usdot z0.s, z1.b, z2.b[4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3]. +sudot z0.s, z1.b, z2.b[4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3]. + +// Negative tests for instructions that are incompatible with movprfx + +movprfx z0.d, p0/z, z7.d +usdot z0.s, z1.b, z2.b[0] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx, suggest using unpredicated movprfx +movprfx z0.d, p0/z, z7.d +sudot z0.s, z1.b, z2.b[3] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: instruction is unpredictable when following a predicated movprfx, suggest using unpredicated movprfx Index: llvm/test/MC/AArch64/SVE/matrix-multiply-int8.s =================================================================== --- /dev/null +++ llvm/test/MC/AArch64/SVE/matrix-multiply-int8.s @@ -0,0 +1,118 @@ +// RUN: llvm-mc -triple=aarch64 -show-encoding -mattr=+sve,+i8mm < %s \ +// RUN: | FileCheck %s --check-prefixes=CHECK-ENCODING,CHECK-INST +// RUN: not llvm-mc -triple=aarch64 -show-encoding -mattr=+sve < %s 2>&1 \ +// RUN: | FileCheck %s --check-prefix=CHECK-ERROR +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve,+i8mm < %s \ +// RUN: | llvm-objdump -d --mattr=+sve,+i8mm - | FileCheck %s --check-prefix=CHECK-INST +// RUN: llvm-mc -triple=aarch64 -filetype=obj -mattr=+sve,+i8mm < %s \ +// RUN: | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-UNKNOWN + + +// --------------------------------------------------------------------------// +// SMMLA, UMMLA, USMMLA (SVE) + +ummla z0.s, z1.b, z2.b +// CHECK-INST: ummla z0.s, z1.b, z2.b +// CHECK-ENCODING: [0x20,0x98,0xc2,0x45] +// CHECK-ERROR: instruction requires: i8mm +// CHECK-UNKNOWN: 20 98 c2 45 + +smmla z0.s, z1.b, z2.b +// CHECK-INST: smmla z0.s, z1.b, z2.b +// CHECK-ENCODING: [0x20,0x98,0x02,0x45] +// CHECK-ERROR: instruction requires: i8mm +// CHECK-UNKNOWN: 20 98 02 45 + +usmmla z0.s, z1.b, z2.b +// CHECK-INST: usmmla z0.s, z1.b, z2.b +// CHECK-ENCODING: [0x20,0x98,0x82,0x45] +// CHECK-ERROR: instruction requires: i8mm +// CHECK-UNKNOWN: 20 98 82 45 + + +// Test compatibility with MOVPRFX instruction. + +movprfx z0, z7 +// CHECK-INST: movprfx z0, z7 +// CHECK-ENCODING: [0xe0,0xbc,0x20,0x04] +// CHECK-UNKNOWN: e0 bc 20 04 + +ummla z0.s, z1.b, z2.b +// CHECK-INST: ummla z0.s, z1.b, z2.b +// CHECK-ENCODING: [0x20,0x98,0xc2,0x45] +// CHECK-ERROR: instruction requires: i8mm +// CHECK-UNKNOWN: 20 98 c2 45 + +movprfx z0, z7 +// CHECK-INST: movprfx z0, z7 +// CHECK-ENCODING: [0xe0,0xbc,0x20,0x04] +// CHECK-UNKNOWN: e0 bc 20 04 + +smmla z0.s, z1.b, z2.b +// CHECK-INST: smmla z0.s, z1.b, z2.b +// CHECK-ENCODING: [0x20,0x98,0x02,0x45] +// CHECK-ERROR: instruction requires: i8mm +// CHECK-UNKNOWN: 20 98 02 45 + +movprfx z0, z7 +// CHECK-INST: movprfx z0, z7 +// CHECK-ENCODING: [0xe0,0xbc,0x20,0x04] +// CHECK-UNKNOWN: e0 bc 20 04 + +usmmla z0.s, z1.b, z2.b +// CHECK-INST: usmmla z0.s, z1.b, z2.b +// CHECK-ENCODING: [0x20,0x98,0x82,0x45] +// CHECK-ERROR: instruction requires: i8mm +// CHECK-UNKNOWN: 20 98 82 45 + + +// --------------------------------------------------------------------------// +// USDOT (SVE, vectors) + +usdot z0.s, z1.b, z2.b +// CHECK-INST: usdot z0.s, z1.b, z2.b +// CHECK-ENCODING: [0x20,0x78,0x82,0x44] +// CHECK-ERROR: instruction requires: i8mm +// CHECK-UNKNOWN: 20 78 82 44 + +// Test compatibility with MOVPRFX instruction. + +movprfx z0, z7 +// CHECK-INST: movprfx z0, z7 +// CHECK-ENCODING: [0xe0,0xbc,0x20,0x04] +// CHECK-UNKNOWN: e0 bc 20 04 + +usdot z0.s, z1.b, z2.b +// CHECK-INST: usdot z0.s, z1.b, z2.b +// CHECK-ENCODING: [0x20,0x78,0x82,0x44] +// CHECK-ERROR: instruction requires: i8mm +// CHECK-UNKNOWN: 20 78 82 44 + + +// --------------------------------------------------------------------------// +// USDOT, SUDOT (SVE, indexed) + +usdot z0.s, z1.b, z2.b[0] +// CHECK-INST: usdot z0.s, z1.b, z2.b[0] +// CHECK-ENCODING: [0x20,0x18,0xa2,0x44] +// CHECK-ERROR: instruction requires: i8mm +// CHECK-UNKNOWN: 20 18 a2 44 + +sudot z0.s, z1.b, z2.b[3] +// CHECK-INST: sudot z0.s, z1.b, z2.b[3] +// CHECK-ENCODING: [0x20,0x1c,0xba,0x44] +// CHECK-ERROR: instruction requires: i8mm +// CHECK-UNKNOWN: 20 1c ba 44 + +// Test compatibility with MOVPRFX instruction. + +movprfx z0, z7 +// CHECK-INST: movprfx z0, z7 +// CHECK-ENCODING: [0xe0,0xbc,0x20,0x04] +// CHECK-UNKNOWN: e0 bc 20 04 + +usdot z0.s, z1.b, z2.b[0] +// CHECK-INST: usdot z0.s, z1.b, z2.b[0] +// CHECK-ENCODING: [0x20,0x18,0xa2,0x44] +// CHECK-ERROR: instruction requires: i8mm +// CHECK-UNKNOWN: 20 18 a2 44 Index: llvm/test/MC/AArch64/armv8.6a-simd-matmul-error.s =================================================================== --- /dev/null +++ llvm/test/MC/AArch64/armv8.6a-simd-matmul-error.s @@ -0,0 +1,34 @@ +// RUN: not llvm-mc -triple aarch64 -show-encoding -mattr=+i8mm < %s 2>&1 | FileCheck %s + +// No interesting edge cases for [US]MMLA, except for the fact that the data +// types are fixed (no 64-bit version), and USMMLA exists, but SUMMLA does not. +smmla v1.2s, v16.8b, v31.8b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +summla v1.4s, v16.16b, v31.16b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: unrecognized instruction mnemonic, did you mean: smmla, ummla, usmmla? + +// USDOT (vector) has two valid data type combinations, others are rejected. +usdot v3.4s, v15.8b, v30.8b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +usdot v3.2s, v15.16b, v30.16b +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction + +// For USDOT and SUDOT (indexed), the index is in range [0,3] (regardless of data types) +usdot v31.2s, v1.8b, v2.4b[4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3]. +usdot v31.4s, v1.16b, v2.4b[4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3]. +sudot v31.2s, v1.8b, v2.4b[4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3]. +sudot v31.4s, v1.16b, v2.4b[4] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: vector lane must be an integer in range [0, 3]. + +// The arrangement specifiers of the first two operands muct match. +usdot v31.4s, v1.8b, v2.4b[0] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +usdot v31.2s, v1.16b, v2.4b[0] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +sudot v31.4s, v1.8b, v2.4b[0] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +sudot v31.2s, v1.16b, v2.4b[0] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction Index: llvm/test/MC/AArch64/armv8.6a-simd-matmul.s =================================================================== --- /dev/null +++ llvm/test/MC/AArch64/armv8.6a-simd-matmul.s @@ -0,0 +1,43 @@ +// RUN: llvm-mc -triple aarch64 -show-encoding -mattr=+i8mm < %s | FileCheck %s +// RUN: llvm-mc -triple aarch64 -show-encoding -mattr=+v8.6a < %s | FileCheck %s +// RUN: not llvm-mc -triple aarch64 -show-encoding -mattr=+v8.6a-i8mm < %s 2>&1 | FileCheck %s --check-prefix=NOMATMUL + +smmla v1.4s, v16.16b, v31.16b +ummla v1.4s, v16.16b, v31.16b +usmmla v1.4s, v16.16b, v31.16b +// CHECK: smmla v1.4s, v16.16b, v31.16b // encoding: [0x01,0xa6,0x9f,0x4e] +// CHECK: ummla v1.4s, v16.16b, v31.16b // encoding: [0x01,0xa6,0x9f,0x6e] +// CHECK: usmmla v1.4s, v16.16b, v31.16b // encoding: [0x01,0xae,0x9f,0x4e] +// NOMATMUL: instruction requires: i8mm +// NOMATMUL-NEXT: smmla v1.4s, v16.16b, v31.16b +// NOMATMUL: instruction requires: i8mm +// NOMATMUL-NEXT: ummla v1.4s, v16.16b, v31.16b +// NOMATMUL: instruction requires: i8mm +// NOMATMUL-NEXT: usmmla v1.4s, v16.16b, v31.16b + +usdot v3.2s, v15.8b, v30.8b +usdot v3.4s, v15.16b, v30.16b +// CHECK: usdot v3.2s, v15.8b, v30.8b // encoding: [0xe3,0x9d,0x9e,0x0e] +// CHECK: usdot v3.4s, v15.16b, v30.16b // encoding: [0xe3,0x9d,0x9e,0x4e] +// NOMATMUL: instruction requires: i8mm +// NOMATMUL-NEXT: usdot v3.2s, v15.8b, v30.8b +// NOMATMUL: instruction requires: i8mm +// NOMATMUL-NEXT: usdot v3.4s, v15.16b, v30.16b + +usdot v31.2s, v1.8b, v2.4b[3] +usdot v31.4s, v1.16b, v2.4b[3] +// CHECK: usdot v31.2s, v1.8b, v2.4b[3] // encoding: [0x3f,0xf8,0xa2,0x0f] +// CHECK: usdot v31.4s, v1.16b, v2.4b[3] // encoding: [0x3f,0xf8,0xa2,0x4f] +// NOMATMUL: instruction requires: i8mm +// NOMATMUL-NEXT: usdot v31.2s, v1.8b, v2.4b[3] +// NOMATMUL: instruction requires: i8mm +// NOMATMUL-NEXT: usdot v31.4s, v1.16b, v2.4b[3] + +sudot v31.2s, v1.8b, v2.4b[3] +sudot v31.4s, v1.16b, v2.4b[3] +// CHECK: sudot v31.2s, v1.8b, v2.4b[3] // encoding: [0x3f,0xf8,0x22,0x0f] +// CHECK: sudot v31.4s, v1.16b, v2.4b[3] // encoding: [0x3f,0xf8,0x22,0x4f] +// NOMATMUL: instruction requires: i8mm +// NOMATMUL-NEXT: sudot v31.2s, v1.8b, v2.4b[3] +// NOMATMUL: instruction requires: i8mm +// NOMATMUL-NEXT: sudot v31.4s, v1.16b, v2.4b[3] Index: llvm/test/MC/ARM/armv8.6a-matmul-error.s =================================================================== --- /dev/null +++ llvm/test/MC/ARM/armv8.6a-matmul-error.s @@ -0,0 +1,113 @@ +// RUN: not llvm-mc -triple armv8a -show-encoding -mattr=+i8mm < %s 2>&1 | FileCheck %s +// RUN: not llvm-mc -triple thumbv8a -show-encoding -mattr=+i8mm < %s 2>&1 | FileCheck %s + + +// VSMMLA, VUMMLA, VUSMMLA + +// Data type specifier must match instruction + +vsmmla.u8 q0, q1, q2 +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: vsmmla.u8 q0, q1, q2 +// CHECK-NEXT: {{^ \^}} + +vummla.s8 q0, q1, q2 +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: vummla.s8 q0, q1, q2 +// CHECK-NEXT: {{^ \^}} + +vusmmla.u8 q0, q1, q2 +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: vusmmla.u8 q0, q1, q2 +// CHECK-NEXT: {{^ \^}} + + +// Incorrect register type + +vsmmla.s8 d0, q1, q2 +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: operand must be a register in range [q0, q15] +// CHECK-NEXT: vsmmla.s8 d0, q1, q2 +// CHECK-NEXT: {{^ \^}} + +vummla.u8 q0, d1, q2 +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: operand must be a register in range [q0, q15] +// CHECK-NEXT: vummla.u8 q0, d1, q2 +// CHECK-NEXT: {{^ \^}} + +vusmmla.s8 q0, q1, d2 +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: operand must be a register in range [q0, q15] +// CHECK-NEXT: vusmmla.s8 q0, q1, d2 +// CHECK-NEXT: {{^ \^}} + + +// VUSDOT (vector) + +// Data type specifier must match instruction + +vusdot.u8 q0, q1, q2 +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: vusdot.u8 q0, q1, q2 +// CHECK-NEXT: {{^ \^}} + +// Mis-matched register types + +vusdot.s8 q0, d1, d2 +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: operand must be a register in range [d0, d31] +vusdot.s8 d0, q1, d2 +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: operand must be a register in range [d0, d31] +vusdot.s8 d0, d1, q2 +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: operand must be a register in range [d0, d31] + + +// VUSDOT, VSUDOT (by scalar) + +// Data type specifier must match instruction + +vusdot.u8 d0, d1, d2[0] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: vusdot.u8 d0, d1, d2[0] +// CHECK-NEXT: {{^ \^}} + +vsudot.s8 d0, d1, d2[0] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +// CHECK-NEXT: vsudot.s8 d0, d1, d2[0] +// CHECK-NEXT: {{^ \^}} + +// Incorrect register types + +vusdot.s8 q0, d1, d2[0] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid instruction, any one of the following would fix this: +// CHECK-NEXT: vusdot.s8 q0, d1, d2[0] +// CHECK: [[@LINE-3]]:{{[0-9]+}}: note: operand must be a register in range [d0, d31] +// CHECK-NEXT: vusdot.s8 q0, d1, d2[0] +// CHECK-NEXT: {{^ \^}} +// CHECK: [[@LINE-6]]:{{[0-9]+}}: note: operand must be a register in range [q0, q15] +// CHECK-NEXT: vusdot.s8 q0, d1, d2[0] +// CHECK-NEXT: {{^ \^}} + +vusdot.s8 d0, q1, d2[0] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid instruction, any one of the following would fix this: +// CHECK-NEXT: vusdot.s8 d0, q1, d2[0] +// CHECK: [[@LINE-3]]:{{[0-9]+}}: note: operand must be a register in range [d0, d31] +// CHECK-NEXT: vusdot.s8 d0, q1, d2[0] +// CHECK-NEXT: {{^ \^}} +// CHECK: [[@LINE-6]]:{{[0-9]+}}: note: operand must be a register in range [q0, q15] +// CHECK-NEXT: vusdot.s8 d0, q1, d2[0] +// CHECK-NEXT: {{^ \^}} + +vusdot.s8 q0, q1, q2[0] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid instruction, any one of the following would fix this: +// CHECK-NEXT: vusdot.s8 q0, q1, q2[0] +// CHECK: [[@LINE-3]]:{{[0-9]+}}: note: operand must be a register in range [d0, d15] +// CHECK-NEXT: vusdot.s8 q0, q1, q2[0] +// CHECK-NEXT: {{^ \^}} +// CHECK: [[@LINE-6]]:{{[0-9]+}}: note: too many operands for instruction +// CHECK-NEXT: vusdot.s8 q0, q1, q2[0] +// CHECK-NEXT: {{^ \^}} + +// Out of range lane index + +vusdot.s8 d0, d1, d2[2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction +vsudot.u8 q0, q1, d2[2] +// CHECK: [[@LINE-1]]:{{[0-9]+}}: error: invalid operand for instruction Index: llvm/test/MC/ARM/armv8.6a-matmul.s =================================================================== --- /dev/null +++ llvm/test/MC/ARM/armv8.6a-matmul.s @@ -0,0 +1,49 @@ +// RUN: llvm-mc -triple armv8a -show-encoding -mattr=+i8mm < %s | FileCheck %s --check-prefix=ARM +// RUN: llvm-mc -triple thumbv8a -show-encoding -mattr=+i8mm < %s | FileCheck %s --check-prefix=THUMB +// RUN: not llvm-mc -triple armv8a -show-encoding -mattr=v8.5a < %s 2>&1 | FileCheck %s --check-prefix=NOMATMUL +// RUN: not llvm-mc -triple thumbv8a -show-encoding -mattr=v8.5a < %s 2>&1 | FileCheck %s --check-prefix=NOMATMUL + +vsmmla.s8 q0, q1, q2 +// ARM: vsmmla.s8 q0, q1, q2 @ encoding: [0x44,0x0c,0x22,0xfc] +// THUMB: vsmmla.s8 q0, q1, q2 @ encoding: [0x22,0xfc,0x44,0x0c] +// NOMATMUL: [[@LINE-3]]:{{[0-9]+}}: error: instruction requires: 8-bit integer matrix multiply + +vummla.u8 q0, q1, q2 +// ARM: vummla.u8 q0, q1, q2 @ encoding: [0x54,0x0c,0x22,0xfc] +// THUMB: vummla.u8 q0, q1, q2 @ encoding: [0x22,0xfc,0x54,0x0c] +// NOMATMUL: [[@LINE-3]]:{{[0-9]+}}: error: instruction requires: 8-bit integer matrix multiply + +vusmmla.s8 q0, q1, q2 +// ARM: vusmmla.s8 q0, q1, q2 @ encoding: [0x44,0x0c,0xa2,0xfc] +// THUMB: vusmmla.s8 q0, q1, q2 @ encoding: [0xa2,0xfc,0x44,0x0c] +// NOMATMUL: [[@LINE-3]]:{{[0-9]+}}: error: instruction requires: 8-bit integer matrix multiply + +vusdot.s8 d0, d1, d2 +// ARM: vusdot.s8 d0, d1, d2 @ encoding: [0x02,0x0d,0xa1,0xfc] +// THUMB: vusdot.s8 d0, d1, d2 @ encoding: [0xa1,0xfc,0x02,0x0d] +// NOMATMUL: [[@LINE-3]]:{{[0-9]+}}: error: instruction requires: 8-bit integer matrix multiply + +vusdot.s8 q0, q1, q2 +// ARM: vusdot.s8 q0, q1, q2 @ encoding: [0x44,0x0d,0xa2,0xfc] +// THUMB: vusdot.s8 q0, q1, q2 @ encoding: [0xa2,0xfc,0x44,0x0d] +// NOMATMUL: [[@LINE-3]]:{{[0-9]+}}: error: instruction requires: 8-bit integer matrix multiply + +vusdot.s8 d0, d1, d2[0] +// ARM: vusdot.s8 d0, d1, d2[0] @ encoding: [0x02,0x0d,0x81,0xfe] +// THUMB: vusdot.s8 d0, d1, d2[0] @ encoding: [0x81,0xfe,0x02,0x0d] +// NOMATMUL: [[@LINE-3]]:{{[0-9]+}}: error: instruction requires: 8-bit integer matrix multiply + +vsudot.u8 d0, d1, d2[1] +// ARM: vsudot.u8 d0, d1, d2[1] @ encoding: [0x32,0x0d,0x81,0xfe] +// THUMB: vsudot.u8 d0, d1, d2[1] @ encoding: [0x81,0xfe,0x32,0x0d] +// NOMATMUL: [[@LINE-3]]:{{[0-9]+}}: error: instruction requires: 8-bit integer matrix multiply + +vusdot.s8 q0, q1, d2[0] +// ARM: vusdot.s8 q0, q1, d2[0] @ encoding: [0x42,0x0d,0x82,0xfe] +// THUMB: vusdot.s8 q0, q1, d2[0] @ encoding: [0x82,0xfe,0x42,0x0d] +// NOMATMUL: [[@LINE-3]]:{{[0-9]+}}: error: instruction requires: 8-bit integer matrix multiply + +vsudot.u8 q0, q1, d2[1] +// ARM: vsudot.u8 q0, q1, d2[1] @ encoding: [0x72,0x0d,0x82,0xfe] +// THUMB: vsudot.u8 q0, q1, d2[1] @ encoding: [0x82,0xfe,0x72,0x0d] +// NOMATMUL: [[@LINE-3]]:{{[0-9]+}}: error: instruction requires: 8-bit integer matrix multiply Index: llvm/test/MC/Disassembler/AArch64/armv8.6a-simd-matmul.txt =================================================================== --- /dev/null +++ llvm/test/MC/Disassembler/AArch64/armv8.6a-simd-matmul.txt @@ -0,0 +1,34 @@ +# RUN: llvm-mc -triple=aarch64 -mattr=+i8mm -disassemble < %s | FileCheck %s +# RUN: llvm-mc -triple=aarch64 -mattr=+v8.6a -disassemble < %s | FileCheck %s +# RUN: not llvm-mc -triple=aarch64 -mattr=+v8.5a -disassemble < %s 2>&1 | FileCheck %s --check-prefix=NOI8MM + +[0x01,0xa6,0x9f,0x4e] +[0x01,0xa6,0x9f,0x6e] +[0x01,0xae,0x9f,0x4e] +# CHECK: smmla v1.4s, v16.16b, v31.16b +# CHECK: ummla v1.4s, v16.16b, v31.16b +# CHECK: usmmla v1.4s, v16.16b, v31.16b +# NOI8MM: [[@LINE-6]]:{{[0-9]+}}: warning: invalid instruction encoding +# NOI8MM: [[@LINE-6]]:{{[0-9]+}}: warning: invalid instruction encoding +# NOI8MM: [[@LINE-6]]:{{[0-9]+}}: warning: invalid instruction encoding + +[0xe3,0x9d,0x9e,0x0e] +[0xe3,0x9d,0x9e,0x4e] +# CHECK: usdot v3.2s, v15.8b, v30.8b +# CHECK: usdot v3.4s, v15.16b, v30.16b +# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding +# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding + +[0x3f,0xf8,0xa2,0x0f] +[0x3f,0xf8,0xa2,0x4f] +# CHECK: usdot v31.2s, v1.8b, v2.4b[3] +# CHECK: usdot v31.4s, v1.16b, v2.4b[3] +# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding +# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding + +[0x3f,0xf8,0x22,0x0f] +[0x3f,0xf8,0x22,0x4f] +# CHECK: sudot v31.2s, v1.8b, v2.4b[3] +# CHECK: sudot v31.4s, v1.16b, v2.4b[3] +# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding +# NOI8MM: [[@LINE-4]]:{{[0-9]+}}: warning: invalid instruction encoding Index: llvm/test/MC/Disassembler/ARM/armv8.6a-matmul-arm.txt =================================================================== --- /dev/null +++ llvm/test/MC/Disassembler/ARM/armv8.6a-matmul-arm.txt @@ -0,0 +1,38 @@ +# RUN: llvm-mc -triple=armv8 -mattr=+i8mm -disassemble < %s | FileCheck %s +# RUN: not llvm-mc -triple=armv8 -mattr=+v8.4a -disassemble < %s 2>&1 | FileCheck %s --check-prefix=NOMATMUL + +[0x44,0x0c,0x22,0xfc] +# CHECK: vsmmla.s8 q0, q1, q2 +# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding + +[0x54,0x0c,0x22,0xfc] +# CHECK: vummla.u8 q0, q1, q2 +# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding + +[0x44,0x0c,0xa2,0xfc] +# CHECK: vusmmla.s8 q0, q1, q2 +# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding + +[0x02,0x0d,0xa1,0xfc] +# CHECK: vusdot.s8 d0, d1, d2 +# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding + +[0x44,0x0d,0xa2,0xfc] +# CHECK: vusdot.s8 q0, q1, q2 +# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding + +[0x02,0x0d,0x81,0xfe] +# CHECK: vusdot.s8 d0, d1, d2[0] +# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding + +[0x32,0x0d,0x81,0xfe] +# CHECK: vsudot.u8 d0, d1, d2[1] +# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding + +[0x42,0x0d,0x82,0xfe] +# CHECK: vusdot.s8 q0, q1, d2[0] +# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding + +[0x72,0x0d,0x82,0xfe] +# CHECK: vsudot.u8 q0, q1, d2[1] +# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding Index: llvm/test/MC/Disassembler/ARM/armv8.6a-matmul-thumb.txt =================================================================== --- /dev/null +++ llvm/test/MC/Disassembler/ARM/armv8.6a-matmul-thumb.txt @@ -0,0 +1,38 @@ +# RUN: llvm-mc -triple=thumbv8a -mattr=+i8mm -disassemble < %s | FileCheck %s +# RUN: not llvm-mc -triple=thumbv8a -mattr=+v8.4a -disassemble < %s 2>&1 | FileCheck %s --check-prefix=NOMATMUL + +[0x22,0xfc,0x44,0x0c] +# CHECK: vsmmla.s8 q0, q1, q2 +# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding + +[0x22,0xfc,0x54,0x0c] +# CHECK: vummla.u8 q0, q1, q2 +# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding + +[0xa2,0xfc,0x44,0x0c] +# CHECK: vusmmla.s8 q0, q1, q2 +# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding + +[0xa1,0xfc,0x02,0x0d] +# CHECK: vusdot.s8 d0, d1, d2 +# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding + +[0xa2,0xfc,0x44,0x0d] +# CHECK: vusdot.s8 q0, q1, q2 +# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding + +[0x81,0xfe,0x02,0x0d] +# CHECK: vusdot.s8 d0, d1, d2[0] +# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding + +[0x81,0xfe,0x32,0x0d] +# CHECK: vsudot.u8 d0, d1, d2[1] +# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding + +[0x82,0xfe,0x42,0x0d] +# CHECK: vusdot.s8 q0, q1, d2[0] +# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding + +[0x82,0xfe,0x72,0x0d] +# CHECK: vsudot.u8 q0, q1, d2[1] +# NOMATMUL: :[[@LINE-2]]:{{[0-9]+}}: warning: invalid instruction encoding Index: llvm/unittests/Support/TargetParserTest.cpp =================================================================== --- llvm/unittests/Support/TargetParserTest.cpp +++ llvm/unittests/Support/TargetParserTest.cpp @@ -636,6 +636,7 @@ {"maverick", "maverick", nullptr, nullptr}, {"xscale", "noxscale", nullptr, nullptr}, {"sb", "nosb", "+sb", "-sb"}, + {"i8mm", "noi8mm", "+i8mm", "-i8mm"}, {"mve", "nomve", "+mve", "-mve"}, {"mve.fp", "nomve.fp", "+mve.fp", "-mve.fp"}}; @@ -1230,7 +1231,10 @@ {"tme", "notme", "+tme", "-tme"}, {"ssbs", "nossbs", "+ssbs", "-ssbs"}, {"sb", "nosb", "+sb", "-sb"}, - {"predres", "nopredres", "+predres", "-predres"} + {"predres", "nopredres", "+predres", "-predres"}, + {"i8mm", "noi8mm", "+i8mm", "-i8mm"}, + {"f32mm", "nof32mm", "+f32mm", "-f32mm"}, + {"f64mm", "nof64mm", "+f64mm", "-f64mm"}, }; for (unsigned i = 0; i < array_lengthof(ArchExt); i++) {