diff --git a/clang/test/Misc/target-invalid-cpu-note.c b/clang/test/Misc/target-invalid-cpu-note.c --- a/clang/test/Misc/target-invalid-cpu-note.c +++ b/clang/test/Misc/target-invalid-cpu-note.c @@ -5,11 +5,11 @@ // RUN: not %clang_cc1 -triple arm64--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix AARCH64 // AARCH64: error: unknown target CPU 'not-a-cpu' -// AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78c, cortex-a710, cortex-r82, cortex-x1, cortex-x2, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-m1, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel{{$}} +// AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78c, cortex-a710, cortex-r82, cortex-x1, cortex-x2, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-m1, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1{{$}} // RUN: not %clang_cc1 -triple arm64--- -tune-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix TUNE_AARCH64 // TUNE_AARCH64: error: unknown target CPU 'not-a-cpu' -// TUNE_AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78c, cortex-a710, cortex-r82, cortex-x1, cortex-x2, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-m1, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel{{$}} +// TUNE_AARCH64-NEXT: note: valid target CPU values are: cortex-a34, cortex-a35, cortex-a53, cortex-a55, cortex-a510, cortex-a57, cortex-a65, cortex-a65ae, cortex-a72, cortex-a73, cortex-a75, cortex-a76, cortex-a76ae, cortex-a77, cortex-a78, cortex-a78c, cortex-a710, cortex-r82, cortex-x1, cortex-x2, neoverse-e1, neoverse-n1, neoverse-n2, neoverse-512tvb, neoverse-v1, cyclone, apple-a7, apple-a8, apple-a9, apple-a10, apple-a11, apple-a12, apple-a13, apple-a14, apple-m1, apple-s4, apple-s5, exynos-m3, exynos-m4, exynos-m5, falkor, saphira, kryo, thunderx2t99, thunderx3t110, thunderx, thunderxt88, thunderxt81, thunderxt83, tsv110, a64fx, carmel, ampere1{{$}} // RUN: not %clang_cc1 -triple i386--- -target-cpu not-a-cpu -fsyntax-only %s 2>&1 | FileCheck %s --check-prefix X86 // X86: error: unknown target CPU 'not-a-cpu' diff --git a/llvm/include/llvm/Support/AArch64TargetParser.def b/llvm/include/llvm/Support/AArch64TargetParser.def --- a/llvm/include/llvm/Support/AArch64TargetParser.def +++ b/llvm/include/llvm/Support/AArch64TargetParser.def @@ -284,6 +284,10 @@ (AArch64::AEK_FP16 | AArch64::AEK_SVE)) AARCH64_CPU_NAME("carmel", ARMV8_2A, FK_CRYPTO_NEON_FP_ARMV8, false, AArch64::AEK_FP16) +AARCH64_CPU_NAME("ampere1", ARMV8_6A, FK_CRYPTO_NEON_FP_ARMV8, false, + (AArch64::AEK_BF16 | AArch64::AEK_DOTPROD | AArch64::AEK_FP16 | + AArch64::AEK_I8MM | AArch64::AEK_MTE | AArch64::AEK_RAS | + AArch64::AEK_RCPC | AArch64::AEK_SB | AArch64::AEK_SSBS)) // Invalid CPU AARCH64_CPU_NAME("invalid", INVALID, FK_INVALID, true, AArch64::AEK_INVALID) #undef AARCH64_CPU_NAME diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td --- a/llvm/lib/Target/AArch64/AArch64.td +++ b/llvm/lib/Target/AArch64/AArch64.td @@ -555,6 +555,7 @@ include "AArch64InstrInfo.td" include "AArch64SchedPredicates.td" include "AArch64SchedPredExynos.td" +include "AArch64SchedPredAmpere.td" include "AArch64Combine.td" def AArch64InstrInfo : InstrInfo; @@ -624,6 +625,7 @@ include "AArch64SchedA64FX.td" include "AArch64SchedThunderX3T110.td" include "AArch64SchedTSV110.td" +include "AArch64SchedAmpere1.td" def TuneA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", "Cortex-A35 ARM processors">; @@ -937,6 +939,16 @@ FeatureFuseAES, FeaturePostRAScheduler]>; +def TuneAmpere1 : SubtargetFeature<"ampere1", "ARMProcFamily", "Ampere1", + "Ampere Computing Ampere-1 processors", [ + FeaturePostRAScheduler, + FeatureFuseAES, + FeatureLSLFast, + FeatureAggressiveFMA, + FeatureArithmeticBccFusion, + FeatureCmpBccFusion, + FeatureFuseAddress, + FeatureFuseLiterals]>; def ProcessorFeatures { list A53 = [HasV8_0aOps, FeatureCRC, FeatureCrypto, @@ -1038,6 +1050,7 @@ list TSV110 = [HasV8_2aOps, FeatureCrypto, FeatureFPARMv8, FeatureNEON, FeaturePerfMon, FeatureSPE, FeatureFullFP16, FeatureFP16FML, FeatureDotProd]; + list Ampere1 = [HasV8_6aOps, FeatureNEON, FeaturePerfMon]; // ETE and TRBE are future architecture extensions. We temporarily enable them // by default for users targeting generic AArch64. The extensions do not @@ -1173,6 +1186,10 @@ def : ProcessorModel<"carmel", NoSchedModel, ProcessorFeatures.Carmel, [TuneCarmel]>; +// Ampere Computing +def : ProcessorModel<"ampere1", Ampere1Model, ProcessorFeatures.Ampere1, + [TuneAmpere1]>; + //===----------------------------------------------------------------------===// // Assembly parser //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td b/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64SchedAmpere1.td @@ -0,0 +1,1138 @@ +//=- AArch64SchedAmpere1.td - Ampere-1 scheduling def -----*- tablegen -*-=// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for the Ampere Computing Ampere-1 to +// support instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// + +// The Ampere-1 core is anout-of-order micro-architecture. The front +// end has branch prediction, with a 10-cycle recovery time from a +// mispredicted branch. Instructions coming out of the front end are +// decoded into internal micro-ops (uops). + +def Ampere1Model : SchedMachineModel { + let IssueWidth = 4; // 4-way decode and dispatch + let MicroOpBufferSize = 174; // micro-op re-order buffer size + let LoadLatency = 4; // Optimistic load latency + let MispredictPenalty = 10; // Branch mispredict penalty + let LoopMicroOpBufferSize = 32; // Instruction queue size + let CompleteModel = 1; + + list UnsupportedFeatures = !listconcat(SVEUnsupported.F, + SMEUnsupported.F); +} + +let SchedModel = Ampere1Model in { + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on Ampere-1. +// Ampere-1 has 12 pipelines that 8 independent scheduler (4 integer, 2 FP, +// and 2 memory) issue into. The integer and FP schedulers can each issue +// one uop per cycle, while the memory schedulers can each issue one load +// and one store address calculation per cycle. + +def Ampere1UnitA : ProcResource<2>; // integer single-cycle, branch, and flags r/w +def Ampere1UnitB : ProcResource<2>; // integer single-cycle, and complex shifts +def Ampere1UnitBS : ProcResource<1>; // integer multi-cycle +def Ampere1UnitL : ProcResource<2>; // load +def Ampere1UnitS : ProcResource<2>; // store address calculation +def Ampere1UnitX : ProcResource<1>; // FP and vector operations, and flag write +def Ampere1UnitY : ProcResource<1>; // FP and vector operations, and crypto +def Ampere1UnitZ : ProcResource<1>; // FP store data and FP-to-integer moves + +def Ampere1UnitAB : ProcResGroup<[Ampere1UnitA, Ampere1UnitB]>; +def Ampere1UnitXY : ProcResGroup<[Ampere1UnitX, Ampere1UnitY]>; + +//===----------------------------------------------------------------------===// +// Define customized scheduler read/write types specific to the Ampere-1. + +def Ampere1Write_1cyc_1A : SchedWriteRes<[Ampere1UnitA]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1Write_1cyc_2A : SchedWriteRes<[Ampere1UnitA, Ampere1UnitA]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def Ampere1Write_1cyc_1B : SchedWriteRes<[Ampere1UnitB]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1Write_1cyc_1AB : SchedWriteRes<[Ampere1UnitAB]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1Write_1cyc_1L : SchedWriteRes<[Ampere1UnitL]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1Write_1cyc_1S : SchedWriteRes<[Ampere1UnitS]> { + let Latency = 1; + let NumMicroOps = 1; +} + +def Ampere1Write_1cyc_2S : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS]> { + let Latency = 1; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1Y : SchedWriteRes<[Ampere1UnitY]> { + let Latency = 2; + let NumMicroOps = 1; +} + +def Ampere1Write_2cyc_2AB : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitAB]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1B_1AB : SchedWriteRes<[Ampere1UnitB, Ampere1UnitAB]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1B_1A : SchedWriteRes<[Ampere1UnitB, Ampere1UnitA]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1AB_1A : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitA]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1AB_2S : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitS, + Ampere1UnitS]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def Ampere1Write_2cyc_1AB_1S_1Z : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitS, + Ampere1UnitZ]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def Ampere1Write_2cyc_1B_1S : SchedWriteRes<[Ampere1UnitB, Ampere1UnitS]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_2cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 2; + let NumMicroOps = 1; +} + +def Ampere1Write_2cyc_1S_1Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitZ]> { + let Latency = 2; + let NumMicroOps = 2; +} + +def Ampere1Write_3cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 3; + let NumMicroOps = 1; +} + +def Ampere1Write_3cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 3; + let NumMicroOps = 1; +} + +def Ampere1Write_3cyc_1B_1S_1AB : SchedWriteRes<[Ampere1UnitB, Ampere1UnitS, + Ampere1UnitAB]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def Ampere1Write_3cyc_1S_2Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 2; + let NumMicroOps = 3; +} + +def Ampere1Write_3cyc_2S_2Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 2; + let NumMicroOps = 4; +} + +def Ampere1Write_4cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_1L : SchedWriteRes<[Ampere1UnitL]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_1X : SchedWriteRes<[Ampere1UnitX]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_1Y : SchedWriteRes<[Ampere1UnitY]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_1Z : SchedWriteRes<[Ampere1UnitZ]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_2L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def Ampere1Write_4cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 4; + let NumMicroOps = 1; +} + +def Ampere1Write_4cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 4; + let NumMicroOps = 2; +} + +def Ampere1Write_4cyc_1XY_1S_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitS, Ampere1UnitZ]> { + let Latency = 4; + let NumMicroOps = 3; +} + +def Ampere1Write_4cyc_3S_3Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 4; + let NumMicroOps = 6; +} + +def Ampere1Write_5cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def Ampere1Write_5cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 5; + let NumMicroOps = 1; +} + +def Ampere1Write_5cyc_1X : SchedWriteRes<[Ampere1UnitX]> { + let Latency = 5; + let NumMicroOps = 1; +} + +def Ampere1Write_5cyc_1L : SchedWriteRes<[Ampere1UnitL]> { + let Latency = 5; + let NumMicroOps = 1; +} + +def Ampere1Write_5cyc_2L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def Ampere1Write_5cyc_1L_1BS : SchedWriteRes<[Ampere1UnitL, Ampere1UnitBS]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def Ampere1Write_5cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 5; + let NumMicroOps = 1; +} + +def Ampere1Write_5cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 5; + let NumMicroOps = 2; +} + +def Ampere1Write_5cyc_4S_4Z : SchedWriteRes<[Ampere1UnitS, Ampere1UnitS, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 5; + let NumMicroOps = 8; +} + +def Ampere1Write_5cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 5; + let NumMicroOps = 6; +} + +def Ampere1Write_6cyc_2XY_2S_2Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 6; + let NumMicroOps = 6; +} + +def Ampere1Write_6cyc_3XY_3S_3Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 6; + let NumMicroOps = 9; +} + +def Ampere1Write_6cyc_1AB_1L : SchedWriteRes<[Ampere1UnitAB, Ampere1UnitL]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def Ampere1Write_6cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 6; + let NumMicroOps = 1; +} + +def Ampere1Write_6cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def Ampere1Write_6cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def Ampere1Write_6cyc_3L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL]> { + let Latency = 6; + let NumMicroOps = 3; +} + +def Ampere1Write_6cyc_4L : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitL, Ampere1UnitL]> { + let Latency = 6; + let NumMicroOps = 4; +} + +def Ampere1Write_6cyc_1XY_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitZ]> { + let Latency = 6; + let NumMicroOps = 2; +} + +def Ampere1Write_7cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 7; + let NumMicroOps = 1; +} + +def Ampere1Write_7cyc_1BS_1XY : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitXY]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def Ampere1Write_7cyc_1L_1XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitXY]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def Ampere1Write_7cyc_2L_2XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 7; + let NumMicroOps = 4; +} + +def Ampere1Write_7cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 7; + let NumMicroOps = 2; +} + +def Ampere1Write_7cyc_4XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 7; + let NumMicroOps = 12; +} + +def Ampere1Write_8cyc_1BS_1A : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitA]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def Ampere1Write_8cyc_1BS_2A : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitA, + Ampere1UnitA]> { + let Latency = 8; + let NumMicroOps = 3; +} + +def Ampere1Write_8cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 8; + let NumMicroOps = 2; +} + +def Ampere1Write_8cyc_4XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 8; + let NumMicroOps = 4; +} + +def Ampere1Write_8cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 8; + let NumMicroOps = 6; +} + +def Ampere1Write_8cyc_4L_4XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 8; + let NumMicroOps = 8; +} + +def Ampere1Write_9cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 9; + let NumMicroOps = 6; +} + +def Ampere1Write_9cyc_4L_4XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 9; + let NumMicroOps = 8; +} + +def Ampere1Write_9cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 9; + let NumMicroOps = 3; +} + +def Ampere1Write_9cyc_2L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 9; + let NumMicroOps = 5; +} + +def Ampere1Write_9cyc_6XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 9; + let NumMicroOps = 14; +} + +def Ampere1Write_9cyc_8XY_4S_4Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitS, Ampere1UnitS, + Ampere1UnitZ, Ampere1UnitZ, + Ampere1UnitZ, Ampere1UnitZ]> { + let Latency = 9; + let NumMicroOps = 16; +} + +def Ampere1Write_10cyc_2XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 10; + let NumMicroOps = 2; +} + +def Ampere1Write_10cyc_1XY_1Z : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitZ]> { + let Latency = 10; + let NumMicroOps = 2; +} + +def Ampere1Write_10cyc_1X_1Z : SchedWriteRes<[Ampere1UnitX, Ampere1UnitZ]> { + let Latency = 10; + let NumMicroOps = 2; +} + +def Ampere1Write_10cyc_3L_3XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 10; + let NumMicroOps = 6; +} + +def Ampere1Write_10cyc_1A_1BS_1X : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitX]> { + let Latency = 10; + let NumMicroOps = 3; +} + +def Ampere1Write_10cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitXY]> { + let Latency = 10; + let NumMicroOps = 3; +} + +def Ampere1Write_11cyc_1BS_1L : SchedWriteRes<[Ampere1UnitBS, Ampere1UnitL]> { + let Latency = 11; + let NumMicroOps = 2; +} + +def Ampere1Write_11cyc_1A_1BS_1X : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitX]> { + let Latency = 11; + let NumMicroOps = 3; +} + +def Ampere1Write_11cyc_1A_1BS_1XY : SchedWriteRes<[Ampere1UnitA, Ampere1UnitBS, Ampere1UnitXY]> { + let Latency = 11; + let NumMicroOps = 3; +} + +def Ampere1Write_11cyc_4L_8XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 11; + let NumMicroOps = 12; +} + +def Ampere1Write_12cyc_4L_8XY : SchedWriteRes<[Ampere1UnitL, Ampere1UnitL, + Ampere1UnitL, Ampere1UnitL, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 12; + let NumMicroOps = 12; +} + +def Ampere1Write_12cyc_3XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 12; + let NumMicroOps = 3; +} + +def Ampere1Write_12cyc_4XY : SchedWriteRes<[Ampere1UnitXY, Ampere1UnitXY, + Ampere1UnitXY, Ampere1UnitXY]> { + let Latency = 12; + let NumMicroOps = 4; +} + +def Ampere1Write_18cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 18; + let NumMicroOps = 1; +} + +def Ampere1Write_19cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 19; + let NumMicroOps = 1; +} + +def Ampere1Write_25cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 25; + let NumMicroOps = 1; +} + +def Ampere1Write_32cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 32; + let NumMicroOps = 1; +} + +def Ampere1Write_34cyc_1BS : SchedWriteRes<[Ampere1UnitBS]> { + let Latency = 34; + let NumMicroOps = 1; +} + +def Ampere1Write_34cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 34; + let NumMicroOps = 1; +} + +def Ampere1Write_39cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 39; + let NumMicroOps = 1; +} + +def Ampere1Write_62cyc_1XY : SchedWriteRes<[Ampere1UnitXY]> { + let Latency = 62; + let NumMicroOps = 1; +} + +// For basic arithmetic, we have more flexibility for short shifts (LSL shift <= 4), +// which are a single uop, and for extended registers, which have full flexibility +// across Unit A or B for both uops. +def Ampere1Write_Arith : SchedWriteVariant<[ + SchedVar, + SchedVar, + SchedVar]>; + +def Ampere1Write_ArithFlagsetting : SchedWriteVariant<[ + SchedVar, + SchedVar, + SchedVar]>; + +//===----------------------------------------------------------------------===// +// Map the target-defined scheduler read/write resources and latencies for Ampere-1. +// This provides a coarse model, which is then specialised below. + +def : WriteRes; // MOVN, MOVZ +def : WriteRes; // ALU +def : WriteRes { + let Latency = 2; + let NumMicroOps = 2; +} // ALU of Shifted-Reg +def : WriteRes { + let Latency = 2; + let NumMicroOps = 2; +} // ALU of Extended-Reg +def : WriteRes; // EXTR shifts a reg pair +def : WriteRes; // Shift/Scale +def : WriteRes { + let Latency = 18; +} // 32-bit Divide +def : WriteRes { + let Latency = 34; +} // 64-bit Divide +def : WriteRes { + let Latency = 3; +} // 32-bit Multiply +def : WriteRes { + let Latency = 3; +} // 32-bit Multiply +def : WriteRes; +def : WriteRes; +def : WriteRes { + let Latency = 4; +} // Load from base addr plus immediate offset +def : WriteRes { + let Latency = 1; +} // Store to base addr plus immediate offset +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} // Store a register pair. +def : WriteRes; +def : WriteRes { + let Latency = 5; + let NumMicroOps = 2; +} // Load from a register index (maybe scaled). +def : WriteRes { + let Latency = 1; + let NumMicroOps = 2; +} // Store to a register index (maybe scaled). +def : WriteRes { + let Latency = 2; +} // General floating-point ops. +def : WriteRes { + let Latency = 5; +} // Floating-point compare. +def : WriteRes { + let Latency = 6; +} // Float conversion. +def : WriteRes { +} // Float-int register copy. +def : WriteRes { + let Latency = 2; +} // Float-int register copy. +def : WriteRes { + let Latency = 5; +} // Floating-point multiply. +def : WriteRes { + let Latency = 34; +} // Floating-point division. +def : WriteRes { + let Latency = 3; +} // 64bit Vector D ops. +def : WriteRes { + let Latency = 3; +} // 128bit Vector Q ops. +def : WriteRes { + let Latency = 5; +} // Vector loads. +def : WriteRes { + let Latency = 2; +} // Vector stores. + +def : WriteRes { let Unsupported = 1; } + +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } + +def : WriteRes { + let Latency = 4; +} // The second register of a load-pair: LDP,LDPSW,LDNP,LDXP,LDAXP + +// Forwarding logic. +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +//===----------------------------------------------------------------------===// +// Specialising the scheduling model further for Ampere-1. + +def : InstRW<[Ampere1Write_1cyc_1AB], (instrs COPY)>; + +// Branch instructions +def : InstRW<[Ampere1Write_1cyc_1A], (instrs Bcc, BL, RET)>; +def : InstRW<[Ampere1Write_1cyc_1A], + (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>; +def : InstRW<[Ampere1Write_1cyc_2A], (instrs BLR)>; + +// Cryptography instructions +// -- AES encryption/decryption +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^AES[DE]")>; +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^AESI?MC")>; +// -- Polynomial multiplication +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^PMUL", "^PMULL")>; +// -- SHA-256 hash +def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA256(H|H2)")>; +// -- SHA-256 schedule update +def : InstRW<[Ampere1Write_4cyc_1Y], (instregex "^SHA256SU[01]")>; +// -- SHA-3 instructions +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^BCAX", "^EOR3", "^RAX1", "^XAR")>; +// -- SHA-512 hash +def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA512(H|H2)")>; +// -- SHA-512 schedule update +def : InstRW<[Ampere1Write_4cyc_1Y], (instregex "^SHA512SU[01]")>; +// -- SHA1 choose/majority/parity +def : InstRW<[Ampere1Write_4cyc_1X], (instregex "^SHA1[CMP]")>; +// -- SHA1 hash/schedule update +def : InstRW<[Ampere1Write_2cyc_1Y], (instregex "^SHA1SU[01]")>; +def : InstRW<[Ampere1Write_2cyc_1Y], (instregex "^SHA1H")>; + +// FP and vector load instructions +// -- Load 1-element structure to one/all lanes +// ---- all lanes +def : InstRW<[Ampere1Write_7cyc_1L_1XY], + (instregex "^LD1Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// ---- one lane +def : InstRW<[Ampere1Write_7cyc_1L_1XY], + (instregex "^LD1i(8|16|32|64)")>; +// -- Load 1-element structure to one/all lanes, 1D size +def : InstRW<[Ampere1Write_5cyc_1L], + (instregex "^LD1Rv1d")>; +// -- Load 1-element structures to 1 register +def : InstRW<[Ampere1Write_5cyc_1L], + (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 1-element structures to 2 registers +def : InstRW<[Ampere1Write_5cyc_2L], + (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 1-element structures to 3 registers +def : InstRW<[Ampere1Write_6cyc_3L], + (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 1-element structures to 4 registers +def : InstRW<[Ampere1Write_6cyc_4L], + (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Load 2-element structure to all lanes of 2 registers, 1D size +def : InstRW<[Ampere1Write_5cyc_2L], + (instregex "^LD2Rv1d")>; +// -- Load 2-element structure to all lanes of 2 registers, other sizes +def : InstRW<[Ampere1Write_7cyc_2L_2XY], + (instregex "^LD2Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// -- Load 2-element structure to one lane of 2 registers +def : InstRW<[Ampere1Write_7cyc_2L_2XY], + (instregex "^LD2i(8|16|32|64)")>; +// -- Load 2-element structures to 2 registers, 16B/8H/4S/2D size +def : InstRW<[Ampere1Write_7cyc_2L_2XY], + (instregex "^LD2Twov(16b|8h|4s|2d)")>; +// -- Load 2-element structures to 2 registers, 8B/4H/2S size +def : InstRW<[Ampere1Write_9cyc_2L_3XY], + (instregex "^LD2Twov(8b|4h|2s)")>; +// -- Load 3-element structure to all lanes of 3 registers, 1D size +def : InstRW<[Ampere1Write_6cyc_3L], + (instregex "^LD3Rv1d")>; +// -- Load 3-element structure to all lanes of 3 registers, other sizes +def : InstRW<[Ampere1Write_8cyc_3L_3XY], + (instregex "^LD3Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// -- Load 3-element structure to one lane of 3 registers +def : InstRW<[Ampere1Write_8cyc_3L_3XY], + (instregex "^LD3i(8|16|32|64)")>; +// -- Load 3-element structures to 3 registers, 16B/8H/4S sizes +def : InstRW<[Ampere1Write_9cyc_3L_3XY], + (instregex "^LD3Threev(16b|8h|4s)")>; +// -- Load 3-element structures to 3 registers, 2D size +def : InstRW<[Ampere1Write_8cyc_3L_3XY], + (instregex "^LD3Threev2d")>; +// -- Load 3-element structures to 3 registers, 8B/4H/2S sizes +def : InstRW<[Ampere1Write_10cyc_3L_3XY], + (instregex "^LD3Threev(8b|4h|2s)")>; +// -- Load 4-element structure to all lanes of 4 registers, 1D size +def : InstRW<[Ampere1Write_6cyc_4L], + (instregex "^LD4Rv1d")>; +// -- Load 4-element structure to all lanes of 4 registers, other sizes +def : InstRW<[Ampere1Write_8cyc_4L_4XY], + (instregex "^LD4Rv(8b|4h|2s|16b|8h|4s|2d)")>; +// -- Load 4-element structure to one lane of 4 registers +def : InstRW<[Ampere1Write_6cyc_4L], + (instregex "^LD4i(8|16|32|64)")>; +// -- Load 4-element structures to 4 registers, 2D size +def : InstRW<[Ampere1Write_9cyc_4L_4XY], + (instregex "^LD4Fourv2d")>; +// -- Load 4-element structures to 4 registers, 2S size +def : InstRW<[Ampere1Write_12cyc_4L_8XY], + (instregex "^LD4Fourv2s")>; +// -- Load 4-element structures to 4 registers, other sizes +def : InstRW<[Ampere1Write_11cyc_4L_8XY], + (instregex "^LD4Fourv(8b|4h|16b|8h|4s)")>; +// -- Load pair, Q-form +def : InstRW<[Ampere1Write_5cyc_2L], (instregex "LDN?PQ")>; +// -- Load pair, S/D-form +def : InstRW<[Ampere1Write_5cyc_1L_1BS], (instregex "LDN?P(S|D)")>; +// -- Load register +def : InstRW<[Ampere1Write_5cyc_1L], (instregex "LDU?R[BHSDQ]i")>; +// -- Load register, sign-extended register +def : InstRW<[Ampere1Write_6cyc_1AB_1L], (instregex "LDR[BHSDQ]ro(W|X)")>; + +// FP and vector store instructions +// -- Store 1-element structure from one lane of 1 register +def : InstRW<[Ampere1Write_4cyc_1XY_1S_1Z], + (instregex "^ST1i(8|16|32|64)")>; +// -- Store 1-element structures from 1 register +def : InstRW<[Ampere1Write_2cyc_1S_1Z], + (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 1-element structures from 2 registers +def : InstRW<[Ampere1Write_3cyc_2S_2Z], + (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 1-element structures from 3 registers +def : InstRW<[Ampere1Write_4cyc_3S_3Z], + (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 1-element structures from 4 registers +def : InstRW<[Ampere1Write_5cyc_4S_4Z], + (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 2-element structure from one lane of 2 registers +def : InstRW<[Ampere1Write_5cyc_2XY_2S_2Z], + (instregex "^ST2i(8|16|32|64)")>; +// -- Store 2-element structures from 2 registers, 16B/8H/4S/2D sizes +def : InstRW<[Ampere1Write_5cyc_2XY_2S_2Z], + (instregex "^ST2Twov(16b|8h|4s|2d)")>; +// -- Store 2-element structures from 2 registers, 8B/4H/2S sizes +def : InstRW<[Ampere1Write_6cyc_2XY_2S_2Z], + (instregex "^ST2Twov(8b|4h|2s)")>; +// -- Store 3-element structure from one lane of 3 registers +def : InstRW<[Ampere1Write_6cyc_3XY_3S_3Z], + (instregex "^ST3i(8|16|32|64)")>; +// -- Store 3-element structures from 3 registers +def : InstRW<[Ampere1Write_6cyc_3XY_3S_3Z], + (instregex "^ST3Threev(8b|4h|2s|1d|16b|8h|4s|2d)")>; +// -- Store 4-element structure from one lane of 4 registers +def : InstRW<[Ampere1Write_7cyc_4XY_4S_4Z], + (instregex "^ST4i(8|16|32|64)")>; +// -- Store 4-element structures from 4 registers, 16B/8H/4S sizes +def : InstRW<[Ampere1Write_9cyc_8XY_4S_4Z], + (instregex "^ST4Fourv(16b|8h|4s)")>; +// -- Store 4-element structures from 4 registers, 2D sizes +def : InstRW<[Ampere1Write_7cyc_4XY_4S_4Z], + (instregex "^ST4Fourv2d")>; +// -- Store 4-element structures from 4 registers, 8B/4H/2S sizes +def : InstRW<[Ampere1Write_9cyc_6XY_4S_4Z], + (instregex "^ST4Fourv(8b|4h|2s)")>; +// -- Store pair, Q-form +def : InstRW<[Ampere1Write_3cyc_2S_2Z], (instregex "^STN?PQ")>; +// -- Store pair, S/D-form +def : InstRW<[Ampere1Write_3cyc_1S_2Z], (instregex "^STN?P[SD]")>; +// -- Store register +def : InstRW<[Ampere1Write_2cyc_1S_1Z], (instregex "^STU?R[BHSDQ](ui|i)")>; +// -- Store register, sign-extended register offset +def : InstRW<[Ampere1Write_2cyc_1AB_1S_1Z], (instregex "^STR[BHSDQ]ro[XW]")>; + +// FP data processing, bfloat16 format +def : InstRW<[Ampere1Write_5cyc_1XY], (instrs BFCVT)>; +def : InstRW<[Ampere1Write_7cyc_2XY], (instrs BFCVTN, BFCVTN2)>; +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^BFDOTv", "^BF16DOT")>; +def : InstRW<[Ampere1Write_4cyc_2XY], (instrs BFMMLA)>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^BFMLAL")>; + +// FP data processing, scalar/vector, half precision +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(ABD|ABS)v.[fi]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], + (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)16")>; +def : InstRW<[Ampere1Write_4cyc_1X], + (instregex "^FCMPE?H")>; +def : InstRW<[Ampere1Write_10cyc_1A_1BS_1X], + (instregex "^FCCMPE?H")>; +def : InstRW<[Ampere1Write_10cyc_1A_1BS_1XY], + (instregex "^FCSELH")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^[SU]CVTFv.[fi]16")>; +def : InstRW<[Ampere1Write_25cyc_1XY], (instregex "^FDIVv.[if]16", "FDIVH")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if]16")>; +def : InstRW<[Ampere1Write_8cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv4[if]16")>; +def : InstRW<[Ampere1Write_12cyc_3XY], (instregex "^F(MAX|MIN)(NM)?Vv8[if]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FMULX?v.[if]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instrs FMULX16)>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FN?M(ADD|SUB)[H]rrr")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FML[AS]v.[if]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FRECPXv.[if]16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^F(RECP|RSQRT)S16")>; +def : InstRW<[Ampere1Write_4cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if]16")>; +def : InstRW<[Ampere1Write_39cyc_1XY], (instregex "^FSQRTv.f16", "^FSQRTHr")>; + +// FP data processing, scalar/vector, single/double precision +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(ABD|ABS)v.[fi](32|64)")>; +def : InstRW<[Ampere1Write_5cyc_1XY], + (instregex "^F(ADD|ADDP|CADD|NEG|NMUL|SUB)v.[fi](32|64)")>; +def : InstRW<[Ampere1Write_5cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)v.[fi](32|64)")>; +def : InstRW<[Ampere1Write_5cyc_1XY], + (instregex "^F(AC|CM)(EQ|GE|GT|LE|LT)(32|64)")>; +def : InstRW<[Ampere1Write_5cyc_1X], + (instregex "^FCMPE?(S|D)")>; +def : InstRW<[Ampere1Write_11cyc_1A_1BS_1X], + (instregex "^FCCMPE?(S|D)")>; +def : InstRW<[Ampere1Write_11cyc_1A_1BS_1XY], + (instregex "^FCSEL(S|D)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVT[AMNPZ][SU]v.[if](32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^[SU]CVTFv.[fi](32|64)")>; +def : InstRW<[Ampere1Write_34cyc_1XY], (instregex "^FDIVv.[if](64)", "FDIVD")>; +def : InstRW<[Ampere1Write_19cyc_1XY], (instregex "^FDIVv.[if](32)", "FDIVS")>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^F(MAX|MIN)(NM)?P?v.[if](32|64)")>; +def : InstRW<[Ampere1Write_10cyc_2XY], (instregex "^F(MAX|MIN)(NM)?Vv.[if](32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FMULX?v.[if](32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instrs FMULX32, FMULX64)>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FN?M(ADD|SUB)[SD]rrr")>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FML[AS]v.[if](32|64)")>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FRECPXv.[if](32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^F(RECP|RSQRT)S(32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FRINT[AIMNPXZ]v.[if](32|64)")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FRINT(32|64)")>; +def : InstRW<[Ampere1Write_62cyc_1XY], (instregex "^FSQRTv.f64", "^FSQRTDr")>; +def : InstRW<[Ampere1Write_32cyc_1XY], (instregex "^FSQRTv.f32", "^FSQRTSr")>; + +// FP miscellaneous instructions +def : InstRW<[Ampere1Write_10cyc_1XY_1Z], (instregex "^FCVT[AMNPZ][SU][SU][XW][HSD]r")>; +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FCVT[HSD]Hr")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVT[HSD][SD]r")>; +def : InstRW<[Ampere1Write_6cyc_1XY], (instregex "^FCVTLv")>; +def : InstRW<[Ampere1Write_8cyc_2XY], (instregex "^FCVT(N|XN)v")>; +def : InstRW<[Ampere1Write_10cyc_1X_1Z], (instrs FJCVTZS)>; +def : InstRW<[Ampere1Write_5cyc_1BS], (instregex "^FMOV[HSD][WX]r")>; +def : InstRW<[Ampere1Write_7cyc_1BS_1XY], (instregex "^FMOVDXHighr")>; +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^FMOV[HSD][ri]")>; +def : InstRW<[Ampere1Write_6cyc_1XY_1Z], (instregex "^FMOVXDHighr")>; +def : InstRW<[Ampere1Write_4cyc_1Z], (instregex "^FMOV[WX][HSD]r")>; + +// Integer arithmetic and logical instructions +def : InstRW<[Ampere1Write_1cyc_1A], + (instregex "ADC(W|X)r", "SBC(W|X)r")>; +def : InstRW<[Ampere1Write_Arith], + (instregex "(ADD|AND|BIC|EON|EOR|ORN|ORR|SUB)(W|X)r")>; +def : InstRW<[Ampere1Write_ArithFlagsetting], + (instregex "(ADD|AND|BIC|SUB)S(W|X)r")>; +def : InstRW<[Ampere1Write_1cyc_1A], + (instregex "(ADC|SBC)S(W|X)r")>; +def : InstRW<[Ampere1Write_1cyc_1A], (instrs RMIF)>; +def : InstRW<[Ampere1Write_1cyc_1A], + (instregex "(CCMN|CCMP)(X|W)")>; +def : InstRW<[Ampere1Write_1cyc_1A], + (instregex "(CSEL|CSINC|CSINV|CSNEG)(X|W)")>; +def : InstRW<[Ampere1Write_18cyc_1BS], (instrs SDIVWr, UDIVWr)>; +def : InstRW<[Ampere1Write_34cyc_1BS], (instrs SDIVXr, UDIVXr)>; +def : InstRW<[Ampere1Write_3cyc_1BS], + (instregex "(S|U)MULHr")>; +def : InstRW<[Ampere1Write_4cyc_1BS], + (instregex "(S|U)?M(ADD|SUB)L?r")>; + +// Integer load instructions +def : InstRW<[Ampere1Write_4cyc_2L], + (instregex "(LDNP|LDP|LDPSW)(X|W)")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDR(B|D|H|Q|S)ui")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDR(D|Q|W|X)l")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDTR(B|H|W|X)i")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDTRS(BW|BX|HW|HX|W)i")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDUR(BB|HH|X|W)i")>; +def : InstRW<[Ampere1Write_4cyc_1L], + (instregex "LDURS(BW|BX|HW|HX|W)i")>; +def : InstRW<[Ampere1Write_5cyc_1AB_1L], + (instregex "LDR(HH|SHW|SHX|W|X)ro(W|X)")>; +def : InstRW<[Ampere1Write_1cyc_1L], + (instrs PRFMl, PRFUMi, PRFUMi)>; +def : InstRW<[Ampere1Write_2cyc_1AB_1L], + (instrs PRFMroW, PRFMroX)>; + +// Integer miscellaneous instructions +def : InstRW<[Ampere1Write_1cyc_1A], (instrs ADR, ADRP)>; +def : InstRW<[Ampere1Write_1cyc_1B], (instregex "EXTR(W|X)")>; +def : InstRW<[Ampere1Write_1cyc_1B], (instregex "(S|U)?BFM(W|X)")>; +def : InstRW<[Ampere1Write_3cyc_1BS], (instregex "^CRC32C?[BHWX]")>; +def : InstRW<[Ampere1Write_1cyc_1B], (instregex "CLS(W|X)")>; +def : InstRW<[Ampere1Write_1cyc_1A], (instrs SETF8, SETF16)>; +def : InstRW<[Ampere1Write_1cyc_1AB], + (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>; +def : InstRW<[Ampere1Write_1cyc_1B], + (instregex "(RBIT|REV|REV16)(W|X)r", "REV32Xr")>; +def : InstRW<[Ampere1Write_1cyc_1B], + (instregex "(ASR|LSL|LSR|ROR)V(W|X)r")>; + +// Integer store instructions +def : InstRW<[Ampere1Write_1cyc_2S], (instregex "STNP(X|W)i")>; +def : InstRW<[Ampere1Write_2cyc_1B_1S], + (instrs STPWi, STPXi)>; +def : InstRW<[Ampere1Write_3cyc_1B_1S_1AB], + (instregex "STP(W|X)(pre|post)")>; +def : InstRW<[Ampere1Write_1cyc_1S], + (instrs STTRBi, STTRHi, STTRWi, STTRXi)>; +def : InstRW<[Ampere1Write_1cyc_1S], + (instregex "STUR(BB|HH|X|W)i", + "STR(X|W)ui", + "STUR(BB|HH|X|W)i")>; +def : InstRW<[Ampere1Write_1cyc_2S], (instrs STRWroX, STRXroX)>; +def : InstRW<[Ampere1Write_2cyc_1AB_2S], (instrs STRWroW, STRXroW)>; + +// Pointer authentication +//def : InstRW<[Ampere1Write_7cyc_1BS], +// (instrs AUTIAZ, AUTIBZ, AUTIASP, AUTIBSP, AUTIA1716, AUTIB1716)>; +def : InstRW<[Ampere1Write_8cyc_1BS_1A], + (instregex "BRA(A|AZ|B|BZ)", "RETA(A|B)", "ERETA(A|B)")>; +def : InstRW<[Ampere1Write_8cyc_1BS_2A], + (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ)>; +//def : InstRW<[Ampere1Write_7cyc_1BS], +// (instrs PACIAZ, PACIBZ, PACIASP, PACIBSP, PACIA1716, PACIB1716)>; +def : InstRW<[Ampere1Write_11cyc_1BS_1L], (instregex "^LDRA(A|B)")>; +def : InstRW<[Ampere1Write_7cyc_1BS], (instrs XPACD, XPACI)>; + +// Vector integer instructions +// -- absolute difference +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^SABAv", "^SABALv", "^SABDv", "^SABDLv", + "^UABAv", "^UABALv", "^UABDv", "^UABDLv")>; +// -- arithmetic +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^ABSv", "^(ADD|SUB)v", "^SADDLv", "^SADDW", "SHADD", + "SHSUB", "^SRHADD", "^URHADD", "SSUBL", "SSUBW", + "^UADDLv", "^UADDW", "UHADD", "UHSUB", "USUBL", "USUBW")>; +// -- arithmetic, horizontal, 16B +def : InstRW<[Ampere1Write_12cyc_4XY], + (instregex "^ADDVv16i8v", "^SADDLVv16i8v", "^UADDLVv16i8v")>; +def : InstRW<[Ampere1Write_12cyc_4XY], + (instregex "^[SU](MIN|MAX)Vv16i8v")>; +// -- arithmetic, horizontal, 4H/4S +def : InstRW<[Ampere1Write_6cyc_2XY], + (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v")>; +def : InstRW<[Ampere1Write_6cyc_2XY], + (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v")>; +// -- arithmetic, horizontal, 8B/8H +def : InstRW<[Ampere1Write_9cyc_3XY], + (instregex "^[SU]?ADDL?V(v8i16|v4i32)v")>; +def : InstRW<[Ampere1Write_9cyc_3XY], + (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v")>; +// -- arithmetic, narrowing +def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "(ADD|SUB)HNv.*")>; +def : InstRW<[Ampere1Write_5cyc_2XY], (instregex "(RADD|RSUB)HNv.*")>; +// -- arithmetic, pairwise +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^ADDPv", "^SADALP", "^UADALP", "^SADDLPv", "^UADDLPv")>; +// -- arithmetic, saturating +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^SQADD", "^SQSUB", "^SUQADD", "^UQADD", "^UQSUB", "^USQADD")>; +// -- bit count +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^(CLS|CLZ|CNT)v")>; +// -- compare +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^CMEQv", "^CMGEv", "^CMGTv", "^CMLEv", "^CMLTv", + "^CMHIv", "^CMHSv")>; +// -- compare non-zero +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^CMTSTv")>; +// -- dot product +def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^(S|SU|U|US)DOTv")>; +// -- fp reciprocal estimate +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^FRECPEv", "^FRSQRTEv")>; +// -- integer reciprocal estimate +def : InstRW<[Ampere1Write_5cyc_1XY], (instregex "^URECPEv", "^URSQRTEv")>; +// -- logical +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>; +// -- logical, narrowing +def : InstRW<[Ampere1Write_5cyc_2XY], + (instregex "RSHRNv", + "SHRNv", "SQSHRNv", "SQSHRUNv", + "UQXTNv")>; +// -- matrix multiply +def : InstRW<[Ampere1Write_6cyc_2XY], + (instrs SMMLA, UMMLA, USMMLA)>; +// -- max/min +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^SMAXv", "^SMINv", "^UMAXv", "^UMINv")>; +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^SMAXPv", "^SMINPv", "^UMAXPv", "^UMINPv")>; +// -- move immediate +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^MOVIv", "^MVNIv")>; +// -- multiply +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "MULv", "SMULLv", "UMULLv", "SQDMUL(H|L)v", "SQRDMULHv")>; +// -- multiply accumulate +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "MLAv", "MLSv", "(S|U|SQD)(MLAL|MLSL)v", "SQRDML(A|S)Hv")>; +// -- negation, saturating +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^SQABS", "^SQNEG")>; +// -- reverse bits/bytes +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^RBITv", "^REV16v", "^REV32v", "^REV64v")>; +// -- shift +def : InstRW<[Ampere1Write_3cyc_1XY], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>; +// -- shift and accumulate +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "SRSRAv", "SSRAv", "URSRAv", "USRAv")>; +// -- shift, saturating +def : InstRW<[Ampere1Write_3cyc_1XY], + (instregex "^SQRSHLv", "^SQRSHRNv", "^SQRSHRUNv", "^SQSHL", "^SQSHLU", + "^SQXTNv", "^SQXTUNv", "^UQSHRNv", "UQRSHRNv", "^UQRSHL", + "^UQSHL")>; + +// Vector miscellaneous instructions +// -- duplicate element +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^DUPv.+lane")>; +// -- duplicate from GPR +def : InstRW<[Ampere1Write_5cyc_1BS], (instregex "^DUPv.+gpr")>; +// -- extract narrow +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^XTNv")>; +// -- insert/extract element +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^EXTv", "^INSv.+lane")>; +// -- move FP immediate +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^FMOVv")>; +// -- move element to GPR +def : InstRW<[Ampere1Write_6cyc_1XY_1Z], (instregex "(S|U)MOVv")>; +// -- move from GPR to any element +def : InstRW<[Ampere1Write_7cyc_1BS_1XY], (instregex "^INSv.+gpr")>; +// -- table lookup +def : InstRW<[Ampere1Write_2cyc_1XY], + (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>; +def : InstRW<[Ampere1Write_4cyc_2XY], + (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>; +def : InstRW<[Ampere1Write_6cyc_3XY], + (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>; +def : InstRW<[Ampere1Write_8cyc_4XY], + (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>; +// -- transpose +def : InstRW<[Ampere1Write_2cyc_1XY], + (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>; +// -- zip/unzip +def : InstRW<[Ampere1Write_2cyc_1XY], (instregex "^ZIP1v", "^ZIP2v")>; + +} // SchedModel = Ampere1Model diff --git a/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td b/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AArch64/AArch64SchedPredAmpere.td @@ -0,0 +1,25 @@ +//===- AArch64SchedPredAmpere.td - AArch64 Sched Preds -----*- tablegen -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file defines scheduling predicate definitions that are used by the +// AArch64 Ampere Computing processors. +// +//===----------------------------------------------------------------------===// + +// Auxiliary predicates. + +// Check for a LSL shift <= 4 +def AmpereCheapLSL : MCSchedPredicate< + CheckAny<[CheckShiftBy0, + CheckAll< + [CheckShiftLSL, + CheckAny< + [CheckShiftBy1, + CheckShiftBy2, + CheckShiftBy3, + CheckShiftBy4]>]>]>>; diff --git a/llvm/lib/Target/AArch64/AArch64SchedPredicates.td b/llvm/lib/Target/AArch64/AArch64SchedPredicates.td --- a/llvm/lib/Target/AArch64/AArch64SchedPredicates.td +++ b/llvm/lib/Target/AArch64/AArch64SchedPredicates.td @@ -53,7 +53,7 @@ } // Check for shifting in arithmetic and logic instructions. -foreach I = {0-3, 8} in { +foreach I = {0-4, 8} in { let FunctionMapper = "AArch64_AM::getShiftValue" in def CheckShiftBy#I : CheckImmOperand<3, I>; } diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h --- a/llvm/lib/Target/AArch64/AArch64Subtarget.h +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h @@ -79,7 +79,8 @@ ThunderXT83, ThunderXT88, ThunderX3T110, - TSV110 + TSV110, + Ampere1 }; protected: diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp --- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp +++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp @@ -219,6 +219,12 @@ // FIXME: remove this to enable 64-bit SLP if performance looks good. MinVectorRegisterBitWidth = 128; break; + case Ampere1: + CacheLineSize = 64; + PrefFunctionLogAlignment = 6; + PrefLoopLogAlignment = 6; + MaxInterleaveFactor = 4; + break; } } diff --git a/llvm/test/CodeGen/AArch64/cpus.ll b/llvm/test/CodeGen/AArch64/cpus.ll --- a/llvm/test/CodeGen/AArch64/cpus.ll +++ b/llvm/test/CodeGen/AArch64/cpus.ll @@ -34,6 +34,7 @@ ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=tsv110 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=apple-latest 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=a64fx 2>&1 | FileCheck %s +; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=ampere1 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID ; CHECK-NOT: {{.*}} is not a recognized processor for this target diff --git a/llvm/test/CodeGen/AArch64/neon-dot-product.ll b/llvm/test/CodeGen/AArch64/neon-dot-product.ll --- a/llvm/test/CodeGen/AArch64/neon-dot-product.ll +++ b/llvm/test/CodeGen/AArch64/neon-dot-product.ll @@ -5,6 +5,7 @@ ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=neoverse-e1 < %s | FileCheck %s ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=neoverse-n1 < %s | FileCheck %s ; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=neoverse-n2 < %s | FileCheck %s +; RUN: llc -mtriple aarch64-none-linux-gnu -mcpu=ampere1 < %s | FileCheck %s declare <2 x i32> @llvm.aarch64.neon.udot.v2i32.v8i8(<2 x i32>, <8 x i8>, <8 x i8>) declare <4 x i32> @llvm.aarch64.neon.udot.v4i32.v16i8(<4 x i32>, <16 x i8>, <16 x i8>) diff --git a/llvm/test/CodeGen/AArch64/remat.ll b/llvm/test/CodeGen/AArch64/remat.ll --- a/llvm/test/CodeGen/AArch64/remat.ll +++ b/llvm/test/CodeGen/AArch64/remat.ll @@ -24,6 +24,7 @@ ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=tsv110 -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnuabi -mattr=+custom-cheap-as-move -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=thunderx3t110 -o - %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=ampere1 -o - %s | FileCheck %s %X = type { i64, i64, i64 } declare void @f(%X*) diff --git a/llvm/test/MC/AArch64/armv8.2a-dotprod.s b/llvm/test/MC/AArch64/armv8.2a-dotprod.s --- a/llvm/test/MC/AArch64/armv8.2a-dotprod.s +++ b/llvm/test/MC/AArch64/armv8.2a-dotprod.s @@ -13,6 +13,7 @@ // RUN: llvm-mc -triple aarch64 -mcpu=tsv110 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD // RUN: llvm-mc -triple aarch64 -mcpu=cortex-r82 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD // RUN: llvm-mc -triple aarch64 -mattr=+v8r -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD +// RUN: llvm-mc -triple aarch64 -mcpu=ampere1 -show-encoding < %s | FileCheck %s --check-prefix=CHECK-DOTPROD // RUN: not llvm-mc -triple aarch64 -mattr=+v8.2a -show-encoding < %s 2> %t // RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s @@ -36,6 +37,8 @@ // RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s // RUN: not llvm-mc -triple aarch64 -mcpu=neoverse-n2 -mattr=-dotprod -show-encoding < %s 2> %t // RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s +// RUN: not llvm-mc -triple aarch64 -mcpu=ampere1 -mattr=-dotprod -show-encoding < %s 2> %t +// RUN: FileCheck --check-prefix=CHECK-NO-DOTPROD < %t %s udot v0.2s, v1.8b, v2.8b sdot v0.2s, v1.8b, v2.8b diff --git a/llvm/test/MC/AArch64/armv8.3a-rcpc.s b/llvm/test/MC/AArch64/armv8.3a-rcpc.s --- a/llvm/test/MC/AArch64/armv8.3a-rcpc.s +++ b/llvm/test/MC/AArch64/armv8.3a-rcpc.s @@ -6,6 +6,7 @@ // RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mcpu=neoverse-e1 < %s 2>&1 | FileCheck %s // RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mcpu=neoverse-n1 < %s 2>&1 | FileCheck %s // RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mcpu=neoverse-n2 < %s 2>&1 | FileCheck %s +// RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mcpu=ampere1 < %s 2>&1 | FileCheck %s // RUN: llvm-mc -triple aarch64-none-linux-gnu -show-encoding -mattr=+v8.2a -mattr=+rcpc < %s 2>&1 | FileCheck %s // RUN: not llvm-mc -triple aarch64-none-linux-gnu -mattr=+v8.2a < %s 2> %t // RUN: FileCheck --check-prefix=CHECK-REQ %s < %t diff --git a/llvm/test/MC/Disassembler/AArch64/armv8.3a-rcpc.txt b/llvm/test/MC/Disassembler/AArch64/armv8.3a-rcpc.txt --- a/llvm/test/MC/Disassembler/AArch64/armv8.3a-rcpc.txt +++ b/llvm/test/MC/Disassembler/AArch64/armv8.3a-rcpc.txt @@ -12,6 +12,7 @@ # RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=neoverse-e1 --disassemble < %s | FileCheck %s # RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=neoverse-n1 --disassemble < %s | FileCheck %s # RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=neoverse-n2 --disassemble < %s | FileCheck %s +# RUN: llvm-mc -triple aarch64-none-linux-gnu -mcpu=ampere1 --disassemble < %s | FileCheck %s # CHECK: ldaprb w0, [x0] # CHECK: ldaprh w0, [x0] diff --git a/llvm/unittests/Support/TargetParserTest.cpp b/llvm/unittests/Support/TargetParserTest.cpp --- a/llvm/unittests/Support/TargetParserTest.cpp +++ b/llvm/unittests/Support/TargetParserTest.cpp @@ -1169,6 +1169,16 @@ AArch64::AEK_SVE2 | AArch64::AEK_SVE2BITPERM | AArch64::AEK_BF16 | AArch64::AEK_I8MM, "8.5-A"), + ARMCPUTestParams("ampere1", "armv8.6-a", "crypto-neon-fp-armv8", + AArch64::AEK_CRC | AArch64::AEK_CRYPTO | + AArch64::AEK_FP | AArch64::AEK_SIMD | + AArch64::AEK_FP16 | AArch64::AEK_RAS | + AArch64::AEK_LSE | + AArch64::AEK_DOTPROD | AArch64::AEK_RCPC | + AArch64::AEK_RDM | AArch64::AEK_MTE | + AArch64::AEK_SSBS | AArch64::AEK_SB | + AArch64::AEK_BF16 | AArch64::AEK_I8MM, + "8.6-A"), ARMCPUTestParams( "neoverse-512tvb", "armv8.4-a", "crypto-neon-fp-armv8", AArch64::AEK_RAS | AArch64::AEK_SVE | AArch64::AEK_SSBS | @@ -1232,7 +1242,7 @@ AArch64::AEK_LSE | AArch64::AEK_RDM, "8.2-A"))); -static constexpr unsigned NumAArch64CPUArchs = 52; +static constexpr unsigned NumAArch64CPUArchs = 53; TEST(TargetParserTest, testAArch64CPUArchList) { SmallVector List;