Index: include/llvm/Support/AArch64TargetParser.def =================================================================== --- include/llvm/Support/AArch64TargetParser.def +++ include/llvm/Support/AArch64TargetParser.def @@ -70,6 +70,9 @@ (AArch64::AEK_SIMD | AArch64::AEK_CRC | AArch64::AEK_CRYPTO)) AARCH64_CPU_NAME("vulcan", AK_ARMV8_1A, FK_CRYPTO_NEON_FP_ARMV8, false, (AArch64::AEK_SIMD | AArch64::AEK_CRC | AArch64::AEK_CRYPTO)) +AARCH64_CPU_NAME("thunderx", AK_ARMV8_1A, FK_CRYPTO_NEON_FP_ARMV8, false, + (AArch64::AEK_SIMD | AArch64::AEK_CRC | AArch64::AEK_CRYPTO)) + // Invalid CPU AARCH64_CPU_NAME("invalid", AK_INVALID, FK_INVALID, true, AArch64::AEK_INVALID) #undef AARCH64_CPU_NAME Index: lib/Target/AArch64/AArch64.td =================================================================== --- lib/Target/AArch64/AArch64.td +++ lib/Target/AArch64/AArch64.td @@ -146,6 +146,7 @@ include "AArch64SchedM1.td" include "AArch64SchedKryo.td" include "AArch64SchedVulcan.td" +include "AArch64SchedThunderX.td" def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", "Cortex-A35 ARM processors", [ @@ -254,6 +255,18 @@ FeaturePredictableSelectIsExpensive, HasV8_1aOps]>; +def ProcThunderX : SubtargetFeature<"thunderx", "ARMProcFamily", "ThunderX", + "ThunderX ARM processors", [ + FeatureCRC, + FeatureCrypto, + FeatureFPARMv8, + FeatureMergeNarrowLd, + FeatureNEON, + FeaturePerfMon, + FeaturePostRAScheduler, + HasV8_1aOps]>; + + def : ProcessorModel<"generic", NoSchedModel, [ FeatureCRC, FeatureFPARMv8, @@ -274,6 +287,7 @@ def : ProcessorModel<"exynos-m2", ExynosM1Model, [ProcExynosM1]>; def : ProcessorModel<"kryo", KryoModel, [ProcKryo]>; def : ProcessorModel<"vulcan", VulcanModel, [ProcVulcan]>; +def : ProcessorModel<"thunderx", ThunderXModel, [ProcThunderX]>; //===----------------------------------------------------------------------===// // Assembly parser Index: lib/Target/AArch64/AArch64SchedThunderX.td =================================================================== --- /dev/null +++ lib/Target/AArch64/AArch64SchedThunderX.td @@ -0,0 +1,289 @@ +//==- AArch64SchedThunderX.td - ThunderX Scheduling Definitions -*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the Cavium ThunderX processors. +// +//===----------------------------------------------------------------------===// + +// ===---------------------------------------------------------------------===// +// The following definitions describe the simpler per-operand machine model. +// This works with MachineScheduler. See MCSchedModel.h for details. + +// ThunderX machine model for scheduling and other instruction cost heuristics. +def ThunderXModel : SchedMachineModel { + let MicroOpBufferSize = 0; // Explicitly set to zero since ThunderX is in-order. + let IssueWidth = 2; // 2 micro-ops are dispatched per cycle. + let LoadLatency = 3; // Optimistic load latency assuming bypass. + // This is overriden by OperandCycles if the + // Itineraries are queried instead. + let MispredictPenalty = 7; + let CompleteModel = 1; + let LoopMicroOpBufferSize = 8; +} + + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available. + +def ThunderXUnitALU : ProcResource<2> { let BufferSize = 0; } // Int ALU +def ThunderXUnitMAC : ProcResource<1> { let BufferSize = 0; } // Int MAC +def ThunderXUnitDiv : ProcResource<1> { let BufferSize = 0; } // Int Division +def ThunderXUnitLdSt : ProcResource<1> { let BufferSize = 0; } // Load/Store +def ThunderXUnitB : ProcResource<1> { let BufferSize = 0; } // Branch +def ThunderXUnitFPALU : ProcResource<1> { let BufferSize = 0; } // FP ALU +def ThunderXUnitFPMDS : ProcResource<1> { let BufferSize = 0; } // FP Mult/Div/Sqrt + + +//===----------------------------------------------------------------------===// +// Subtarget-specific SchedWrite types which both map the ProcResources and +// set the latency. + +let SchedModel = ThunderXModel in { + +// ALU - Despite having a full latency of 4, most of the ALU instructions can +// forward a cycle earlier and then two cycles earlier in the case of a +// shift-only instruction. These latencies will be incorrect when the +// result cannot be forwarded, but modeling isn't rocket surgery. +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 2; } +def : WriteRes { let Latency = 3; } + +// MAC +def : WriteRes { let Latency = 4; } +def : WriteRes { let Latency = 4; } + +// Div +def : WriteRes { let Latency = 15; } +def : WriteRes { let Latency = 15; } + +// Load +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 3; } + +// Vector Load - Vector loads take 1-5 cycles to issue. For the WriteVecLd +// below, choosing the median of 3 which makes the latency 6. +// May model this more carefully in the future. The remaining +// ThunderXWriteVLD# types represent the 1-5 cycle issues explicitly. +def : WriteRes { let Latency = 6; + let ResourceCycles = [3]; } +def ThunderXWriteVLD1 : SchedWriteRes<[ThunderXUnitLdSt]> { let Latency = 4; } +def ThunderXWriteVLD2 : SchedWriteRes<[ThunderXUnitLdSt]> { let Latency = 12; + let ResourceCycles = [8]; } +def ThunderXWriteVLD3 : SchedWriteRes<[ThunderXUnitLdSt]> { let Latency = 14; + let ResourceCycles = [10]; } +def ThunderXWriteVLD4 : SchedWriteRes<[ThunderXUnitLdSt]> { let Latency = 16; + let ResourceCycles = [12]; } +def ThunderXWriteVLD5 : SchedWriteRes<[ThunderXUnitLdSt]> { let Latency = 16; + let ResourceCycles = [12]; } + +// Pre/Post Indexing - Performed as part of address generation which is already +// accounted for in the WriteST* latencies below +def : WriteRes { let Latency = 0; } + +// Store +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } + +// Vector Store - Similar to vector loads, can take 1-3 cycles to issue. +def : WriteRes { let Latency = 5; + let ResourceCycles = [2];} +def ThunderXWriteVST1 : SchedWriteRes<[ThunderXUnitLdSt]> { let Latency = 4; } +def ThunderXWriteVST2 : SchedWriteRes<[ThunderXUnitLdSt]> { let Latency = 5; + let ResourceCycles = [2]; } +def ThunderXWriteVST3 : SchedWriteRes<[ThunderXUnitLdSt]> { let Latency = 6; + let ResourceCycles = [3]; } + +def : WriteRes { let Unsupported = 1; } + +// Branch +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; + +// FP ALU +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 6; } + +// FP Mul, Div, Sqrt +def : WriteRes { let Latency = 6; } +def : WriteRes { let Latency = 33; + let ResourceCycles = [29]; } +def ThunderXWriteFMAC : SchedWriteRes<[ThunderXUnitFPMDS]> { let Latency = 10; } +def ThunderXWriteFDivSP : SchedWriteRes<[ThunderXUnitFPMDS]> { let Latency = 18; + let ResourceCycles = [14]; } +def ThunderXWriteFDivDP : SchedWriteRes<[ThunderXUnitFPMDS]> { let Latency = 33; + let ResourceCycles = [29]; } +def ThunderXWriteFSqrtSP : SchedWriteRes<[ThunderXUnitFPMDS]> { let Latency = 17; + let ResourceCycles = [13]; } +def ThunderXWriteFSqrtDP : SchedWriteRes<[ThunderXUnitFPMDS]> { let Latency = 32; + let ResourceCycles = [28]; } + +//===----------------------------------------------------------------------===// +// Subtarget-specific SchedRead types. + +// No forwarding for these reads. +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +// ALU - Most operands in the ALU pipes are not needed for two cycles. Shiftable +// operands are needed one cycle later if and only if they are to be +// shifted. Otherwise, they too are needed two cycles later. This same +// ReadAdvance applies to Extended registers as well, even though there is +// a separate SchedPredicate for them. +def : ReadAdvance; +def ThunderXReadShifted : SchedReadAdvance<1, [WriteImm,WriteI, + WriteISReg, WriteIEReg,WriteIS, + WriteID32,WriteID64, + WriteIM32,WriteIM64]>; +def ThunderXReadNotShifted : SchedReadAdvance<2, [WriteImm,WriteI, + WriteISReg, WriteIEReg,WriteIS, + WriteID32,WriteID64, + WriteIM32,WriteIM64]>; +def ThunderXReadISReg : SchedReadVariant<[ + SchedVar, + SchedVar]>; +def : SchedAlias; + +def ThunderXReadIEReg : SchedReadVariant<[ + SchedVar, + SchedVar]>; +def : SchedAlias; + +// MAC - Operands are generally needed one cycle later in the MAC pipe. +// Accumulator operands are needed two cycles later. +def : ReadAdvance; +def : ReadAdvance; + +// Div +def : ReadAdvance; + +//===----------------------------------------------------------------------===// +// Subtarget-specific InstRWs. + +//--- +// Miscellaneous +//--- +def : InstRW<[WriteI], (instrs COPY)>; + +//--- +// Vector Loads +//--- +def : InstRW<[ThunderXWriteVLD1], (instregex "LD1i(8|16|32|64)$")>; +def : InstRW<[ThunderXWriteVLD1], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[ThunderXWriteVLD1], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[ThunderXWriteVLD2], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[ThunderXWriteVLD3], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[ThunderXWriteVLD4], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[ThunderXWriteVLD1, WriteAdr], (instregex "LD1i(8|16|32|64)_POST$")>; +def : InstRW<[ThunderXWriteVLD1, WriteAdr], (instregex "LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[ThunderXWriteVLD1, WriteAdr], (instregex "LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[ThunderXWriteVLD2, WriteAdr], (instregex "LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[ThunderXWriteVLD3, WriteAdr], (instregex "LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[ThunderXWriteVLD4, WriteAdr], (instregex "LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[ThunderXWriteVLD1], (instregex "LD2i(8|16|32|64)$")>; +def : InstRW<[ThunderXWriteVLD1], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[ThunderXWriteVLD2], (instregex "LD2Twov(8b|4h|2s)$")>; +def : InstRW<[ThunderXWriteVLD4], (instregex "LD2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[ThunderXWriteVLD1, WriteAdr], (instregex "LD2i(8|16|32|64)(_POST)?$")>; +def : InstRW<[ThunderXWriteVLD1, WriteAdr], (instregex "LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)(_POST)?$")>; +def : InstRW<[ThunderXWriteVLD2, WriteAdr], (instregex "LD2Twov(8b|4h|2s)(_POST)?$")>; +def : InstRW<[ThunderXWriteVLD4, WriteAdr], (instregex "LD2Twov(16b|8h|4s|2d)(_POST)?$")>; + +def : InstRW<[ThunderXWriteVLD2], (instregex "LD3i(8|16|32|64)$")>; +def : InstRW<[ThunderXWriteVLD2], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[ThunderXWriteVLD4], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)$")>; +def : InstRW<[ThunderXWriteVLD3], (instregex "LD3Threev(2d)$")>; +def : InstRW<[ThunderXWriteVLD2, WriteAdr], (instregex "LD3i(8|16|32|64)_POST$")>; +def : InstRW<[ThunderXWriteVLD2, WriteAdr], (instregex "LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[ThunderXWriteVLD4, WriteAdr], (instregex "LD3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>; +def : InstRW<[ThunderXWriteVLD3, WriteAdr], (instregex "LD3Threev(2d)_POST$")>; + +def : InstRW<[ThunderXWriteVLD2], (instregex "LD4i(8|16|32|64)$")>; +def : InstRW<[ThunderXWriteVLD2], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[ThunderXWriteVLD5], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>; +def : InstRW<[ThunderXWriteVLD4], (instregex "LD4Fourv(2d)$")>; +def : InstRW<[ThunderXWriteVLD2, WriteAdr], (instregex "LD4i(8|16|32|64)_POST$")>; +def : InstRW<[ThunderXWriteVLD2, WriteAdr], (instregex "LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[ThunderXWriteVLD5, WriteAdr], (instregex "LD4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>; +def : InstRW<[ThunderXWriteVLD4, WriteAdr], (instregex "LD4Fourv(2d)_POST$")>; + +//--- +// Vector Stores +//--- +def : InstRW<[ThunderXWriteVST1], (instregex "ST1i(8|16|32|64)$")>; +def : InstRW<[ThunderXWriteVST1], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[ThunderXWriteVST1], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[ThunderXWriteVST2], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[ThunderXWriteVST2], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>; +def : InstRW<[ThunderXWriteVST1, WriteAdr], (instregex "ST1i(8|16|32|64)_POST$")>; +def : InstRW<[ThunderXWriteVST1, WriteAdr], (instregex "ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[ThunderXWriteVST1, WriteAdr], (instregex "ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[ThunderXWriteVST2, WriteAdr], (instregex "ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; +def : InstRW<[ThunderXWriteVST2, WriteAdr], (instregex "ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>; + +def : InstRW<[ThunderXWriteVST1], (instregex "ST2i(8|16|32|64)$")>; +def : InstRW<[ThunderXWriteVST1], (instregex "ST2Twov(8b|4h|2s)$")>; +def : InstRW<[ThunderXWriteVST2], (instregex "ST2Twov(16b|8h|4s|2d)$")>; +def : InstRW<[ThunderXWriteVST1, WriteAdr], (instregex "ST2i(8|16|32|64)_POST$")>; +def : InstRW<[ThunderXWriteVST1, WriteAdr], (instregex "ST2Twov(8b|4h|2s)_POST$")>; +def : InstRW<[ThunderXWriteVST2, WriteAdr], (instregex "ST2Twov(16b|8h|4s|2d)_POST$")>; + +def : InstRW<[ThunderXWriteVST2], (instregex "ST3i(8|16|32|64)$")>; +def : InstRW<[ThunderXWriteVST3], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)$")>; +def : InstRW<[ThunderXWriteVST2], (instregex "ST3Threev(2d)$")>; +def : InstRW<[ThunderXWriteVST2, WriteAdr], (instregex "ST3i(8|16|32|64)_POST$")>; +def : InstRW<[ThunderXWriteVST3, WriteAdr], (instregex "ST3Threev(8b|4h|2s|1d|16b|8h|4s)_POST$")>; +def : InstRW<[ThunderXWriteVST2, WriteAdr], (instregex "ST3Threev(2d)_POST$")>; + +def : InstRW<[ThunderXWriteVST2], (instregex "ST4i(8|16|32|64)$")>; +def : InstRW<[ThunderXWriteVST3], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)$")>; +def : InstRW<[ThunderXWriteVST2], (instregex "ST4Fourv(2d)$")>; +def : InstRW<[ThunderXWriteVST2, WriteAdr], (instregex "ST4i(8|16|32|64)_POST$")>; +def : InstRW<[ThunderXWriteVST3, WriteAdr], (instregex "ST4Fourv(8b|4h|2s|1d|16b|8h|4s)_POST$")>; +def : InstRW<[ThunderXWriteVST2, WriteAdr], (instregex "ST4Fourv(2d)_POST$")>; + +//--- +// Floating Point MAC, DIV, SQRT +//--- +def : InstRW<[ThunderXWriteFMAC], (instregex "^FN?M(ADD|SUB).*")>; +def : InstRW<[ThunderXWriteFMAC], (instregex "^FML(A|S).*")>; +def : InstRW<[ThunderXWriteFDivSP], (instrs FDIVSrr)>; +def : InstRW<[ThunderXWriteFDivDP], (instrs FDIVDrr)>; +def : InstRW<[ThunderXWriteFDivSP], (instregex "^FDIVv.*32$")>; +def : InstRW<[ThunderXWriteFDivDP], (instregex "^FDIVv.*64$")>; +def : InstRW<[ThunderXWriteFSqrtSP], (instregex "^.*SQRT.*32$")>; +def : InstRW<[ThunderXWriteFSqrtDP], (instregex "^.*SQRT.*64$")>; + +} Index: lib/Target/AArch64/AArch64Subtarget.h =================================================================== --- lib/Target/AArch64/AArch64Subtarget.h +++ lib/Target/AArch64/AArch64Subtarget.h @@ -44,7 +44,8 @@ Cyclone, ExynosM1, Kryo, - Vulcan + Vulcan, + ThunderX }; protected: Index: lib/Target/AArch64/AArch64Subtarget.cpp =================================================================== --- lib/Target/AArch64/AArch64Subtarget.cpp +++ lib/Target/AArch64/AArch64Subtarget.cpp @@ -74,6 +74,15 @@ MinPrefetchStride = 1024; MaxPrefetchIterationsAhead = 11; break; + case ThunderX: + CacheLineSize = 128; + PrefetchDistance = 256; + PrefFunctionAlignment = 5; + PrefLoopAlignment = 5; + MinPrefetchStride = 8; + MaxPrefetchIterationsAhead = 7; + MergeNarrowLoads = true; + break; case Vulcan: MaxInterleaveFactor = 4; break; Index: test/CodeGen/AArch64/cpus.ll =================================================================== --- test/CodeGen/AArch64/cpus.ll +++ test/CodeGen/AArch64/cpus.ll @@ -11,6 +11,7 @@ ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=exynos-m2 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=kryo 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=vulcan 2>&1 | FileCheck %s +; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=thunderx 2>&1 | FileCheck %s ; RUN: llc < %s -mtriple=arm64-unknown-unknown -mcpu=invalidcpu 2>&1 | FileCheck %s --check-prefix=INVALID ; CHECK-NOT: {{.*}} is not a recognized processor for this target Index: test/CodeGen/AArch64/remat.ll =================================================================== --- test/CodeGen/AArch64/remat.ll +++ test/CodeGen/AArch64/remat.ll @@ -7,6 +7,7 @@ ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=exynos-m2 -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=kryo -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=vulcan -o - %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnuabi -mcpu=thunderx -o - %s | FileCheck %s ; RUN: llc -mtriple=aarch64-linux-gnuabi -mattr=+custom-cheap-as-move -o - %s | FileCheck %s %X = type { i64, i64, i64 }