Index: lib/Target/AArch64/AArch64.td =================================================================== --- lib/Target/AArch64/AArch64.td +++ lib/Target/AArch64/AArch64.td @@ -90,6 +90,7 @@ include "AArch64SchedA53.td" include "AArch64SchedA57.td" include "AArch64SchedCyclone.td" +include "AArch64SchedM1.td" def ProcA35 : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35", "Cortex-A35 ARM processors", @@ -144,8 +145,7 @@ // FIXME: Cortex-A72 is currently modelled as an Cortex-A57. def : ProcessorModel<"cortex-a72", CortexA57Model, [ProcA57]>; def : ProcessorModel<"cyclone", CycloneModel, [ProcCyclone]>; -// FIXME: Exynos-M1 is currently modelled without a specific SchedModel. -def : ProcessorModel<"exynos-m1", NoSchedModel, [ProcExynosM1]>; +def : ProcessorModel<"exynos-m1", ExynosM1Model, [ProcExynosM1]>; //===----------------------------------------------------------------------===// // Assembly parser Index: lib/Target/AArch64/AArch64SchedM1.td =================================================================== --- /dev/null +++ lib/Target/AArch64/AArch64SchedM1.td @@ -0,0 +1,313 @@ +//=- AArch64SchedM1.td - Samsung Exynos-M1 Scheduling Defs ---*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the machine model for Samsung Exynos-M1 to support +// instruction scheduling and other instruction cost heuristics. +// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// The Exynos-M1 is a traditional superscalar microprocessor with a +// 4-wide in-order stage for decode and dispatch and a wider issue stage. +// The execution units and loads and stores are out-of-order. + +def ExynosM1Model : SchedMachineModel { + let IssueWidth = 4; // Up to 4 uops per cycle + let MinLatency = 0; // OoO + let MicroOpBufferSize = 96; // ROB size + let LoopMicroOpBufferSize = 32; // Insn queue size + let LoadLatency = 4; // Optimistic cases + let MispredictPenalty = 14; // Minimum penalty +} + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available on the Exynos-M1, +// which has 9 pipelines, each with its own queue with out-of-order dispatch. + +def M1UnitA : ProcResource<2>; // Simple integer +def M1UnitC : ProcResource<1>; // Simple and complex integer +def M1UnitB : ProcResource<2>; // Branch +def M1UnitL : ProcResource<1>; // Load +def M1UnitS : ProcResource<1>; // Store +def M1UnitSD : ProcResource<1>; // Store FP +def M1PipeF0 : ProcResource<1>; // FP #0 +def M1PipeF1 : ProcResource<1>; // FP #1 + +let Super = M1PipeF0 in { + def M1UnitFMAC : ProcResource<1>; // FP multiplication + def M1UnitFCVT : ProcResource<1>; // FP conversion + def M1UnitNAL0 : ProcResource<1>; // Simple vector. + def M1UnitNMISC : ProcResource<1>; // Miscellanea + def M1UnitNCRYPT : ProcResource<1>; // Cryptographic +} + +let Super = M1PipeF1 in { + def M1UnitFADD : ProcResource<1>; // Simple FP + let BufferSize = 1 in + def M1UnitFVAR : ProcResource<1>; // FP division & square root (serialized) + def M1UnitNAL1 : ProcResource<1>; // Simple vector. + def M1UnitFST : ProcResource<1>; // FP store +} + +let SchedModel = ExynosM1Model in { + def M1UnitALU : ProcResGroup<[M1UnitA, + M1UnitC]>; // All simple integer. + def M1UnitNALU : ProcResGroup<[M1UnitNAL0, + M1UnitNAL1]>; // All simple vector. +} + +let SchedModel = ExynosM1Model in { + +//===----------------------------------------------------------------------===// +// Coarse scheduling model for the Exynos-M1. + +// Branch instructions. +// TODO: Non-conditional direct branches take zero cycles and units. +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +// TODO: Branch and link is much different. + +// Arithmetic and logical integer instructions. +def : WriteRes { let Latency = 1; } +// TODO: Shift over 3 and some extensions take 2 cycles. +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } + +// Move instructions. +def : WriteRes { let Latency = 1; } + +// Divide and multiply instructions. +// TODO: Division blocks the divider inside C. +def : WriteRes { let Latency = 13; } +def : WriteRes { let Latency = 21; } +// TODO: Long multiplication take 5 cycles and also the ALU. +// TODO: Multiplication with accumulation can be advanced. +def : WriteRes { let Latency = 3; } +// TODO: 64-bit multiplication has a throughput of 1/2. +def : WriteRes { let Latency = 4; } + +// Miscellaneous instructions. +def : WriteRes { let Latency = 2; } + +// TODO: The latency for the post or pre register is 1 cycle. +def : WriteRes { let Latency = 0; } + +// Load instructions. +def : WriteRes { let Latency = 4; } +// TODO: Extended address requires also the ALU. +def : WriteRes { let Latency = 5; } +def : WriteRes { let Latency = 4; } + +// Store instructions. +def : WriteRes { let Latency = 1; } +// TODO: Extended address requires also the ALU. +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } + +// FP data instructions. +def : WriteRes { let Latency = 3; } +// TODO: FCCMP is much different. +def : WriteRes { let Latency = 4; } +// TODO: DP takes longer. +def : WriteRes { let Latency = 15; } +// TODO: MACC takes longer. +def : WriteRes { let Latency = 4; } + +// FP miscellaneous instructions. +// TODO: Conversion between register files is much different. +def : WriteRes { let Latency = 3; } +def : WriteRes { let Latency = 1; } +// TODO: Copy from FPR to GPR is much different. +def : WriteRes { let Latency = 4; } + +// FP load instructions. +// TODO: ASIMD loads are much different. +def : WriteRes { let Latency = 5; } + +// FP store instructions. +// TODO: ASIMD stores are much different. +def : WriteRes { let Latency = 1; } + +// ASIMD FP instructions. +// TODO: Other operations are much different. +def : WriteRes { let Latency = 3; } + +// Other miscellaneous instructions. +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } +def : WriteRes { let Latency = 1; } + +//===----------------------------------------------------------------------===// +// Fast forwarding. + +// TODO: Add FP register forwarding rules. + +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +// Integer multiply-accumulate. +// TODO: The forwarding for WriteIM64 saves actually 3 cycles. +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; +def : ReadAdvance; + +//===----------------------------------------------------------------------===// +// Finer scheduling model for the Exynos-M1. + +// Branch Instructions +def M1WriteBrLnk : SchedWriteRes<[M1UnitB, + M1UnitALU]> { let Latency = 1; } +def M1WriteBrLnkReg : SchedWriteRes<[M1UnitB, + M1UnitALU, + M1UnitALU]> { let Latency = 2; } + +def : InstRW<[M1WriteBrLnk], (instrs BL)>; +def : InstRW<[M1WriteBrLnkReg], (instrs BLR)>; + +// Arithmetic and logical integer instructions. +def : InstRW<[WriteI], (instrs COPY)>; +def : InstRW<[WriteIM64], (instrs SMULHrr, UMULHrr)>; + +// Miscellaneous instructions. +def : InstRW<[WriteExtr], (instrs EXTRWrri, EXTRXrri)>; +def : InstRW<[WriteI], (instregex "BFM")>; + +// Load instructions. + +// Store instructions. + +// FP data instructions. +def M1WriteFADD3 : SchedWriteRes<[M1UnitFADD]> { let Latency = 3; } +def M1WriteFCVT3 : SchedWriteRes<[M1UnitFCVT]> { let Latency = 3; } +def M1WriteFMAC5 : SchedWriteRes<[M1UnitFMAC]> { let Latency = 5; } +def M1WriteFVAR15 : SchedWriteRes<[M1UnitFVAR]> { let Latency = 15; } +def M1WriteFVAR23 : SchedWriteRes<[M1UnitFVAR]> { let Latency = 23; } +def M1WriteNMISC2 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 2; } + +def : InstRW<[M1WriteFADD3], (instregex "^F(ADD|SUB)[DS]rr")>; +def : InstRW<[M1WriteFVAR15], (instrs FDIVSrr)>; +def : InstRW<[M1WriteFVAR23], (instrs FDIVDrr)>; +def : InstRW<[M1WriteNMISC2], (instregex "^F(MAX|MIN).+rr")>; +def : InstRW<[M1WriteFMAC5], (instregex "^FN?M(ADD|SUB)[DS]rrr")>; +def : InstRW<[M1WriteFCVT3], (instregex "^FRINT.+r")>; +def : InstRW<[M1WriteFVAR15], (instrs FSQRTSr)>; +def : InstRW<[M1WriteFVAR23], (instrs FSQRTDr)>; + +// FP miscellaneous instructions. +def M1WriteASIMDE : SchedWriteRes<[M1UnitFCVT, + M1UnitFST]> { let Latency = 8; } +def M1WriteASIMDF : SchedWriteRes<[M1UnitFCVT, + M1UnitFST, + M1UnitL]> { let Latency = 13; } + +def : InstRW<[M1WriteASIMDF], (instregex "^[FSU]CVT[AMNPZ][SU](_Int)?[SU]?[XW]?[DS]?[rds]i?")>; +def : InstRW<[M1WriteASIMDE], (instregex "^[SU]CVTF")>; + +// FP load instructions. + +// FP store instructions. + +// ASIMD instructions. +def M1WriteNALU1 : SchedWriteRes<[M1UnitNALU]> { let Latency = 1; } +def M1WriteNAL13 : SchedWriteRes<[M1UnitNAL1]> { let Latency = 3; } +def M1WriteNMISC1 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 1; } +def M1WriteNMISC3 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 3; } +def M1WriteNMISC4 : SchedWriteRes<[M1UnitNMISC]> { let Latency = 4; } + +def : InstRW<[M1WriteNMISC3], (instregex "^[SU]ABA")>; +def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?ADDL?V")>; +def : InstRW<[M1WriteNMISC1], (instregex "^[SU](MIN|MAX)V")>; +def : InstRW<[M1WriteNMISC4], (instregex "^(MUL|SQR?DMULH)v")>; +def : InstRW<[M1WriteNMISC4], (instregex "^ML[AS]v")>; +def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD)ML[AS]L")>; +def : InstRW<[M1WriteNMISC4], (instregex "^(S|U|SQD)MULL")>; +def : InstRW<[M1WriteNMISC3], (instregex "^[SU]ADALP")>; +def : InstRW<[M1WriteNAL13], (instregex "^(S|SR|U|UR)SRA")>; +def : InstRW<[M1WriteNMISC3], (instregex "^[SU]?(Q|R){1,2}SHR")>; +def : InstRW<[M1WriteNMISC3], (instregex "^[SU]QSHLU?")>; +def : InstRW<[M1WriteNALU1], (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>; +def : InstRW<[M1WriteNMISC3], (instregex "^[SU][QR]{1,2}SHL(b|d|h|s|v)")>; + +// ASIMD FP instructions. +def M1WriteASIMDA : SchedWriteRes<[M1UnitNALU, + M1UnitNALU, + M1UnitFADD]> { let Latency = 9; } +def M1WriteFMAC4 : SchedWriteRes<[M1UnitFMAC]> { let Latency = 4; } + +def : InstRW<[M1WriteNMISC3], (instregex "^(FABD|FADD|FSUB)v")>; +def : InstRW<[M1WriteASIMDA], (instregex "^FADDP")>; +def : InstRW<[M1WriteNMISC1], (instregex "^(FACGE|FACGT|FCMEQ|FCMGE|FCMGT|FCMLE|FCMLT)v")>; +def : InstRW<[M1WriteFCVT3], (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?v")>; +def : InstRW<[M1WriteFVAR15], (instregex "FDIVv.f32")>; +def : InstRW<[M1WriteFVAR23], (instregex "FDIVv2f64")>; +def : InstRW<[M1WriteFVAR15], (instregex "FSQRTv.f32")>; +def : InstRW<[M1WriteFVAR23], (instregex "FSQRTv2f64")>; +def : InstRW<[M1WriteNMISC1], (instregex "^(FMAX|FMIN)(NM)?V?v")>; +def : InstRW<[M1WriteNMISC2], (instregex "^(FMAX|FMIN)(NM)?Pv")>; +def : InstRW<[M1WriteFMAC4], (instregex "^FMULX?v")>; +def : InstRW<[M1WriteFMAC5], (instregex "^FML[AS]v")>; +def : InstRW<[M1WriteFCVT3], (instregex "^FRINT[AIMNPXZ]v")>; + +// ASIMD miscellaneous instructions. +def M1WriteASIMDB : SchedWriteRes<[M1UnitNALU, + M1UnitFST]> { let Latency = 5; } +def M1WriteASIMDC : SchedWriteRes<[M1UnitNALU, + M1UnitFST]> { let Latency = 6; } +def M1WriteASIMDD : SchedWriteRes<[M1UnitNALU, + M1UnitFST, + M1UnitL]> { let Latency = 10; } +def M1WriteFCVT4 : SchedWriteRes<[M1UnitFCVT]> { let Latency = 4; } +def M1WriteNALU2 : SchedWriteRes<[M1UnitNALU]> { let Latency = 2; } +def M1WriteNAL11 : SchedWriteRes<[M1UnitNAL1]> { let Latency = 1; } + +def : InstRW<[M1WriteNAL11], (instregex "^(BIF|BIT|BSL)v16i8")>; +def : InstRW<[M1WriteNALU1], (instregex "^CPY")>; +def : InstRW<[M1WriteASIMDB], (instregex "^DUPv.+gpr")>; +def : InstRW<[M1WriteNAL13], (instregex "^[SU]QXTU?N")>; +def : InstRW<[M1WriteFCVT4], (instregex "^[FU](RECP|RSQRT)(E|X)v")>; +def : InstRW<[M1WriteFMAC5], (instregex "^F(RECP|RSQRT)Sv")>; +def : InstRW<[M1WriteNAL11], (instregex "^TB[LX]v(8|16)i8One")>; +def : InstRW<[WriteSequence<[M1WriteNAL11], 2>], + (instregex "^TB[LX]v(8|16)i8Two")>; +def : InstRW<[WriteSequence<[M1WriteNAL11], 3>], + (instregex "^TB[LX]v(8|16)i8Three")>; +def : InstRW<[WriteSequence<[M1WriteNAL11], 4>], + (instregex "^TB[LX]v(8|16)i8Four")>; +def : InstRW<[M1WriteASIMDD], (instregex "^[SU]MOVv")>; +def : InstRW<[M1WriteASIMDC], (instregex "^INSv")>; +def : InstRW<[M1WriteNALU2], (instregex "^UZP(1|2)(v16i8|v8i16|v4i32|v2i64)")>; +def : InstRW<[M1WriteNALU1], (instregex "^ZIP(1|2)(v16i8|v8i16|v4i32|v2i64)")>; + +// ASIMD load instructions. + +// ASIMD store instructions. + +// Cryptography instructions. +def M1WriteNCRYPT1 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 1; } +def M1WriteNCRYPT5 : SchedWriteRes<[M1UnitNCRYPT]> { let Latency = 5; } + +def : InstRW<[M1WriteNCRYPT1], (instregex "^AES")>; +def : InstRW<[M1WriteNCRYPT1], (instregex "^PMUL")>; +def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA1(H|SU)")>; +def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA1[CMP]")>; +def : InstRW<[M1WriteNCRYPT1], (instregex "^SHA256SU0")>; +def : InstRW<[M1WriteNCRYPT5], (instregex "^SHA256(H|SU1)")>; + +// CRC instructions. +def M1WriteC2 : SchedWriteRes<[M1UnitC]> { let Latency = 2; } + +def : InstRW<[M1WriteC2], (instregex "^CRC32")>; + +} // SchedModel = ExynosM1Model