Index: lib/Target/RISCV/RISCV.td =================================================================== --- lib/Target/RISCV/RISCV.td +++ lib/Target/RISCV/RISCV.td @@ -55,19 +55,37 @@ def RV64 : HwMode<"+64bit">; def RV32 : HwMode<"-64bit">; +// Use the MachineScheduler for instruction scheduling for the subtarget. +def FeatureUseMISched: SubtargetFeature<"use-misched", "UseMISched", "true", + "Use the MachineScheduler">; + +// Pre-RA -> RA -> Post-RA +def FeatureNoPostRASched : SubtargetFeature<"disable-postra-scheduler", + "DisablePostRAScheduler", "false", + "Why not schedule again after register allocation?">; + //===----------------------------------------------------------------------===// // Registers, calling conventions, instruction descriptions. //===----------------------------------------------------------------------===// include "RISCVRegisterInfo.td" +include "RISCVRegisterBanks.td" include "RISCVCallingConv.td" include "RISCVInstrInfo.td" + +//===----------------------------------------------------------------------===// +// RISCV schedules. +//===----------------------------------------------------------------------===// +// +include "RISCVSchedule.td" + //===----------------------------------------------------------------------===// // RISC-V processors supported. //===----------------------------------------------------------------------===// -def : ProcessorModel<"generic-rv32", NoSchedModel, []>; +def : ProcessorModel<"generic-rv32", GRV32Model, [FeatureUseMISched, + FeatureNoPostRASched]>; def : ProcessorModel<"generic-rv64", NoSchedModel, [Feature64Bit]>; Index: lib/Target/RISCV/RISCVISelLowering.h =================================================================== --- lib/Target/RISCV/RISCVISelLowering.h +++ lib/Target/RISCV/RISCVISelLowering.h @@ -16,6 +16,7 @@ #define LLVM_LIB_TARGET_RISCV_RISCVISELLOWERING_H #include "RISCV.h" +#include "llvm/CodeGen/CallingConvLower.h" #include "llvm/CodeGen/SelectionDAG.h" #include "llvm/CodeGen/TargetLowering.h" @@ -51,6 +52,13 @@ EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *BB) const override; + CCAssignFn *CCAssignFnForCall(const DataLayout &DL, bool IsFixed = false, + Type *OrigTy = nullptr) const; + CCAssignFn *CCAssignFnForReturn(const DataLayout &DL, bool IsFixed = false, + Type *OrigTy = nullptr) const; + + Sched::Preference getSchedulingPreference(SDNode *N) const override; + private: void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo, const SmallVectorImpl &Ins, Index: lib/Target/RISCV/RISCVISelLowering.cpp =================================================================== --- lib/Target/RISCV/RISCVISelLowering.cpp +++ lib/Target/RISCV/RISCVISelLowering.cpp @@ -475,20 +475,11 @@ return false; } -// Implements the RISC-V calling convention. Returns true upon failure. -static bool CC_RISCV(const DataLayout &DL, unsigned ValNo, MVT ValVT, MVT LocVT, - CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, - CCState &State, bool IsFixed, bool IsRet, Type *OrigTy) { - unsigned XLen = DL.getLargestLegalIntTypeSizeInBits(); - assert(XLen == 32 || XLen == 64); - MVT XLenVT = XLen == 32 ? MVT::i32 : MVT::i64; - assert(ValVT == XLenVT && "Unexpected ValVT"); - assert(LocVT == XLenVT && "Unexpected LocVT"); - - // Any return value split in to more than two values can't be returned - // directly. - if (IsRet && ValNo > 1) - return true; +static bool CC_RISCVFn(unsigned XLen, MVT XLenVT, unsigned ValNo, MVT ValVT, + MVT LocVT, CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State, + bool IsFixed = false, Type *OrigTy = nullptr) { + const DataLayout &DL = State.getMachineFunction().getDataLayout(); // If this is a variadic argument, the RISC-V calling convention requires // that it is assigned an 'even' or 'aligned' register if it has 8-byte @@ -573,6 +564,94 @@ return false; } +static bool CC_RISCV32Fn(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State) { + unsigned XLen = 32; + MVT XLenVT = MVT::i32; + + return CC_RISCVFn(XLen, XLenVT, ValNo, ValVT, LocVT, LocInfo, ArgFlags, + State); +} + +static bool CC_RISCV64Fn(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State) { + unsigned XLen = 64; + MVT XLenVT = MVT::i64; + + return CC_RISCVFn(XLen, XLenVT, ValNo, ValVT, LocVT, LocInfo, ArgFlags, + State); +} + +static bool RetCC_RISCV32Fn(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + unsigned XLen = 32; + MVT XLenVT = MVT::i32; + + if (ValNo > 1) + return true; + + return CC_RISCVFn(XLen, XLenVT, ValNo, ValVT, LocVT, LocInfo, ArgFlags, + State); +} + +static bool RetCC_RISCV64Fn(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, + ISD::ArgFlagsTy ArgFlags, CCState &State) { + unsigned XLen = 64; + MVT XLenVT = MVT::i64; + + if (ValNo > 1) + return true; + + return CC_RISCVFn(XLen, XLenVT, ValNo, ValVT, LocVT, LocInfo, ArgFlags, + State); +} + +CCAssignFn *RISCVTargetLowering::CCAssignFnForCall(const DataLayout &DL, + bool IsFixed, + Type *OrigTy) const { + assert(IsFixed && "IsFixed support not yet implemented"); + assert(OrigTy && "OrigTy support not yet implemented"); + if (DL.getLargestLegalIntTypeSizeInBits() == 32) + return CC_RISCV32Fn; + + return CC_RISCV64Fn; +} + +CCAssignFn *RISCVTargetLowering::CCAssignFnForReturn(const DataLayout &DL, + bool IsFixed, + Type *OrigTy) const { + assert(IsFixed && "IsFixed support not yet implemented"); + assert(OrigTy && "OrigTy support not yet implemented"); + if (DL.getLargestLegalIntTypeSizeInBits() == 32) + return RetCC_RISCV32Fn; + + return RetCC_RISCV64Fn; +} + +// Implements the RISC-V calling convention. Returns true upon failure. +static bool CC_RISCV(unsigned ValNo, MVT ValVT, MVT LocVT, + CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags, + CCState &State, bool IsFixed, bool IsRet, Type *OrigTy) { + const DataLayout &DL = State.getMachineFunction().getDataLayout(); + unsigned XLen = DL.getLargestLegalIntTypeSizeInBits(); + assert(XLen == 32 || XLen == 64); + MVT XLenVT = XLen == 32 ? MVT::i32 : MVT::i64; + assert(ValVT == XLenVT && "Unexpected ValVT"); + assert(LocVT == XLenVT && "Unexpected LocVT"); + + // Any return value split in to more than two values can't be returned + // directly. + if (IsRet && ValNo > 1) + return true; + + return CC_RISCVFn(XLen, XLenVT, ValNo, ValVT, LocVT, LocInfo, ArgFlags, State, + IsFixed, OrigTy); +} + void RISCVTargetLowering::analyzeInputArgs( MachineFunction &MF, CCState &CCInfo, const SmallVectorImpl &Ins, bool IsRet) const { @@ -589,8 +668,8 @@ else if (Ins[i].isOrigArg()) ArgTy = FType->getParamType(Ins[i].getOrigArgIndex()); - if (CC_RISCV(MF.getDataLayout(), i, ArgVT, ArgVT, CCValAssign::Full, - ArgFlags, CCInfo, /*IsRet=*/true, IsRet, ArgTy)) { + if (CC_RISCV(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, + /*IsFixed=*/true, IsRet, ArgTy)) { DEBUG(dbgs() << "InputArg #" << i << " has unhandled type " << EVT(ArgVT).getEVTString() << '\n'); llvm_unreachable(nullptr); @@ -609,8 +688,8 @@ ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; Type *OrigTy = CLI ? CLI->getArgs()[Outs[i].OrigArgIndex].Ty : nullptr; - if (CC_RISCV(MF.getDataLayout(), i, ArgVT, ArgVT, CCValAssign::Full, - ArgFlags, CCInfo, Outs[i].IsFixed, IsRet, OrigTy)) { + if (CC_RISCV(i, ArgVT, ArgVT, CCValAssign::Full, ArgFlags, CCInfo, + Outs[i].IsFixed, IsRet, OrigTy)) { DEBUG(dbgs() << "OutputArg #" << i << " has unhandled type " << EVT(ArgVT).getEVTString() << "\n"); llvm_unreachable(nullptr); @@ -987,8 +1066,8 @@ for (unsigned i = 0, e = Outs.size(); i != e; ++i) { MVT VT = Outs[i].VT; ISD::ArgFlagsTy ArgFlags = Outs[i].Flags; - if (CC_RISCV(MF.getDataLayout(), i, VT, VT, CCValAssign::Full, ArgFlags, - CCInfo, /*IsFixed=*/true, /*IsRet=*/true, nullptr)) + if (CC_RISCV(i, VT, VT, CCValAssign::Full, ArgFlags, CCInfo, + /*IsFixed=*/true, /*IsRet=*/true, nullptr)) return false; } return true; @@ -1069,3 +1148,31 @@ return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT); } + +Sched::Preference RISCVTargetLowering::getSchedulingPreference( + SDNode *N) const { + unsigned NumVals = N->getNumValues(); + if (!NumVals) + return Sched::RegPressure; + + for (unsigned i = 0; i != NumVals; ++i) { + EVT VT = N->getValueType(i); + if (VT == MVT::Glue || VT == MVT::Other) + continue; + if (VT.isFloatingPoint() || VT.isVector()) + return Sched::ILP; + } + + if (!N->isMachineOpcode()) + return Sched::RegPressure; + + // Load are scheduled for latency even if there instruction itinerary + // is not available. + const TargetInstrInfo *TII = Subtarget.getInstrInfo(); + const MCInstrDesc &MCID = TII->get(N->getMachineOpcode()); + + if (MCID.getNumDefs() == 0) + return Sched::RegPressure; + + return Sched::RegPressure; +} Index: lib/Target/RISCV/RISCVSchedule.td =================================================================== --- lib/Target/RISCV/RISCVSchedule.td +++ lib/Target/RISCV/RISCVSchedule.td @@ -0,0 +1,95 @@ +//===-- RISCVSchedule.td - RISCV Scheduling Definitions ----*- tablegen -*-===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +//===----------------------------------------------------------------------===// + +//===----------------------------------------------------------------------===// +// Sched definitions for integer pipeline instructions +// +// Basic ALU operation. +def WriteALU : SchedWrite; +def ReadALU : SchedRead; + +// Basic ALU with shifts. +def WriteALUsi : SchedWrite; // Shift by immediate. +def WriteALUsr : SchedWrite; // Shift by register. +def WriteALUSsr : SchedWrite; // Shift by register (flag setting). +def ReadALUsr : SchedRead; // Some operands are read later. + +// Compares. +def WriteCMP : SchedWrite; +def WriteCMPsi : SchedWrite; +def WriteCMPsr : SchedWrite; + +// Multiplys. +def WriteMUL16 : SchedWrite; // 16-bit multiply. +def WriteMUL32 : SchedWrite; // 32-bit multiply. +def WriteMUL64Lo : SchedWrite; // 64-bit result. Low reg. +def WriteMUL64Hi : SchedWrite; // 64-bit result. High reg. +def ReadMUL : SchedRead; + +// Multiply-accumulates. +def WriteMAC16 : SchedWrite; // 16-bit mac. +def WriteMAC32 : SchedWrite; // 32-bit mac. +def WriteMAC64Lo : SchedWrite; // 64-bit mac. Low reg. +def WriteMAC64Hi : SchedWrite; // 64-bit mac. High reg. +def ReadMAC : SchedRead; + +// Divisions. +def WriteDIV : SchedWrite; + +// Loads/Stores. +def WriteLd : SchedWrite; +def WritePreLd : SchedWrite; +def WriteST : SchedWrite; + +// Branches. +def WriteBr : SchedWrite; +def WriteBrL : SchedWrite; +def WriteBrTbl : SchedWrite; + +// Noop. +def WriteNoop : SchedWrite; + +//===----------------------------------------------------------------------===// +// Sched definitions for floating-point and neon instructions +// +// Floating point conversions +def WriteFPCVT : SchedWrite; +def WriteFPMOV : SchedWrite; // FP -> GPR and vice-versa + +// ALU operations (32/64-bit) +def WriteFPALU32 : SchedWrite; +def WriteFPALU64 : SchedWrite; + +// Multiplication +def WriteFPMUL32 : SchedWrite; +def WriteFPMUL64 : SchedWrite; +def ReadFPMUL : SchedRead; // multiplier read +def ReadFPMAC : SchedRead; // accumulator read + +// Multiply-accumulate +def WriteFPMAC32 : SchedWrite; +def WriteFPMAC64 : SchedWrite; + +// Division +def WriteFPDIV32 : SchedWrite; +def WriteFPDIV64 : SchedWrite; + +// Square-root +def WriteFPSQRT32 : SchedWrite; +def WriteFPSQRT64 : SchedWrite; + +// Vector load and stores +def WriteVLD1 : SchedWrite; +def WriteVST1 : SchedWrite; + +//===----------------------------------------------------------------------===// +// Processor instruction itineraries. + +include "RISCVScheduleGRV32.td" Index: lib/Target/RISCV/RISCVScheduleGRV32.td =================================================================== --- lib/Target/RISCV/RISCVScheduleGRV32.td +++ lib/Target/RISCV/RISCVScheduleGRV32.td @@ -0,0 +1,477 @@ +//=- RISCVScheduleGRV32.td - RISCV Generic RV32 Scheduling ---*- tablegen -*-=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file defines the itinerary class data for the RISCV Generice RV32 +// processors. +// +//===----------------------------------------------------------------------===// + +// ===---------------------------------------------------------------------===// +// This section contains legacy support for itineraries. This is +// required until SD and PostRA schedulers are replaced by MachineScheduler. + +// +// FIXME: Where can find scheduling information derived from "WHICH RV32 +// Technical Reference Manual"? only reviewed Compiler Principle's ILP is not +// enough! +// +// Functional units +def GRV32_Issue0 : FuncUnit; // Issue 0 +def GRV32_Issue1 : FuncUnit; // Issue 1 +def GRV32_Branch : FuncUnit; // Branch +def GRV32_ALU0 : FuncUnit; // ALU / MUL pipeline 0 +def GRV32_ALU1 : FuncUnit; // ALU pipeline 1 +def GRV32_AGU : FuncUnit; // Address generation unit for ld / st +def GRV32_WPipe : FuncUnit; // WHICH pipeline +def GRV32_DRegsVFP: FuncUnit; // FP register set, VFP side + +// Bypasses +def GRV32_LdBypass : Bypass; + +def GRV32Itineraries : ProcessorItineraries< + [GRV32_Issue0, GRV32_Issue1, GRV32_Branch, GRV32_ALU0, GRV32_ALU1, GRV32_AGU, + GRV32_WPipe, GRV32_DRegsVFP], + [GRV32_LdBypass], [ +]>; + +// ===---------------------------------------------------------------------===// +// The following definitions describe the simpler per-operand machine model. +// This works with MachineScheduler and will eventually replace itineraries. + +class GRV32WriteLMOpsListType writes> { + list Writes = writes; + SchedMachineModel SchedModel = ?; +} + +// Generice RV32 machine model for scheduling and other instruction cost heuristics. +def GRV32Model : SchedMachineModel { + let IssueWidth = 2; // 2 micro-ops are dispatched per cycle. + let MicroOpBufferSize = 56; // Based on available renamed registers. + let LoadLatency = 2; // Optimistic load latency assuming bypass. + // This is overriden by OperandCycles if the + // Itineraries are queried instead. + let MispredictPenalty = 8; // Based on estimate of pipeline depth. + + let Itineraries = GRV32Itineraries; + + // FIXME: Many vector operations were never given an itinerary. We + // haven't mapped these to the new model either. + let CompleteModel = 0; +} + +//===----------------------------------------------------------------------===// +// Define each kind of processor resource and number available. +// +// The AGU unit has BufferSize=1 so that the latency between operations +// that use it are considered to stall other operations. +// +// The FP unit has BufferSize=0 so that it is a hard dispatch +// hazard. No instruction may be dispatched while the unit is reserved. + +let SchedModel = GRV32Model in { + +def GRV32UnitALU : ProcResource<2>; +def GRV32UnitMul : ProcResource<1> { let Super = GRV32UnitALU; } +def GRV32UnitAGU : ProcResource<1> { let BufferSize = 1; } +def GRV32UnitLS : ProcResource<1>; +def GRV32UnitFP : ProcResource<1> { let BufferSize = 0; } +def GRV32UnitB : ProcResource<1>; + +//===----------------------------------------------------------------------===// +// Define scheduler read/write types with their resources and latency on GRV32. + +// Consume an issue slot, but no processor resources. This is useful when all +// other writes associated with the operand have NumMicroOps = 0. +def GRV32WriteIssue : SchedWriteRes<[]> { let Latency = 0; } + +// Write an integer register. +def GRV32WriteI : SchedWriteRes<[GRV32UnitALU]>; +// Write an integer shifted-by register +def GRV32WriteIsr : SchedWriteRes<[GRV32UnitALU]> { let Latency = 2; } + +// Basic ALU. +def GRV32WriteALU : SchedWriteRes<[GRV32UnitALU]>; +// ALU with operand shifted by immediate. +def : WriteRes { let Latency = 2; } +// ALU with operand shifted by register. +def GRV32WriteALUsr : SchedWriteRes<[GRV32UnitALU]> { let Latency = 3; } + +// Multiplication +def GRV32WriteM : SchedWriteRes<[GRV32UnitMul, GRV32UnitMul]> { let Latency = 4; } +def GRV32WriteMHi : SchedWriteRes<[GRV32UnitMul]> { let Latency = 5; + let NumMicroOps = 0; } +def GRV32WriteM16 : SchedWriteRes<[GRV32UnitMul]> { let Latency = 3; } +def GRV32WriteM16Hi : SchedWriteRes<[GRV32UnitMul]> { let Latency = 4; + let NumMicroOps = 0; } +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; +def : ReadAdvance; +def : ReadAdvance; + +// Floating-point +// Only one FP or AGU instruction may issue per cycle. We model this +// by having FP instructions consume the AGU resource. +def GRV32WriteF : SchedWriteRes<[GRV32UnitFP, GRV32UnitAGU]> { let Latency = 4; } +def GRV32WriteFMov : SchedWriteRes<[GRV32UnitFP, GRV32UnitAGU]> { let Latency = 1; } +def GRV32WriteFMulS : SchedWriteRes<[GRV32UnitFP, GRV32UnitAGU]> { let Latency = 5; } +def GRV32WriteFMulD : SchedWriteRes<[GRV32UnitFP, GRV32UnitAGU]> { let Latency = 6; } +def GRV32WriteFMAS : SchedWriteRes<[GRV32UnitFP, GRV32UnitAGU]> { let Latency = 8; } + +def GRV32WriteFMAD : SchedWriteRes<[GRV32UnitFP, GRV32UnitAGU]> { let Latency = 9; } +def GRV32WriteFDivS : SchedWriteRes<[GRV32UnitFP, GRV32UnitAGU]> { let Latency = 15; } +def GRV32WriteFDivD : SchedWriteRes<[GRV32UnitFP, GRV32UnitAGU]> { let Latency = 25; } +def GRV32WriteFSqrtS : SchedWriteRes<[GRV32UnitFP, GRV32UnitAGU]> { let Latency = 17; } +def GRV32WriteFSqrtD : SchedWriteRes<[GRV32UnitFP, GRV32UnitAGU]> { let Latency = 32; } + +// NEON has an odd mix of latencies. Simply name the write types by latency. +def GRV32WriteV1 : SchedWriteRes<[GRV32UnitFP, GRV32UnitAGU]> { let Latency = 1; } +def GRV32WriteV2 : SchedWriteRes<[GRV32UnitFP, GRV32UnitAGU]> { let Latency = 2; } +def GRV32WriteV3 : SchedWriteRes<[GRV32UnitFP, GRV32UnitAGU]> { let Latency = 3; } +def GRV32WriteV4 : SchedWriteRes<[GRV32UnitFP, GRV32UnitAGU]> { let Latency = 4; } +def GRV32WriteV5 : SchedWriteRes<[GRV32UnitFP, GRV32UnitAGU]> { let Latency = 5; } +def GRV32WriteV6 : SchedWriteRes<[GRV32UnitFP, GRV32UnitAGU]> { let Latency = 6; } +def GRV32WriteV7 : SchedWriteRes<[GRV32UnitFP, GRV32UnitAGU]> { let Latency = 7; } +def GRV32WriteV9 : SchedWriteRes<[GRV32UnitFP, GRV32UnitAGU]> { let Latency = 9; } +def GRV32WriteV10 : SchedWriteRes<[GRV32UnitFP, GRV32UnitAGU]> { let Latency = 10; } + +def : WriteRes; +def : WriteRes; + +// Reserve GRV32UnitFP for 2 consecutive cycles. +def GRV32Write2V4 : SchedWriteRes<[GRV32UnitFP, GRV32UnitAGU]> { + let Latency = 4; + let ResourceCycles = [2]; +} +def GRV32Write2V7 : SchedWriteRes<[GRV32UnitFP, GRV32UnitAGU]> { + let Latency = 7; + let ResourceCycles = [2]; +} +def GRV32Write2V9 : SchedWriteRes<[GRV32UnitFP, GRV32UnitAGU]> { + let Latency = 9; + let ResourceCycles = [2]; +} + +// Branches don't have a def operand but still consume resources. +def GRV32WriteB : SchedWriteRes<[GRV32UnitB]>; + +// Address generation. +def GRV32WriteAdr : SchedWriteRes<[GRV32UnitAGU]> { let NumMicroOps = 0; } + +// Load Integer. +def GRV32WriteL : SchedWriteRes<[GRV32UnitLS]> { let Latency = 3; } +def : SchedAlias; +// Load the upper 32-bits using the same micro-op. +def GRV32WriteLHi : SchedWriteRes<[]> { let Latency = 3; + let NumMicroOps = 0; } +// Offset shifted by register. +def GRV32WriteLsi : SchedWriteRes<[GRV32UnitLS]> { let Latency = 4; } +// Load (and zero extend) a byte. +def GRV32WriteLb : SchedWriteRes<[GRV32UnitLS]> { let Latency = 4; } +def GRV32WriteLbsi : SchedWriteRes<[GRV32UnitLS]> { let Latency = 5; } + +// Load or Store Float, aligned. +def GRV32WriteLSfp : SchedWriteRes<[GRV32UnitLS, GRV32UnitFP]> { let Latency = 1; } + +// Store Integer. +def GRV32WriteS : SchedWriteRes<[GRV32UnitLS]>; + +//===----------------------------------------------------------------------===// +// Define resources dynamically for load multiple variants. + +// Define helpers for extra latency without consuming resources. +def GRV32WriteCycle1 : SchedWriteRes<[]> { let Latency = 1; let NumMicroOps = 0; } +foreach NumCycles = 2-8 in { +def GRV32WriteCycle#NumCycles : WriteSequence<[GRV32WriteCycle1], NumCycles>; +} // foreach NumCycles + +// Define address generation sequences and predicates for 8 flavors of LDMs. +foreach NumAddr = 1-8 in { + +// Define GRV32WriteAdr1-8 as a sequence of GRV32WriteAdr with additive +// latency for instructions that generate multiple loads or stores. +def GRV32WriteAdr#NumAddr : WriteSequence<[GRV32WriteAdr], NumAddr>; + +// Define a predicate to select the LDM based on number of memory addresses. +def GRV32LMAdr#NumAddr#Pred : + SchedPredicate<"(TII->getNumLDMAddresses(*MI)+1)/2 == "#NumAddr>; + +} // foreach NumAddr + +// Fall-back for unknown LDMs. +def GRV32LMUnknownPred : SchedPredicate<"TII->getNumLDMAddresses(*MI) == 0">; + +// LDM/VLDM/VLDn address generation latency & resources. +// Dynamically select the GRV32WriteAdrN sequence using a predicate. +def GRV32WriteLMAdr : SchedWriteVariant<[ + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + // For unknown LDM/VLDM/VSTM, assume 2 32-bit registers. + SchedVar]>; + +// Define LDM Resources. +// These take no issue resource, so they can be combined with other +// writes like WriteB. +// GRV32WriteLMLo takes a single LS resource and 2 cycles. +def GRV32WriteLMLo : SchedWriteRes<[GRV32UnitLS]> { let Latency = 2; + let NumMicroOps = 0; } +// Assuming aligned access, the upper half of each pair is free with +// the same latency. +def GRV32WriteLMHi : SchedWriteRes<[]> { let Latency = 2; + let NumMicroOps = 0; } +// Each GRV32WriteL#N variant adds N cycles of latency without consuming +// additional resources. +foreach NumAddr = 1-8 in { +def GRV32WriteL#NumAddr : WriteSequence< + [GRV32WriteLMLo, !cast("GRV32WriteCycle"#NumAddr)]>; +def GRV32WriteL#NumAddr#Hi : WriteSequence< + [GRV32WriteLMHi, !cast("GRV32WriteCycle"#NumAddr)]>; +} + +//===----------------------------------------------------------------------===// +// LDM: Load multiple into 32-bit integer registers. + +def GRV32WriteLMOpsList : GRV32WriteLMOpsListType< + [GRV32WriteL1, GRV32WriteL1Hi, + GRV32WriteL2, GRV32WriteL2Hi, + GRV32WriteL3, GRV32WriteL3Hi, + GRV32WriteL4, GRV32WriteL4Hi, + GRV32WriteL5, GRV32WriteL5Hi, + GRV32WriteL6, GRV32WriteL6Hi, + GRV32WriteL7, GRV32WriteL7Hi, + GRV32WriteL8, GRV32WriteL8Hi]>; + +// GRV32WriteLM variants expand into a pair of writes for each 64-bit +// value loaded. When the number of registers is odd, the last +// GRV32WriteLnHi is naturally ignored because the instruction has no +// following def operands. These variants take no issue resource, so +// they may need to be part of a WriteSequence that includes GRV32WriteIssue. +def GRV32WriteLM : SchedWriteVariant<[ + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + // For unknown LDMs, define the maximum number of writes, but only + // make the first two consume resources. + SchedVar]> { + let Variadic = 1; +} + +//===----------------------------------------------------------------------===// +// VFP Load/Store Multiple Variants, and NEON VLDn/VSTn support. + +// GRV32WriteLfpOp is the same as GRV32WriteLSfp but takes no issue resources +// so can be used in WriteSequences for in single-issue instructions that +// encapsulate multiple loads. +def GRV32WriteLfpOp : SchedWriteRes<[GRV32UnitLS, GRV32UnitFP]> { + let Latency = 1; + let NumMicroOps = 0; +} + +foreach NumAddr = 1-8 in { + +// Helper for GRV32WriteLfp1-8: A sequence of fp loads with no micro-ops. +def GRV32WriteLfp#NumAddr#Seq : WriteSequence<[GRV32WriteLfpOp], NumAddr>; + +// GRV32WriteLfp1-8 definitions are statically expanded into a sequence of +// GRV32WriteLfpOps with additive latency that takes a single issue slot. +// Used directly to describe NEON VLDn. +def GRV32WriteLfp#NumAddr : WriteSequence< + [GRV32WriteIssue, !cast("GRV32WriteLfp"#NumAddr#Seq)]>; + +// GRV32WriteLfp1-8Mov adds a cycle of latency and FP resource for +// permuting loaded values. +def GRV32WriteLfp#NumAddr#Mov : WriteSequence< + [GRV32WriteF, !cast("GRV32WriteLfp"#NumAddr#Seq)]>; + +} // foreach NumAddr + +// Define VLDM/VSTM PreRA resources. +// GRV32WriteLMfpPreRA are dynamically expanded into the correct +// GRV32WriteLfp1-8 sequence based on a predicate. This supports the +// preRA VLDM variants in which all 64-bit loads are written to the +// same tuple of either single or double precision registers. +def GRV32WriteLMfpPreRA : SchedWriteVariant<[ + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + // For unknown VLDM/VSTM PreRA, assume 2xS registers. + SchedVar]>; + +// Define VLDM/VSTM PostRA Resources. +// GRV32WriteLMfpLo takes a LS and FP resource and one issue slot but no latency. +def GRV32WriteLMfpLo : SchedWriteRes<[GRV32UnitLS, GRV32UnitFP]> { let Latency = 0; } + +foreach NumAddr = 1-8 in { + +// Each GRV32WriteL#N variant adds N cycles of latency without consuming +// additional resources. +def GRV32WriteLMfp#NumAddr : WriteSequence< + [GRV32WriteLMfpLo, !cast("GRV32WriteCycle"#NumAddr)]>; + +// Assuming aligned access, the upper half of each pair is free with +// the same latency. +def GRV32WriteLMfp#NumAddr#Hi : WriteSequence< + [GRV32WriteLMHi, !cast("GRV32WriteCycle"#NumAddr)]>; + +} // foreach NumAddr + +// VLDM PostRA Variants. These variants expand GRV32WriteLMfpPostRA into a +// pair of writes for each 64-bit data loaded. When the number of +// registers is odd, the last WriteLMfpnHi is naturally ignored because +// the instruction has no following def operands. + +def GRV32WriteLMfpPostRAOpsList : GRV32WriteLMOpsListType< + [GRV32WriteLMfp1, GRV32WriteLMfp2, // 0-1 + GRV32WriteLMfp3, GRV32WriteLMfp4, // 2-3 + GRV32WriteLMfp5, GRV32WriteLMfp6, // 4-5 + GRV32WriteLMfp7, GRV32WriteLMfp8, // 6-7 + GRV32WriteLMfp1Hi, // 8-8 + GRV32WriteLMfp2Hi, GRV32WriteLMfp2Hi, // 9-10 + GRV32WriteLMfp3Hi, GRV32WriteLMfp3Hi, // 11-12 + GRV32WriteLMfp4Hi, GRV32WriteLMfp4Hi, // 13-14 + GRV32WriteLMfp5Hi, GRV32WriteLMfp5Hi, // 15-16 + GRV32WriteLMfp6Hi, GRV32WriteLMfp6Hi, // 17-18 + GRV32WriteLMfp7Hi, GRV32WriteLMfp7Hi, // 19-20 + GRV32WriteLMfp8Hi, GRV32WriteLMfp8Hi]>; // 21-22 + +def GRV32WriteLMfpPostRA : SchedWriteVariant<[ + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + SchedVar, + // For unknown LDMs, define the maximum number of writes, but only + // make the first two consume resources. We are optimizing for the case + // where the operands are DPRs, and this determines the first eight + // types. The remaining eight types are filled to cover the case + // where the operands are SPRs. + SchedVar]> { + let Variadic = 1; +} + +// Distinguish between our multiple MI-level forms of the same +// VLDM/VSTM instructions. +def GRV32PreRA : SchedPredicate< + "TargetRegisterInfo::isVirtualRegister(MI->getOperand(0).getReg())">; +def GRV32PostRA : SchedPredicate< + "TargetRegisterInfo::isPhysicalRegister(MI->getOperand(0).getReg())">; + +// VLDM represents all destination registers as a single register +// tuple, unlike LDM. So the number of write operands is not variadic. +def GRV32WriteLMfp : SchedWriteVariant<[ + SchedVar, + SchedVar]>; + +//===----------------------------------------------------------------------===// +// Resources for other (non-LDM/VLDM) Variants. + +// These mov immediate writers are unconditionally expanded with +// additive latency. +def GRV32WriteI2 : WriteSequence<[GRV32WriteI, GRV32WriteI]>; +def GRV32WriteI2pc : WriteSequence<[GRV32WriteI, GRV32WriteI, WriteALU]>; +def GRV32WriteI2ld : WriteSequence<[GRV32WriteI, GRV32WriteI, GRV32WriteL]>; + +// Some ALU operations can read loaded integer values one cycle early. +def GRV32ReadALU : SchedReadAdvance<1, + [GRV32WriteL, GRV32WriteLHi, GRV32WriteLsi, GRV32WriteLb, GRV32WriteLbsi, + GRV32WriteL1, GRV32WriteL2, GRV32WriteL3, GRV32WriteL4, + GRV32WriteL5, GRV32WriteL6, GRV32WriteL7, GRV32WriteL8, + GRV32WriteL1Hi, GRV32WriteL2Hi, GRV32WriteL3Hi, GRV32WriteL4Hi, + GRV32WriteL5Hi, GRV32WriteL6Hi, GRV32WriteL7Hi, GRV32WriteL8Hi]>; + +// Read types for operands that are unconditionally read in cycle N +// after the instruction issues, decreases producer latency by N-1. +def GRV32Read2 : SchedReadAdvance<1>; +def GRV32Read3 : SchedReadAdvance<2>; +def GRV32Read4 : SchedReadAdvance<3>; + +// Map SchedRWs that are identical for cortexa9 to existing resources. +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; + +// ===---------------------------------------------------------------------===// +// Floating-point. Map target defined SchedReadWrite to processor specific ones +// +def : WriteRes { let Latency = 4; } +def : SchedAlias; + +def : SchedAlias; +def : SchedAlias; + +def : SchedAlias; +def : SchedAlias; + +def : SchedAlias; +def : SchedAlias; + +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; + +def : ReadAdvance; +def : ReadAdvance; + +// ===---------------------------------------------------------------------===// +// Subtarget-specific overrides. Map opcodes to list of SchedReadWrite types. +// +def : SchedAlias; +def : SchedAlias; +def : SchedAlias; + +def : WriteRes { let Latency = 0; } + +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes; +def : WriteRes { let Latency = 0; let NumMicroOps = 0; } +} // SchedModel = GRV32Model Index: lib/Target/RISCV/RISCVSubtarget.h =================================================================== --- lib/Target/RISCV/RISCVSubtarget.h +++ lib/Target/RISCV/RISCVSubtarget.h @@ -20,12 +20,12 @@ #include "llvm/CodeGen/SelectionDAGTargetInfo.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" #include "llvm/IR/DataLayout.h" -#include "llvm/Target/TargetMachine.h" #define GET_SUBTARGETINFO_HEADER #include "RISCVGenSubtargetInfo.inc" namespace llvm { +class RISCVTargetMachine; class StringRef; class RISCVSubtarget : public RISCVGenSubtargetInfo { @@ -52,7 +52,7 @@ public: // Initializes the data members to match that of the specified triple. RISCVSubtarget(const Triple &TT, const std::string &CPU, - const std::string &FS, const TargetMachine &TM); + const std::string &FS, const RISCVTargetMachine &TM); // Parses features string setting specified subtarget options. The // definition of this function is auto-generated by tblgen. @@ -71,6 +71,11 @@ const SelectionDAGTargetInfo *getSelectionDAGInfo() const override { return &TSInfo; } + const CallLowering *getCallLowering() const override; + const InstructionSelector *getInstructionSelector() const override; + const LegalizerInfo *getLegalizerInfo() const override; + const RegisterBankInfo *getRegBankInfo() const override; + bool hasStdExtM() const { return HasStdExtM; } bool hasStdExtA() const { return HasStdExtA; } bool hasStdExtF() const { return HasStdExtF; } @@ -79,6 +84,30 @@ bool is64Bit() const { return HasRV64; } MVT getXLenVT() const { return XLenVT; } unsigned getXLen() const { return XLen; } + + bool useMachineScheduler() const { return UseMISched; } + bool disablePostRAScheduler() const { return DisablePostRAScheduler; } + + /// Returns true if machine scheduler should be enabled. + bool enableMachineScheduler() const override; + + /// True for some subtargets at > -O0. + bool enablePostRAScheduler() const override; + +protected: + /// UseMISched - True if MachineScheduler should be used for this subtarget. + bool UseMISched = false; + + /// DisablePostRAScheduler - False if scheduling should happen again after + /// register allocation. + bool DisablePostRAScheduler = false; + +private: + /// GlobalISel related APIs. + std::unique_ptr CallLoweringInfo; + std::unique_ptr InstSelector; + std::unique_ptr Legalizer; + std::unique_ptr RegBankInfo; }; } // End llvm namespace Index: lib/Target/RISCV/RISCVSubtarget.cpp =================================================================== --- lib/Target/RISCV/RISCVSubtarget.cpp +++ lib/Target/RISCV/RISCVSubtarget.cpp @@ -11,9 +11,14 @@ // //===----------------------------------------------------------------------===// +#include "RISCVCallLowering.h" +#include "RISCVLegalizerInfo.h" +#include "RISCVRegisterBankInfo.h" #include "RISCVSubtarget.h" +#include "RISCVTargetMachine.h" #include "RISCV.h" #include "RISCVFrameLowering.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelect.h" #include "llvm/Support/TargetRegistry.h" using namespace llvm; @@ -42,7 +47,44 @@ } RISCVSubtarget::RISCVSubtarget(const Triple &TT, const std::string &CPU, - const std::string &FS, const TargetMachine &TM) + const std::string &FS, + const RISCVTargetMachine &TM) : RISCVGenSubtargetInfo(TT, CPU, FS), FrameLowering(initializeSubtargetDependencies(CPU, FS, TT.isArch64Bit())), - InstrInfo(), RegInfo(getHwMode()), TLInfo(TM, *this) {} + InstrInfo(), RegInfo(getHwMode()), TLInfo(TM, *this) { + CallLoweringInfo.reset(new RISCVCallLowering(*getTargetLowering())); + Legalizer.reset(new RISCVLegalizerInfo(*this, TM)); + + auto *RBI = new RISCVRegisterBankInfo(*getRegisterInfo()); + InstSelector.reset(createRISCVInstructionSelector( + *static_cast(&TM), *this, *RBI)); + + RegBankInfo.reset(RBI); +} + +const CallLowering *RISCVSubtarget::getCallLowering() const { + return CallLoweringInfo.get(); +} + +const InstructionSelector *RISCVSubtarget::getInstructionSelector() const { + return InstSelector.get(); +} + +const LegalizerInfo *RISCVSubtarget::getLegalizerInfo() const { + return Legalizer.get(); +} + +const RegisterBankInfo *RISCVSubtarget::getRegBankInfo() const { + return RegBankInfo.get(); +} + +bool RISCVSubtarget::enableMachineScheduler() const { + // Enable the MachineScheduler before register allocation for subtargets + // with the use-misched feature. + return useMachineScheduler(); +} + +// This overrides the PostRAScheduler bit in the SchedModel for any CPU. +bool RISCVSubtarget::enablePostRAScheduler() const { + return !disablePostRAScheduler(); +} Index: lib/Target/RISCV/RISCVTargetMachine.cpp =================================================================== --- lib/Target/RISCV/RISCVTargetMachine.cpp +++ lib/Target/RISCV/RISCVTargetMachine.cpp @@ -14,6 +14,14 @@ #include "RISCV.h" #include "RISCVTargetMachine.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/CodeGen/GlobalISel/CallLowering.h" +#include "llvm/CodeGen/GlobalISel/IRTranslator.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelect.h" +#include "llvm/CodeGen/GlobalISel/InstructionSelector.h" +#include "llvm/CodeGen/GlobalISel/Legalizer.h" +#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h" +#include "llvm/CodeGen/GlobalISel/RegBankSelect.h" +#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h" #include "llvm/CodeGen/TargetPassConfig.h" @@ -26,6 +34,9 @@ extern "C" void LLVMInitializeRISCVTarget() { RegisterTargetMachine X(getTheRISCV32Target()); RegisterTargetMachine Y(getTheRISCV64Target()); + + PassRegistry &Registry = *PassRegistry::getPassRegistry(); + initializeGlobalISel(Registry); } static std::string computeDataLayout(const Triple &TT) { @@ -68,13 +79,24 @@ class RISCVPassConfig : public TargetPassConfig { public: RISCVPassConfig(RISCVTargetMachine &TM, PassManagerBase &PM) - : TargetPassConfig(TM, PM) {} + : TargetPassConfig(TM, PM) { + if (TM.getOptLevel() != CodeGenOpt::None) { + RISCVGenSubtargetInfo STI(TM.getTargetTriple(), TM.getTargetCPU(), + TM.getTargetFeatureString()); + if (STI.hasFeature(RISCV::FeatureUseMISched)) + substitutePass(&PostRASchedulerID, &PostMachineSchedulerID); + } + } RISCVTargetMachine &getRISCVTargetMachine() const { return getTM(); } + bool addGlobalInstructionSelect() override; bool addInstSelector() override; + bool addIRTranslator() override; + bool addLegalizeMachineIR() override; + bool addRegBankSelect() override; void addPreEmitPass() override; }; } @@ -83,10 +105,30 @@ return new RISCVPassConfig(*this, PM); } +bool RISCVPassConfig::addGlobalInstructionSelect() { + addPass(new InstructionSelect()); + return false; +} + bool RISCVPassConfig::addInstSelector() { addPass(createRISCVISelDag(getRISCVTargetMachine())); return false; } +bool RISCVPassConfig::addIRTranslator() { + addPass(new IRTranslator()); + return false; +} + +bool RISCVPassConfig::addLegalizeMachineIR() { + addPass(new Legalizer()); + return false; +} + +bool RISCVPassConfig::addRegBankSelect() { + addPass(new RegBankSelect()); + return false; +} + void RISCVPassConfig::addPreEmitPass() { addPass(&BranchRelaxationPassID); }