Index: llvm/trunk/lib/Target/ARM/ARM.td =================================================================== --- llvm/trunk/lib/Target/ARM/ARM.td +++ llvm/trunk/lib/Target/ARM/ARM.td @@ -144,6 +144,17 @@ "true", "Use NEON for single precision FP">; +// On some processors, VLDn instructions that access unaligned data take one +// extra cycle. Take that into account when computing operand latencies. +def FeatureCheckVLDnAlign : SubtargetFeature<"vldn-align", "CheckVLDnAlign", + "true", + "Check for VLDn unaligned access">; + +// Some processors have a nonpipelined VFP coprocessor. +def FeatureNonpipelinedVFP : SubtargetFeature<"nonpipelined-vfp", + "NonpipelinedVFP", "true", + "VFP instructions are not pipelined">; + // Some processors have FP multiply-accumulate instructions that don't // play nicely with other VFP / NEON instructions, and it's generally better // to just not use them. @@ -552,6 +563,7 @@ def : ProcessorModel<"cortex-a8", CortexA8Model, [ARMv7a, ProcA8, FeatureHasRetAddrStack, + FeatureNonpipelinedVFP, FeatureTrustZone, FeatureSlowFPBrcc, FeatureHasSlowFPVMLx, @@ -567,6 +579,7 @@ FeatureAvoidPartialCPSR, FeaturePreferVMOVSR, FeatureNEONForFPMovs, + FeatureCheckVLDnAlign, FeatureMP]>; // FIXME: A12 has currently the same Schedule model as A9 @@ -589,6 +602,7 @@ FeatureT2XtPk, FeatureVFP4, FeatureMP, + FeatureCheckVLDnAlign, FeatureHWDiv, FeatureHWDivARM, FeatureAvoidPartialCPSR, @@ -612,6 +626,7 @@ // division features. def : ProcessorModel<"krait", CortexA9Model, [ARMv7a, ProcKrait, FeatureHasRetAddrStack, + FeatureCheckVLDnAlign, FeatureVMLxForwarding, FeatureT2XtPk, FeatureFP16, Index: llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ llvm/trunk/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -3024,6 +3024,45 @@ return Size / 4; } +static unsigned getNumMicroOpsSingleIssuePlusExtras(unsigned Opc, + unsigned NumRegs) { + unsigned UOps = 1 + NumRegs; // 1 for address computation. + switch (Opc) { + default: + break; + case ARM::VLDMDIA_UPD: + case ARM::VLDMDDB_UPD: + case ARM::VLDMSIA_UPD: + case ARM::VLDMSDB_UPD: + case ARM::VSTMDIA_UPD: + case ARM::VSTMDDB_UPD: + case ARM::VSTMSIA_UPD: + case ARM::VSTMSDB_UPD: + case ARM::LDMIA_UPD: + case ARM::LDMDA_UPD: + case ARM::LDMDB_UPD: + case ARM::LDMIB_UPD: + case ARM::STMIA_UPD: + case ARM::STMDA_UPD: + case ARM::STMDB_UPD: + case ARM::STMIB_UPD: + case ARM::tLDMIA_UPD: + case ARM::tSTMIA_UPD: + case ARM::t2LDMIA_UPD: + case ARM::t2LDMDB_UPD: + case ARM::t2STMIA_UPD: + case ARM::t2STMDB_UPD: + ++UOps; // One for base register writeback. + break; + case ARM::LDMIA_RET: + case ARM::tPOP_RET: + case ARM::t2LDMIA_RET: + UOps += 2; // One for base reg wb, one for write to pc. + break; + } + return UOps; +} + unsigned ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData, const MachineInstr *MI) const { @@ -3107,65 +3146,35 @@ case ARM::t2STMIA_UPD: case ARM::t2STMDB_UPD: { unsigned NumRegs = MI->getNumOperands() - Desc.getNumOperands() + 1; - if (Subtarget.isSwift()) { - int UOps = 1 + NumRegs; // One for address computation, one for each ld / st. - switch (Opc) { - default: break; - case ARM::VLDMDIA_UPD: - case ARM::VLDMDDB_UPD: - case ARM::VLDMSIA_UPD: - case ARM::VLDMSDB_UPD: - case ARM::VSTMDIA_UPD: - case ARM::VSTMDDB_UPD: - case ARM::VSTMSIA_UPD: - case ARM::VSTMSDB_UPD: - case ARM::LDMIA_UPD: - case ARM::LDMDA_UPD: - case ARM::LDMDB_UPD: - case ARM::LDMIB_UPD: - case ARM::STMIA_UPD: - case ARM::STMDA_UPD: - case ARM::STMDB_UPD: - case ARM::STMIB_UPD: - case ARM::tLDMIA_UPD: - case ARM::tSTMIA_UPD: - case ARM::t2LDMIA_UPD: - case ARM::t2LDMDB_UPD: - case ARM::t2STMIA_UPD: - case ARM::t2STMDB_UPD: - ++UOps; // One for base register writeback. - break; - case ARM::LDMIA_RET: - case ARM::tPOP_RET: - case ARM::t2LDMIA_RET: - UOps += 2; // One for base reg wb, one for write to pc. - break; - } - return UOps; - } else if (Subtarget.isCortexA8() || Subtarget.isCortexA7()) { + switch (Subtarget.getLdStMultipleTiming()) { + case ARMSubtarget::SingleIssuePlusExtras: + return getNumMicroOpsSingleIssuePlusExtras(Opc, NumRegs); + case ARMSubtarget::SingleIssue: + // Assume the worst. + return NumRegs; + case ARMSubtarget::DoubleIssue: { if (NumRegs < 4) return 2; // 4 registers would be issued: 2, 2. // 5 registers would be issued: 2, 2, 1. - int A8UOps = (NumRegs / 2); + unsigned UOps = (NumRegs / 2); if (NumRegs % 2) - ++A8UOps; - return A8UOps; - } else if (Subtarget.isLikeA9()) { - int A9UOps = (NumRegs / 2); + ++UOps; + return UOps; + } + case ARMSubtarget::DoubleIssueCheckUnalignedAccess: { + unsigned UOps = (NumRegs / 2); // If there are odd number of registers or if it's not 64-bit aligned, // then it takes an extra AGU (Address Generation Unit) cycle. - if ((NumRegs % 2) || - !MI->hasOneMemOperand() || + if ((NumRegs % 2) || !MI->hasOneMemOperand() || (*MI->memoperands_begin())->getAlignment() < 8) - ++A9UOps; - return A9UOps; - } else { - // Assume the worst. - return NumRegs; + ++UOps; + return UOps; + } } } } + llvm_unreachable("Didn't find the number of microops"); } int @@ -3542,7 +3551,7 @@ } } - if (DefAlign < 8 && Subtarget.isLikeA9()) { + if (DefAlign < 8 && Subtarget.checkVLDnAccessAlignment()) { switch (DefMCID->getOpcode()) { default: break; case ARM::VLD1q8: @@ -3767,10 +3776,9 @@ if (!UseNode->isMachineOpcode()) { int Latency = ItinData->getOperandCycle(DefMCID.getSchedClass(), DefIdx); - if (Subtarget.isLikeA9() || Subtarget.isSwift()) - return Latency <= 2 ? 1 : Latency - 1; - else - return Latency <= 3 ? 1 : Latency - 2; + int Adj = Subtarget.getPreISelOperandLatencyAdjustment(); + int Threshold = 1 + Adj; + return Latency <= Threshold ? 1 : Latency - Adj; } const MCInstrDesc &UseMCID = get(UseNode->getMachineOpcode()); @@ -3841,7 +3849,7 @@ } } - if (DefAlign < 8 && Subtarget.isLikeA9()) + if (DefAlign < 8 && Subtarget.checkVLDnAccessAlignment()) switch (DefMCID.getOpcode()) { default: break; case ARM::VLD1q8: @@ -4060,9 +4068,8 @@ const MachineInstr *UseMI, unsigned UseIdx) const { unsigned DDomain = DefMI->getDesc().TSFlags & ARMII::DomainMask; unsigned UDomain = UseMI->getDesc().TSFlags & ARMII::DomainMask; - if (Subtarget.isCortexA8() && + if (Subtarget.nonpipelinedVFP() && (DDomain == ARMII::DomainVFP || UDomain == ARMII::DomainVFP)) - // CortexA8 VFP instructions are not pipelined. return true; // Hoist VFP / NEON instructions with 4 or higher latency. Index: llvm/trunk/lib/Target/ARM/ARMSubtarget.h =================================================================== --- llvm/trunk/lib/Target/ARM/ARMSubtarget.h +++ llvm/trunk/lib/Target/ARM/ARMSubtarget.h @@ -56,6 +56,22 @@ ARMv7m, ARMv7em, ARMv8a, ARMv81a, ARMv82a, ARMv8mMainline, ARMv8mBaseline }; +public: + /// What kind of timing do load multiple/store multiple instructions have. + enum ARMLdStMultipleTiming { + /// Can load/store 2 registers/cycle. + DoubleIssue, + /// Can load/store 2 registers/cycle, but needs an extra cycle if the access + /// is not 64-bit aligned. + DoubleIssueCheckUnalignedAccess, + /// Can load/store 1 register/cycle. + SingleIssue, + /// Can load/store 1 register/cycle, but needs an extra cycle for address + /// computation and potentially also for register writeback. + SingleIssuePlusExtras, + }; + +protected: /// ARMProcFamily - ARM processor family: Cortex-A8, Cortex-A9, and others. ARMProcFamilyEnum ARMProcFamily; @@ -236,6 +252,12 @@ /// If true, VMOVRS, VMOVSR and VMOVS will be converted from VFP to NEON. bool UseNEONForFPMovs; + /// If true, VLDn instructions take an extra cycle for unaligned accesses. + bool CheckVLDnAlign; + + /// If true, VFP instructions are not pipelined. + bool NonpipelinedVFP; + /// StrictAlign - If true, the subtarget disallows unaligned memory /// accesses for some types. For details, see /// ARMTargetLowering::allowsMisalignedMemoryAccesses(). @@ -268,6 +290,16 @@ /// CPUString - String name of used CPU. std::string CPUString; + unsigned MaxInterleaveFactor; + + /// What kind of timing do load multiple/store multiple have (double issue, + /// single issue etc). + ARMLdStMultipleTiming LdStMultipleTiming; + + /// The adjustment that we need to apply to get the operand latency from the + /// operand cycle returned by the itinerary data for pre-ISel operands. + int PreISelOperandLatencyAdjustment; + /// IsLittle - The target is Little Endian bool IsLittle; @@ -400,6 +432,8 @@ bool preferVMOVSR() const { return PreferVMOVSR; } bool preferISHSTBarriers() const { return PreferISHST; } bool useNEONForFPMovs() const { return UseNEONForFPMovs; } + bool checkVLDnAccessAlignment() const { return CheckVLDnAlign; } + bool nonpipelinedVFP() const { return NonpipelinedVFP; } bool prefers32BitThumb() const { return Pref32BitThumb; } bool avoidCPSRPartialUpdate() const { return AvoidCPSRPartialUpdate; } bool avoidMOVsShifterOperand() const { return AvoidMOVsShifterOperand; } @@ -538,6 +572,16 @@ /// function for this subtarget. unsigned getStackAlignment() const { return stackAlignment; } + unsigned getMaxInterleaveFactor() const { return MaxInterleaveFactor; } + + ARMLdStMultipleTiming getLdStMultipleTiming() const { + return LdStMultipleTiming; + } + + int getPreISelOperandLatencyAdjustment() const { + return PreISelOperandLatencyAdjustment; + } + /// GVIsIndirectSymbol - true if the GV will be accessed via an indirect /// symbol. bool GVIsIndirectSymbol(const GlobalValue *GV, Reloc::Model RelocM) const; Index: llvm/trunk/lib/Target/ARM/ARMSubtarget.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMSubtarget.cpp +++ llvm/trunk/lib/Target/ARM/ARMSubtarget.cpp @@ -109,10 +109,13 @@ Has8MSecExt(false), HasCrypto(false), HasCRC(false), HasRAS(false), HasZeroCycleZeroing(false), IsProfitableToUnpredicate(false), HasSlowVGETLNi32(false), HasSlowVDUP32(false), PreferVMOVSR(false), - PreferISHST(false), UseNEONForFPMovs(false), StrictAlign(false), - RestrictIT(false), HasDSP(false), UseNaClTrap(false), GenLongCalls(false), + PreferISHST(false), UseNEONForFPMovs(false), CheckVLDnAlign(false), + NonpipelinedVFP(false), StrictAlign(false), RestrictIT(false), + HasDSP(false), UseNaClTrap(false), GenLongCalls(false), UnsafeFPMath(false), UseSjLjEH(false), stackAlignment(4), CPUString(CPU), - IsLittle(IsLittle), TargetTriple(TT), Options(TM.Options), TM(TM), + MaxInterleaveFactor(1), LdStMultipleTiming(SingleIssue), + PreISelOperandLatencyAdjustment(2), IsLittle(IsLittle), TargetTriple(TT), + Options(TM.Options), TM(TM), FrameLowering(initializeFrameLowering(CPU, FS)), // At this point initializeSubtargetDependencies has been called so // we can query directly. @@ -221,6 +224,51 @@ if ((Bits[ARM::ProcA5] || Bits[ARM::ProcA8]) && // Where this matters (Options.UnsafeFPMath || isTargetDarwin())) UseNEONForSinglePrecisionFP = true; + + // FIXME: Teach TableGen to deal with these instead of doing it manually here. + switch (ARMProcFamily) { + case Others: + case CortexA5: + break; + case CortexA7: + LdStMultipleTiming = DoubleIssue; + break; + case CortexA8: + LdStMultipleTiming = DoubleIssue; + break; + case CortexA9: + LdStMultipleTiming = DoubleIssueCheckUnalignedAccess; + PreISelOperandLatencyAdjustment = 1; + break; + case CortexA12: + break; + case CortexA15: + MaxInterleaveFactor = 2; + PreISelOperandLatencyAdjustment = 1; + break; + case CortexA17: + case CortexA32: + case CortexA35: + case CortexA53: + case CortexA57: + case CortexA72: + case CortexA73: + case CortexR4: + case CortexR4F: + case CortexR5: + case CortexR7: + case CortexM3: + case ExynosM1: + break; + case Krait: + PreISelOperandLatencyAdjustment = 1; + break; + case Swift: + MaxInterleaveFactor = 2; + LdStMultipleTiming = SingleIssuePlusExtras; + PreISelOperandLatencyAdjustment = 1; + break; + } } bool ARMSubtarget::isAPCS_ABI() const { Index: llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/trunk/lib/Target/ARM/ARMTargetTransformInfo.h @@ -97,10 +97,7 @@ } unsigned getMaxInterleaveFactor(unsigned VF) { - // These are out of order CPUs: - if (ST->isCortexA15() || ST->isSwift()) - return 2; - return 1; + return ST->getMaxInterleaveFactor(); } int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);