diff --git a/llvm/lib/Target/ARM/ARM.h b/llvm/lib/Target/ARM/ARM.h --- a/llvm/lib/Target/ARM/ARM.h +++ b/llvm/lib/Target/ARM/ARM.h @@ -58,6 +58,7 @@ Pass *createMVEGatherScatterLoweringPass(); FunctionPass *createARMSLSHardeningPass(); FunctionPass *createARMIndirectThunks(); +FunctionPass *createARMSubregWrite(); void LowerARMMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI, ARMAsmPrinter &AP); @@ -76,7 +77,7 @@ void initializeMVETailPredicationPass(PassRegistry &); void initializeMVEGatherScatterLoweringPass(PassRegistry &); void initializeARMSLSHardeningPass(PassRegistry &); - +void initializeARMSubregWritePass(PassRegistry &); } // end namespace llvm #endif // LLVM_LIB_TARGET_ARM_ARM_H diff --git a/llvm/lib/Target/ARM/ARMSubregWrite.cpp b/llvm/lib/Target/ARM/ARMSubregWrite.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/ARM/ARMSubregWrite.cpp @@ -0,0 +1,742 @@ +//===-------------------------- ARMSubregWrite.cpp ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// ARMSubregWrite pass attempts to mitigate a pipeline stall when an +// FP/ASIMD uop reads a D- or Q-register that has recently been +// written with one or more S-register results. +// ===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMInstrInfo.h" +#include "ARMMachineFunctionInfo.h" +#include "MCTargetDesc/ARMAddressingModes.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "arm-subreg-write" +STATISTIC(NumHazardsFound, "Number of S-register forwarding hazards found"); +STATISTIC(NumHazardsNotHandled, + "Number of S-register forwarding hazards not handled"); + +namespace { +class ARMSubregWrite : public MachineFunctionPass { + const TargetInstrInfo *TII; + +public: + static char ID; + explicit ARMSubregWrite() : MachineFunctionPass(ID) { + initializeARMSubregWritePass(*PassRegistry::getPassRegistry()); + } + + bool runOnMachineFunction(MachineFunction &F) override; + + StringRef getPassName() const override { + return "Fix incorrect subregister writes"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + +private: + bool runOnBasicBlock(MachineBasicBlock &MBB, MachineRegisterInfo &MRI); +}; +char ARMSubregWrite::ID = 0; + +} // namespace + +INITIALIZE_PASS(ARMSubregWrite, "arm-subreg-writes-pass", + "ARM subregister writes optimization", false, false) + +bool ARMSubregWrite::runOnMachineFunction(MachineFunction &F) { + LLVM_DEBUG(dbgs() << "***** ARMSubregWrite *****\n"); + bool Changed = false; + TII = F.getSubtarget().getInstrInfo(); + MachineRegisterInfo &MRI = F.getRegInfo(); + + for (auto &MBB : F) { + Changed |= runOnBasicBlock(MBB, MRI); + } + return Changed; +} + +static bool isFPASIMDInstr(const MachineInstr &MI) { + switch (MI.getOpcode()) { + case ARM::VMLAslfd: + case ARM::VMLAslfq: + case ARM::VMLSslfd: + case ARM::VMLSslfq: + case ARM::VMULslfd: + case ARM::VMULslfq: + case ARM::NEON_VMAXNMNDf: + case ARM::NEON_VMAXNMNQf: + case ARM::NEON_VMINNMNDf: + case ARM::NEON_VMINNMNQf: + case ARM::VABDfd: + case ARM::VABDfq: + case ARM::VABSfd: + case ARM::VABSfq: + case ARM::VACGEfd: + case ARM::VACGEfq: + case ARM::VACGTfd: + case ARM::VACGTfq: + case ARM::VADDfd: + case ARM::VADDfq: + case ARM::VCEQfd: + case ARM::VCEQfq: + case ARM::VCGEfd: + case ARM::VCGEfq: + case ARM::VCGTfd: + case ARM::VCGTfq: + case ARM::VFMAfd: + case ARM::VFMAfq: + case ARM::VFMSfd: + case ARM::VFMSfq: + case ARM::VMAXfd: + case ARM::VMAXfq: + case ARM::VMINfd: + case ARM::VMINfq: + case ARM::VMLAfd: + case ARM::VMLAfq: + case ARM::VMLSfd: + case ARM::VMLSfq: + case ARM::VMULfd: + case ARM::VMULfq: + case ARM::VNEGfd: + case ARM::VNEGf32q: + case ARM::VRECPEfd: + case ARM::VRECPEfq: + case ARM::VRECPSfd: + case ARM::VRECPSfq: + case ARM::VRSQRTEfd: + case ARM::VRSQRTEfq: + case ARM::VRSQRTSfd: + case ARM::VRSQRTSfq: + case ARM::VSUBfd: + case ARM::VSUBfq: + case ARM::VEXTd32: + case ARM::VEXTq32: + case ARM::VORNd: + case ARM::VORNq: + case ARM::VORRd: + case ARM::VORRq: + case ARM::VANDd: + case ARM::VANDq: + case ARM::VEORd: + case ARM::VEORq: + case ARM::VBICd: + case ARM::VBICq: + case ARM::VBIFd: + case ARM::VBIFq: + case ARM::VBITd: + case ARM::VBITq: + case ARM::VBSLd: + case ARM::VBSLq: + case ARM::VBSPd: + case ARM::VBSPq: + case ARM::VCNTd: + case ARM::VCNTq: + case ARM::VMVNd: + case ARM::VMVNq: + case ARM::VABALsv4i32: + case ARM::VABALuv4i32: + case ARM::VABAsv2i32: + case ARM::VABAsv4i32: + case ARM::VABAuv2i32: + case ARM::VABAuv4i32: + case ARM::VABDLsv4i32: + case ARM::VABDLuv4i32: + case ARM::VABDsv2i32: + case ARM::VABDsv4i32: + case ARM::VABDuv2i32: + case ARM::VABDuv4i32: + case ARM::VABSv2i32: + case ARM::VABSv4i32: + case ARM::VADDHNv2i32: + case ARM::VADDLsv4i32: + case ARM::VADDLuv4i32: + case ARM::VADDWsv4i32: + case ARM::VADDWuv4i32: + case ARM::VADDv2i32: + case ARM::VADDv4i32: + case ARM::VBICiv2i32: + case ARM::VBICiv4i32: + case ARM::VCADDv2f32: + case ARM::VCADDv4f32: + case ARM::VCEQv2i32: + case ARM::VCEQv4i32: + case ARM::VCEQzv2f32: + case ARM::VCEQzv2i32: + case ARM::VCEQzv4f32: + case ARM::VCEQzv4i32: + case ARM::VCGEsv2i32: + case ARM::VCGEsv4i32: + case ARM::VCGEuv2i32: + case ARM::VCGEuv4i32: + case ARM::VCGEzv2f32: + case ARM::VCGEzv2i32: + case ARM::VCGEzv4f32: + case ARM::VCGEzv4i32: + case ARM::VCGTsv2i32: + case ARM::VCGTsv4i32: + case ARM::VCGTuv2i32: + case ARM::VCGTuv4i32: + case ARM::VCGTzv2f32: + case ARM::VCGTzv2i32: + case ARM::VCGTzv4f32: + case ARM::VCGTzv4i32: + case ARM::VCLEzv2f32: + case ARM::VCLEzv2i32: + case ARM::VCLEzv4f32: + case ARM::VCLEzv4i32: + case ARM::VCLSv2i32: + case ARM::VCLSv4i32: + case ARM::VCLTzv2f32: + case ARM::VCLTzv2i32: + case ARM::VCLTzv4f32: + case ARM::VCLTzv4i32: + case ARM::VCLZv2i32: + case ARM::VCLZv4i32: + case ARM::VCMLAv2f32: + case ARM::VCMLAv2f32_indexed: + case ARM::VCMLAv4f32: + case ARM::VCMLAv4f32_indexed: + case ARM::VHADDsv2i32: + case ARM::VHADDsv4i32: + case ARM::VHADDuv2i32: + case ARM::VHADDuv4i32: + case ARM::VHSUBsv2i32: + case ARM::VHSUBsv4i32: + case ARM::VHSUBuv2i32: + case ARM::VHSUBuv4i32: + case ARM::VMAXsv2i32: + case ARM::VMAXsv4i32: + case ARM::VMAXuv2i32: + case ARM::VMAXuv4i32: + case ARM::VMINsv2i32: + case ARM::VMINsv4i32: + case ARM::VMINuv2i32: + case ARM::VMINuv4i32: + case ARM::VMLALslsv2i32: + case ARM::VMLALsluv2i32: + case ARM::VMLALsv4i32: + case ARM::VMLALuv4i32: + case ARM::VMLAslv2i32: + case ARM::VMLAslv4i32: + case ARM::VMLAv2i32: + case ARM::VMLAv4i32: + case ARM::VMLSLslsv2i32: + case ARM::VMLSLsluv2i32: + case ARM::VMLSLsv4i32: + case ARM::VMLSLuv4i32: + case ARM::VMLSslv2i32: + case ARM::VMLSslv4i32: + case ARM::VMLSv2i32: + case ARM::VMLSv4i32: + case ARM::VMOVLsv4i32: + case ARM::VMOVLuv4i32: + case ARM::VMOVNv2i32: + case ARM::VMOVv2f32: + case ARM::VMOVv2i32: + case ARM::VMOVv4f32: + case ARM::VMOVv4i32: + case ARM::VMULLslsv2i32: + case ARM::VMULLsluv2i32: + case ARM::VMULLsv4i32: + case ARM::VMULLuv4i32: + case ARM::VMULslv2i32: + case ARM::VMULslv4i32: + case ARM::VMULv2i32: + case ARM::VMULv4i32: + case ARM::VMVNv2i32: + case ARM::VMVNv4i32: + case ARM::VORRiv2i32: + case ARM::VORRiv4i32: + case ARM::VPADALsv2i32: + case ARM::VPADALsv4i32: + case ARM::VPADALuv2i32: + case ARM::VPADALuv4i32: + case ARM::VPADDLsv2i32: + case ARM::VPADDLsv4i32: + case ARM::VPADDLuv2i32: + case ARM::VPADDLuv4i32: + case ARM::VQABSv2i32: + case ARM::VQABSv4i32: + case ARM::VQADDsv2i32: + case ARM::VQADDsv4i32: + case ARM::VQADDuv2i32: + case ARM::VQADDuv4i32: + case ARM::VQDMLALslv2i32: + case ARM::VQDMLALv4i32: + case ARM::VQDMLSLslv2i32: + case ARM::VQDMLSLv4i32: + case ARM::VQDMULHslv2i32: + case ARM::VQDMULHslv4i32: + case ARM::VQDMULHv2i32: + case ARM::VQDMULHv4i32: + case ARM::VQDMULLslv2i32: + case ARM::VQDMULLv4i32: + case ARM::VQMOVNsuv2i32: + case ARM::VQMOVNsv2i32: + case ARM::VQMOVNuv2i32: + case ARM::VQNEGv2i32: + case ARM::VQNEGv4i32: + case ARM::VQRDMLAHslv2i32: + case ARM::VQRDMLAHslv4i32: + case ARM::VQRDMLAHv2i32: + case ARM::VQRDMLAHv4i32: + case ARM::VQRDMLSHslv2i32: + case ARM::VQRDMLSHslv4i32: + case ARM::VQRDMLSHv2i32: + case ARM::VQRDMLSHv4i32: + case ARM::VQRDMULHslv2i32: + case ARM::VQRDMULHslv4i32: + case ARM::VQRDMULHv2i32: + case ARM::VQRDMULHv4i32: + case ARM::VQRSHLsv2i32: + case ARM::VQRSHLsv4i32: + case ARM::VQRSHLuv2i32: + case ARM::VQRSHLuv4i32: + case ARM::VQRSHRNsv2i32: + case ARM::VQRSHRNuv2i32: + case ARM::VQRSHRUNv2i32: + case ARM::VQSHLsiv2i32: + case ARM::VQSHLsiv4i32: + case ARM::VQSHLsuv2i32: + case ARM::VQSHLsuv4i32: + case ARM::VQSHLsv2i32: + case ARM::VQSHLsv4i32: + case ARM::VQSHLuiv2i32: + case ARM::VQSHLuiv4i32: + case ARM::VQSHLuv2i32: + case ARM::VQSHLuv4i32: + case ARM::VQSHRNsv2i32: + case ARM::VQSHRNuv2i32: + case ARM::VQSHRUNv2i32: + case ARM::VQSUBsv2i32: + case ARM::VQSUBsv4i32: + case ARM::VQSUBuv2i32: + case ARM::VQSUBuv4i32: + case ARM::VRADDHNv2i32: + case ARM::VRHADDsv2i32: + case ARM::VRHADDsv4i32: + case ARM::VRHADDuv2i32: + case ARM::VRHADDuv4i32: + case ARM::VRSHLsv2i32: + case ARM::VRSHLsv4i32: + case ARM::VRSHLuv2i32: + case ARM::VRSHLuv4i32: + case ARM::VRSHRNv2i32: + case ARM::VRSHRsv2i32: + case ARM::VRSHRsv4i32: + case ARM::VRSHRuv2i32: + case ARM::VRSHRuv4i32: + case ARM::VRSRAsv2i32: + case ARM::VRSRAsv4i32: + case ARM::VRSRAuv2i32: + case ARM::VRSRAuv4i32: + case ARM::VRSUBHNv2i32: + case ARM::VSHLLsv4i32: + case ARM::VSHLLuv4i32: + case ARM::VSHLiv2i32: + case ARM::VSHLiv4i32: + case ARM::VSHLsv2i32: + case ARM::VSHLsv4i32: + case ARM::VSHLuv2i32: + case ARM::VSHLuv4i32: + case ARM::VSHRNv2i32: + case ARM::VSHRsv2i32: + case ARM::VSHRsv4i32: + case ARM::VSHRuv2i32: + case ARM::VSHRuv4i32: + case ARM::VSLIv2i32: + case ARM::VSLIv4i32: + case ARM::VSRAsv2i32: + case ARM::VSRAsv4i32: + case ARM::VSRAuv2i32: + case ARM::VSRAuv4i32: + case ARM::VSRIv2i32: + case ARM::VSRIv4i32: + case ARM::VSUBHNv2i32: + case ARM::VSUBLsv4i32: + case ARM::VSUBLuv4i32: + case ARM::VSUBWsv4i32: + case ARM::VSUBWuv4i32: + case ARM::VSUBv2i32: + case ARM::VSUBv4i32: + case ARM::VTSTv2i32: + case ARM::VTSTv4i32: + return true; + default: + return false; + } +} + +struct DQRegDesc { + SmallVector SRegs; +}; + +template <> struct llvm::DenseMapInfo { + static inline DQRegDesc getEmptyKey() { return DQRegDesc(); } + static inline DQRegDesc getTombstoneKey() { + DQRegDesc DQReg; + DQReg.SRegs.push_back(0); + return DQReg; + } + static unsigned getHashValue(const DQRegDesc &DQReg) { + unsigned Val = 0; + for (unsigned I = 0; I < DQReg.SRegs.size(); ++I) { + Val |= DQReg.SRegs[I] << (I * 8); + } + return Val; + } + static bool isEqual(const DQRegDesc &LHS, const DQRegDesc &RHS) { + if (LHS.SRegs.size() != RHS.SRegs.size()) + return false; + + for (unsigned I = 0; I < LHS.SRegs.size(); ++I) { + if (LHS.SRegs[I] != RHS.SRegs[I]) + return false; + } + return true; + } +}; + +static bool matchSRegSequence(MachineInstr &MI, const MachineRegisterInfo &MRI, + SmallVectorImpl &SRegs) { + if (!MI.isRegSequence()) + return false; + + // We're looking for either Dreg (2 x Sreg) or Qreg (4 x Sreg) + assert(MI.getNumOperands() % 2 == 1); + if (MI.getNumOperands() < 5) + return false; + + for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); OpIdx += 2) { + MachineOperand &MOP = MI.getOperand(OpIdx); + Register SReg = MOP.getReg(); + if (MRI.getRegClass(SReg) != &ARM::SPRRegClass) + return false; + uint16_t Index = 0; + switch (MI.getOperand(OpIdx + 1).getImm()) { + case ARM::ssub_0: + Index = 0; + break; + case ARM::ssub_1: + Index = 1; + break; + case ARM::ssub_2: + Index = 2; + break; + case ARM::ssub_3: + Index = 3; + break; + default: + llvm_unreachable("Unhandled index."); + } + + if (SRegs.empty()) { + unsigned NumSubregs = (MI.getNumOperands() - 1) / 2; + SRegs.resize(NumSubregs); + assert((SRegs.size() == 2 || SRegs.size() == 4) && + "Unexpected number of sub-registers in a REG_SEQUENCE"); + } + assert(SReg); + SRegs[Index] = SReg; + } + return true; +} + +static bool matchSRegInsertSubreg(MachineInstr &MI, + const MachineRegisterInfo &MRI, + Register &SrcReg, + SmallVectorImpl &SRegs) { + if (!MI.isInsertSubreg()) + return false; + + SrcReg = MI.getOperand(1).getReg(); + Register SReg = MI.getOperand(2).getReg(); + if (MRI.getRegClass(SReg) != &ARM::SPRRegClass) + return false; + + uint16_t Index = 0; + switch (MI.getOperand(3).getImm()) { + case ARM::ssub_0: + Index = 0; + break; + case ARM::ssub_1: + Index = 1; + break; + case ARM::ssub_2: + Index = 2; + break; + case ARM::ssub_3: + Index = 3; + break; + default: + llvm_unreachable("Unhandled index."); + } + + if (SRegs.empty()) { + const TargetRegisterClass *RC = MRI.getRegClass(SrcReg); + bool IsQReg = RC == &ARM::QPRRegClass || RC == &ARM::MQPRRegClass || + RC == &ARM::QPR_VFP2RegClass; + + if (IsQReg) + SRegs.resize(4); + else + SRegs.resize(2); + } + + assert(SReg); + SRegs[Index] = SReg; + return true; +} + +// Insert a pair of VMOVRS and VSETLNi32 to copy an S-register using a +// scalar write to the corresponding D-register. +Register insertSRegCopy(Register DReg, Register SReg, uint16_t Lane, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertPt, DebugLoc DL, + const TargetInstrInfo &TII, MachineRegisterInfo &MRI) { + // FIXME: VSETLN doesn't work with S-registers, so we have to copy + // via a core register. Is there a better way to do this? + Register TmpGPReg = MRI.createVirtualRegister(&ARM::GPRRegClass); + MachineInstr *TmpCopyDef = + BuildMI(MBB, InsertPt, DL, TII.get(ARM::VMOVRS), TmpGPReg) + .addReg(SReg) + .add(predOps(ARMCC::AL)); + LLVM_DEBUG(dbgs() << "New instr: " << *TmpCopyDef); + + Register NewReg = MRI.cloneVirtualRegister(DReg); + MachineInstr *NewDef = + BuildMI(MBB, InsertPt, DL, TII.get(ARM::VSETLNi32), NewReg) + .addReg(DReg) + .addReg(TmpGPReg) + .addImm(Lane) + .add(predOps(ARMCC::AL)); + LLVM_DEBUG(dbgs() << "New instr: " << *NewDef); + + return NewReg; +} + +// Insert a VLD1 to replace a VLDRS instruction. +Register insertVLD1FromVLDRS(Register DReg, Register AddrReg, + unsigned VLDRSOffset, uint16_t Lane, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertPt, DebugLoc DL, + const TargetInstrInfo &TII, + MachineRegisterInfo &MRI) { + if (VLDRSOffset != 0) { + unsigned Offset = ARM_AM::getAM5Offset(VLDRSOffset); + unsigned UnscaledOffset = Offset * 4; + bool IsSub = ARM_AM::getAM5Op(VLDRSOffset) == ARM_AM::sub; + Register NewAddr = MRI.cloneVirtualRegister(AddrReg); + + ARMFunctionInfo *AFI = MBB.getParent()->getInfo(); + unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri : ARM::t2ADDri; + unsigned SUBriOpc = !AFI->isThumbFunction() ? ARM::SUBri : ARM::t2SUBri; + + MachineInstr *NewAddrDef = + BuildMI(MBB, InsertPt, DL, TII.get(IsSub ? SUBriOpc : ADDriOpc), + NewAddr) + .addReg(AddrReg) + .addImm(UnscaledOffset) + .add(predOps(ARMCC::AL)) + .add(condCodeOp()); + LLVM_DEBUG(dbgs() << "New instr: " << *NewAddrDef); + AddrReg = NewAddr; + } + + Register NewReg = MRI.cloneVirtualRegister(DReg); + unsigned Alignment = 0; // conservatively set to zero + MachineInstr *NewDef = + BuildMI(MBB, InsertPt, DL, TII.get(ARM::VLD1LNd32), NewReg) + .addReg(AddrReg) + .addImm(Alignment) + .addReg(DReg) + .addImm(Lane) + .add(predOps(ARMCC::AL)); + LLVM_DEBUG(dbgs() << "New instr: " << *NewDef); + + return NewReg; +} + +// Ideally find the def instruction of the SReg and replace it with a +// direct write to a D-register scalar. If there is no equivalent +// instruction for the def, copy the SReg to a D-register scalar. +Register rewriteSRegDef(Register DReg, Register SReg, uint16_t Lane, + MachineBasicBlock &MBB, + MachineBasicBlock::iterator InsertPt, DebugLoc DL, + const TargetInstrInfo &TII, MachineRegisterInfo &MRI) { + assert(SReg); + MachineOperand *SRegDefOp = MRI.getOneDef(SReg); + if (!SRegDefOp || &MBB != SRegDefOp->getParent()->getParent()) { + LLVM_DEBUG(dbgs() << "No def found in the current BB\n"); + return insertSRegCopy(DReg, SReg, Lane, MBB, InsertPt, DL, TII, MRI); + } + MachineInstr *SRegDef = SRegDefOp->getParent(); + + LLVM_DEBUG(dbgs() << "Rewriting " << *SRegDef); + switch (SRegDef->getOpcode()) { + case ARM::VLDRS: { + const MachineOperand &AddrOp = SRegDef->getOperand(1); + if (!AddrOp.isReg()) { + return insertSRegCopy(DReg, SReg, Lane, MBB, InsertPt, DL, TII, MRI); + } + + Register AddrReg = AddrOp.getReg(); + unsigned Offset = SRegDef->getOperand(2).getImm(); + return insertVLD1FromVLDRS(DReg, AddrReg, Offset, Lane, MBB, InsertPt, DL, + TII, MRI); + } + case TargetOpcode::IMPLICIT_DEF: { + return DReg; + } + default: { + return insertSRegCopy(DReg, SReg, Lane, MBB, InsertPt, DL, TII, MRI); + } + } + llvm_unreachable("Unhandled instruction"); +} + +typedef DenseMap> DQRegToUsersMap; + +static void findHazardCandidates(MachineInstr &MI, MachineRegisterInfo &MRI, + DQRegToUsersMap &HazardCandidates) { + LLVM_DEBUG(dbgs() << "Scanning: " << MI); + if (!isFPASIMDInstr(MI)) + return; + for (MachineOperand &MOP : MI.uses()) { + if (!MOP.isReg() || MOP.isImplicit()) + continue; + MachineInstr *MOPDef = MRI.getUniqueVRegDef(MOP.getReg()); + if (!MOPDef) { + LLVM_DEBUG(dbgs() << "Unable to find a unique def for " << MOP << '\n'); + continue; + } + + DQRegDesc DQReg; + if (matchSRegSequence(*MOPDef, MRI, DQReg.SRegs)) { + LLVM_DEBUG(dbgs() << "Candidate: " << MOP << '\n'); + HazardCandidates[DQReg].push_back(&MOP); + ++NumHazardsFound; + continue; + } + Register SrcReg; + if (matchSRegInsertSubreg(*MOPDef, MRI, SrcReg, DQReg.SRegs)) { + MachineInstr *SrcRegDef = MRI.getUniqueVRegDef(SrcReg); + if (!SrcRegDef) { + LLVM_DEBUG(dbgs() << "Unable to find a unique def for " + << MRI.getVRegName(SrcReg) << '\n'); + continue; + } + if (!SrcRegDef->isImplicitDef()) { + // There is no real obstacle to support such cases, but I haven't seen + // this in any of the tests. + LLVM_DEBUG(dbgs() << "Non-trivial INSERT_SUBREG is not supported\n"); + ++NumHazardsNotHandled; + continue; + } + LLVM_DEBUG(dbgs() << "Candidate: " << MOP << '\n'); + ++NumHazardsFound; + HazardCandidates[DQReg].push_back(&MOP); + } + } +} + +Register defineDReg(MachineBasicBlock::iterator InsertPt, + MachineBasicBlock &MBB, DebugLoc DL, + MachineRegisterInfo &MRI, const TargetInstrInfo &TII) { + Register Reg = MRI.createVirtualRegister(&ARM::DPR_VFP2RegClass); + MachineInstr *Def = + BuildMI(MBB, InsertPt, DL, TII.get(TargetOpcode::IMPLICIT_DEF), Reg); + LLVM_DEBUG(dbgs() << "New instr: " << *Def); + return Reg; +} + +Register defineDRegSeq(ArrayRef DRegs, + MachineBasicBlock::iterator InsertPt, + MachineBasicBlock &MBB, DebugLoc DL, + MachineRegisterInfo &MRI, const TargetInstrInfo &TII) { + Register Reg = MRI.createVirtualRegister(&ARM::QPR_VFP2RegClass); + assert(DRegs.size() == 2); + MachineInstr *RegSeq = + BuildMI(MBB, InsertPt, DL, TII.get(ARM::REG_SEQUENCE), Reg) + .addReg(DRegs[0]) + .addImm(ARM::dsub_0) + .addReg(DRegs[1]) + .addImm(ARM::dsub_1); + LLVM_DEBUG(dbgs() << "New instr: " << *RegSeq); + return Reg; +} + +bool ARMSubregWrite::runOnBasicBlock(MachineBasicBlock &MBB, + MachineRegisterInfo &MRI) { + assert(MRI.isSSA()); + + LLVM_DEBUG(dbgs() << "Running on MBB: " << MBB + << " - scanning instructions...\n"); + DQRegToUsersMap HazardCandidates; + for (MachineInstr &MI : MBB) { + findHazardCandidates(MI, MRI, HazardCandidates); + } + LLVM_DEBUG(dbgs() << "Scan complete, found " << HazardCandidates.size() + << " register forwarding hazards.\n"); + if (HazardCandidates.empty()) + return false; + + for (auto &KV : HazardCandidates) { + const DQRegDesc &DQReg = KV.first; + SmallVectorImpl &FixOps = KV.second; + assert(!FixOps.empty()); + + MachineInstr *DQRegUser = FixOps[0]->getParent(); + MachineBasicBlock::iterator InsertPt(DQRegUser); + DebugLoc DL = DQRegUser->getDebugLoc(); + + SmallVector DRegs; + DRegs.push_back(defineDReg(InsertPt, MBB, DL, MRI, *TII)); + if (DQReg.SRegs.size() == 4) { + DRegs.push_back(defineDReg(InsertPt, MBB, DL, MRI, *TII)); + } + + for (unsigned I = 0; I < DQReg.SRegs.size(); ++I) { + Register SReg = DQReg.SRegs[I]; + if (!SReg) + continue; + unsigned DRegIndex = I / 2; + unsigned DRegLane = I % 2; + Register &DReg = DRegs[DRegIndex]; + DReg = rewriteSRegDef(DReg, SReg, DRegLane, MBB, InsertPt, DL, *TII, MRI); + } + + Register NewReg = (DRegs.size() == 1) + ? DRegs[0] + : defineDRegSeq(DRegs, InsertPt, MBB, DL, MRI, *TII); + + for (MachineOperand *Op : FixOps) { + Op->setReg(NewReg); + } + } + + LLVM_DEBUG(dbgs() << "Final MBB:\n" << MBB); + return /*Changed*/ true; +} + +FunctionPass *llvm::createARMSubregWrite() { return new ARMSubregWrite(); } diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp --- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -75,6 +75,11 @@ EnableGlobalMerge("arm-global-merge", cl::Hidden, cl::desc("Enable the global merge pass")); +static cl::opt + EnableSubregWriteOpt("arm-subreg-write", cl::Hidden, + cl::desc("Optimize subregister writes"), + cl::init(false)); + namespace llvm { void initializeARMExecutionDomainFixPass(PassRegistry&); } @@ -102,6 +107,7 @@ initializeARMBlockPlacementPass(Registry); initializeMVEGatherScatterLoweringPass(Registry); initializeARMSLSHardeningPass(Registry); + initializeARMSubregWritePass(Registry); } static std::unique_ptr createTLOF(const Triple &TT) { @@ -502,6 +508,9 @@ if (!DisableA15SDOptimization) addPass(createA15SDOptimizerPass()); + + if (EnableSubregWriteOpt) + addPass(createARMSubregWrite()); } } diff --git a/llvm/lib/Target/ARM/CMakeLists.txt b/llvm/lib/Target/ARM/CMakeLists.txt --- a/llvm/lib/Target/ARM/CMakeLists.txt +++ b/llvm/lib/Target/ARM/CMakeLists.txt @@ -50,6 +50,7 @@ ARMRegisterBankInfo.cpp ARMSelectionDAGInfo.cpp ARMSLSHardening.cpp + ARMSubregWrite.cpp ARMSubtarget.cpp ARMTargetMachine.cpp ARMTargetObjectFile.cpp diff --git a/llvm/test/CodeGen/ARM/arm-subreg-write.ll b/llvm/test/CodeGen/ARM/arm-subreg-write.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/arm-subreg-write.ll @@ -0,0 +1,199 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple armv8 -mattr=+neon,+fp-armv8 -arm-subreg-write | FileCheck %s +; RUN: llc < %s -mtriple armv8 -mattr=+neon,+fp-armv8 -arm-subreg-write -filetype=obj -o - | llvm-objdump -d - | FileCheck %s --check-prefix=CHECK-OBJ +; RUN: llc < %s -mtriple armv8 -mattr=+neon,+fp-armv8 | FileCheck %s --check-prefix NOOPT + +declare <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float>, <2 x float>) nounwind readnone +declare <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float>, <4 x float>) nounwind readnone + +; VLDR to an S-register should be replaced by a scalar VLD1 +define <2 x float> @foo(float* %0, float* %1, i64 %offset) { +; CHECK-LABEL: foo: +; CHECK: @ %bb.0: +; CHECK-NEXT: add r1, r1, r2, lsl #2 +; CHECK-NEXT: vld1.32 {d0[0]}, [r0] +; CHECK-NEXT: vld1.32 {d1[0]}, [r1] +; CHECK-NEXT: vmaxnm.f32 d16, d0, d1 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr +; +; NOOPT-LABEL: foo: +; NOOPT: @ %bb.0: +; NOOPT-NEXT: vldr s0, [r0] +; NOOPT-NEXT: add r0, r1, r2, lsl #2 +; NOOPT-NEXT: vldr s2, [r0] +; NOOPT-NEXT: vmaxnm.f32 d16, d0, d1 +; NOOPT-NEXT: vmov r0, r1, d16 +; NOOPT-NEXT: bx lr + %3 = load float, float* %0, align 4 + %4 = insertelement <2 x float> undef, float %3, i32 0 + + %5 = getelementptr float, float* %1, i64 %offset + %6 = load float, float* %5, align 4 + %7 = insertelement <2 x float> undef, float %6, i32 0 + + %8 = call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> %4, <2 x float> %7) + ret <2 x float> %8 +} + +; If the original VLDR has an address offset, add a new instruction to +; compute the address for VLD1 +define <2 x float> @bar(float* %0, float* %1, float* %2, float* %3, i64 %offset) { +; CHECK-LABEL: bar: +; CHECK: @ %bb.0: +; CHECK-NEXT: ldr r1, [sp] +; CHECK-NEXT: vld1.32 {d16[0]}, [r2] +; CHECK-NEXT: add r1, r0, r1, lsl #2 +; CHECK-NEXT: add r0, r0, #4 +; CHECK-NEXT: vld1.32 {d16[1]}, [r3] +; CHECK-NEXT: vld1.32 {d17[0]}, [r0] +; CHECK-NEXT: vld1.32 {d17[1]}, [r1] +; CHECK-NEXT: vmaxnm.f32 d16, d17, d16 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr +; +; NOOPT-LABEL: bar: +; NOOPT: @ %bb.0: +; NOOPT-NEXT: ldr r1, [sp] +; NOOPT-NEXT: vldr s2, [r0, #4] +; NOOPT-NEXT: vldr s1, [r3] +; NOOPT-NEXT: add r0, r0, r1, lsl #2 +; NOOPT-NEXT: vldr s0, [r2] +; NOOPT-NEXT: vldr s3, [r0] +; NOOPT-NEXT: vmaxnm.f32 d16, d1, d0 +; NOOPT-NEXT: vmov r0, r1, d16 +; NOOPT-NEXT: bx lr + %gep0 = getelementptr float, float* %0, i64 1 + %gep1 = getelementptr float, float* %0, i64 %offset + %f0 = load float, float* %gep0, align 4 + %f1 = load float, float* %gep1, align 4 + %f2 = load float, float* %2, align 4 + %f3 = load float, float* %3, align 4 + + %va0 = insertelement <2 x float> undef, float %f0, i32 0 + %va1 = insertelement <2 x float> %va0, float %f1, i32 1 + + %vb0 = insertelement <2 x float> undef, float %f2, i32 0 + %vb1 = insertelement <2 x float> %vb0, float %f3, i32 1 + + %res = call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> %va1, <2 x float> %vb1) + ret <2 x float> %res +} + +; If a def instruction cannot be replaced by a scalar variant, copy an +; S-register via a core register. +define <2 x float> @baz(double %0, <2 x float> %vf0) { +; CHECK-LABEL: baz: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vcvt.f32.f64 s0, d16 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov d16, r2, r3 +; CHECK-NEXT: vmov.32 d0[0], r0 +; CHECK-NEXT: vmaxnm.f32 d16, d16, d0 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: bx lr +; +; NOOPT-LABEL: baz: +; NOOPT: @ %bb.0: +; NOOPT-NEXT: vmov d16, r0, r1 +; NOOPT-NEXT: vcvt.f32.f64 s0, d16 +; NOOPT-NEXT: vmov d16, r2, r3 +; NOOPT-NEXT: vmaxnm.f32 d16, d16, d0 +; NOOPT-NEXT: vmov r0, r1, d16 +; NOOPT-NEXT: bx lr + %f = fptrunc double %0 to float + %vf1 = insertelement <2 x float> undef, float %f, i32 0 + %res = call <2 x float> @llvm.arm.neon.vmaxnm.v2f32(<2 x float> %vf0, <2 x float> %vf1) + ret <2 x float> %res +} + +; Check that Q-registers are handled as well +define <4 x float> @foov4(float* %0, float* %1, float* %2, float* %3, float* %4) { +; CHECK-LABEL: foov4: +; CHECK: @ %bb.0: +; CHECK-NEXT: vld1.32 {d16[0]}, [r0] +; CHECK-NEXT: ldr r12, [sp] +; CHECK-NEXT: vld1.32 {d16[1]}, [r1] +; CHECK-NEXT: vld1.32 {d17[0]}, [r2] +; CHECK-NEXT: vld1.32 {d17[1]}, [r3] +; CHECK-NEXT: vld1.32 {d18[0]}, [r12] +; CHECK-NEXT: vmaxnm.f32 q8, q8, q9 +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr +; +; NOOPT-LABEL: foov4: +; NOOPT: @ %bb.0: +; NOOPT-NEXT: vldr s3, [r3] +; NOOPT-NEXT: ldr r12, [sp] +; NOOPT-NEXT: vldr s2, [r2] +; NOOPT-NEXT: vldr s1, [r1] +; NOOPT-NEXT: vldr s0, [r0] +; NOOPT-NEXT: vldr s4, [r12] +; NOOPT-NEXT: vmaxnm.f32 q8, q0, q1 +; NOOPT-NEXT: vmov r0, r1, d16 +; NOOPT-NEXT: vmov r2, r3, d17 +; NOOPT-NEXT: bx lr + %s0 = load float, float* %0, align 4 + %s1 = load float, float* %1, align 4 + %s2 = load float, float* %2, align 4 + %s3 = load float, float* %3, align 4 + + %v0 = insertelement <4 x float> undef, float %s0, i32 0 + %v1 = insertelement <4 x float> %v0, float %s1, i32 1 + %v2 = insertelement <4 x float> %v1, float %s2, i32 2 + %v3 = insertelement <4 x float> %v2, float %s3, i32 3 + + %s4 = load float, float* %4, align 4 + %v4 = insertelement <4 x float> undef, float %s4, i32 0 + %v5 = insertelement <4 x float> %v4, float %s4, i32 1 + %v6 = insertelement <4 x float> %v5, float %s4, i32 2 + %v7 = insertelement <4 x float> %v6, float %s4, i32 3 + + %res = call <4 x float> @llvm.arm.neon.vmaxnm.v4f32(<4 x float> %v3, <4 x float> %v4) + ret <4 x float> %res +} + +; SIMD instructions with scalar operands are encoded incorrectly when +; an incorrect register class is used for a replacement D- or +; Q-register. This is not shown in llc output, but can be checked with +; objdump. +define <4 x float> @scalar_instrs(<4 x float> %mul0, float %mul1, float %mul2) { +; These checks are not autogenerated, but copied from the checks below. +; CHECK-OBJ-LABEL: : +; CHECK-OBJ: vmov r0, s0 +; CHECK-OBJ-NEXT: vmov.32 d0[0], r0 +; CHECK-OBJ-NEXT: vmul.f32 q8, q8, d0[0] +; +; CHECK-LABEL: scalar_instrs: +; CHECK: @ %bb.0: +; CHECK-NEXT: vldr s0, [sp, #4] +; CHECK-NEXT: vmov d17, r2, r3 +; CHECK-NEXT: vldr s2, [sp] +; CHECK-NEXT: vmov d16, r0, r1 +; CHECK-NEXT: vmul.f32 s0, s2, s0 +; CHECK-NEXT: vmov r0, s0 +; CHECK-NEXT: vmov.32 d0[0], r0 +; CHECK-NEXT: vmul.f32 q8, q8, d0[0] +; CHECK-NEXT: vmov r0, r1, d16 +; CHECK-NEXT: vmov r2, r3, d17 +; CHECK-NEXT: bx lr +; +; NOOPT-LABEL: scalar_instrs: +; NOOPT: @ %bb.0: +; NOOPT-NEXT: vldr s0, [sp, #4] +; NOOPT-NEXT: vmov d17, r2, r3 +; NOOPT-NEXT: vldr s2, [sp] +; NOOPT-NEXT: vmov d16, r0, r1 +; NOOPT-NEXT: vmul.f32 s0, s2, s0 +; NOOPT-NEXT: vmul.f32 q8, q8, d0[0] +; NOOPT-NEXT: vmov r0, r1, d16 +; NOOPT-NEXT: vmov r2, r3, d17 +; NOOPT-NEXT: bx lr + %mul12 = fmul float %mul1, %mul2 + %mul12vec = insertelement <4 x float> undef, float %mul12, i32 0 + %mul12shuffle = shufflevector <4 x float> %mul12vec, <4 x float> undef, <4 x i32> zeroinitializer + %mul = fmul <4 x float> %mul0, %mul12shuffle + ret <4 x float> %mul +}