Index: llvm/lib/Target/AArch64/AArch64.h =================================================================== --- llvm/lib/Target/AArch64/AArch64.h +++ llvm/lib/Target/AArch64/AArch64.h @@ -39,7 +39,7 @@ FunctionPass *createAArch64StorePairSuppressPass(); FunctionPass *createAArch64ExpandPseudoPass(); FunctionPass *createAArch64LoadStoreOptimizationPass(); -FunctionPass *createAArch64VectorByElementOptPass(); +FunctionPass *createAArch64SIMDInstrOptPass(); ModulePass *createAArch64PromoteConstantPass(); FunctionPass *createAArch64ConditionOptimizerPass(); FunctionPass *createAArch64A57FPLoadBalancing(); @@ -64,7 +64,7 @@ void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&); void initializeAArch64ExpandPseudoPass(PassRegistry&); void initializeAArch64LoadStoreOptPass(PassRegistry&); -void initializeAArch64VectorByElementOptPass(PassRegistry&); +void initializeAArch64SIMDInstrOptPass(PassRegistry&); void initializeAArch64PromoteConstantPass(PassRegistry&); void initializeAArch64RedundantCopyEliminationPass(PassRegistry&); void initializeAArch64StorePairSuppressPass(PassRegistry&); Index: llvm/lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetMachine.cpp +++ llvm/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -157,7 +157,7 @@ initializeAArch64DeadRegisterDefinitionsPass(*PR); initializeAArch64ExpandPseudoPass(*PR); initializeAArch64LoadStoreOptPass(*PR); - initializeAArch64VectorByElementOptPass(*PR); + initializeAArch64SIMDInstrOptPass(*PR); initializeAArch64PromoteConstantPass(*PR); initializeAArch64RedundantCopyEliminationPass(*PR); initializeAArch64StorePairSuppressPass(*PR); @@ -455,7 +455,7 @@ addPass(&EarlyIfConverterID); if (EnableStPairSuppress) addPass(createAArch64StorePairSuppressPass()); - addPass(createAArch64VectorByElementOptPass()); + addPass(createAArch64SIMDInstrOptPass()); return true; } Index: llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp +++ llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp @@ -1,4 +1,3 @@ -//=- AArch64VectorByElementOpt.cpp - AArch64 vector by element inst opt pass =// // // The LLVM Compiler Infrastructure // @@ -7,19 +6,27 @@ // //===----------------------------------------------------------------------===// // -// This file contains a pass that performs optimization for vector by element -// SIMD instructions. -// -// Certain SIMD instructions with vector element operand are not efficient. -// Rewrite them into SIMD instructions with vector operands. This rewrite -// is driven by the latency of the instructions. +// This file contains a pass that performs optimization on SIMD instructions +// with high latency by splitting them into more efficient series of +// instructions. // +// 1. Rewrite certain SIMD instructions with vector element due to their +// inefficiency on some targets. // Example: // fmla v0.4s, v1.4s, v2.s[1] // is rewritten into // dup v3.4s, v2.s[1] // fmla v0.4s, v1.4s, v3.4s // +// 2. Rewrite Interleaved memory access instructions due to their +// inefficiency on some targets. +// Example: +// st2 {v0.4s, v1.4s}, addr +// is rewritten into +// zip1 v2.4s, v0.4s, v1.4s +// zip2 v3.4s, v0.4s, v1.4s +// stp q2, q3, addr +// //===----------------------------------------------------------------------===// #include "AArch64InstrInfo.h" @@ -43,47 +50,124 @@ using namespace llvm; -#define DEBUG_TYPE "aarch64-vectorbyelement-opt" +#define DEBUG_TYPE "aarch64-simdinstr-opt" STATISTIC(NumModifiedInstr, - "Number of vector by element instructions modified"); + "Number of SIMD instructions modified"); #define AARCH64_VECTOR_BY_ELEMENT_OPT_NAME \ - "AArch64 vector by element instruction optimization pass" + "AArch64 SIMD instructions optimization pass" namespace { -struct AArch64VectorByElementOpt : public MachineFunctionPass { +struct AArch64SIMDInstrOpt : public MachineFunctionPass { static char ID; const TargetInstrInfo *TII; MachineRegisterInfo *MRI; TargetSchedModel SchedModel; - AArch64VectorByElementOpt() : MachineFunctionPass(ID) { - initializeAArch64VectorByElementOptPass(*PassRegistry::getPassRegistry()); + // The two maps below are used to cache decisions instead of recomputing: + // This is used to cache instruction replacement decisions within function + // units and across function units. + std::map, bool> SIMDInstrTable; + // This is used to cache the decision of whether to leave the Interleave-Store + // instructions replacement pass early or not for a particular target. + std::map InterlEarlyExit; + + typedef enum { + VectorElem, + Interleave + } Subpass; + + // Instruction represented by OrigOpc is replaced by instructions in ReplOpc. + struct InstReplInfo { + unsigned OrigOpc; + std::vector ReplOpc; + const TargetRegisterClass RC; + }; + +#define RuleST2(OpcOrg, OpcR0, OpcR1, OpcR2, RC) \ + {OpcOrg, {OpcR0, OpcR1, OpcR2}, RC} +#define RuleST4(OpcOrg, OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, \ + OpcR7, OpcR8, OpcR9, RC) \ + {OpcOrg, {OpcR0, OpcR1, OpcR2, OpcR3, OpcR4, OpcR5, OpcR6, OpcR7, \ + OpcR8, OpcR9}, RC} + + // The Instruction Replacement Table: + std::vector IRT = { + // ST2 instructions + RuleST2(AArch64::ST2Twov2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, + AArch64::STPQi, AArch64::FPR128RegClass), + RuleST2(AArch64::ST2Twov4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, + AArch64::STPQi, AArch64::FPR128RegClass), + RuleST2(AArch64::ST2Twov2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, + AArch64::STPDi, AArch64::FPR64RegClass), + RuleST2(AArch64::ST2Twov8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, + AArch64::STPQi, AArch64::FPR128RegClass), + RuleST2(AArch64::ST2Twov4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, + AArch64::STPDi, AArch64::FPR64RegClass), + RuleST2(AArch64::ST2Twov16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, + AArch64::STPQi, AArch64::FPR128RegClass), + RuleST2(AArch64::ST2Twov8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, + AArch64::STPDi, AArch64::FPR64RegClass), + // ST4 instructions + RuleST4(AArch64::ST4Fourv2d, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, + AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, + AArch64::ZIP2v2i64, AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, + AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), + RuleST4(AArch64::ST4Fourv4s, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, + AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, + AArch64::ZIP2v4i32, AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, + AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), + RuleST4(AArch64::ST4Fourv2s, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, + AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, + AArch64::ZIP2v2i32, AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, + AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass), + RuleST4(AArch64::ST4Fourv8h, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, + AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, + AArch64::ZIP2v8i16, AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, + AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), + RuleST4(AArch64::ST4Fourv4h, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, + AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, + AArch64::ZIP2v4i16, AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, + AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass), + RuleST4(AArch64::ST4Fourv16b, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, + AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, + AArch64::ZIP2v16i8, AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, + AArch64::STPQi, AArch64::STPQi, AArch64::FPR128RegClass), + RuleST4(AArch64::ST4Fourv8b, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, + AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, + AArch64::ZIP2v8i8, AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, + AArch64::STPDi, AArch64::STPDi, AArch64::FPR64RegClass) + }; + + // A costly instruction is replaced in this work by N efficient instructions + // The maximum of N is curently 10 and it is for ST4 case. + static const unsigned MaxNumRepl = 10; + + AArch64SIMDInstrOpt() : MachineFunctionPass(ID) { + initializeAArch64SIMDInstrOptPass(*PassRegistry::getPassRegistry()); } /// Based only on latency of instructions, determine if it is cost efficient - /// to replace the instruction InstDesc by the two instructions InstDescRep1 - /// and InstDescRep2. - /// Return true if replacement is recommended. - bool - shouldReplaceInstruction(MachineFunction *MF, const MCInstrDesc *InstDesc, - const MCInstrDesc *InstDescRep1, - const MCInstrDesc *InstDescRep2, - std::map &VecInstElemTable) const; - - /// Determine if we need to exit the vector by element instruction - /// optimization pass early. This makes sure that Targets with no need - /// for this optimization do not spent any compile time on this pass. - /// This check is done by comparing the latency of an indexed FMLA - /// instruction to the latency of the DUP + the latency of a vector - /// FMLA instruction. We do not check on other related instructions such - /// as FMLS as we assume that if the situation shows up for one - /// instruction, then it is likely to show up for the related ones. - /// Return true if early exit of the pass is recommended. - bool earlyExitVectElement(MachineFunction *MF); + /// to replace the instruction InstDesc by the instructions stored in the + /// array InstDescRepl. + /// Return true if replacement is expected to be faster. + bool shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc, + SmallVectorImpl &ReplInstrMCID); + + /// Determine if we need to exit the instruction replacement optimization + /// subpasses early. This makes sure that Targets with no need for this + /// optimization do not spend any compile time on this subpass other than the + /// simple check performed here. This simple check is done by comparing the + /// latency of the original instruction to the latency of the replacement + /// instructions. We only check for a representative instruction in the class + /// of instructions and not all concerned instructions. For the VectorElem + /// subpass, we check for the FMLA instruction while for the interleave subpass + /// we check for the st2.4s instruction. + /// Return true if early exit of the subpass is recommended. + bool shouldExitEarly(MachineFunction *MF, Subpass SP); /// Check whether an equivalent DUP instruction has already been /// created or not. @@ -96,8 +180,24 @@ /// Rewrite them into SIMD instructions with vector operands. This rewrite /// is driven by the latency of the instructions. /// Return true if the SIMD instruction is modified. - bool optimizeVectElement(MachineInstr &MI, - std::map *VecInstElemTable) const; + bool optimizeVectElement(MachineInstr &MI); + + /// Process The REG_SEQUENCE instruction, and extract the source + /// operands of the st2/4 instruction from it. + /// Example of such instructions. + /// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1; + /// Return true when the instruction is processed successfully. + bool processSeqRegInst(MachineInstr *DefiningMI, unsigned* StReg, + unsigned* StRegKill, unsigned NumArg) const; + + /// Load/Store Interleaving instructions are not always beneficial. + /// Replace them by zip instructionand classical load/store. + /// Return true if the SIMD instruction is modified. + bool optimizeLdStInterleave(MachineInstr &MI); + + /// Return the number of useful source registers for this + /// instruction (2 for st2 and 4 for st4). + unsigned determineSrcReg(MachineInstr &MI) const; bool runOnMachineFunction(MachineFunction &Fn) override; @@ -106,84 +206,120 @@ } }; -char AArch64VectorByElementOpt::ID = 0; +char AArch64SIMDInstrOpt::ID = 0; } // end anonymous namespace -INITIALIZE_PASS(AArch64VectorByElementOpt, "aarch64-vectorbyelement-opt", +INITIALIZE_PASS(AArch64SIMDInstrOpt, "aarch64-simdinstr-opt", AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false) /// Based only on latency of instructions, determine if it is cost efficient -/// to replace the instruction InstDesc by the two instructions InstDescRep1 -/// and InstDescRep2. Note that it is assumed in this fuction that an -/// instruction of type InstDesc is always replaced by the same two -/// instructions as results are cached here. -/// Return true if replacement is recommended. -bool AArch64VectorByElementOpt::shouldReplaceInstruction( - MachineFunction *MF, const MCInstrDesc *InstDesc, - const MCInstrDesc *InstDescRep1, const MCInstrDesc *InstDescRep2, - std::map &VecInstElemTable) const { - // Check if replacment decision is alredy available in the cached table. +/// to replace the instruction InstDesc by the instructions stored in the +/// array InstDescRepl. +/// Return true if replacement is expected to be faster. +bool AArch64SIMDInstrOpt:: +shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc, + SmallVectorImpl &InstDescRepl) { + // Check if replacement decision is already available in the cached table. // if so, return it. - if (!VecInstElemTable.empty() && - VecInstElemTable.find(InstDesc->getOpcode()) != VecInstElemTable.end()) - return VecInstElemTable[InstDesc->getOpcode()]; + std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU(); + std::pair InstID = std::make_pair(InstDesc->getOpcode(), Subtarget); + if (!SIMDInstrTable.empty() && + SIMDInstrTable.find(InstID) != SIMDInstrTable.end()) + return SIMDInstrTable[InstID]; unsigned SCIdx = InstDesc->getSchedClass(); - unsigned SCIdxRep1 = InstDescRep1->getSchedClass(); - unsigned SCIdxRep2 = InstDescRep2->getSchedClass(); const MCSchedClassDesc *SCDesc = - SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx); - const MCSchedClassDesc *SCDescRep1 = - SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep1); - const MCSchedClassDesc *SCDescRep2 = - SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep2); + SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx); - // If a subtarget does not define resources for any of the instructions + // If a subtarget does not define resources for the instructions // of interest, then return false for no replacement. - if (!SCDesc->isValid() || SCDesc->isVariant() || !SCDescRep1->isValid() || - SCDescRep1->isVariant() || !SCDescRep2->isValid() || - SCDescRep2->isVariant()) { - VecInstElemTable[InstDesc->getOpcode()] = false; + const MCSchedClassDesc *SCDescRepl; + if (!SCDesc->isValid() || SCDesc->isVariant()) + { + SIMDInstrTable[InstID] = false; return false; } + for (auto IDesc : InstDescRepl) + { + SCDescRepl = SchedModel.getMCSchedModel()->getSchedClassDesc( + IDesc->getSchedClass()); + if (!SCDescRepl->isValid() || SCDescRepl->isVariant()) + { + SIMDInstrTable[InstID] = false; + return false; + } + } + + // Replacement cost. + unsigned ReplCost = 0; + for (auto IDesc :InstDescRepl) + ReplCost += SchedModel.computeInstrLatency(IDesc->getOpcode()); - if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > - SchedModel.computeInstrLatency(InstDescRep1->getOpcode()) + - SchedModel.computeInstrLatency(InstDescRep2->getOpcode())) { - VecInstElemTable[InstDesc->getOpcode()] = true; + if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost) + { + SIMDInstrTable[InstID] = true; return true; } - VecInstElemTable[InstDesc->getOpcode()] = false; - return false; + else + { + SIMDInstrTable[InstID] = false; + return false; + } } -/// Determine if we need to exit the vector by element instruction -/// optimization pass early. This makes sure that Targets with no need -/// for this optimization do not spent any compile time on this pass. -/// This check is done by comparing the latency of an indexed FMLA -/// instruction to the latency of the DUP + the latency of a vector -/// FMLA instruction. We do not check on other related instructions such -/// as FMLS as we assume that if the situation shows up for one -/// instruction, then it is likely to show up for the related ones. -/// Return true if early exit of the pass is recommended. -bool AArch64VectorByElementOpt::earlyExitVectElement(MachineFunction *MF) { - std::map VecInstElemTable; - const MCInstrDesc *IndexMulMCID = &TII->get(AArch64::FMLAv4i32_indexed); - const MCInstrDesc *DupMCID = &TII->get(AArch64::DUPv4i32lane); - const MCInstrDesc *MulMCID = &TII->get(AArch64::FMULv4f32); - - if (!shouldReplaceInstruction(MF, IndexMulMCID, DupMCID, MulMCID, - VecInstElemTable)) - return true; - return false; +/// Determine if we need to exit the instruction replacement optimization +/// subpasses early. This makes sure that Targets with no need for this +/// optimization do not spend any compile time on this subpass other than the +/// simple check performed here. This simple check is done by comparing the +/// latency of the original instruction to the latency of the replacement +/// instructions. We only check for a representative instruction in the class of +/// instructions and not all concerned instructions. For the VectorElem subpass, +/// we check for the FMLA instruction while for the interleave subpass we check +/// for the st2.4s instruction. +/// Return true if early exit of the subpass is recommended. +bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) { + const MCInstrDesc* OriginalMCID; + SmallVector ReplInstrMCID; + + switch (SP) { + case VectorElem: + OriginalMCID = &TII->get(AArch64::FMLAv4i32_indexed); + ReplInstrMCID.push_back(&TII->get(AArch64::DUPv4i32lane)); + ReplInstrMCID.push_back(&TII->get(AArch64::FMULv4f32)); + if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) + return false; + break; + case Interleave: + // Check if early exit decision is already available in the cached + // table or not. + std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU(); + if (!InterlEarlyExit.empty() && + InterlEarlyExit.find(Subtarget) != InterlEarlyExit.end()) + return InterlEarlyExit[Subtarget]; + + for (auto &I : IRT) { + OriginalMCID = &TII->get(I.OrigOpc); + for (auto &Repl : I.ReplOpc) + ReplInstrMCID.push_back(&TII->get(Repl)); + if (shouldReplaceInst(MF, OriginalMCID, ReplInstrMCID)) { + InterlEarlyExit[Subtarget] = false; + return false; + } + ReplInstrMCID.clear(); + } + InterlEarlyExit[Subtarget] = true; + break; + } + + return true; } /// Check whether an equivalent DUP instruction has already been /// created or not. /// Return true when the dup instruction already exists. In this case, /// DestReg will point to the destination of the already created DUP. -bool AArch64VectorByElementOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode, +bool AArch64SIMDInstrOpt::reuseDUP(MachineInstr &MI, unsigned DupOpcode, unsigned SrcReg, unsigned LaneNumber, unsigned *DestReg) const { for (MachineBasicBlock::iterator MII = MI, MIE = MI.getParent()->begin(); @@ -215,8 +351,7 @@ /// dup v3.4s, v2.s[1] // dup not necessary if redundant /// fmla v0.4s, v1.4s, v3.4s /// Return true if the SIMD instruction is modified. -bool AArch64VectorByElementOpt::optimizeVectElement( - MachineInstr &MI, std::map *VecInstElemTable) const { +bool AArch64SIMDInstrOpt::optimizeVectElement(MachineInstr &MI) { const MCInstrDesc *MulMCID, *DupMCID; const TargetRegisterClass *RC = &AArch64::FPR128RegClass; @@ -283,9 +418,11 @@ break; } - if (!shouldReplaceInstruction(MI.getParent()->getParent(), - &TII->get(MI.getOpcode()), DupMCID, MulMCID, - *VecInstElemTable)) + SmallVector ReplInstrMCID; + ReplInstrMCID.push_back(DupMCID); + ReplInstrMCID.push_back(MulMCID); + if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()), + ReplInstrMCID)) return false; const DebugLoc &DL = MI.getDebugLoc(); @@ -305,7 +442,6 @@ unsigned SrcReg2 = MI.getOperand(3).getReg(); unsigned Src2IsKill = getKillRegState(MI.getOperand(3).isKill()); unsigned LaneNumber = MI.getOperand(4).getImm(); - // Create a new DUP instruction. Note that if an equivalent DUP instruction // has already been created before, then use that one instread of creating // a new one. @@ -338,7 +474,217 @@ return true; } -bool AArch64VectorByElementOpt::runOnMachineFunction(MachineFunction &MF) { +/// Load/Store Interleaving instructions are not always beneficial. +/// Replace them by zip instructions and classical load/store. +/// +/// Example: +/// st2 {v0.4s, v1.4s}, addr +/// is rewritten into +/// zip1 v2.4s, v0.4s, v1.4s +/// zip2 v3.4s, v0.4s, v1.4s +/// stp q2, q3, addr +// +/// Example: +/// st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr +/// is rewritten into +/// zip1 v4.4s, v0.4s, v2.4s +/// zip2 v5.4s, v0.4s, v2.4s +/// zip1 v6.4s, v1.4s, v3.4s +/// zip2 v7.4s, v1.4s, v3.4s +/// zip1 v8.4s, v4.4s, v6.4s +/// zip2 v9.4s, v4.4s, v6.4s +/// zip1 v10.4s, v5.4s, v7.4s +/// zip2 v11.4s, v5.4s, v7.4s +/// stp q8, q9, addr +/// stp q10, q11, addr+32 +/// Currently only instructions related to st2 and st4 are considered. +/// Other may be added later. +/// Return true if the SIMD instruction is modified. +bool AArch64SIMDInstrOpt::optimizeLdStInterleave(MachineInstr &MI) { + + unsigned SeqReg, AddrReg; + unsigned StReg[4], StRegKill[4]; + MachineInstr *DefiningMI; + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock &MBB = *MI.getParent(); + SmallVector ZipDest; + SmallVector ReplInstrMCID; + + // If current instruction matches any of the rewriting rules, then + // gather information about parameters of the new instructions. + bool Match = false; + for (auto &I : IRT) { + if (MI.getOpcode() == I.OrigOpc) { + SeqReg = MI.getOperand(0).getReg(); + AddrReg = MI.getOperand(1).getReg(); + DefiningMI = MRI->getUniqueVRegDef(SeqReg); + unsigned NumReg = determineSrcReg(MI); + if (!processSeqRegInst(DefiningMI, StReg, StRegKill, NumReg)) + return false; + + for (auto &Repl : I.ReplOpc) { + ReplInstrMCID.push_back(&TII->get(Repl)); + // Generate destination registers but only for non-store instruction. + if (Repl != AArch64::STPQi && Repl != AArch64::STPDi) + ZipDest.push_back(MRI->createVirtualRegister(&I.RC)); + } + Match = true; + break; + } + } + + if (!Match) + return false; + + // Determine if it is profitable to replace MI by the series of instructions + // represented in ReplInstrMCID. + if (!shouldReplaceInst(MI.getParent()->getParent(), &TII->get(MI.getOpcode()), + ReplInstrMCID)) + return false; + + // Generate the replacement instructions composed of zip1, zip2, and stp (at + // this point, the code generation is hardcoded and does not rely on the IRT + // table used above given that code generation for ST2 replacement is somewhat + // different than for ST4 replacement. We could have added more info into the + // table related to how we build new instructions but we may be adding more + // complexity with that). + switch (MI.getOpcode()) { + default: + return false; + case AArch64::ST2Twov16b: + case AArch64::ST2Twov8b: + case AArch64::ST2Twov8h: + case AArch64::ST2Twov4h: + case AArch64::ST2Twov4s: + case AArch64::ST2Twov2s: + case AArch64::ST2Twov2d: + // zip instructions + BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0]) + .addReg(StReg[0]) + .addReg(StReg[1]); + BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1]) + .addReg(StReg[0], StRegKill[0]) + .addReg(StReg[1], StRegKill[1]); + // stp instructions + BuildMI(MBB, MI, DL, *ReplInstrMCID[2]) + .addReg(ZipDest[0]) + .addReg(ZipDest[1]) + .addReg(AddrReg) + .addImm(0); + break; + case AArch64::ST4Fourv16b: + case AArch64::ST4Fourv8b: + case AArch64::ST4Fourv8h: + case AArch64::ST4Fourv4h: + case AArch64::ST4Fourv4s: + case AArch64::ST4Fourv2s: + case AArch64::ST4Fourv2d: + // zip instructions + BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0]) + .addReg(StReg[0]) + .addReg(StReg[2]); + BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1]) + .addReg(StReg[0], StRegKill[0]) + .addReg(StReg[2], StRegKill[2]); + BuildMI(MBB, MI, DL, *ReplInstrMCID[2], ZipDest[2]) + .addReg(StReg[1]) + .addReg(StReg[3]); + BuildMI(MBB, MI, DL, *ReplInstrMCID[3], ZipDest[3]) + .addReg(StReg[1], StRegKill[1]) + .addReg(StReg[3], StRegKill[3]); + BuildMI(MBB, MI, DL, *ReplInstrMCID[4], ZipDest[4]) + .addReg(ZipDest[0]) + .addReg(ZipDest[2]); + BuildMI(MBB, MI, DL, *ReplInstrMCID[5], ZipDest[5]) + .addReg(ZipDest[0]) + .addReg(ZipDest[2]); + BuildMI(MBB, MI, DL, *ReplInstrMCID[6], ZipDest[6]) + .addReg(ZipDest[1]) + .addReg(ZipDest[3]); + BuildMI(MBB, MI, DL, *ReplInstrMCID[7], ZipDest[7]) + .addReg(ZipDest[1]) + .addReg(ZipDest[3]); + // stp instructions + BuildMI(MBB, MI, DL, *ReplInstrMCID[8]) + .addReg(ZipDest[4]) + .addReg(ZipDest[5]) + .addReg(AddrReg) + .addImm(0); + BuildMI(MBB, MI, DL, *ReplInstrMCID[9]) + .addReg(ZipDest[6]) + .addReg(ZipDest[7]) + .addReg(AddrReg) + .addImm(2); + break; + } + + ++NumModifiedInstr; + return true; +} + +/// Process The REG_SEQUENCE instruction, and extract the source +/// operands of the st2/4 instruction from it. +/// Example of such instruction. +/// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1; +/// Return true when the instruction is processed successfully. +bool AArch64SIMDInstrOpt::processSeqRegInst(MachineInstr *DefiningMI, + unsigned* StReg, unsigned* StRegKill, unsigned NumArg) const { + assert (DefiningMI != NULL); + if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE) + return false; + + for (unsigned i=0; igetOperand(2*i+1).getReg(); + StRegKill[i] = getKillRegState(DefiningMI->getOperand(2*i+1).isKill()); + + // Sanity check for the other arguments. + if (DefiningMI->getOperand(2*i+2).isImm()) { + switch (DefiningMI->getOperand(2*i+2).getImm()) { + default: + return false; + case AArch64::dsub0: + case AArch64::dsub1: + case AArch64::dsub2: + case AArch64::dsub3: + case AArch64::qsub0: + case AArch64::qsub1: + case AArch64::qsub2: + case AArch64::qsub3: + break; + } + } + else + return false; + } + return true; +} + +/// Return the number of useful source registers for this instruction +/// (2 for ST2 and 4 for ST4). +unsigned AArch64SIMDInstrOpt::determineSrcReg(MachineInstr &MI) const { + switch (MI.getOpcode()) { + default: + llvm_unreachable("Unsupported instruction for this pass"); + case AArch64::ST2Twov16b: + case AArch64::ST2Twov8b: + case AArch64::ST2Twov8h: + case AArch64::ST2Twov4h: + case AArch64::ST2Twov4s: + case AArch64::ST2Twov2s: + case AArch64::ST2Twov2d: + return 2; + case AArch64::ST4Fourv16b: + case AArch64::ST4Fourv8b: + case AArch64::ST4Fourv8h: + case AArch64::ST4Fourv4h: + case AArch64::ST4Fourv4s: + case AArch64::ST4Fourv2s: + case AArch64::ST4Fourv2d: + return 4; + } +} + +bool AArch64SIMDInstrOpt::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction())) return false; @@ -353,36 +699,38 @@ if (!SchedModel.hasInstrSchedModel()) return false; - // A simple check to exit this pass early for targets that do not need it. - if (earlyExitVectElement(&MF)) - return false; - bool Changed = false; - std::map VecInstElemTable; - SmallVector RemoveMIs; - - for (MachineBasicBlock &MBB : MF) { - for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end(); - MII != MIE;) { - MachineInstr &MI = *MII; - if (optimizeVectElement(MI, &VecInstElemTable)) { - // Add MI to the list of instructions to be removed given that it has - // been replaced. - RemoveMIs.push_back(&MI); - Changed = true; + for (auto OptimizationKind : {VectorElem, Interleave}) { + if (!shouldExitEarly(&MF, OptimizationKind)) { + SmallVector RemoveMIs; + for (MachineBasicBlock &MBB : MF) { + for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end(); + MII != MIE;) { + MachineInstr &MI = *MII; + bool InstRewrite; + if (OptimizationKind == VectorElem) + InstRewrite = optimizeVectElement(MI) ; + else + InstRewrite = optimizeLdStInterleave(MI); + if (InstRewrite) { + // Add MI to the list of instructions to be removed given that it + // has been replaced. + RemoveMIs.push_back(&MI); + Changed = true; + } + ++MII; + } } - ++MII; + for (MachineInstr *MI : RemoveMIs) + MI->eraseFromParent(); } } - for (MachineInstr *MI : RemoveMIs) - MI->eraseFromParent(); - return Changed; } -/// createAArch64VectorByElementOptPass - returns an instance of the +/// createAArch64SIMDInstrOptPass - returns an instance of the /// vector by element optimization pass. -FunctionPass *llvm::createAArch64VectorByElementOptPass() { - return new AArch64VectorByElementOpt(); +FunctionPass *llvm::createAArch64SIMDInstrOptPass() { + return new AArch64SIMDInstrOpt(); } Index: llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll +++ llvm/test/CodeGen/AArch64/arm64-neon-2velem.ll @@ -3147,3 +3147,24 @@ %s = fsub <4 x float> %0, %1 ret <4 x float> %s } + +define <2 x float> @test_vfma_lane_simdinstr_opt_pass_caching_a57(<2 x float> %a, <2 x float> %b, <2 x float> %v) "target-cpu"="cortex-a57" { +; CHECK-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_a57: +; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.s[1] +; CHECK-NEXT: ret +entry: + %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> + %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) + ret <2 x float> %0 +} + +define <2 x float> @test_vfma_lane_simdinstr_opt_pass_caching_m1(<2 x float> %a, <2 x float> %b, <2 x float> %v) "target-cpu"="exynos-m1" { +; CHECK-LABEL: test_vfma_lane_simdinstr_opt_pass_caching_m1: +; CHECK: dup [[x:v[0-9]+]].2s, {{v[0-9]+}}.s[1] +; CHECK: fmla {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, [[x]].2s +; CHECK-NEXT: ret +entry: + %lane = shufflevector <2 x float> %v, <2 x float> undef, <2 x i32> + %0 = tail call <2 x float> @llvm.fma.v2f32(<2 x float> %lane, <2 x float> %b, <2 x float> %a) + ret <2 x float> %0 +} Index: llvm/test/CodeGen/AArch64/arm64-st1.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-st1.ll +++ llvm/test/CodeGen/AArch64/arm64-st1.ll @@ -1,4 +1,6 @@ ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs -mcpu=exynos-m1 | FileCheck --check-prefix=EXYNOS %s +; The instruction latencies of Exynos-M1 trigger the transform we see under the Exynos check. define void @st1lane_16b(<16 x i8> %A, i8* %D) { ; CHECK-LABEL: st1lane_16b @@ -375,6 +377,10 @@ define void @st2_8b(<8 x i8> %A, <8 x i8> %B, i8* %P) nounwind { ; CHECK-LABEL: st2_8b ; CHECK: st2.8b +; EXYNOS-LABEL: st2_8b +; EXYNOS: zip1.8b +; EXYNOS: zip2.8b +; EXYNOS: stp call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %P) ret void } @@ -389,6 +395,17 @@ define void @st4_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P) nounwind { ; CHECK-LABEL: st4_8b ; CHECK: st4.8b +; EXYNOS-LABEL: st4_8b +; EXYNOS: zip1.8b +; EXYNOS: zip2.8b +; EXYNOS: zip1.8b +; EXYNOS: zip2.8b +; EXYNOS: zip1.8b +; EXYNOS: zip2.8b +; EXYNOS: stp +; EXYNOS: zip1.8b +; EXYNOS: zip2.8b +; EXYNOS: stp call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P) ret void } @@ -400,6 +417,10 @@ define void @st2_16b(<16 x i8> %A, <16 x i8> %B, i8* %P) nounwind { ; CHECK-LABEL: st2_16b ; CHECK: st2.16b +; EXYNOS-LABEL: st2_16b +; EXYNOS: zip1.16b +; EXYNOS: zip2.16b +; EXYNOS: stp call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %P) ret void } @@ -414,6 +435,17 @@ define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P) nounwind { ; CHECK-LABEL: st4_16b ; CHECK: st4.16b +; EXYNOS-LABEL: st4_16b +; EXYNOS: zip1.16b +; EXYNOS: zip2.16b +; EXYNOS: zip1.16b +; EXYNOS: zip2.16b +; EXYNOS: zip1.16b +; EXYNOS: zip2.16b +; EXYNOS: stp +; EXYNOS: zip1.16b +; EXYNOS: zip2.16b +; EXYNOS: stp call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P) ret void } @@ -425,6 +457,10 @@ define void @st2_4h(<4 x i16> %A, <4 x i16> %B, i16* %P) nounwind { ; CHECK-LABEL: st2_4h ; CHECK: st2.4h +; EXYNOS-LABEL: st2_4h +; EXYNOS: zip1.4h +; EXYNOS: zip2.4h +; EXYNOS: stp call void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %P) ret void } @@ -439,6 +475,17 @@ define void @st4_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P) nounwind { ; CHECK-LABEL: st4_4h ; CHECK: st4.4h +; EXYNOS-LABEL: st4_4h +; EXYNOS: zip1.4h +; EXYNOS: zip2.4h +; EXYNOS: zip1.4h +; EXYNOS: zip2.4h +; EXYNOS: zip1.4h +; EXYNOS: zip2.4h +; EXYNOS: stp +; EXYNOS: zip1.4h +; EXYNOS: zip2.4h +; EXYNOS: stp call void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P) ret void } @@ -450,6 +497,10 @@ define void @st2_8h(<8 x i16> %A, <8 x i16> %B, i16* %P) nounwind { ; CHECK-LABEL: st2_8h ; CHECK: st2.8h +; EXYNOS-LABEL: st2_8h +; EXYNOS: zip1.8h +; EXYNOS: zip2.8h +; EXYNOS: stp call void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %P) ret void } @@ -464,6 +515,17 @@ define void @st4_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P) nounwind { ; CHECK-LABEL: st4_8h ; CHECK: st4.8h +; EXYNOS-LABEL: st4_8h +; EXYNOS: zip1.8h +; EXYNOS: zip2.8h +; EXYNOS: zip1.8h +; EXYNOS: zip2.8h +; EXYNOS: zip1.8h +; EXYNOS: zip2.8h +; EXYNOS: stp +; EXYNOS: zip1.8h +; EXYNOS: zip2.8h +; EXYNOS: stp call void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P) ret void } @@ -475,6 +537,10 @@ define void @st2_2s(<2 x i32> %A, <2 x i32> %B, i32* %P) nounwind { ; CHECK-LABEL: st2_2s ; CHECK: st2.2s +; EXYNOS-LABEL: st2_2s +; EXYNOS: zip1.2s +; EXYNOS: zip2.2s +; EXYNOS: stp call void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %P) ret void } @@ -489,6 +555,17 @@ define void @st4_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P) nounwind { ; CHECK-LABEL: st4_2s ; CHECK: st4.2s +; EXYNOS-LABEL: st4_2s +; EXYNOS: zip1.2s +; EXYNOS: zip2.2s +; EXYNOS: zip1.2s +; EXYNOS: zip2.2s +; EXYNOS: zip1.2s +; EXYNOS: zip2.2s +; EXYNOS: stp +; EXYNOS: zip1.2s +; EXYNOS: zip2.2s +; EXYNOS: stp call void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P) ret void } @@ -500,6 +577,10 @@ define void @st2_4s(<4 x i32> %A, <4 x i32> %B, i32* %P) nounwind { ; CHECK-LABEL: st2_4s ; CHECK: st2.4s +; EXYNOS-LABEL: st2_4s +; EXYNOS: zip1.4s +; EXYNOS: zip2.4s +; EXYNOS: stp call void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %P) ret void } @@ -514,6 +595,17 @@ define void @st4_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P) nounwind { ; CHECK-LABEL: st4_4s ; CHECK: st4.4s +; EXYNOS-LABEL: st4_4s +; EXYNOS: zip1.4s +; EXYNOS: zip2.4s +; EXYNOS: zip1.4s +; EXYNOS: zip2.4s +; EXYNOS: zip1.4s +; EXYNOS: zip2.4s +; EXYNOS: stp +; EXYNOS: zip1.4s +; EXYNOS: zip2.4s +; EXYNOS: stp call void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P) ret void } @@ -551,6 +643,10 @@ define void @st2_2d(<2 x i64> %A, <2 x i64> %B, i64* %P) nounwind { ; CHECK-LABEL: st2_2d ; CHECK: st2.2d +; EXYNOS-LABEL: st2_2d +; EXYNOS: zip1.2d +; EXYNOS: zip2.2d +; EXYNOS: stp call void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %P) ret void } @@ -565,6 +661,17 @@ define void @st4_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P) nounwind { ; CHECK-LABEL: st4_2d ; CHECK: st4.2d +; EXYNOS-LABEL: st4_2d +; EXYNOS: zip1.2d +; EXYNOS: zip2.2d +; EXYNOS: zip1.2d +; EXYNOS: zip2.2d +; EXYNOS: zip1.2d +; EXYNOS: zip2.2d +; EXYNOS: stp +; EXYNOS: zip1.2d +; EXYNOS: zip2.2d +; EXYNOS: stp call void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P) ret void }