Index: llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp +++ llvm/lib/Target/AArch64/AArch64VectorByElementOpt.cpp @@ -7,19 +7,27 @@ // //===----------------------------------------------------------------------===// // -// This file contains a pass that performs optimization for vector by element -// SIMD instructions. -// -// Certain SIMD instructions with vector element operand are not efficient. -// Rewrite them into SIMD instructions with vector operands. This rewrite -// is driven by the latency of the instructions. +// This file contains a pass that performs optimization on SIMD instructions +// with high latency by splitting them into more efficient series of +// instructions. // +// 1. Rewrite certain SIMD instructions with vector element due to their +// inefficiency on some targets. // Example: // fmla v0.4s, v1.4s, v2.s[1] // is rewritten into // dup v3.4s, v2.s[1] // fmla v0.4s, v1.4s, v3.4s // +// 2. Rewrite Interleaved memory access instructions due to their +// inefficiency on some targets. +// Example: +// st2 {v0.4s, v1.4s}, addr +// is rewritten into +// zip1 v2.4s, v0.4s, v1.4s +// zip2 v3.4s, v0.4s, v1.4s +// stp q2, q3, addr +// //===----------------------------------------------------------------------===// #include "AArch64InstrInfo.h" @@ -60,18 +68,31 @@ MachineRegisterInfo *MRI; TargetSchedModel SchedModel; + typedef enum { + FMLA, + Interleave + } Subpass; + + typedef enum { + TwoVectors, + FourVectors + } AccessType; + + //maximum number of replacement instructions. + static const unsigned MaxNumRepl = 10; + AArch64VectorByElementOpt() : MachineFunctionPass(ID) { initializeAArch64VectorByElementOptPass(*PassRegistry::getPassRegistry()); } /// Based only on latency of instructions, determine if it is cost efficient - /// to replace the instruction InstDesc by the two instructions InstDescRep1 - /// and InstDescRep2. + /// to replace the instruction InstDesc by the instructions stored in the + /// array InstDescRep1. /// Return true if replacement is recommended. bool shouldReplaceInstruction(MachineFunction *MF, const MCInstrDesc *InstDesc, - const MCInstrDesc *InstDescRep1, - const MCInstrDesc *InstDescRep2, + const MCInstrDesc *InstDescRepl[], + unsigned NumRepl, std::map &VecInstElemTable) const; /// Determine if we need to exit the vector by element instruction @@ -83,7 +104,7 @@ /// as FMLS as we assume that if the situation shows up for one /// instruction, then it is likely to show up for the related ones. /// Return true if early exit of the pass is recommended. - bool earlyExitVectElement(MachineFunction *MF); + bool earlyExitVectElement(MachineFunction *MF, Subpass sss); /// Check whether an equivalent DUP instruction has already been /// created or not. @@ -99,6 +120,26 @@ bool optimizeVectElement(MachineInstr &MI, std::map *VecInstElemTable) const; + /// Process The REG_SEQUENCE instruction, and extract the source + /// operands of the st2/4 instruction from it. + /// Example of such instructions. + /// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1; + /// Return true when the instruction is processed successfully. + bool processSeqRegInst(MachineInstr *DefiningMI, unsigned* StReg, + unsigned* StRegKill, unsigned NumArg) const; + + /// Prepare the parameters needed to build the replacement statements. + void prepareStmtParam(unsigned Opcode1, unsigned Opcode2, + const TargetRegisterClass *RC, + const MCInstrDesc* ReplInstrMCID[], + unsigned ZipDest[], AccessType AccT) const; + + /// Load/Store Interelaving instructions are not always beneficial. + /// Replace them by zip instructionand classical load/store. + /// Return true if the SIMD instruction is modified. + bool optimizeLdStInterleave( + MachineInstr &MI, std::map *VecInstElemTable) const; + bool runOnMachineFunction(MachineFunction &Fn) override; StringRef getPassName() const override { @@ -114,14 +155,12 @@ AARCH64_VECTOR_BY_ELEMENT_OPT_NAME, false, false) /// Based only on latency of instructions, determine if it is cost efficient -/// to replace the instruction InstDesc by the two instructions InstDescRep1 -/// and InstDescRep2. Note that it is assumed in this fuction that an -/// instruction of type InstDesc is always replaced by the same two -/// instructions as results are cached here. +/// to replace the instruction InstDesc by the instructions stored in the +/// array InstDescRep1. /// Return true if replacement is recommended. bool AArch64VectorByElementOpt::shouldReplaceInstruction( MachineFunction *MF, const MCInstrDesc *InstDesc, - const MCInstrDesc *InstDescRep1, const MCInstrDesc *InstDescRep2, + const MCInstrDesc *InstDescRepl[], unsigned NumRepl, std::map &VecInstElemTable) const { // Check if replacment decision is alredy available in the cached table. // if so, return it. @@ -130,52 +169,86 @@ return VecInstElemTable[InstDesc->getOpcode()]; unsigned SCIdx = InstDesc->getSchedClass(); - unsigned SCIdxRep1 = InstDescRep1->getSchedClass(); - unsigned SCIdxRep2 = InstDescRep2->getSchedClass(); + unsigned SCIdxRepl[MaxNumRepl]; + for (unsigned i=0; igetSchedClass(); + const MCSchedClassDesc *SCDesc = - SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx); - const MCSchedClassDesc *SCDescRep1 = - SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep1); - const MCSchedClassDesc *SCDescRep2 = - SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdxRep2); + SchedModel.getMCSchedModel()->getSchedClassDesc(SCIdx); + const MCSchedClassDesc *SCDescRepl[MaxNumRepl]; + for (unsigned i=0; igetSchedClassDesc(SCIdxRepl[i]); // If a subtarget does not define resources for any of the instructions // of interest, then return false for no replacement. - if (!SCDesc->isValid() || SCDesc->isVariant() || !SCDescRep1->isValid() || - SCDescRep1->isVariant() || !SCDescRep2->isValid() || - SCDescRep2->isVariant()) { + if (!SCDesc->isValid() || SCDesc->isVariant()) + { VecInstElemTable[InstDesc->getOpcode()] = false; return false; } + else + { + for (unsigned i=0; iisValid() || SCDescRepl[i]->isVariant()) + { + VecInstElemTable[InstDesc->getOpcode()] = false; + return false; + } + } + + // Replacement cost. + unsigned ReplCost = 0; + for (unsigned i=0; igetOpcode()); - if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > - SchedModel.computeInstrLatency(InstDescRep1->getOpcode()) + - SchedModel.computeInstrLatency(InstDescRep2->getOpcode())) { + if (SchedModel.computeInstrLatency(InstDesc->getOpcode()) > ReplCost) + { VecInstElemTable[InstDesc->getOpcode()] = true; return true; } - VecInstElemTable[InstDesc->getOpcode()] = false; - return false; + else + { + VecInstElemTable[InstDesc->getOpcode()] = false; + return false; + } } -/// Determine if we need to exit the vector by element instruction -/// optimization pass early. This makes sure that Targets with no need -/// for this optimization do not spent any compile time on this pass. -/// This check is done by comparing the latency of an indexed FMLA -/// instruction to the latency of the DUP + the latency of a vector -/// FMLA instruction. We do not check on other related instructions such -/// as FMLS as we assume that if the situation shows up for one -/// instruction, then it is likely to show up for the related ones. -/// Return true if early exit of the pass is recommended. -bool AArch64VectorByElementOpt::earlyExitVectElement(MachineFunction *MF) { +/// Determine if we need to exit the instruction replacement optimization +/// subpass early. This makes sure that Targets with no need for this +/// optimization do not spend any compile time on this subpass other than +/// the simple check performed below. +/// This check is done by comparing the latency of the original instruction +/// to the latency of the replacement instructions. We only check for a +/// representative instruction in the class of instructions (ex. FMLA +/// instruction) and not all related instructions (ex. FMLS instruction). +/// Return true if early exit of the subpass is recommended. +bool AArch64VectorByElementOpt::earlyExitVectElement(MachineFunction *MF, + Subpass sp) { + assert(sp == FMLA || sp == Interleave); std::map VecInstElemTable; - const MCInstrDesc *IndexMulMCID = &TII->get(AArch64::FMLAv4i32_indexed); - const MCInstrDesc *DupMCID = &TII->get(AArch64::DUPv4i32lane); - const MCInstrDesc *MulMCID = &TII->get(AArch64::FMULv4f32); + const MCInstrDesc *OriginalMCID; + + if (sp == FMLA) + { + OriginalMCID = &TII->get(AArch64::FMLAv4i32_indexed); + const MCInstrDesc* ReplInstrMCID[2] = {&TII->get(AArch64::DUPv4i32lane), + &TII->get(AArch64::FMULv4f32)}; + if (!shouldReplaceInstruction(MF, OriginalMCID, ReplInstrMCID, 2, + VecInstElemTable)) + return true; + } + else // (sp == Interleave) + { + OriginalMCID = &TII->get(AArch64::ST2Twov4s); + const MCInstrDesc* ReplInstrMCID[3] = {&TII->get(AArch64::ZIP1v4i32), + &TII->get(AArch64::ZIP2v4i32), + &TII->get(AArch64::STPQi)}; + if (!shouldReplaceInstruction(MF, OriginalMCID, ReplInstrMCID, 3, + VecInstElemTable)) + return true; + } - if (!shouldReplaceInstruction(MF, IndexMulMCID, DupMCID, MulMCID, - VecInstElemTable)) - return true; return false; } @@ -283,8 +356,11 @@ break; } + const MCInstrDesc* ReplInstrMCID[2]; + ReplInstrMCID[0] = DupMCID; + ReplInstrMCID[1] = MulMCID; if (!shouldReplaceInstruction(MI.getParent()->getParent(), - &TII->get(MI.getOpcode()), DupMCID, MulMCID, + &TII->get(MI.getOpcode()), ReplInstrMCID, 2, *VecInstElemTable)) return false; @@ -338,6 +414,320 @@ return true; } +/// Load/Store Interelaving instructions are not always beneficial. +/// Replace them by zip instructions and classical load/store. +/// +/// Example: +/// st2 {v0.4s, v1.4s}, addr +/// is rewritten into +/// zip1 v2.4s, v0.4s, v1.4s +/// zip2 v3.4s, v0.4s, v1.4s +/// stp q2, q3, addr +// +/// Example: +/// st4 {v0.4s, v1.4s, v2.4s, v3.4s}, addr +/// is rewritten into +/// zip1 v4.4s, v0.4s, v2.4s +/// zip2 v5.4s, v0.4s, v2.4s +/// zip1 v6.4s, v1.4s, v3.4s +/// zip2 v7.4s, v1.4s, v3.4s +/// zip1 v8.4s, v4.4s, v6.4s +/// zip2 v9.4s, v4.4s, v6.4s +/// zip1 v10.4s, v5.4s, v7.4s +/// zip2 v11.4s, v5.4s, v7.4s +/// stp q8, q9, addr +/// stp q10, q11, addr+32 +/// Currently only instructions related to st2 and st4 are considered. +/// Other may be added later. +/// Return true if the SIMD instruction is modified. +bool AArch64VectorByElementOpt::optimizeLdStInterleave( + MachineInstr &MI, std::map *VecInstElemTable) const { + + unsigned SeqReg, AddrReg; + unsigned StReg[4], StRegKill[4]; + unsigned ZipDest[MaxNumRepl]; + const MCInstrDesc* ReplInstrMCID[MaxNumRepl]; + unsigned NumReplInst; + const TargetRegisterClass *RC128 = &AArch64::FPR128RegClass; + const TargetRegisterClass *RC64 = &AArch64::FPR64RegClass; + MachineInstr *DefiningMI; + const DebugLoc &DL = MI.getDebugLoc(); + MachineBasicBlock &MBB = *MI.getParent(); + + // Common code among all instructions. + switch (MI.getOpcode()) { + default: + return false; + + case AArch64::ST2Twov16b: + case AArch64::ST2Twov8b: + case AArch64::ST2Twov8h: + case AArch64::ST2Twov4h: + case AArch64::ST2Twov4s: + case AArch64::ST2Twov2s: + case AArch64::ST2Twov2d: + SeqReg = MI.getOperand(0).getReg(); + AddrReg = MI.getOperand(1).getReg(); + DefiningMI = MRI->getUniqueVRegDef(SeqReg); + NumReplInst = 3; + if (!processSeqRegInst(DefiningMI, StReg, StRegKill, 2)) + return false; + break; + + case AArch64::ST4Fourv16b: + case AArch64::ST4Fourv8b: + case AArch64::ST4Fourv8h: + case AArch64::ST4Fourv4h: + case AArch64::ST4Fourv4s: + case AArch64::ST4Fourv2s: + case AArch64::ST4Fourv2d: + SeqReg = MI.getOperand(0).getReg(); + AddrReg = MI.getOperand(1).getReg(); + DefiningMI = MRI->getUniqueVRegDef(SeqReg); + NumReplInst = 10; + if (!processSeqRegInst(DefiningMI, StReg, StRegKill, 4)) + return false; + break; + } + + // Specialized code + switch (MI.getOpcode()) { + default: + return false; + + // ST2 case + case AArch64::ST2Twov2d: + prepareStmtParam(AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, RC128, ReplInstrMCID, + ZipDest, TwoVectors); + NumReplInst = 3; + break; + case AArch64::ST2Twov4s: + prepareStmtParam(AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, RC128, ReplInstrMCID, + ZipDest, TwoVectors); + NumReplInst = 3; + break; + case AArch64::ST2Twov2s: + prepareStmtParam(AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, RC64, ReplInstrMCID, + ZipDest, TwoVectors); + NumReplInst = 3; + break; + case AArch64::ST2Twov8h: + prepareStmtParam(AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, RC128, ReplInstrMCID, + ZipDest, TwoVectors); + NumReplInst = 3; + break; + case AArch64::ST2Twov4h: + prepareStmtParam(AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, RC64, ReplInstrMCID, + ZipDest, TwoVectors); + break; + case AArch64::ST2Twov16b: + prepareStmtParam(AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, RC128, ReplInstrMCID, + ZipDest, TwoVectors); + NumReplInst = 3; + break; + case AArch64::ST2Twov8b: + prepareStmtParam(AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, RC64, ReplInstrMCID, + ZipDest, TwoVectors); + NumReplInst = 3; + break; + + // St4 case + case AArch64::ST4Fourv2d: + prepareStmtParam(AArch64::ZIP1v2i64, AArch64::ZIP2v2i64, RC128, ReplInstrMCID, + ZipDest, FourVectors); + NumReplInst = 10; + break; + case AArch64::ST4Fourv4s: + prepareStmtParam(AArch64::ZIP1v4i32, AArch64::ZIP2v4i32, RC128, ReplInstrMCID, + ZipDest, FourVectors); + NumReplInst = 10; + break; + case AArch64::ST4Fourv2s: + prepareStmtParam(AArch64::ZIP1v2i32, AArch64::ZIP2v2i32, RC64, ReplInstrMCID, + ZipDest, FourVectors); + NumReplInst = 10; + break; + case AArch64::ST4Fourv8h: + prepareStmtParam(AArch64::ZIP1v8i16, AArch64::ZIP2v8i16, RC128, ReplInstrMCID, + ZipDest, FourVectors); + NumReplInst = 10; + break; + case AArch64::ST4Fourv4h: + prepareStmtParam(AArch64::ZIP1v4i16, AArch64::ZIP2v4i16, RC64, ReplInstrMCID, + ZipDest, FourVectors); + NumReplInst = 10; + break; + case AArch64::ST4Fourv16b: + prepareStmtParam(AArch64::ZIP1v16i8, AArch64::ZIP2v16i8, RC128, ReplInstrMCID, + ZipDest, FourVectors); + NumReplInst = 10; + break; + case AArch64::ST4Fourv8b: + prepareStmtParam(AArch64::ZIP1v8i8, AArch64::ZIP2v8i8, RC64, ReplInstrMCID, + ZipDest, FourVectors); + NumReplInst = 10; + break; + } + + if (!shouldReplaceInstruction(MI.getParent()->getParent(), + &TII->get(MI.getOpcode()), ReplInstrMCID, + NumReplInst, *VecInstElemTable)) + return false; + + // Generate the replacement instructions composed of zip1, zip2, and stp + switch (MI.getOpcode()) { + default: + return false; + case AArch64::ST2Twov16b: + case AArch64::ST2Twov8b: + case AArch64::ST2Twov8h: + case AArch64::ST2Twov4h: + case AArch64::ST2Twov4s: + case AArch64::ST2Twov2s: + case AArch64::ST2Twov2d: + // zip instructions + BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0]) + .addReg(StReg[0]) + .addReg(StReg[1]); + BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1]) + .addReg(StReg[0], StRegKill[0]) + .addReg(StReg[1], StRegKill[1]); + // stp instructions + BuildMI(MBB, MI, DL, *ReplInstrMCID[2]) + .addReg(ZipDest[0]) + .addReg(ZipDest[1]) + .addReg(AddrReg) + .addImm(0); + break; + case AArch64::ST4Fourv16b: + case AArch64::ST4Fourv8b: + case AArch64::ST4Fourv8h: + case AArch64::ST4Fourv4h: + case AArch64::ST4Fourv4s: + case AArch64::ST4Fourv2s: + case AArch64::ST4Fourv2d: + // zip instructions + BuildMI(MBB, MI, DL, *ReplInstrMCID[0], ZipDest[0]) + .addReg(StReg[0]) + .addReg(StReg[2]); + BuildMI(MBB, MI, DL, *ReplInstrMCID[1], ZipDest[1]) + .addReg(StReg[0], StRegKill[0]) + .addReg(StReg[2], StRegKill[2]); + BuildMI(MBB, MI, DL, *ReplInstrMCID[2], ZipDest[2]) + .addReg(StReg[1]) + .addReg(StReg[3]); + BuildMI(MBB, MI, DL, *ReplInstrMCID[3], ZipDest[3]) + .addReg(StReg[1], StRegKill[1]) + .addReg(StReg[3], StRegKill[3]); + BuildMI(MBB, MI, DL, *ReplInstrMCID[4], ZipDest[4]) + .addReg(ZipDest[0]) + .addReg(ZipDest[2]); + BuildMI(MBB, MI, DL, *ReplInstrMCID[5], ZipDest[5]) + .addReg(ZipDest[0]) + .addReg(ZipDest[2]); + BuildMI(MBB, MI, DL, *ReplInstrMCID[6], ZipDest[6]) + .addReg(ZipDest[1]) + .addReg(ZipDest[3]); + BuildMI(MBB, MI, DL, *ReplInstrMCID[7], ZipDest[7]) + .addReg(ZipDest[1]) + .addReg(ZipDest[3]); + // stp instructions + BuildMI(MBB, MI, DL, *ReplInstrMCID[8]) + .addReg(ZipDest[4]) + .addReg(ZipDest[5]) + .addReg(AddrReg) + .addImm(0); + BuildMI(MBB, MI, DL, *ReplInstrMCID[9]) + .addReg(ZipDest[6]) + .addReg(ZipDest[7]) + .addReg(AddrReg) + .addImm(2); + break; + } + + ++NumModifiedInstr; + return true; +} + +/// Process The REG_SEQUENCE instruction, and extract the source +/// operands of the st2/4 instruction from it. +/// Example of such instruction. +/// %dest = REG_SEQUENCE %st2_src1, dsub0, %st2_src2, dsub1; +/// Return true when the instruction is processed successfully. +bool AArch64VectorByElementOpt::processSeqRegInst(MachineInstr *DefiningMI, + unsigned* StReg, unsigned* StRegKill, unsigned NumArg) const +{ + assert (DefiningMI != NULL); + if (DefiningMI->getOpcode() != AArch64::REG_SEQUENCE) + return false; + + for (unsigned i=0; igetOperand(2*i+1).getReg(); + StRegKill[i] = getKillRegState(DefiningMI->getOperand(2*i+1).isKill()); + + // Sanity check for the other arguments. + if (DefiningMI->getOperand(2*i+2).isImm()) + { + switch (DefiningMI->getOperand(2*i+2).getImm()) { + default: + return false; + case AArch64::dsub0: + case AArch64::dsub1: + case AArch64::dsub2: + case AArch64::dsub3: + case AArch64::qsub0: + case AArch64::qsub1: + case AArch64::qsub2: + case AArch64::qsub3: + break; + } + } + else + return false; + } + return true; +} + +/// Prepare the parameters needed to build the replacement statements +void AArch64VectorByElementOpt::prepareStmtParam(unsigned Opcode1, + unsigned Opcode2, const TargetRegisterClass *RC, + const MCInstrDesc* ReplInstrMCID[], unsigned ZipDest[], AccessType AccT) const +{ + assert(AccT == TwoVectors || AccT == FourVectors); + if ( AccT == TwoVectors) + { + ReplInstrMCID[0] = &TII->get(Opcode1); + ZipDest[0] = MRI->createVirtualRegister(RC); + ReplInstrMCID[1] = &TII->get(Opcode2); + ZipDest[1] = MRI->createVirtualRegister(RC); + if (RC == &AArch64::FPR128RegClass) + ReplInstrMCID[2] = &TII->get(AArch64::STPQi); + else + ReplInstrMCID[2] = &TII->get(AArch64::STPDi); + } + else // AccT == FourVectors + { + for (int i=0; i<4; i++) + { + ReplInstrMCID[2*i] = &TII->get(Opcode1); + ZipDest[2*i] = MRI->createVirtualRegister(RC); + ReplInstrMCID[2*i+1] = &TII->get(Opcode2); + ZipDest[2*i+1] = MRI->createVirtualRegister(RC); + } + if (RC == &AArch64::FPR128RegClass) + { + ReplInstrMCID[8] = &TII->get(AArch64::STPQi); + ReplInstrMCID[9] = &TII->get(AArch64::STPQi); + } + else + { + ReplInstrMCID[8] = &TII->get(AArch64::STPDi); + ReplInstrMCID[9] = &TII->get(AArch64::STPDi); + } + } +} + bool AArch64VectorByElementOpt::runOnMachineFunction(MachineFunction &MF) { if (skipFunction(*MF.getFunction())) return false; @@ -353,30 +743,53 @@ if (!SchedModel.hasInstrSchedModel()) return false; - // A simple check to exit this pass early for targets that do not need it. - if (earlyExitVectElement(&MF)) - return false; - bool Changed = false; - std::map VecInstElemTable; - SmallVector RemoveMIs; - - for (MachineBasicBlock &MBB : MF) { - for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end(); - MII != MIE;) { - MachineInstr &MI = *MII; - if (optimizeVectElement(MI, &VecInstElemTable)) { - // Add MI to the list of instructions to be removed given that it has - // been replaced. - RemoveMIs.push_back(&MI); - Changed = true; + + // A simple check to exit the by vector element sub-pass early for targets + // that do not need it. + if (!earlyExitVectElement(&MF, FMLA)) + { + std::map VecInstElemTable; + SmallVector RemoveMIs; + for (MachineBasicBlock &MBB : MF) { + for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end(); + MII != MIE;) { + MachineInstr &MI = *MII; + if (optimizeVectElement(MI, &VecInstElemTable)) { + // Add MI to the list of instructions to be removed given that it has + // been replaced. + RemoveMIs.push_back(&MI); + Changed = true; + } + ++MII; } - ++MII; } - } - - for (MachineInstr *MI : RemoveMIs) - MI->eraseFromParent(); + for (MachineInstr *MI : RemoveMIs) + MI->eraseFromParent(); + } + + // Another simple check to exit the interleaved store sub-pass early for targets + // that do not need it. + if (!earlyExitVectElement(&MF, Interleave)) + { + std::map VecInstElemTable; + SmallVector RemoveMIs; + for (MachineBasicBlock &MBB : MF) { + for (MachineBasicBlock::iterator MII = MBB.begin(), MIE = MBB.end(); + MII != MIE;) { + MachineInstr &MI = *MII; + if (optimizeLdStInterleave(MI, &VecInstElemTable)) { + // Add MI to the list of instructions to be removed given that it has + // been replaced. + RemoveMIs.push_back(&MI); + Changed = true; + } + ++MII; + } + } + for (MachineInstr *MI : RemoveMIs) + MI->eraseFromParent(); + } return Changed; } Index: llvm/test/CodeGen/AArch64/arm64-st1.ll =================================================================== --- llvm/test/CodeGen/AArch64/arm64-st1.ll +++ llvm/test/CodeGen/AArch64/arm64-st1.ll @@ -1,4 +1,6 @@ ; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs | FileCheck %s +; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -verify-machineinstrs -mcpu=exynos-m1 | FileCheck --check-prefix=EXYNOS %s +; The instruction latencies of Exynos-M1 trigger the transform we see under the Exynos check. define void @st1lane_16b(<16 x i8> %A, i8* %D) { ; CHECK-LABEL: st1lane_16b @@ -375,6 +377,10 @@ define void @st2_8b(<8 x i8> %A, <8 x i8> %B, i8* %P) nounwind { ; CHECK-LABEL: st2_8b ; CHECK: st2.8b +; EXYNOS-LABEL: st2_8b +; EXYNOS: zip1.8b +; EXYNOS: zip2.8b +; EXYNOS: stp call void @llvm.aarch64.neon.st2.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, i8* %P) ret void } @@ -389,6 +395,17 @@ define void @st4_8b(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P) nounwind { ; CHECK-LABEL: st4_8b ; CHECK: st4.8b +; EXYNOS-LABEL: st4_8b +; EXYNOS: zip1.8b +; EXYNOS: zip2.8b +; EXYNOS: zip1.8b +; EXYNOS: zip2.8b +; EXYNOS: zip1.8b +; EXYNOS: zip2.8b +; EXYNOS: stp +; EXYNOS: zip1.8b +; EXYNOS: zip2.8b +; EXYNOS: stp call void @llvm.aarch64.neon.st4.v8i8.p0i8(<8 x i8> %A, <8 x i8> %B, <8 x i8> %C, <8 x i8> %D, i8* %P) ret void } @@ -400,6 +417,10 @@ define void @st2_16b(<16 x i8> %A, <16 x i8> %B, i8* %P) nounwind { ; CHECK-LABEL: st2_16b ; CHECK: st2.16b +; EXYNOS-LABEL: st2_16b +; EXYNOS: zip1.16b +; EXYNOS: zip2.16b +; EXYNOS: stp call void @llvm.aarch64.neon.st2.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, i8* %P) ret void } @@ -414,6 +435,17 @@ define void @st4_16b(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P) nounwind { ; CHECK-LABEL: st4_16b ; CHECK: st4.16b +; EXYNOS-LABEL: st4_16b +; EXYNOS: zip1.16b +; EXYNOS: zip2.16b +; EXYNOS: zip1.16b +; EXYNOS: zip2.16b +; EXYNOS: zip1.16b +; EXYNOS: zip2.16b +; EXYNOS: stp +; EXYNOS: zip1.16b +; EXYNOS: zip2.16b +; EXYNOS: stp call void @llvm.aarch64.neon.st4.v16i8.p0i8(<16 x i8> %A, <16 x i8> %B, <16 x i8> %C, <16 x i8> %D, i8* %P) ret void } @@ -425,6 +457,10 @@ define void @st2_4h(<4 x i16> %A, <4 x i16> %B, i16* %P) nounwind { ; CHECK-LABEL: st2_4h ; CHECK: st2.4h +; EXYNOS-LABEL: st2_4h +; EXYNOS: zip1.4h +; EXYNOS: zip2.4h +; EXYNOS: stp call void @llvm.aarch64.neon.st2.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, i16* %P) ret void } @@ -439,6 +475,17 @@ define void @st4_4h(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P) nounwind { ; CHECK-LABEL: st4_4h ; CHECK: st4.4h +; EXYNOS-LABEL: st4_4h +; EXYNOS: zip1.4h +; EXYNOS: zip2.4h +; EXYNOS: zip1.4h +; EXYNOS: zip2.4h +; EXYNOS: zip1.4h +; EXYNOS: zip2.4h +; EXYNOS: stp +; EXYNOS: zip1.4h +; EXYNOS: zip2.4h +; EXYNOS: stp call void @llvm.aarch64.neon.st4.v4i16.p0i16(<4 x i16> %A, <4 x i16> %B, <4 x i16> %C, <4 x i16> %D, i16* %P) ret void } @@ -450,6 +497,10 @@ define void @st2_8h(<8 x i16> %A, <8 x i16> %B, i16* %P) nounwind { ; CHECK-LABEL: st2_8h ; CHECK: st2.8h +; EXYNOS-LABEL: st2_8h +; EXYNOS: zip1.8h +; EXYNOS: zip2.8h +; EXYNOS: stp call void @llvm.aarch64.neon.st2.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, i16* %P) ret void } @@ -464,6 +515,17 @@ define void @st4_8h(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P) nounwind { ; CHECK-LABEL: st4_8h ; CHECK: st4.8h +; EXYNOS-LABEL: st4_8h +; EXYNOS: zip1.8h +; EXYNOS: zip2.8h +; EXYNOS: zip1.8h +; EXYNOS: zip2.8h +; EXYNOS: zip1.8h +; EXYNOS: zip2.8h +; EXYNOS: stp +; EXYNOS: zip1.8h +; EXYNOS: zip2.8h +; EXYNOS: stp call void @llvm.aarch64.neon.st4.v8i16.p0i16(<8 x i16> %A, <8 x i16> %B, <8 x i16> %C, <8 x i16> %D, i16* %P) ret void } @@ -475,6 +537,10 @@ define void @st2_2s(<2 x i32> %A, <2 x i32> %B, i32* %P) nounwind { ; CHECK-LABEL: st2_2s ; CHECK: st2.2s +; EXYNOS-LABEL: st2_2s +; EXYNOS: zip1.2s +; EXYNOS: zip2.2s +; EXYNOS: stp call void @llvm.aarch64.neon.st2.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, i32* %P) ret void } @@ -489,6 +555,17 @@ define void @st4_2s(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P) nounwind { ; CHECK-LABEL: st4_2s ; CHECK: st4.2s +; EXYNOS-LABEL: st4_2s +; EXYNOS: zip1.2s +; EXYNOS: zip2.2s +; EXYNOS: zip1.2s +; EXYNOS: zip2.2s +; EXYNOS: zip1.2s +; EXYNOS: zip2.2s +; EXYNOS: stp +; EXYNOS: zip1.2s +; EXYNOS: zip2.2s +; EXYNOS: stp call void @llvm.aarch64.neon.st4.v2i32.p0i32(<2 x i32> %A, <2 x i32> %B, <2 x i32> %C, <2 x i32> %D, i32* %P) ret void } @@ -500,6 +577,10 @@ define void @st2_4s(<4 x i32> %A, <4 x i32> %B, i32* %P) nounwind { ; CHECK-LABEL: st2_4s ; CHECK: st2.4s +; EXYNOS-LABEL: st2_4s +; EXYNOS: zip1.4s +; EXYNOS: zip2.4s +; EXYNOS: stp call void @llvm.aarch64.neon.st2.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, i32* %P) ret void } @@ -514,6 +595,17 @@ define void @st4_4s(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P) nounwind { ; CHECK-LABEL: st4_4s ; CHECK: st4.4s +; EXYNOS-LABEL: st4_4s +; EXYNOS: zip1.4s +; EXYNOS: zip2.4s +; EXYNOS: zip1.4s +; EXYNOS: zip2.4s +; EXYNOS: zip1.4s +; EXYNOS: zip2.4s +; EXYNOS: stp +; EXYNOS: zip1.4s +; EXYNOS: zip2.4s +; EXYNOS: stp call void @llvm.aarch64.neon.st4.v4i32.p0i32(<4 x i32> %A, <4 x i32> %B, <4 x i32> %C, <4 x i32> %D, i32* %P) ret void } @@ -551,6 +643,10 @@ define void @st2_2d(<2 x i64> %A, <2 x i64> %B, i64* %P) nounwind { ; CHECK-LABEL: st2_2d ; CHECK: st2.2d +; EXYNOS-LABEL: st2_2d +; EXYNOS: zip1.2d +; EXYNOS: zip2.2d +; EXYNOS: stp call void @llvm.aarch64.neon.st2.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, i64* %P) ret void } @@ -565,6 +661,17 @@ define void @st4_2d(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P) nounwind { ; CHECK-LABEL: st4_2d ; CHECK: st4.2d +; EXYNOS-LABEL: st4_2d +; EXYNOS: zip1.2d +; EXYNOS: zip2.2d +; EXYNOS: zip1.2d +; EXYNOS: zip2.2d +; EXYNOS: zip1.2d +; EXYNOS: zip2.2d +; EXYNOS: stp +; EXYNOS: zip1.2d +; EXYNOS: zip2.2d +; EXYNOS: stp call void @llvm.aarch64.neon.st4.v2i64.p0i64(<2 x i64> %A, <2 x i64> %B, <2 x i64> %C, <2 x i64> %D, i64* %P) ret void }