Index: lib/Target/PowerPC/CMakeLists.txt =================================================================== --- lib/Target/PowerPC/CMakeLists.txt +++ lib/Target/PowerPC/CMakeLists.txt @@ -36,6 +36,7 @@ PPCTLSDynamicCall.cpp PPCVSXCopy.cpp PPCVSXFMAMutate.cpp + PPCVSXSwapRemoval.cpp ) add_subdirectory(AsmParser) Index: lib/Target/PowerPC/PPC.h =================================================================== --- lib/Target/PowerPC/PPC.h +++ lib/Target/PowerPC/PPC.h @@ -39,6 +39,7 @@ FunctionPass *createPPCEarlyReturnPass(); FunctionPass *createPPCVSXCopyPass(); FunctionPass *createPPCVSXFMAMutatePass(); + FunctionPass *createPPCVSXSwapRemovalPass(); FunctionPass *createPPCBranchSelectionPass(); FunctionPass *createPPCISelDag(PPCTargetMachine &TM); FunctionPass *createPPCTLSDynamicCallPass(); Index: lib/Target/PowerPC/PPCTargetMachine.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetMachine.cpp +++ lib/Target/PowerPC/PPCTargetMachine.cpp @@ -231,6 +231,7 @@ bool addPreISel() override; bool addILPOpts() override; bool addInstSelector() override; + void addMachineSSAOptimization() override; void addPreRegAlloc() override; void addPreSched2() override; void addPreEmitPass() override; @@ -298,6 +299,12 @@ return false; } +void PPCPassConfig::addMachineSSAOptimization() { + TargetPassConfig::addMachineSSAOptimization(); + if (Triple(TM->getTargetTriple()).getArch() == Triple::ppc64le) + addPass(createPPCVSXSwapRemovalPass()); +} + void PPCPassConfig::addPreRegAlloc() { initializePPCVSXFMAMutatePass(*PassRegistry::getPassRegistry()); insertPass(VSXFMAMutateEarly ? &RegisterCoalescerID : &MachineSchedulerID, Index: lib/Target/PowerPC/PPCVSXSwapRemoval.cpp =================================================================== --- lib/Target/PowerPC/PPCVSXSwapRemoval.cpp +++ lib/Target/PowerPC/PPCVSXSwapRemoval.cpp @@ -0,0 +1,747 @@ +//===----------- PPCVSXSwapRemoval.cpp - Remove VSX LE Swaps -------------===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===---------------------------------------------------------------------===// +// +// This pass analyzes vector computations and removes unnecessary +// doubleword swaps (xxswapd instructions). This pass is performed +// only for little-endian VSX code generation. +// +// For this specific case, loads and stores of v4i32, v4r32, v2i64, +// and v2f64 vectors are inefficient. These are implemented using +// the lxvd2x and stxvd2x instructions, which invert the order of +// doublewords in a vector register. Thus code generation inserts +// an xxswapd after each such load, and prior to each such store. +// +// The extra xxswapd instructions reduce performance. This can be +// particularly bad for vectorized code. The purpose of this pass +// is to reduce the number of xxswapd instructions required for +// correctness. +// +// The primary insight is that much code that operates on vectors +// does not care about the relative order of elements in a register, +// so long as the correct memory order is preserved. If we have a +// computation where all input values are provided by lxvd2x/xxswapd, +// all outputs are stored using xxswapd/lxvd2x, and all intermediate +// computations are lane-insensitive (independent of element order), +// then all the xxswapd instructions associated with the loads and +// stores may be removed without changing observable semantics. +// +// This pass uses standard equivalence class infrastructure to create +// maximal webs of computations fitting the above description. Each +// such web is then optimized by removing its unnecessary xxswapd +// instructions. +// +// There are some lane-sensitive operations for which we can still +// permit the optimization, provided we modify those operations +// accordingly. Such operations are identified as using "special +// handling" within this module. +// +//===---------------------------------------------------------------------===// + +#include "PPCInstrInfo.h" +#include "PPC.h" +#include "PPCInstrBuilder.h" +#include "PPCTargetMachine.h" +#include "llvm/ADT/EquivalenceClasses.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/Format.h" +#include "llvm/Support/raw_ostream.h" + +using namespace llvm; + +#define DEBUG_TYPE "ppc-vsx-swaps" + +namespace llvm { + void initializePPCVSXSwapRemovalPass(PassRegistry&); +} + +namespace { + +// A PPCVSXSwapEntry is created for each machine instruction that +// is relevant to a vector computation. +struct PPCVSXSwapEntry { + // Pointer to the instruction. + MachineInstr *VSEMI; + + // Unique ID (position in the swap vector). + int VSEId; + + // Attributes of this node. + unsigned int IsLoad : 1; + unsigned int IsStore : 1; + unsigned int IsSwap : 1; + unsigned int MentionsPhysVR: 1; + unsigned int IsSwappable : 1; + unsigned int SpecialHandling : 3; + unsigned int WebRejected : 1; + unsigned int WillRemove : 1; +}; + +enum SHValues { + SH_NONE = 0, + SH_BUILDVEC, + SH_EXTRACT, + SH_INSERT, + SH_NOSWAP_LD, + SH_NOSWAP_ST, + SH_SPLAT +}; + +struct PPCVSXSwapRemoval : public MachineFunctionPass { + + static char ID; + const PPCInstrInfo *TII; + MachineFunction *MF; + MachineRegisterInfo *MRI; + + // Swap entries are allocated in a vector for better performance. + std::vector SwapVector; + + // A mapping is maintained between machine instructions and + // their swap entries. The key is the address of the MI. + std::map SwapMap; + + // Equivalence classes are used to gather webs of related computation. + // Swap entries are represented by their VSEId fields. + EquivalenceClasses *EC; + + PPCVSXSwapRemoval() : MachineFunctionPass(ID) { + initializePPCVSXSwapRemovalPass(*PassRegistry::getPassRegistry()); + } + +private: + // Initialize data structures. + void initialize(MachineFunction &MFParm); + + // Walk the machine instructions to gather vector usage information. + // Return true iff vector mentions are present. + bool gatherVectorMentions(); + + // Add an entry to the swap vector and swap map. + int addSwapEntry(MachineInstr *MI, PPCVSXSwapEntry &SwapEntry); + + // Hunt backwards through COPY and SUBREG_TO_REG chains for a + // source register. VecIdx indicates the swap vector entry to + // mark as mentioning a physical register if the search leads + // to one. + unsigned lookThruCopyLike(unsigned SrcReg, unsigned VecIdx); + + // Generate equivalence classes for related computations (webs). + void formWebs(); + + // Analyze webs and determine those that cannot be optimized. + void recordUnoptimizableWebs(); + + // Record which swap instructions can be safely removed. + void markSwapsForRemoval(); + + // Remove swaps and update other instructions requiring special + // handling. Return true iff any changes are made. + bool removeSwaps(); + + // Update instructions requiring special handling. + void handleSpecialSwappables(int EntryIdx); + + // Dump a description of the entries in the swap vector. + void dumpSwapVector(); + + // Return true iff the given register is in the given class. + bool isRegInClass(unsigned Reg, const TargetRegisterClass *RC) { + if (TargetRegisterInfo::isVirtualRegister(Reg)) + return RC->hasSubClassEq(MRI->getRegClass(Reg)); + if (RC->contains(Reg)) + return true; + return false; + } + + // Return true iff the given register is a full vector register. + bool isVecReg(unsigned Reg) { + return (isRegInClass(Reg, &PPC::VSRCRegClass) || + isRegInClass(Reg, &PPC::VRRCRegClass)); + } + +public: + // Main entry point for this pass. + bool runOnMachineFunction(MachineFunction &MF) override { + // If we don't have VSX on the subtarget, don't do anything. + const PPCSubtarget &STI = MF.getSubtarget(); + if (!STI.hasVSX()) + return false; + + bool Changed = false; + initialize(MF); + + if (gatherVectorMentions()) { + formWebs(); + recordUnoptimizableWebs(); + markSwapsForRemoval(); + Changed = removeSwaps(); + } + + // FIXME: See the allocation of EC in initialize(). + delete EC; + return Changed; + } +}; + +// Initialize data structures for this pass. In particular, clear the +// swap vector and allocate the equivalence class mapping before +// processing each function. +void PPCVSXSwapRemoval::initialize(MachineFunction &MFParm) { + MF = &MFParm; + MRI = &MF->getRegInfo(); + TII = static_cast(MF->getSubtarget().getInstrInfo()); + + const int InitialVectorSize(256); + SwapVector.clear(); + SwapVector.reserve(InitialVectorSize); + + // FIXME: Currently we allocate EC each time because we don't have + // access to the set representation on which to call clear(). Should + // consider adding a clear() method to the EquivalenceClasses class. + EC = new EquivalenceClasses; +} + +// Create an entry in the swap vector for each instruction that mentions +// a full vector register, recording various characteristics of the +// instructions there. +bool PPCVSXSwapRemoval::gatherVectorMentions() { + bool RelevantFunction = false; + + for (MachineFunction::iterator I = MF->begin(); I != MF->end();) { + MachineBasicBlock &MBB = *I++; + + for (MachineBasicBlock::iterator BI = MBB.begin(), BIE = MBB.end(); + BI != BIE; ++BI) { + MachineInstr *MI = BI; + bool RelevantInstr = false; + + for (unsigned Idx = 0, Last = MI->getNumOperands(); Idx != Last; ++Idx) { + MachineOperand &MO = MI->getOperand(Idx); + if (!MO.isReg()) + continue; + unsigned Reg = MO.getReg(); + if (isVecReg(Reg)) { + RelevantInstr = true; + break; + } + } + + if (!RelevantInstr) + continue; + + RelevantFunction = true; + + // Create a SwapEntry initialized to zeros, then fill in the + // instruction and ID fields before pushing it to the back + // of the swap vector. + PPCVSXSwapEntry SwapEntry{}; + int VecIdx = addSwapEntry(MI, SwapEntry); + + switch(MI->getOpcode()) { + default: + // Unless noted otherwise, an instruction is considered + // safe for the optimization. + SwapVector[VecIdx].IsSwappable = 1; + break; + case PPC::XXPERMDI: + // This is a swap if it is of the form XXPERMDI t, s, s, 2. + // Unfortunately, MachineCSE ignores COPY and SUBREG_TO_REG, so we + // can also see XXPERMDI t, SUBREG_TO_REG(s), SUBREG_TO_REG(s), 2, + // for example. We have to look through chains of COPY and + // SUBREG_TO_REG to find the real source value for comparison. + // If the real source value is a physical register, then mark the + // XXPERMDI as mentioning a physical register. + // Any other form of XXPERMDI is lane-sensitive and unsafe + // for the optimization. + if (MI->getOperand(3).getImm() == 2) { + unsigned trueReg1 = lookThruCopyLike(MI->getOperand(1).getReg(), + VecIdx); + unsigned trueReg2 = lookThruCopyLike(MI->getOperand(2).getReg(), + VecIdx); + if (trueReg1 == trueReg2) + SwapVector[VecIdx].IsSwap = 1; + } + break; + case PPC::LVX: + // Non-permuting loads are currently unsafe. We can use special + // handling for this in the future. + SwapVector[VecIdx].IsLoad = 1; + break; + case PPC::LXVD2X: + case PPC::LXVW4X: + // Permuting loads are marked as both load and swap, and are + // safe for optimization. + SwapVector[VecIdx].IsLoad = 1; + SwapVector[VecIdx].IsSwap = 1; + break; + case PPC::STVX: + // Non-permuting stores are currently unsafe. We can use special + // handling for this in the future. + SwapVector[VecIdx].IsStore = 1; + break; + case PPC::STXVD2X: + case PPC::STXVW4X: + // Permuting stores are marked as both store and swap, and are + // safe for optimization. + SwapVector[VecIdx].IsStore = 1; + SwapVector[VecIdx].IsSwap = 1; + break; + case PPC::SUBREG_TO_REG: + // These are fine provided they are moving between full vector + // register classes. + if (isVecReg(MI->getOperand(0).getReg()) && + isVecReg(MI->getOperand(2).getReg())) + SwapVector[VecIdx].IsSwappable = 1; + break; + case PPC::COPY: + // These are fine provided they are moving between full vector + // register classes. + if (isVecReg(MI->getOperand(0).getReg()) && + isVecReg(MI->getOperand(1).getReg())) + SwapVector[VecIdx].IsSwappable = 1; + break; + case PPC::VSPLTB: + case PPC::VSPLTH: + case PPC::VSPLTW: + // Splats are lane-sensitive, but we can use special handling + // to adjust the source lane for the splat. This is not yet + // implemented. When it is, we need to uncomment the following: + // SwapVector[VecIdx].IsSwappable = 1; + SwapVector[VecIdx].SpecialHandling = SHValues::SH_SPLAT; + break; + // The presence of the following lane-sensitive operations in a + // web will kill the optimization, at least for now. + // FIXME: Some of these could be permitted with special handling, + // and will be phased in as time permits. + // FIXME: Is there a way we could specify this as a flag on the + // instruction in the tblgen description? That would seem + // a better approach for future maintenance, but I don't + // immediately see how this can be done. Perhaps one of the + // flags on a MachineInstr can be set this way? + case PPC::INLINEASM: + case PPC::EXTRACT_SUBREG: + case PPC::INSERT_SUBREG: + case PPC::COPY_TO_REGCLASS: + case PPC::LVEBX: + case PPC::LVEHX: + case PPC::LVEWX: + case PPC::LVSL: + case PPC::LVSR: + case PPC::LVXL: + case PPC::LXVDSX: + case PPC::STVEBX: + case PPC::STVEHX: + case PPC::STVEWX: + case PPC::STVXL: + case PPC::STXSDX: + case PPC::VCIPHER: + case PPC::VCIPHERLAST: + case PPC::VMRGHB: + case PPC::VMRGHH: + case PPC::VMRGHW: + case PPC::VMRGLB: + case PPC::VMRGLH: + case PPC::VMRGLW: + case PPC::VMULESB: + case PPC::VMULESH: + case PPC::VMULESW: + case PPC::VMULEUB: + case PPC::VMULEUH: + case PPC::VMULEUW: + case PPC::VMULOSB: + case PPC::VMULOSH: + case PPC::VMULOSW: + case PPC::VMULOUB: + case PPC::VMULOUH: + case PPC::VMULOUW: + case PPC::VNCIPHER: + case PPC::VNCIPHERLAST: + case PPC::VPERM: + case PPC::VPERMXOR: + case PPC::VPKPX: + case PPC::VPKSHSS: + case PPC::VPKSHUS: + case PPC::VPKSWSS: + case PPC::VPKSWUS: + case PPC::VPKUHUM: + case PPC::VPKUHUS: + case PPC::VPKUWUM: + case PPC::VPKUWUS: + case PPC::VPMSUMB: + case PPC::VPMSUMD: + case PPC::VPMSUMH: + case PPC::VPMSUMW: + case PPC::VRLB: + case PPC::VRLD: + case PPC::VRLH: + case PPC::VRLW: + case PPC::VSBOX: + case PPC::VSHASIGMAD: + case PPC::VSHASIGMAW: + case PPC::VSL: + case PPC::VSLDOI: + case PPC::VSLO: + case PPC::VSR: + case PPC::VSRO: + case PPC::VSUM2SWS: + case PPC::VSUM4SBS: + case PPC::VSUM4SHS: + case PPC::VSUM4UBS: + case PPC::VSUMSWS: + case PPC::VUPKHPX: + case PPC::VUPKHSB: + case PPC::VUPKHSH: + case PPC::VUPKLPX: + case PPC::VUPKLSB: + case PPC::VUPKLSH: + case PPC::XXMRGHW: + case PPC::XXMRGLW: + case PPC::XXSPLTW: + break; + } + } + } + + if (RelevantFunction) { + DEBUG(dbgs() << "Swap vector when first built\n\n"); + dumpSwapVector(); + } + + return RelevantFunction; +} + +// Add an entry to the swap vector and swap map, and make a +// singleton equivalence class for the entry. +int PPCVSXSwapRemoval::addSwapEntry(MachineInstr *MI, + PPCVSXSwapEntry& SwapEntry) { + SwapEntry.VSEMI = MI; + SwapEntry.VSEId = SwapVector.size(); + SwapVector.push_back(SwapEntry); + EC->insert(SwapEntry.VSEId); + SwapMap[MI] = SwapEntry.VSEId; + return SwapEntry.VSEId; +} + +// This is used to find the "true" source register for an +// XXPERMDI instruction, since MachineCSE does not handle the +// "copy-like" operations (Copy and SubregToReg). Returns +// the original SrcReg unless it is the target of a copy-like +// operation, in which case we chain backwards through all +// such operations to the ultimate source register. If a +// physical register is encountered, we stop the search and +// flag the swap entry indicated by VecIdx (the original +// XXPERMDI) as mentioning a physical register. +unsigned PPCVSXSwapRemoval::lookThruCopyLike(unsigned SrcReg, + unsigned VecIdx) { + MachineInstr *MI = MRI->getVRegDef(SrcReg); + if (!MI->isCopyLike()) + return SrcReg; + + unsigned CopySrcReg; + if (MI->isCopy()) + CopySrcReg = MI->getOperand(1).getReg(); + else // isSubregToReg() + CopySrcReg = MI->getOperand(2).getReg(); + + if (!TargetRegisterInfo::isVirtualRegister(CopySrcReg)) { + SwapVector[VecIdx].MentionsPhysVR = 1; + return CopySrcReg; + } + + return lookThruCopyLike(CopySrcReg, VecIdx); +} + +// Generate equivalence classes for related computations (webs) by +// def-use relationships of virtual registers. Mention of a physical +// register terminates the generation of equivalence classes as this +// indicates a use of a parameter, definition of a return value, use +// of a value returned from a call, or definition of a parameter to a +// call. Computations with physical register mentions are flagged +// as such so their containing webs will not be optimized. +void PPCVSXSwapRemoval::formWebs() { + + DEBUG(dbgs() << "\n*** Forming webs for swap removal ***\n\n"); + + for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) { + + MachineInstr *MI = SwapVector[EntryIdx].VSEMI; + + DEBUG(dbgs() << "\n" << SwapVector[EntryIdx].VSEId << " "); + DEBUG(MI->dump()); + + // It's sufficient to walk vector uses and join them to their unique + // definitions. In addition, check *all* vector register operands + // for physical regs. + for (unsigned Idx = 0, Last = MI->getNumOperands(); Idx != Last; ++Idx) { + + MachineOperand &MO = MI->getOperand(Idx); + if (!MO.isReg()) + continue; + + unsigned Reg = MO.getReg(); + if (!isVecReg(Reg)) + continue; + + if (!TargetRegisterInfo::isVirtualRegister(Reg)) { + SwapVector[EntryIdx].MentionsPhysVR = 1; + continue; + } + + if (!MO.isUse()) + continue; + + MachineInstr* DefMI = MRI->getVRegDef(Reg); + assert(SwapMap.find(DefMI) != SwapMap.end() && + "Inconsistency: def of vector reg not found in swap map!"); + int DefIdx = SwapMap[DefMI]; + (void)EC->unionSets(SwapVector[DefIdx].VSEId, + SwapVector[EntryIdx].VSEId); + + DEBUG(dbgs() << format("Unioning %d with %d\n", SwapVector[DefIdx].VSEId, + SwapVector[EntryIdx].VSEId)); + DEBUG(dbgs() << " Def: "); + DEBUG(DefMI->dump()); + } + } +} + +// Walk the swap vector entries looking for conditions that prevent their +// containing computations from being optimized. When such conditions are +// found, mark the representative of the computation's equivalence class +// as rejected. +void PPCVSXSwapRemoval::recordUnoptimizableWebs() { + + DEBUG(dbgs() << "\n*** Rejecting webs for swap removal ***\n\n"); + + for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) { + int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId); + + // Reject webs containing mentions of physical registers, or containing + // operations that we don't know how to handle in a lane-permuted region. + if (SwapVector[EntryIdx].MentionsPhysVR || + !(SwapVector[EntryIdx].IsSwappable || SwapVector[EntryIdx].IsSwap)) { + + SwapVector[Repr].WebRejected = 1; + + DEBUG(dbgs() << + format("Web %d rejected for either physreg or not swap[pable]\n", + Repr)); + DEBUG(dbgs() << " in " << EntryIdx << ": "); + DEBUG(SwapVector[EntryIdx].VSEMI->dump()); + DEBUG(dbgs() << "\n"); + } + + // Reject webs than contain swapping loads that feed something other + // than a swap instruction. + else if (SwapVector[EntryIdx].IsLoad && SwapVector[EntryIdx].IsSwap) { + MachineInstr *MI = SwapVector[EntryIdx].VSEMI; + unsigned DefReg = MI->getOperand(0).getReg(); + + for (MachineInstr &UseMI : MRI->use_nodbg_instructions(DefReg)) { + int UseIdx = SwapMap[&UseMI]; + + if (!SwapVector[UseIdx].IsSwap || SwapVector[UseIdx].IsLoad || + SwapVector[UseIdx].IsStore) { + + SwapVector[Repr].WebRejected = 1; + + DEBUG(dbgs() << + format("Web %d rejected for load not feeding swap\n", Repr)); + DEBUG(dbgs() << " def " << EntryIdx << ": "); + DEBUG(MI->dump()); + DEBUG(dbgs() << " use " << UseIdx << ": "); + DEBUG(UseMI.dump()); + DEBUG(dbgs() << "\n"); + } + } + + // Reject webs than contain swapping stores that are fed by something + // other than a swap instruction. + } else if (SwapVector[EntryIdx].IsStore && SwapVector[EntryIdx].IsSwap) { + MachineInstr *MI = SwapVector[EntryIdx].VSEMI; + unsigned UseReg = MI->getOperand(0).getReg(); + MachineInstr *DefMI = MRI->getVRegDef(UseReg); + int DefIdx = SwapMap[DefMI]; + + if (!SwapVector[DefIdx].IsSwap || SwapVector[DefIdx].IsLoad || + SwapVector[DefIdx].IsStore) { + + SwapVector[Repr].WebRejected = 1; + + DEBUG(dbgs() << + format("Web %d rejected for store not fed by swap\n", Repr)); + DEBUG(dbgs() << " def " << DefIdx << ": "); + DEBUG(DefMI->dump()); + DEBUG(dbgs() << " use " << EntryIdx << ": "); + DEBUG(MI->dump()); + DEBUG(dbgs() << "\n"); + } + } + } + + DEBUG(dbgs() << "Swap vector after web analysis:\n\n"); + dumpSwapVector(); +} + +// Walk the swap vector entries looking for swaps fed by permuting loads +// and swaps that feed permuting stores. If the containing computation +// has not been marked rejected, mark each such swap for removal. +// (Removal is delayed in case optimization has disturbed the pattern, +// such that multiple loads feed the same swap, etc.) +void PPCVSXSwapRemoval::markSwapsForRemoval() { + + DEBUG(dbgs() << "\n*** Marking swaps for removal ***\n\n"); + + for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) { + + if (SwapVector[EntryIdx].IsLoad && SwapVector[EntryIdx].IsSwap) { + int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId); + + if (!SwapVector[Repr].WebRejected) { + MachineInstr *MI = SwapVector[EntryIdx].VSEMI; + unsigned DefReg = MI->getOperand(0).getReg(); + + for (MachineInstr &UseMI : MRI->use_nodbg_instructions(DefReg)) { + int UseIdx = SwapMap[&UseMI]; + SwapVector[UseIdx].WillRemove = 1; + + DEBUG(dbgs() << "Marking swap fed by load for removal: "); + DEBUG(UseMI.dump()); + } + } + + } else if (SwapVector[EntryIdx].IsStore && SwapVector[EntryIdx].IsSwap) { + int Repr = EC->getLeaderValue(SwapVector[EntryIdx].VSEId); + + if (!SwapVector[Repr].WebRejected) { + MachineInstr *MI = SwapVector[EntryIdx].VSEMI; + unsigned UseReg = MI->getOperand(0).getReg(); + MachineInstr *DefMI = MRI->getVRegDef(UseReg); + int DefIdx = SwapMap[DefMI]; + SwapVector[DefIdx].WillRemove = 1; + + DEBUG(dbgs() << "Marking swap feeding store for removal: "); + DEBUG(DefMI->dump()); + } + + } else if (SwapVector[EntryIdx].IsSwappable && + SwapVector[EntryIdx].SpecialHandling != 0) + handleSpecialSwappables(EntryIdx); + } +} + +// The identified swap entry requires special handling to allow its +// containing computation to be optimized. Perform that handling +// here. +void PPCVSXSwapRemoval::handleSpecialSwappables(int EntryIdx) { +} + +// Walk the swap vector and replace each entry marked for removal with +// a copy operation. +bool PPCVSXSwapRemoval::removeSwaps() { + + DEBUG(dbgs() << "\n*** Removing swaps ***\n\n"); + + bool Changed = false; + + for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) { + if (SwapVector[EntryIdx].WillRemove) { + Changed = true; + MachineInstr *MI = SwapVector[EntryIdx].VSEMI; + MachineBasicBlock *MBB = MI->getParent(); + BuildMI(*MBB, MI, MI->getDebugLoc(), + TII->get(TargetOpcode::COPY), MI->getOperand(0).getReg()) + .addOperand(MI->getOperand(1)); + + DEBUG(dbgs() << format("Replaced %d with copy: ", + SwapVector[EntryIdx].VSEId)); + DEBUG(MI->dump()); + + MI->eraseFromParent(); + } + } + + return Changed; +} + +// For debug purposes, dump the contents of the swap vector. +void PPCVSXSwapRemoval::dumpSwapVector() { + + for (unsigned EntryIdx = 0; EntryIdx < SwapVector.size(); ++EntryIdx) { + + MachineInstr *MI = SwapVector[EntryIdx].VSEMI; + int ID = SwapVector[EntryIdx].VSEId; + + DEBUG(dbgs() << format("%6d", ID)); + DEBUG(dbgs() << format("%6d", EC->getLeaderValue(ID))); + DEBUG(dbgs() << format(" BB#%3d", MI->getParent()->getNumber())); + DEBUG(dbgs() << format(" %14s ", TII->getName(MI->getOpcode()))); + + if (SwapVector[EntryIdx].IsLoad) + DEBUG(dbgs() << "load "); + if (SwapVector[EntryIdx].IsStore) + DEBUG(dbgs() << "store "); + if (SwapVector[EntryIdx].IsSwap) + DEBUG(dbgs() << "swap "); + if (SwapVector[EntryIdx].MentionsPhysVR) + DEBUG(dbgs() << "physreg "); + + if (SwapVector[EntryIdx].IsSwappable) { + DEBUG(dbgs() << "swappable "); + switch(SwapVector[EntryIdx].SpecialHandling) { + default: + DEBUG(dbgs() << "special:**unknown**"); + break; + case SH_NONE: + break; + case SH_BUILDVEC: + DEBUG(dbgs() << "special:buildvec "); + break; + case SH_EXTRACT: + DEBUG(dbgs() << "special:extract "); + break; + case SH_INSERT: + DEBUG(dbgs() << "special:insert "); + break; + case SH_NOSWAP_LD: + DEBUG(dbgs() << "special:load "); + break; + case SH_NOSWAP_ST: + DEBUG(dbgs() << "special:store "); + break; + case SH_SPLAT: + DEBUG(dbgs() << "special:splat "); + break; + } + } + + if (SwapVector[EntryIdx].WebRejected) + DEBUG(dbgs() << "rejected "); + if (SwapVector[EntryIdx].WillRemove) + DEBUG(dbgs() << "remove "); + + DEBUG(dbgs() << "\n"); + } + + DEBUG(dbgs() << "\n"); +} + +} // end default namespace + +INITIALIZE_PASS_BEGIN(PPCVSXSwapRemoval, DEBUG_TYPE, + "PowerPC VSX Swap Removal", false, false) +INITIALIZE_PASS_END(PPCVSXSwapRemoval, DEBUG_TYPE, + "PowerPC VSX Swap Removal", false, false) + +char PPCVSXSwapRemoval::ID = 0; +FunctionPass* +llvm::createPPCVSXSwapRemovalPass() { return new PPCVSXSwapRemoval(); } Index: test/CodeGen/PowerPC/swaps-le-1.ll =================================================================== --- test/CodeGen/PowerPC/swaps-le-1.ll +++ test/CodeGen/PowerPC/swaps-le-1.ll @@ -0,0 +1,127 @@ +; RUN: llc -O3 -mcpu=pwr8 -mtriple=powerpc64le-unknown-linux-gnu < %s | FileCheck %s + +; This test was generated from the following source: +; +; #define N 4096 +; int ca[N] __attribute__((aligned(16))); +; int cb[N] __attribute__((aligned(16))); +; int cc[N] __attribute__((aligned(16))); +; int cd[N] __attribute__((aligned(16))); +; +; void foo () +; { +; int i; +; for (i = 0; i < N; i++) { +; ca[i] = (cb[i] + cc[i]) * cd[i]; +; } +; } + +@cb = common global [4096 x i32] zeroinitializer, align 16 +@cc = common global [4096 x i32] zeroinitializer, align 16 +@cd = common global [4096 x i32] zeroinitializer, align 16 +@ca = common global [4096 x i32] zeroinitializer, align 16 + +define void @foo() { +entry: + br label %vector.body + +vector.body: + %index = phi i64 [ 0, %entry ], [ %index.next.3, %vector.body ] + %0 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cb, i64 0, i64 %index + %1 = bitcast i32* %0 to <4 x i32>* + %wide.load = load <4 x i32>, <4 x i32>* %1, align 16 + %2 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cc, i64 0, i64 %index + %3 = bitcast i32* %2 to <4 x i32>* + %wide.load13 = load <4 x i32>, <4 x i32>* %3, align 16 + %4 = add nsw <4 x i32> %wide.load13, %wide.load + %5 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cd, i64 0, i64 %index + %6 = bitcast i32* %5 to <4 x i32>* + %wide.load14 = load <4 x i32>, <4 x i32>* %6, align 16 + %7 = mul nsw <4 x i32> %4, %wide.load14 + %8 = getelementptr inbounds [4096 x i32], [4096 x i32]* @ca, i64 0, i64 %index + %9 = bitcast i32* %8 to <4 x i32>* + store <4 x i32> %7, <4 x i32>* %9, align 16 + %index.next = add nuw nsw i64 %index, 4 + %10 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cb, i64 0, i64 %index.next + %11 = bitcast i32* %10 to <4 x i32>* + %wide.load.1 = load <4 x i32>, <4 x i32>* %11, align 16 + %12 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cc, i64 0, i64 %index.next + %13 = bitcast i32* %12 to <4 x i32>* + %wide.load13.1 = load <4 x i32>, <4 x i32>* %13, align 16 + %14 = add nsw <4 x i32> %wide.load13.1, %wide.load.1 + %15 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cd, i64 0, i64 %index.next + %16 = bitcast i32* %15 to <4 x i32>* + %wide.load14.1 = load <4 x i32>, <4 x i32>* %16, align 16 + %17 = mul nsw <4 x i32> %14, %wide.load14.1 + %18 = getelementptr inbounds [4096 x i32], [4096 x i32]* @ca, i64 0, i64 %index.next + %19 = bitcast i32* %18 to <4 x i32>* + store <4 x i32> %17, <4 x i32>* %19, align 16 + %index.next.1 = add nuw nsw i64 %index.next, 4 + %20 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cb, i64 0, i64 %index.next.1 + %21 = bitcast i32* %20 to <4 x i32>* + %wide.load.2 = load <4 x i32>, <4 x i32>* %21, align 16 + %22 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cc, i64 0, i64 %index.next.1 + %23 = bitcast i32* %22 to <4 x i32>* + %wide.load13.2 = load <4 x i32>, <4 x i32>* %23, align 16 + %24 = add nsw <4 x i32> %wide.load13.2, %wide.load.2 + %25 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cd, i64 0, i64 %index.next.1 + %26 = bitcast i32* %25 to <4 x i32>* + %wide.load14.2 = load <4 x i32>, <4 x i32>* %26, align 16 + %27 = mul nsw <4 x i32> %24, %wide.load14.2 + %28 = getelementptr inbounds [4096 x i32], [4096 x i32]* @ca, i64 0, i64 %index.next.1 + %29 = bitcast i32* %28 to <4 x i32>* + store <4 x i32> %27, <4 x i32>* %29, align 16 + %index.next.2 = add nuw nsw i64 %index.next.1, 4 + %30 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cb, i64 0, i64 %index.next.2 + %31 = bitcast i32* %30 to <4 x i32>* + %wide.load.3 = load <4 x i32>, <4 x i32>* %31, align 16 + %32 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cc, i64 0, i64 %index.next.2 + %33 = bitcast i32* %32 to <4 x i32>* + %wide.load13.3 = load <4 x i32>, <4 x i32>* %33, align 16 + %34 = add nsw <4 x i32> %wide.load13.3, %wide.load.3 + %35 = getelementptr inbounds [4096 x i32], [4096 x i32]* @cd, i64 0, i64 %index.next.2 + %36 = bitcast i32* %35 to <4 x i32>* + %wide.load14.3 = load <4 x i32>, <4 x i32>* %36, align 16 + %37 = mul nsw <4 x i32> %34, %wide.load14.3 + %38 = getelementptr inbounds [4096 x i32], [4096 x i32]* @ca, i64 0, i64 %index.next.2 + %39 = bitcast i32* %38 to <4 x i32>* + store <4 x i32> %37, <4 x i32>* %39, align 16 + %index.next.3 = add nuw nsw i64 %index.next.2, 4 + %40 = icmp eq i64 %index.next.3, 4096 + br i1 %40, label %for.end, label %vector.body + +for.end: + ret void +} + +; CHECK-LABEL: @foo +; CHECK-NOT: xxpermdi +; CHECK-NOT: xxswapd + +; CHECK: lxvd2x +; CHECK: lxvd2x +; CHECK-DAG: lxvd2x +; CHECK-DAG: vadduwm +; CHECK: vmuluwm +; CHECK: stxvd2x + +; CHECK: lxvd2x +; CHECK: lxvd2x +; CHECK-DAG: lxvd2x +; CHECK-DAG: vadduwm +; CHECK: vmuluwm +; CHECK: stxvd2x + +; CHECK: lxvd2x +; CHECK: lxvd2x +; CHECK-DAG: lxvd2x +; CHECK-DAG: vadduwm +; CHECK: vmuluwm +; CHECK: stxvd2x + +; CHECK: lxvd2x +; CHECK: lxvd2x +; CHECK-DAG: lxvd2x +; CHECK-DAG: vadduwm +; CHECK: vmuluwm +; CHECK: stxvd2x Index: test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll =================================================================== --- test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll +++ test/CodeGen/PowerPC/vsx-ldst-builtin-le.ll @@ -1,7 +1,6 @@ ; RUN: llc -mcpu=pwr8 -mattr=+vsx -O2 -mtriple=powerpc64le-unknown-linux-gnu < %s > %t ; RUN: grep lxvd2x < %t | count 18 ; RUN: grep stxvd2x < %t | count 18 -; RUN: grep xxpermdi < %t | count 36 @vf = global <4 x float> , align 16 @vd = global <2 x double> , align 16 Index: test/CodeGen/PowerPC/vsx-ldst.ll =================================================================== --- test/CodeGen/PowerPC/vsx-ldst.ll +++ test/CodeGen/PowerPC/vsx-ldst.ll @@ -12,7 +12,6 @@ ; RUN: llc -mcpu=pwr8 -mattr=+vsx -O2 -mtriple=powerpc64le-unknown-linux-gnu < %s > %t ; RUN: grep lxvd2x < %t | count 6 ; RUN: grep stxvd2x < %t | count 6 -; RUN: grep xxpermdi < %t | count 12 @vsi = global <4 x i32> , align 16 @vui = global <4 x i32> , align 16