Index: lib/Target/PowerPC/PPCInstrInfo.h =================================================================== --- lib/Target/PowerPC/PPCInstrInfo.h +++ lib/Target/PowerPC/PPCInstrInfo.h @@ -49,6 +49,7 @@ PPC970_Shift = 3, PPC970_Mask = 0x07 << PPC970_Shift }; + enum PPC970_Unit { /// These are the various PPC970 execution unit pipelines. Each instruction /// is one of these. @@ -171,6 +172,39 @@ void insertNoop(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI) const override; + /// This is used by the pre-regalloc scheduler to determine if two loads are + /// loading from the same base address. It should only return true if the base + /// pointers are the same and the only differences between the two addresses + /// are the offset. It also returns the offsets by reference. + bool areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, + int64_t &Offset1, + int64_t &Offset2) const override; + + /// shouldScheduleLoadsNear - This is a used by the pre-regalloc scheduler to + /// determine (in conjunction with areLoadsFromSameBasePtr) if two loads should + /// be scheduled togther. On some targets if two loads are loading from + /// addresses in the same cache line, it's better if they are scheduled + /// together. This function takes two integers that represent the load offsets + /// from the common base address. It returns true if it decides it's desirable + /// to schedule the two loads together. "NumLoads" is the number of loads that + /// have already been scheduled after Load1. + bool shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, + int64_t Offset1, int64_t Offset2, + unsigned NumLoads) const override; + + + /// Get the base register and byte offset of an instruction that reads/writes + /// memory. + virtual bool getMemOpBaseRegImmOfs(MachineInstr &MemOp, unsigned &BaseReg, + int64_t &Offset, + const TargetRegisterInfo *TRI) const override; + + /// Returns true if the two given memory operations should be scheduled + /// adjacent. Called from the LoadCluster/StoreCluster DAG mutation passes. + /// to TargetPassConfig::createMachineScheduler() to have an effect. + virtual bool shouldClusterMemOps(MachineInstr &FirstLdSt, + MachineInstr &SecondLdSt, + unsigned NumLoads) const override; // Branch analysis. bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB, Index: lib/Target/PowerPC/PPCInstrInfo.cpp =================================================================== --- lib/Target/PowerPC/PPCInstrInfo.cpp +++ lib/Target/PowerPC/PPCInstrInfo.cpp @@ -40,6 +40,9 @@ using namespace llvm; +static cl::opt VectorLoadClusterCount("vec-load-clustering", cl::Hidden, cl::init(4)); +static cl::opt ScalarLoadClusterCount("scalar-load-clustering", cl::Hidden, cl::init(4)); + #define DEBUG_TYPE "ppc-instr-info" #define GET_INSTRMAP_INFO @@ -1929,3 +1932,201 @@ return &PPC::VSRCRegClass; return RC; } + + +static MachineMemOperand* extractBasePointer(MachineSDNode *Load, const char* LoadName) { + MachineMemOperand **IMemOp = Load->memoperands_begin(); + if(IMemOp) { + MachineMemOperand* ret = *IMemOp; + assert(++IMemOp == Load->memoperands_end() && + "Expect a single memory operand in a load"); + return ret; + } + + return nullptr; +} + +// Some machine instructions may have both mayLoad and mayStore flags set. +// These are instructions are lowered from intrinsics that don't actually +// touch memory, but can not have the IntrNoMem flags becuase it needs to be +// inserted into the chain due to their side-effects. +static bool mayActuallyLoad(MCInstrDesc Desc) { + return Desc.mayLoad() && !Desc.mayStore(); +} + +static SDValue getChainOperand(SDNode *Node) { + unsigned OpIndex = Node->getNumOperands(); + while(OpIndex && Node->getOperand(OpIndex - 1).getValueType() == MVT::Glue) { + --OpIndex; + } + + assert(OpIndex && "expected at least one Operand!"); + SDValue ChainOp = Node->getOperand(--OpIndex); + assert(ChainOp.getValueType() == MVT::Other && "Expected Chain Operand!"); + return ChainOp; +} + +bool PPCInstrInfo::areLoadsFromSameBasePtr(SDNode *Load1, SDNode *Load2, + int64_t &Offset1, int64_t &Offset2) const { + + // Only interested in MachineSDNodes + if(!Load1->isMachineOpcode() || !Load2->isMachineOpcode()) { + return false; + } + + const MCInstrDesc &MCIDesc1 = get(Load1->getMachineOpcode()); + const MCInstrDesc &MCIDesc2 = get(Load2->getMachineOpcode()); + if(!mayActuallyLoad(MCIDesc1) || MCIDesc1.isPseudo() || + !mayActuallyLoad(MCIDesc2) || MCIDesc2.isPseudo()) { + return false; + } + + // only interested in Loads in the same chain. + if(getChainOperand(Load1) != getChainOperand(Load2)) { + return false; + } + + // Get the memory operands + MachineSDNode *MachineLoad1 = dyn_cast(Load1); + MachineSDNode *MachineLoad2 = dyn_cast(Load2); + assert(MachineLoad1 && MachineLoad1); + MachineMemOperand *MemOp1 = extractBasePointer(MachineLoad1, "Load1"); + MachineMemOperand *MemOp2 = extractBasePointer(MachineLoad2, "Load2");; + + // Not every load will have its MMO properly set. For example the loads + // created from intrinsic calls might not have them set. + if(!MemOp1 || !MemOp2) + return false; + + // Check tlat the memory ops use the same base value + if(MemOp1->getValue() == MemOp2->getValue()) { + Offset1 = MemOp1->getOffset(); + Offset2 = MemOp2->getOffset(); + return true; + } + + // Loads are off different base values. + return false; +} + +bool PPCInstrInfo::shouldScheduleLoadsNear(SDNode *Load1, SDNode *Load2, + int64_t Offset1, int64_t Offset2, + unsigned NumLoads) const { + assert(Offset2 > Offset1 && "Offset2 must larger then Offset1!"); + if(Offset2 - Offset1 > 128) + return false; + + // Want to cluster vector loads with a different threshold from scalar loads + // due to having more vector registers. Since the incoming Loads are sorted + // based on offset + EVT VT = Load1->getValueType(0); + if (VT.isVector()) { + return NumLoads < VectorLoadClusterCount ; + } + + return NumLoads < ScalarLoadClusterCount; +} + +static bool tryExtractImm(MachineInstr *MI, int64_t &Offset) { + switch(MI->getOpcode()) + { + default: + return false; + case PPC::LI8: + case PPC::LI: + assert(MI->getOperand(1).isImm()); + Offset = MI->getOperand(1).getImm(); + return true; + } +} + +bool PPCInstrInfo::getMemOpBaseRegImmOfs(MachineInstr &MemOp, unsigned &BaseReg, + int64_t &Offset, + const TargetRegisterInfo *TRI) const { + + const MCInstrDesc &InstrDesc = get(MemOp.getOpcode()); + + // mayLoad and mayStore means its a MachineInstr with side-effects rather then + // an actual memory operation. + if (InstrDesc.mayLoad() && InstrDesc.mayStore()) { + return false; + } + + // expect at least 3 (explicit) operands. + if(MemOp.getNumExplicitOperands() < 3) { + return false; + } + + MachineOperand &Mo1 = MemOp.getOperand(1); + MachineOperand &Mo2 = MemOp.getOperand(2); + + // Check for 'BaseAddr + imm' and 'imm + BaseAddr'. + if(Mo1.isReg() && Mo2.isImm()) { + BaseReg = Mo1.getReg(); + Offset = Mo2.getImm(); + return true; + } else if (Mo2.isReg() && Mo1.isImm()) { + BaseReg = Mo2.getReg(); + Offset = Mo1.getImm(); + return true; + } + + if (!(Mo1.isReg() && Mo2.isReg())) { + return false; + } + + // check for 'r0 + BaseAddr' + if (Mo1.getReg() == PPC::ZERO8 || Mo1.getReg() == PPC::ZERO) { + BaseReg = Mo2.getReg(); + Offset = 0; + return true; + } + + BaseReg = Mo1.getReg(); + MachineRegisterInfo &MRI = MemOp.getParent()->getParent()->getRegInfo(); + + // Try to extract an immediate from the register definition. + if(TargetRegisterInfo::isVirtualRegister(Mo2.getReg())) { + // May have more then one def + if(!MRI.hasOneDef(Mo2.getReg())) { + return false; + } + + MachineInstr *MI = MRI.getVRegDef(Mo2.getReg()); + return tryExtractImm(MI, Offset); + } + + // TODO Do I need to handle physical registers at this point? + return false; +} + +bool PPCInstrInfo::shouldClusterMemOps(MachineInstr &FirstLdSt, MachineInstr &SecondLdSt, + unsigned NumLoads) const { + + if (FirstLdSt.hasOrderedMemoryRef() || SecondLdSt.hasOrderedMemoryRef()) { + return false; + } + + const TargetRegisterInfo *TRI = &getRegisterInfo(); + unsigned FirstBaseReg, SecondBaseReg; + int64_t FirstOffset, SecondOffset; + + if (!getMemOpBaseRegImmOfs(FirstLdSt, FirstBaseReg, FirstOffset, TRI) || + !getMemOpBaseRegImmOfs(SecondLdSt, SecondBaseReg, SecondOffset, TRI)) { + return false; + } + + // Only cluster loads from the same base address + if (FirstBaseReg != SecondBaseReg) + return false; + + int64_t OffsetDiff = FirstOffset > SecondOffset ? + FirstOffset - SecondOffset : + SecondOffset - FirstOffset; + + if(OffsetDiff > 128) { + return false; + } + + return true; +} Index: lib/Target/PowerPC/PPCTargetMachine.cpp =================================================================== --- lib/Target/PowerPC/PPCTargetMachine.cpp +++ lib/Target/PowerPC/PPCTargetMachine.cpp @@ -24,6 +24,7 @@ #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/CodeGen/Passes.h" #include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/CodeGen/MachineScheduler.h" #include "llvm/IR/Attributes.h" #include "llvm/IR/DataLayout.h" #include "llvm/IR/Function.h" @@ -305,6 +306,17 @@ return getTM(); } + ScheduleDAGInstrs * + createMachineScheduler(MachineSchedContext *C) const override { + // TODO: Create the default for non-P8 Targets. + // ScheduleDAGInstrs *DAG = TargetPassConfig::createMachineScheduler(C); + ScheduleDAGMILive *DAG = createGenericSchedLive(C); + DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI)); + DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI)); + + return DAG; + } + void addIRPasses() override; bool addPreISel() override; bool addILPOpts() override; Index: test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll =================================================================== --- test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll +++ test/CodeGen/PowerPC/fp128-bitcast-after-operation.ll @@ -31,11 +31,12 @@ ; PPC64-P8: blr ; PPC32-DAG: stfd 1, 24(1) -; PPC32-DAG: stfd 2, 16(1) +; PPC32: nop ; PPC32: nop ; PPC32-DAG: lwz [[HI0:[0-9]+]], 24(1) -; PPC32-DAG: lwz [[LO0:[0-9]+]], 16(1) ; PPC32-DAG: lwz [[HI1:[0-9]+]], 28(1) +; PPC32-DAG: stfd 2, 16(1) +; PPC32-DAG: lwz [[LO0:[0-9]+]], 16(1) ; PPC32-DAG: lwz [[LO1:[0-9]+]], 20(1) ; PPC32: rlwinm [[FLIP_BIT:[0-9]+]], [[HI0]], 0, 0, 0 ; PPC32-DAG: xor [[HI0]], [[HI0]], [[FLIP_BIT]] Index: test/CodeGen/PowerPC/jaggedstructs.ll =================================================================== --- test/CodeGen/PowerPC/jaggedstructs.ll +++ test/CodeGen/PowerPC/jaggedstructs.ll @@ -35,10 +35,10 @@ ; CHECK: sth {{[0-9]+}}, 70(1) ; CHECK: stw {{[0-9]+}}, 66(1) ; CHECK: lbz {{[0-9]+}}, 191(1) -; CHECK: lhz {{[0-9]+}}, 189(1) ; CHECK: lwz {{[0-9]+}}, 185(1) -; CHECK: stb {{[0-9]+}}, 79(1) +; CHECK: lhz {{[0-9]+}}, 189(1) ; CHECK: sth {{[0-9]+}}, 77(1) +; CHECK: stb {{[0-9]+}}, 79(1) ; CHECK: stw {{[0-9]+}}, 73(1) ; CHECK: ld 6, 72(1) ; CHECK: ld 5, 64(1) Index: test/CodeGen/PowerPC/memcpy-vec.ll =================================================================== --- test/CodeGen/PowerPC/memcpy-vec.ll +++ test/CodeGen/PowerPC/memcpy-vec.ll @@ -15,10 +15,10 @@ ; PWR7-LABEL: @foo1 ; PWR7-NOT: bl memcpy ; PWR7-DAG: li [[OFFSET:[0-9]+]], 16 -; PWR7-DAG: lxvd2x [[TMP0:[0-9]+]], 4, [[OFFSET]] +; PWR7-DAG: lxvd2x [[TMP0:[0-9]+]], 0, 4 +; PWR7-DAG: lxvd2x [[TMP1:[0-9]+]], 4, [[OFFSET]] +; PWR7-DAG: stxvd2x [[TMP1]], 3, [[OFFSET]] ; PWR7-DAG: stxvd2x [[TMP0]], 0, 3 -; PWR7-DAG: lxvd2x [[TMP1:[0-9]+]], 0, 4 -; PWR7-DAG: stxvd2x [[TMP1]], 0, 3 ; PWR7: blr ; PWR8-LABEL: @foo1 Index: test/CodeGen/PowerPC/ppc32-vacopy.ll =================================================================== --- test/CodeGen/PowerPC/ppc32-vacopy.ll +++ test/CodeGen/PowerPC/ppc32-vacopy.ll @@ -19,6 +19,6 @@ ; CHECK: lwz [[REG1:[0-9]+]], {{.*}} ; CHECK: lwz [[REG2:[0-9]+]], {{.*}} ; CHECK: lwz [[REG3:[0-9]+]], {{.*}} -; CHECK: stw [[REG1]], {{.*}} -; CHECK: stw [[REG2]], {{.*}} ; CHECK: stw [[REG3]], {{.*}} +; CHECK: stw [[REG2]], {{.*}} +; CHECK: stw [[REG1]], {{.*}} Index: test/CodeGen/PowerPC/ppcf128-endian.ll =================================================================== --- test/CodeGen/PowerPC/ppcf128-endian.ll +++ test/CodeGen/PowerPC/ppcf128-endian.ll @@ -27,8 +27,8 @@ } ; CHECK: @caller ; CHECK: ld [[REG:[0-9]+]], .LC -; CHECK: lfd 2, 8([[REG]]) ; CHECK: lfd 1, 0([[REG]]) +; CHECK: lfd 2, 8([[REG]]) ; CHECK: bl test declare void @test(ppc_fp128)