Index: lib/Target/AArch64/AArch64.h =================================================================== --- lib/Target/AArch64/AArch64.h +++ lib/Target/AArch64/AArch64.h @@ -46,6 +46,8 @@ FunctionPass *createAArch64CollectLOHPass(); +FunctionPass *createAArch64SSALoadStoreOptPass(); + void initializeAArch64ExpandPseudoPass(PassRegistry&); } // end namespace llvm Index: lib/Target/AArch64/AArch64SSALoadStoreOptimizer.cpp =================================================================== --- /dev/null +++ lib/Target/AArch64/AArch64SSALoadStoreOptimizer.cpp @@ -0,0 +1,490 @@ +//===--- AArch64ExynosOpt.cpp --- Suppress store pair formation ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that performs load / store related peephole +// optimizations in SSA form. This pass should be run before register +// allocation. + +// ===---------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MachineTraceMetrics.h" +#include "llvm/CodeGen/TargetSchedule.h" +#include "llvm/Support/Debug.h" +#include "llvm/Support/raw_ostream.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/IR/Constants.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-ssa-ldst-opt" + +namespace { +class AArch64SSALoadStoreOpt : public MachineFunctionPass { + + const AArch64InstrInfo *TII; + MachineRegisterInfo *MRI; + const TargetRegisterInfo *TRI; + const AArch64Subtarget *Subtarget; + +public: + static char ID; + AArch64SSALoadStoreOpt() : MachineFunctionPass(ID) {} + bool tryToWidenLdStInst(MachineBasicBlock::iterator &I, + std::set &stores, + std::set &loads); + bool checkOffsetRange(MachineInstr *MI); + bool isConsecutive(MachineInstr *FirstMI, MachineInstr *SecondMI, + std::vector &MIs); + void buildWideLdStInst(MachineInstr *MI, MachineInstr *MergeMI); + bool optimizeBlock(MachineBasicBlock &MBB); + bool runOnMachineFunction(MachineFunction &MF) override; + const char *getPassName() const override { + return "AArch64 SSA load / store optimization pass"; + } +}; +char AArch64SSALoadStoreOpt::ID = 0; +} // namespace + +static void getPosition(MachineBasicBlock::iterator &I, MachineBasicBlock &MBB, + MachineInstr &MI) { + for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); MBBI != E; + ++MBBI) { + if (&*MBBI == &MI) + I = MBBI; + } +} + +// Scaling factor for unscaled load or store. +static int getMemScale(MachineInstr *MI) { + switch (MI->getOpcode()) { + default: + llvm_unreachable("Opcode has unknown scale!"); + case AArch64::LDRBBui: + case AArch64::LDURBBi: + case AArch64::LDRSBWui: + case AArch64::LDURSBWi: + case AArch64::STRBBui: + case AArch64::STURBBi: + return 1; + case AArch64::LDRHHui: + case AArch64::LDURHHi: + case AArch64::LDRSHWui: + case AArch64::LDURSHWi: + case AArch64::STRHHui: + case AArch64::STURHHi: + return 2; + case AArch64::LDRSui: + case AArch64::LDURSi: + case AArch64::LDRSWui: + case AArch64::LDURSWi: + case AArch64::LDRWui: + case AArch64::LDURWi: + case AArch64::STRSui: + case AArch64::STURSi: + case AArch64::STRWui: + case AArch64::STURWi: + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPWi: + case AArch64::STPSi: + case AArch64::STPWi: + return 4; + case AArch64::LDRDui: + case AArch64::LDURDi: + case AArch64::LDRXui: + case AArch64::LDURXi: + case AArch64::STRDui: + case AArch64::STURDi: + case AArch64::STRXui: + case AArch64::STURXi: + case AArch64::LDPDi: + case AArch64::LDPXi: + case AArch64::STPDi: + case AArch64::STPXi: + return 8; + case AArch64::LDRQui: + case AArch64::LDURQi: + case AArch64::STRQui: + case AArch64::STURQi: + case AArch64::LDPQi: + case AArch64::STPQi: + return 16; + } +} + +static const MachineOperand &getLdStRegOp(const MachineInstr *MI) { + return MI->getOperand(0); +} + +static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) { + return MI->getOperand(1); +} + +static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) { + return MI->getOperand(2); +} + +bool AArch64SSALoadStoreOpt::checkOffsetRange(MachineInstr *MI) { + bool IsScaled = !TII->isUnscaledLdSt(MI); + int Offset = getLdStOffsetOp(MI).getImm(); + + // When the original load/store is scaled(ldr/str) and it has an odd offset + // value, + // it can be widened if (-256 <= unscaled offset value < 256) is satisfied. + // cf.) unscaled offset value = scaled offset value * memory scale size + if (IsScaled) { + if (Offset % 2) { + int UnscaledOffset = Offset * getMemScale(MI); + if (UnscaledOffset < 256 && UnscaledOffset >= -256) + return true; + else + return false; + } + } + + return true; +} + +// Check if two loads/stores have consecutive memory accesses. +bool AArch64SSALoadStoreOpt::isConsecutive(MachineInstr *FirstMI, + MachineInstr *SecondMI, + std::vector &MIs) { + bool FirstMIIsUnscaled = TII->isUnscaledLdSt(FirstMI); + unsigned FirstMIBaseReg = getLdStBaseOp(FirstMI).getReg(); + int FirstMIOffset = getLdStOffsetOp(FirstMI).getImm(); + int FirstMIOffsetStride = FirstMIIsUnscaled ? getMemScale(FirstMI) : 1; + + bool SecondMIIsUnscaled = TII->isUnscaledLdSt(SecondMI); + unsigned SecondMIBaseReg = getLdStBaseOp(SecondMI).getReg(); + int SecondMIOffset = getLdStOffsetOp(SecondMI).getImm(); + if (FirstMIIsUnscaled != SecondMIIsUnscaled) { + // We're trying to pack instructions that differ in how they are scaled. + // If FirstMI is scaled then scale the offset of MI accordingly. + // Otherwise, do the opposite (i.e., make MI's offset unscaled). + int MemSize = getMemScale(SecondMI); + if (SecondMIIsUnscaled) { + // If the unscaled offset isn't a multiple of the MemSize, we can't + // pack the operations together: bail and keep looking. + if (SecondMIOffset % MemSize) + return false; + + SecondMIOffset /= MemSize; + } else { + SecondMIOffset *= MemSize; + } + } + + if (FirstMIBaseReg == SecondMIBaseReg && + ((FirstMIOffset == SecondMIOffset + FirstMIOffsetStride) || + (FirstMIOffset + FirstMIOffsetStride == SecondMIOffset))) { + if (FirstMIOffset < SecondMIOffset) { + MIs.push_back(FirstMI); + MIs.push_back(SecondMI); + } else { + MIs.push_back(SecondMI); + MIs.push_back(FirstMI); + } + + return true; + } + + return false; +} + +// Build wide load/store instruction. MI has lower offset. MergeMI has higher +// offset. +void AArch64SSALoadStoreOpt::buildWideLdStInst(MachineInstr *MI, + MachineInstr *MergeMI) { + MachineBasicBlock *MBB = MI->getParent(); + MachineOperand &RegOp = MI->getOperand(0); + unsigned Reg = MI->getOperand(0).getReg(); + const MachineOperand &BaseRegOp = MI->getOperand(1); + bool IsScaled = !TII->isUnscaledLdSt(MI->getOpcode()); + int OffsetImm = MI->getOperand(2).getImm(); + + bool ShouldBeUnscaled = false; + // When the original load/store is scaled(ldr/str), + // offset should be unscaled if the offset is an odd value. + // Otherwise, the offset shoud be a half of itself. + if (IsScaled) { + if (OffsetImm % 2) { + OffsetImm *= getMemScale(MI); + ShouldBeUnscaled = true; + } else { + OffsetImm /= 2; + } + } + + MachineBasicBlock::iterator InsertionPoint; + getPosition(InsertionPoint, *MBB, *MI); + + // Select the opcode for wide load/store. Consider whether the copcode should + // be scaled or + // unscaled. + unsigned NewOpc = 0; + switch (MI->getOpcode()) { + default: + assert(false && "Unexpected MI's opcode."); + break; + case AArch64::LDRWui: + if (ShouldBeUnscaled) + NewOpc = AArch64::LDURXi; + else + NewOpc = AArch64::LDRXui; + break; + case AArch64::LDURWi: + NewOpc = AArch64::LDURXi; + break; + case AArch64::STRWui: + if (ShouldBeUnscaled) + NewOpc = AArch64::STURXi; + else + NewOpc = AArch64::STRXui; + break; + case AArch64::STURWi: + NewOpc = AArch64::STURXi; + break; + } + + // Set the register operand (dst of load or src of store) to wide type. + MRI->setRegClass(Reg, &AArch64::GPR64RegClass); + + // Build wide load/store instruction. + MachineInstr *NewMI = + BuildMI(*MBB, InsertionPoint, MI->getDebugLoc(), TII->get(NewOpc)) + .addOperand(RegOp) + .addOperand(BaseRegOp) + .addImm(OffsetImm) + .setMemRefs(MI->mergeMemRefsWith(*MergeMI)); + (void)NewMI; + + DEBUG(dbgs() << "Creating the wide load/store instruction. Replacing " + "instructions:\n "); + DEBUG(MI->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(MergeMI->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(dbgs() << " with instruction:\n "); + DEBUG((NewMI)->print(dbgs())); +} + +bool AArch64SSALoadStoreOpt::tryToWidenLdStInst( + MachineBasicBlock::iterator &I, std::set &stores, + std::set &loads) { + MachineBasicBlock::iterator E = I->getParent()->end(); + MachineBasicBlock::iterator MBBI = I; + MachineInstr *FirstMI = I; + ++MBBI; + + // FirstMI(store) should have a register base operand and an immediate offset + // operand. + if (!getLdStBaseOp(FirstMI).isReg() || !getLdStOffsetOp(FirstMI).isImm()) + return false; + + unsigned Reg = getLdStRegOp(FirstMI).getReg(); + + // If FirstMI's offset is out of the range, give up widening. + if (!checkOffsetRange(FirstMI)) + return false; + + // Check if there exists a def instruction for the register operand of + // FirstMI. + if (!MRI->hasOneDef(Reg)) + return false; + + MachineInstr &DefInst = *MRI->def_instr_begin(Reg); + + // DefInst should be a load instruction. + if (DefInst.getOpcode() != AArch64::LDRWui && + DefInst.getOpcode() != AArch64::LDURWi) + return false; + + // DefInst(load) should have a register base operand and an immediate offset + // operand. + if (!getLdStBaseOp(&DefInst).isReg() || !getLdStOffsetOp(&DefInst).isImm()) + return false; + + // DefInst should have a unique use instruction that should be FirstMI. + if (!MRI->hasOneUse(Reg)) + return false; + + std::vector consecutive_stores; + std::vector consecutive_loads; + + // Find consecutive store for FirstMI. + for (; MBBI != E; ++MBBI) { + MachineInstr *MI = MBBI; + + if (MI->getOpcode() != AArch64::STURWi && + MI->getOpcode() != AArch64::STRWui) + continue; + + // MI(store) should have a register base operand and an immediate offset + // operand. + if (!getLdStBaseOp(MI).isReg() || !getLdStOffsetOp(MI).isImm()) + return false; + + // Check if FirstMI(store) and MI(store) are consecutive. + if (!isConsecutive(FirstMI, MI, consecutive_stores)) + continue; + + unsigned MIReg = getLdStRegOp(MI).getReg(); + + // Check if there exists a def instruction for the register operand of MI. + if (!MRI->hasOneDef(MIReg)) { + consecutive_stores.clear(); + continue; + } + + MachineInstr &MIDefInst = *MRI->def_instr_begin(MIReg); + + // MIDefInst should have a unique use instruction that should be MI. + if (!MRI->hasOneUse(MIReg)) + return false; + + // MIDefInst should be a load instruction. + if (MIDefInst.getOpcode() != AArch64::LDRWui && + MIDefInst.getOpcode() != AArch64::LDURWi) { + consecutive_stores.clear(); + continue; + } + + // MIDefInst(load) should have a register base operand and an immediate + // offset operand. + if (!getLdStBaseOp(&MIDefInst).isReg() || + !getLdStOffsetOp(&MIDefInst).isImm()) + return false; + + // If MIDefInst's offset is out of the range, give up widening. + if (!checkOffsetRange(&MIDefInst)) + return false; + + // Check if DefInst(load) and MIDefInst(load) are consecutive. + if (isConsecutive(&DefInst, &MIDefInst, consecutive_loads)) { + // If consecutive loads/stores are found, the followings are satisfied. + // dst reg of first load = src reg of first store + // dst reg of second load = src reg of second store + if (getLdStRegOp(consecutive_loads[0]).getReg() != + getLdStRegOp(consecutive_stores[0]).getReg() || + getLdStRegOp(consecutive_loads[1]).getReg() != + getLdStRegOp(consecutive_stores[1]).getReg()) + return false; + + for (auto &I : consecutive_stores) { + stores.insert(I); + } + + for (auto &I : consecutive_loads) { + loads.insert(I); + } + + // Build wide load/store instruction from consecutive loads/stores. + buildWideLdStInst(consecutive_loads[0], consecutive_loads[1]); + buildWideLdStInst(consecutive_stores[0], consecutive_stores[1]); + return true; + } + + consecutive_stores.clear(); + } + return false; +} + +bool AArch64SSALoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { + bool Modified = false; + // 1) Find consecutive two 32-bit loads and consecutive two 32-bit stores that + // write the values of the consecutive 32-bit loads. Transform the + // loads/stores + // to 64-bit load/store. + // + // When the wide load/store is unscaled(ldur/stur), offset needs not te be + // changed. + // e.g., + // %vreg2 = LDURWi %vreg0, -76; + // %vreg3 = LDURWi %vreg0, -72; + // STURWi %vreg2, %vreg1, -44; + // STURWi %vreg3, %vreg1, -40; + // ; becomes + // %vreg2 = LDURXi %vreg0, -76; + // STURXi %vreg2, %vreg1, -44; + // + // When the wide load/store is scaled(ldr/str), offset should be a half of + // the original value. + // e.g., + // %vreg2 = LDRWui %vreg0, 4; + // %vreg3 = LDRWui %vreg0, 5; + // STRWui %vreg2, %vreg1, 2; + // STRWui %vreg3, %vreg1, 3; + // ; becomes + // %vreg2 = LDRXui %vreg0, 2; + // STRXui %vreg2, %vreg1, 1; + // + // When the original load/store is scaled(ldr/str) and it has an odd offset + // value, + // it can be widened if (-256 <= unscaled offset value < 256) is satisfied. + // cf.) unscaled offset value = scaled offset value * memory scale size + // e.g., + // %vreg2 = LDRWui %vreg0, 13; + // %vreg3 = LDRWui %vreg0, 14; + // STRWui %vreg2, %vreg1, 37; + // STRWui %vreg3, %vreg1, 38; + // ; becomes + // %vreg2 = LDURXi %vreg0, 52; 52 = 13 * 4 + // STURXi %vreg2, %vreg1, 148; 148 = 37 * 4 + std::set loads; + std::set stores; + for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); MBBI != E; + ++MBBI) { + MachineInstr *MI = MBBI; + switch (MI->getOpcode()) { + default: + break; + case AArch64::STURWi: + case AArch64::STRWui: + if (stores.find(MI) != stores.end()) + continue; + + if (tryToWidenLdStInst(MBBI, stores, loads)) { + Modified = true; + } + break; + } + } + + for (auto &I : loads) { + I->eraseFromParent(); + } + + for (auto &I : stores) { + I->eraseFromParent(); + } + + return Modified; +} + +bool AArch64SSALoadStoreOpt::runOnMachineFunction(MachineFunction &MF) { + Subtarget = &static_cast(MF.getSubtarget()); + TII = static_cast(Subtarget->getInstrInfo()); + TRI = MF.getSubtarget().getRegisterInfo(); + MRI = &MF.getRegInfo(); + bool Changed = false; + for (MachineBasicBlock &MBB : MF) + Changed |= optimizeBlock(MBB); + return Changed; +} + +FunctionPass *llvm::createAArch64SSALoadStoreOptPass() { + return new AArch64SSALoadStoreOpt(); +} Index: lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetMachine.cpp +++ lib/Target/AArch64/AArch64TargetMachine.cpp @@ -107,6 +107,12 @@ cl::desc("Enable the loop data prefetch pass"), cl::init(true)); +static cl::opt + EnableSSALoadStoreOpt("aarch64-ssa-load-store-opt", + cl::desc("Enable the load/store pair" + " optimization pass in SSA form"), + cl::init(true), cl::Hidden); + extern "C" void LLVMInitializeAArch64Target() { // Register the target. RegisterTargetMachine X(TheAArch64leTarget); @@ -346,6 +352,8 @@ #endif bool AArch64PassConfig::addILPOpts() { + if (EnableSSALoadStoreOpt) + addPass(createAArch64SSALoadStoreOptPass()); if (EnableCondOpt) addPass(createAArch64ConditionOptimizerPass()); if (EnableCCMP) Index: lib/Target/AArch64/CMakeLists.txt =================================================================== --- lib/Target/AArch64/CMakeLists.txt +++ lib/Target/AArch64/CMakeLists.txt @@ -45,6 +45,7 @@ AArch64FrameLowering.cpp AArch64ConditionOptimizer.cpp AArch64RedundantCopyElimination.cpp + AArch64SSALoadStoreOptimizer.cpp AArch64ISelDAGToDAG.cpp AArch64ISelLowering.cpp AArch64InstrInfo.cpp Index: test/CodeGen/AArch64/ldst-opt.ll =================================================================== --- test/CodeGen/AArch64/ldst-opt.ll +++ test/CodeGen/AArch64/ldst-opt.ll @@ -1122,9 +1122,9 @@ %phi1 = phi i32* [ %gep4, %for.body ], [ %b, %0 ] %phi2 = phi i32* [ %gep3, %for.body ], [ %a, %0 ] %i = phi i64 [ %dec.i, %for.body], [ %count, %0 ] - %gep1 = getelementptr i32, i32* %phi1, i64 -1 + %gep1 = getelementptr i32, i32* %phi1, i64 -3 %load1 = load i32, i32* %gep1 - %gep2 = getelementptr i32, i32* %phi2, i64 -1 + %gep2 = getelementptr i32, i32* %phi2, i64 -3 store i32 %load1, i32* %gep2 %load2 = load i32, i32* %phi1 store i32 %load2, i32* %phi2 Index: test/CodeGen/AArch64/ssa-ldst-opt.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/ssa-ldst-opt.ll @@ -0,0 +1,50 @@ +; RUN: llc -march=aarch64 -verify-machineinstrs -asm-verbose=false -o - %s | FileCheck %s + +; CHECK-LABEL: test_offset_no_changed: +; CHECK: ldur x8, [x0, #-76] +; CHECK-NEXT: stur x8, [x1, #-44] +; CHECK-NEXT: ret +define void @test_offset_no_changed(i32* %p1, i32* %p2) #0 { + %ld.ptr1 = getelementptr i32, i32* %p1, i64 -19 + %1 = load i32, i32* %ld.ptr1, align 4 + %st.ptr1 = getelementptr i32, i32* %p2, i64 -11 + store i32 %1, i32* %st.ptr1, align 4 + %ld.ptr2 = getelementptr i32, i32* %p1, i64 -18 + %2 = load i32, i32* %ld.ptr2, align 4 + %st.ptr2 = getelementptr i32, i32* %p2, i64 -10 + store i32 %2, i32* %st.ptr2, align 4 + ret void +} + +; CHECK-LABEL: test_offset_halved: +; CHECK: ldr x8, [x0, #16] +; CHECK-NEXT: str x8, [x1, #8] +; CHECK-NEXT: ret +define void @test_offset_halved(i32* %p1, i32* %p2) #0 { + %ld.ptr1 = getelementptr i32, i32* %p1, i64 4 + %1 = load i32, i32* %ld.ptr1, align 4 + %st.ptr1 = getelementptr i32, i32* %p2, i64 2 + store i32 %1, i32* %st.ptr1, align 4 + %ld.ptr2 = getelementptr i32, i32* %p1, i64 5 + %2 = load i32, i32* %ld.ptr2, align 4 + %st.ptr2 = getelementptr i32, i32* %p2, i64 3 + store i32 %2, i32* %st.ptr2, align 4 + ret void +} + +; CHECK-LABEL: test_offset_unscaled: +; CHECK: ldur x8, [x0, #52] +; CHECK-NEXT: stur x8, [x1, #148] +; CHECK-NEXT: ret +define void @test_offset_unscaled(i32* %p1, i32* %p2) #0 { + %ld.ptr1 = getelementptr i32, i32* %p1, i64 13 + %1 = load i32, i32* %ld.ptr1, align 4 + %st.ptr1 = getelementptr i32, i32* %p2, i64 37 + store i32 %1, i32* %st.ptr1, align 4 + %ld.ptr2 = getelementptr i32, i32* %p1, i64 14 + %2 = load i32, i32* %ld.ptr2, align 4 + %st.ptr2 = getelementptr i32, i32* %p2, i64 38 + store i32 %2, i32* %st.ptr2, align 4 + ret void +} +