Index: lib/Target/AArch64/AArch64.h =================================================================== --- lib/Target/AArch64/AArch64.h +++ lib/Target/AArch64/AArch64.h @@ -46,6 +46,8 @@ FunctionPass *createAArch64CollectLOHPass(); +FunctionPass *createAArch64SSALoadStoreOptPass(); + void initializeAArch64ExpandPseudoPass(PassRegistry&); } // end namespace llvm Index: lib/Target/AArch64/AArch64SSALoadStoreOptimizer.cpp =================================================================== --- /dev/null +++ lib/Target/AArch64/AArch64SSALoadStoreOptimizer.cpp @@ -0,0 +1,535 @@ +//===--- AArch64SSALoadStoreOptimizer.cpp --- AArch64 load/store opt. pass in +// SSA form ---===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This file contains a pass that performs load / store related peephole +// optimizations in SSA form. This pass should be run before register +// allocation. + +// ===---------------------------------------------------------------------===// + +#include "AArch64InstrInfo.h" +#include "AArch64Subtarget.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetInstrInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-ssa-ldst-opt" + +#define MAX_UNSCALED_OFFSET 255 +#define MIN_UNSCALED_OFFSET -256 + +namespace { +class AArch64SSALoadStoreOpt : public MachineFunctionPass { + + const AArch64InstrInfo *TII; + MachineRegisterInfo *MRI; + const AArch64Subtarget *Subtarget; + AliasAnalysis *AA; + +public: + static char ID; + AArch64SSALoadStoreOpt() : MachineFunctionPass(ID) {} + bool tryToWidenLdStInst(MachineInstr *MI, + std::set &StoresWillBeDeleted, + std::set &LoadsWillBeDeleted); + bool instrAliased(MachineInstr *MIa, MachineInstr *MIb); + bool checkOffsetRange(MachineInstr *MI); + bool isConsecutive(MachineInstr *FirstMI, MachineInstr *SecondMI, + SmallVector &MIs); + bool hasAliasBetween(MachineInstr *MIa, MachineInstr *MIb); + void buildWideLdStInst(MachineInstr *MI, MachineInstr *MergeMI); + bool optimizeBlock(MachineBasicBlock &MBB); + bool runOnMachineFunction(MachineFunction &MF) override; + const char *getPassName() const override { + return "AArch64 SSA load / store optimization pass"; + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addPreserved(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; +char AArch64SSALoadStoreOpt::ID = 0; +} // namespace + +// Scaling factor for unscaled load or store. +static int getMemScale(MachineInstr *MI) { + switch (MI->getOpcode()) { + default: + llvm_unreachable("Opcode has unknown scale!"); + case AArch64::LDRWui: + case AArch64::LDURWi: + case AArch64::STRWui: + case AArch64::STURWi: + return 4; + } +} + +static const MachineOperand &getLdStRegOp(const MachineInstr *MI) { + return MI->getOperand(0); +} + +static const MachineOperand &getLdStBaseOp(const MachineInstr *MI) { + return MI->getOperand(1); +} + +static const MachineOperand &getLdStOffsetOp(const MachineInstr *MI) { + return MI->getOperand(2); +} + +// Check if the memory operands of two instructions are aliased to each other. +bool AArch64SSALoadStoreOpt::instrAliased(MachineInstr *MIa, + MachineInstr *MIb) { + MachineMemOperand *MMOA = + MIa->hasOneMemOperand() ? *MIa->memoperands_begin() : nullptr; + MachineMemOperand *MMOB = + MIb->hasOneMemOperand() ? *MIb->memoperands_begin() : nullptr; + + if (!MMOA || !MMOB) + return true; + + if (!MMOA->getValue() || !MMOB->getValue()) + return true; + + MemoryLocation LocA(MMOA->getValue(), MMOA->getSize(), MMOA->getAAInfo()); + MemoryLocation LocB(MMOB->getValue(), MMOB->getSize(), MMOB->getAAInfo()); + + return AA->alias(LocA, LocB); +} + +// Check if there is any instruction between MIa and MIb, which may alias with +// the instruction located later between MIa and MIb. +bool AArch64SSALoadStoreOpt::hasAliasBetween(MachineInstr *MIa, + MachineInstr *MIb) { + MachineBasicBlock *MBB = MIa->getParent(); + + MachineInstr *EndMI = MIb; + unsigned Status = 0; + for (auto &MBBI : *MBB) { + MachineInstr *MI = &MBBI; + switch (Status) { + default: + llvm_unreachable("Unexpected status"); + case 0: // neither MIa nor MIb is not met + if (MI == MIa) { + Status++; + } else if (MI == MIb) { + EndMI = MIa; + Status++; + } + break; + case 1: // one of MIa and MIb is met + if (MI == EndMI) { + Status++; + } else if (MI->mayStore()) { + // Check if MI is aliased with EndMI. + if (instrAliased(MI, EndMI)) + return true; + } + break; + case 2: // both MIa and MIb are met + break; + } + } + + return false; +} + +bool AArch64SSALoadStoreOpt::checkOffsetRange(MachineInstr *MI) { + bool IsScaled = !TII->isUnscaledLdSt(MI); + int Offset = getLdStOffsetOp(MI).getImm(); + + // When the original load/store is scaled(ldr/str) and it has an odd offset + // value, it can be widened + // if (MIN_UNSCALED_OFFSET <= unscaled offset value <= MAX_UNSCALED_OFFSET) + // is satisfied. + // cf.) unscaled offset value = scaled offset value * memory scale size + if (IsScaled) { + int UnscaledOffset = Offset * getMemScale(MI); + if (Offset % 2) + return UnscaledOffset <= MAX_UNSCALED_OFFSET && + UnscaledOffset >= MIN_UNSCALED_OFFSET; + } + + return true; +} + +// Check if two loads/stores have consecutive memory accesses. +// If found, the SmallVector has the consecutive instructions in the access +// order. +bool AArch64SSALoadStoreOpt::isConsecutive( + MachineInstr *FirstMI, MachineInstr *SecondMI, + SmallVector &MIs) { + assert((FirstMI->mayLoad() ? SecondMI->mayLoad() + : (FirstMI->mayStore() && SecondMI->mayStore())) && + "Unexpected input instructions"); + bool FirstMIIsUnscaled = TII->isUnscaledLdSt(FirstMI); + unsigned FirstMIBaseReg = getLdStBaseOp(FirstMI).getReg(); + int FirstMIOffset = getLdStOffsetOp(FirstMI).getImm(); + int FirstMIOffsetStride = FirstMIIsUnscaled ? getMemScale(FirstMI) : 1; + + bool SecondMIIsUnscaled = TII->isUnscaledLdSt(SecondMI); + unsigned SecondMIBaseReg = getLdStBaseOp(SecondMI).getReg(); + int SecondMIOffset = getLdStOffsetOp(SecondMI).getImm(); + if (FirstMIIsUnscaled != SecondMIIsUnscaled) { + // We're trying to pack instructions that differ in how they are scaled. + // If FirstMI is scaled then scale the offset of MI accordingly. + // Otherwise, do the opposite (i.e., make MI's offset unscaled). + int MemSize = getMemScale(SecondMI); + if (SecondMIIsUnscaled) { + // If the unscaled offset isn't a multiple of the MemSize, we can't + // pack the operations together: bail and keep looking. + if (SecondMIOffset % MemSize) + return false; + + SecondMIOffset /= MemSize; + } else { + SecondMIOffset *= MemSize; + } + } + + if (FirstMIBaseReg == SecondMIBaseReg) { + if (FirstMIOffset + FirstMIOffsetStride == SecondMIOffset) { + MIs.push_back(FirstMI); + MIs.push_back(SecondMI); + return true; + } else if (FirstMIOffset == SecondMIOffset + FirstMIOffsetStride) { + MIs.push_back(SecondMI); + MIs.push_back(FirstMI); + return true; + } + } + + return false; +} + +// Build wide load/store instruction. MI has lower offset. MergeMI has higher +// offset. +void AArch64SSALoadStoreOpt::buildWideLdStInst(MachineInstr *MI, + MachineInstr *MergeMI) { + assert((MI->mayLoad() ? MergeMI->mayLoad() + : (MI->mayStore() && MergeMI->mayStore())) && + "Unexpected input instructions"); + MachineBasicBlock *MBB = MI->getParent(); + MachineOperand &RegOp = MI->getOperand(0); + unsigned Reg = RegOp.getReg(); + const MachineOperand &BaseRegOp = MI->getOperand(1); + bool IsScaled = !TII->isUnscaledLdSt(MI->getOpcode()); + int OffsetImm = MI->getOperand(2).getImm(); + + bool ShouldBeUnscaled = false; + // When the original load/store is scaled(ldr/str), + // offset should be unscaled if the offset is an odd value. + // Otherwise, the offset shoud be a half of itself. + if (IsScaled) { + if (OffsetImm % 2) { + OffsetImm *= getMemScale(MI); + ShouldBeUnscaled = true; + } else { + OffsetImm /= 2; + } + } + + // Select the opcode for wide load/store. Consider whether the opcode should + // be scaled or unscaled. + unsigned NewOpc = 0; + switch (MI->getOpcode()) { + default: + llvm_unreachable("Unexpected MI's opcode."); + break; + case AArch64::LDRWui: + NewOpc = ShouldBeUnscaled ? AArch64::LDURXi : AArch64::LDRXui; + break; + case AArch64::LDURWi: + NewOpc = AArch64::LDURXi; + break; + case AArch64::STRWui: + NewOpc = ShouldBeUnscaled ? AArch64::STURXi : AArch64::STRXui; + break; + case AArch64::STURWi: + NewOpc = AArch64::STURXi; + break; + } + + // Set the register operand (dst of load or src of store) to wide type. + MRI->setRegClass(Reg, &AArch64::GPR64RegClass); + + // Generate wide memory operand. + MachineMemOperand *MMO = *MI->memoperands_begin(); + MachineMemOperand *NewMMO = new MachineMemOperand( + MMO->getPointerInfo(), MMO->getFlags(), MMO->getSize() << 1, + MMO->getBaseAlignment() << 1, MMO->getAAInfo(), MMO->getRanges()); + + // Set the location for inserting new wide instruction. + MachineBasicBlock::iterator InsertionPoint = MI; + + // Build wide load/store instruction. + MachineInstr *NewMI = + BuildMI(*MBB, InsertionPoint, MI->getDebugLoc(), TII->get(NewOpc)) + .addOperand(RegOp) + .addOperand(BaseRegOp) + .addImm(OffsetImm) + .setMemRefs(&NewMMO, &NewMMO); + NewMI->addMemOperand(*MBB->getParent(), NewMMO); + (void)NewMI; + + DEBUG(dbgs() << "Creating the wide load/store instruction. Replacing " + "instructions:\n "); + DEBUG(MI->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(MergeMI->print(dbgs())); + DEBUG(dbgs() << " "); + DEBUG(dbgs() << " with instruction:\n "); + DEBUG((NewMI)->print(dbgs())); +} + +bool AArch64SSALoadStoreOpt::tryToWidenLdStInst( + MachineInstr *MI, std::set &StoresWillBeDeleted, + std::set &LoadsWillBeDeleted) { + MachineInstr *FirstMI = MI; + + // FirstMI(store) should have a register base operand and an immediate offset + // operand. + if (!getLdStBaseOp(FirstMI).isReg() || !getLdStOffsetOp(FirstMI).isImm()) + return false; + + unsigned Reg = getLdStRegOp(FirstMI).getReg(); + + // Check if there exists a def instruction for the register operand of + // FirstMI. + if (!MRI->hasOneDef(Reg)) + return false; + + MachineInstr &DefInst = *MRI->def_instr_begin(Reg); + + // DefInst should be a load instruction. + if (DefInst.getOpcode() != AArch64::LDRWui && + DefInst.getOpcode() != AArch64::LDURWi) + return false; + + // DefInst(load) should have a register base operand and an immediate offset + // operand. + if (!getLdStBaseOp(&DefInst).isReg() || !getLdStOffsetOp(&DefInst).isImm()) + return false; + + // DefInst should have a unique use instruction that should be FirstMI. + if (!MRI->hasOneUse(Reg)) + return false; + + SmallVector ConsecutiveStores; + SmallVector ConsecutiveLoads; + + // Find consecutive store for FirstMI. + MachineBasicBlock::iterator E = MI->getParent()->end(); + MachineBasicBlock::iterator MBBI = MI; + ++MBBI; + for (; MBBI != E; ++MBBI) { + MachineInstr *MI = MBBI; + + if (MI->getOpcode() != AArch64::STURWi && + MI->getOpcode() != AArch64::STRWui) + continue; + + // MI(store) should have a register base operand and an immediate offset + // operand. + if (!getLdStBaseOp(MI).isReg() || !getLdStOffsetOp(MI).isImm()) + return false; + + // Check if FirstMI(store) and MI(store) are consecutive. + if (!isConsecutive(FirstMI, MI, ConsecutiveStores)) + continue; + + // If the first store's offset is out of range, give up widening. + if (!checkOffsetRange(ConsecutiveStores[0])) + return false; + + unsigned MIReg = getLdStRegOp(MI).getReg(); + + // Check if there exists a def instruction for the register operand of MI. + if (!MRI->hasOneDef(MIReg)) { + ConsecutiveStores.clear(); + continue; + } + + MachineInstr &MIDefInst = *MRI->def_instr_begin(MIReg); + + // MIDefInst should have a unique use instruction that should be MI. + if (!MRI->hasOneUse(MIReg)) + return false; + + // MIDefInst should be a load instruction. + if (MIDefInst.getOpcode() != AArch64::LDRWui && + MIDefInst.getOpcode() != AArch64::LDURWi) { + ConsecutiveStores.clear(); + continue; + } + + // MIDefInst(load) should have a register base operand and an immediate + // offset operand. + if (!getLdStBaseOp(&MIDefInst).isReg() || + !getLdStOffsetOp(&MIDefInst).isImm()) + return false; + + // Check if DefInst(load) and MIDefInst(load) are consecutive. + if (isConsecutive(&DefInst, &MIDefInst, ConsecutiveLoads)) { + // If the first load's offset is out of range, give up widening. + if (!checkOffsetRange(ConsecutiveLoads[0])) + return false; + + // If consecutive loads/stores are found, the followings are satisfied. + // dst reg of first load = src reg of first store + // dst reg of second load = src reg of second store + if (getLdStRegOp(ConsecutiveLoads[0]).getReg() != + getLdStRegOp(ConsecutiveStores[0]).getReg() || + getLdStRegOp(ConsecutiveLoads[1]).getReg() != + getLdStRegOp(ConsecutiveStores[1]).getReg()) + return false; + + // The first store instruction should not be aliased to the second load + // instruction. + if (instrAliased(ConsecutiveStores[0], ConsecutiveLoads[1])) { + DEBUG(dbgs() << "The first store instruction:\n "); + DEBUG(ConsecutiveStores[0]->print(dbgs())); + DEBUG(dbgs() + << "might be aliased with the second load instruction:\n "); + DEBUG(ConsecutiveLoads[1]->print(dbgs())); + return false; + } + + // The second store instruction should not be aliased to the first load + // instruction. + if (instrAliased(ConsecutiveStores[1], ConsecutiveLoads[0])) { + DEBUG(dbgs() << "The second store instruction:\n "); + DEBUG(ConsecutiveStores[1]->print(dbgs())); + DEBUG(dbgs() + << "might be aliased with the first load instruction:\n "); + DEBUG(ConsecutiveLoads[0]->print(dbgs())); + return false; + } + + // Check if there is any instruction between two consecutive loads, + // which may alias the loads. If found, give up widening. + if (hasAliasBetween(ConsecutiveLoads[0], ConsecutiveLoads[1])) + return false; + + // Check if there is any instruction between two consecutive stores, + // which may alias the stores. If found, give up widening. + if (hasAliasBetween(ConsecutiveStores[0], ConsecutiveStores[1])) + return false; + + for (auto &I : ConsecutiveStores) + StoresWillBeDeleted.insert(I); + + for (auto &I : ConsecutiveLoads) + LoadsWillBeDeleted.insert(I); + + // Build wide load/store instruction from consecutive loads/stores. + buildWideLdStInst(ConsecutiveLoads[0], ConsecutiveLoads[1]); + buildWideLdStInst(ConsecutiveStores[0], ConsecutiveStores[1]); + return true; + } + + ConsecutiveStores.clear(); + } + return false; +} + +bool AArch64SSALoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB) { + bool Modified = false; + // 1) Find consecutive two 32-bit loads and consecutive two 32-bit stores that + // write the values of the consecutive 32-bit loads. Transform the + // loads/stores to 64-bit load/store. + // + // When the wide load/store is unscaled(ldur/stur), offset needs not to be + // changed. + // e.g., + // %vreg2 = LDURWi %vreg0, -76; + // %vreg3 = LDURWi %vreg0, -72; + // STURWi %vreg2, %vreg1, -44; + // STURWi %vreg3, %vreg1, -40; + // ; becomes + // %vreg2 = LDURXi %vreg0, -76; + // STURXi %vreg2, %vreg1, -44; + // + // When the wide load/store is scaled(ldr/str), offset should be a half of + // the original value. + // e.g., + // %vreg2 = LDRWui %vreg0, 4; + // %vreg3 = LDRWui %vreg0, 5; + // STRWui %vreg2, %vreg1, 2; + // STRWui %vreg3, %vreg1, 3; + // ; becomes + // %vreg2 = LDRXui %vreg0, 2; + // STRXui %vreg2, %vreg1, 1; + // + // When the original load/store is scaled(ldr/str) and it has an odd offset + // value, it can be widened if + // (MIN_UNSCALED_OFFSET <= unscaled offset value <= MAX_UNSCALED_OFFSET) + // is satisfied. + // cf.) unscaled offset value = scaled offset value * memory scale size + // e.g., + // %vreg2 = LDRWui %vreg0, 13; + // %vreg3 = LDRWui %vreg0, 14; + // STRWui %vreg2, %vreg1, 37; + // STRWui %vreg3, %vreg1, 38; + // ; becomes + // %vreg2 = LDURXi %vreg0, 52; 52 = 13 * 4 + // STURXi %vreg2, %vreg1, 148; 148 = 37 * 4 + std::set LoadsWillBeDeleted; + std::set StoresWillBeDeleted; + for (auto &MBBI : MBB) { + MachineInstr *MI = &MBBI; + switch (MI->getOpcode()) { + default: + break; + case AArch64::STURWi: + case AArch64::STRWui: + if (StoresWillBeDeleted.find(MI) != StoresWillBeDeleted.end()) + continue; + + if (tryToWidenLdStInst(MI, StoresWillBeDeleted, LoadsWillBeDeleted)) + Modified = true; + + break; + } + } + + for (auto &I : LoadsWillBeDeleted) + I->eraseFromParent(); + + for (auto &I : StoresWillBeDeleted) + I->eraseFromParent(); + + return Modified; +} + +bool AArch64SSALoadStoreOpt::runOnMachineFunction(MachineFunction &MF) { + Subtarget = &static_cast(MF.getSubtarget()); + TII = static_cast(Subtarget->getInstrInfo()); + MRI = &MF.getRegInfo(); + AA = &getAnalysis().getAAResults(); + + bool Changed = false; + for (MachineBasicBlock &MBB : MF) + Changed |= optimizeBlock(MBB); + return Changed; +} + +FunctionPass *llvm::createAArch64SSALoadStoreOptPass() { + return new AArch64SSALoadStoreOpt(); +} Index: lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetMachine.cpp +++ lib/Target/AArch64/AArch64TargetMachine.cpp @@ -107,6 +107,12 @@ cl::desc("Enable the loop data prefetch pass"), cl::init(true)); +static cl::opt + EnableSSALoadStoreOpt("aarch64-ssa-load-store-opt", + cl::desc("Enable the load/store pair" + " optimization pass in SSA form"), + cl::init(false), cl::Hidden); + extern "C" void LLVMInitializeAArch64Target() { // Register the target. RegisterTargetMachine X(TheAArch64leTarget); @@ -346,6 +352,8 @@ #endif bool AArch64PassConfig::addILPOpts() { + if (EnableSSALoadStoreOpt) + addPass(createAArch64SSALoadStoreOptPass()); if (EnableCondOpt) addPass(createAArch64ConditionOptimizerPass()); if (EnableCCMP) Index: lib/Target/AArch64/CMakeLists.txt =================================================================== --- lib/Target/AArch64/CMakeLists.txt +++ lib/Target/AArch64/CMakeLists.txt @@ -45,6 +45,7 @@ AArch64FrameLowering.cpp AArch64ConditionOptimizer.cpp AArch64RedundantCopyElimination.cpp + AArch64SSALoadStoreOptimizer.cpp AArch64ISelDAGToDAG.cpp AArch64ISelLowering.cpp AArch64InstrInfo.cpp Index: test/CodeGen/AArch64/ldst-opt.ll =================================================================== --- test/CodeGen/AArch64/ldst-opt.ll +++ test/CodeGen/AArch64/ldst-opt.ll @@ -1122,9 +1122,9 @@ %phi1 = phi i32* [ %gep4, %for.body ], [ %b, %0 ] %phi2 = phi i32* [ %gep3, %for.body ], [ %a, %0 ] %i = phi i64 [ %dec.i, %for.body], [ %count, %0 ] - %gep1 = getelementptr i32, i32* %phi1, i64 -1 + %gep1 = getelementptr i32, i32* %phi1, i64 -3 %load1 = load i32, i32* %gep1 - %gep2 = getelementptr i32, i32* %phi2, i64 -1 + %gep2 = getelementptr i32, i32* %phi2, i64 -3 store i32 %load1, i32* %gep2 %load2 = load i32, i32* %phi1 store i32 %load2, i32* %phi2 Index: test/CodeGen/AArch64/ssa-ldst-opt.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/ssa-ldst-opt.ll @@ -0,0 +1,182 @@ +; RUN: llc -march=aarch64 -verify-machineinstrs -asm-verbose=false -aarch64-ssa-load-store-opt -o - %s | FileCheck %s + +; This test is for 'AArch64 load/store optimization in SSA form'. +; Find consecutive two 32-bit loads and consecutive two 32-bit stores that write the values of the consecutive 32-bit loads. +; Transform the loads/stores to 64-bit load/store. + +; CHECK-LABEL: test_offset_no_changed: +; CHECK: ldur x8, [x0, #-76] +; CHECK-NEXT: stur x8, [x1, #-44] +; CHECK-NEXT: ret +define void @test_offset_no_changed(i32* %p1, i32* %p2) #0 { + %ld.ptr1 = getelementptr i32, i32* %p1, i64 -19 + %1 = load i32, i32* %ld.ptr1, align 4, !tbaa !8 + %st.ptr1 = getelementptr i32, i32* %p2, i64 -11 + store i32 %1, i32* %st.ptr1, align 4, !tbaa !5 + %ld.ptr2 = getelementptr i32, i32* %p1, i64 -18 + %2 = load i32, i32* %ld.ptr2, align 4, !tbaa !9 + %st.ptr2 = getelementptr i32, i32* %p2, i64 -10 + store i32 %2, i32* %st.ptr2, align 4, !tbaa !6 + ret void +} + +; CHECK-LABEL: test_offset_halved: +; CHECK: ldr x8, [x0, #16] +; CHECK-NEXT: str x8, [x1, #8] +; CHECK-NEXT: ret +define void @test_offset_halved(i32* %p1, i32* %p2) #0 { + %ld.ptr1 = getelementptr i32, i32* %p1, i64 4 + %1 = load i32, i32* %ld.ptr1, align 4, !tbaa !8 + %st.ptr1 = getelementptr i32, i32* %p2, i64 2 + store i32 %1, i32* %st.ptr1, align 4, !tbaa !5 + %ld.ptr2 = getelementptr i32, i32* %p1, i64 5 + %2 = load i32, i32* %ld.ptr2, align 4, !tbaa !9 + %st.ptr2 = getelementptr i32, i32* %p2, i64 3 + store i32 %2, i32* %st.ptr2, align 4, !tbaa !6 + ret void +} + +; CHECK-LABEL: test_offset_unscaled: +; CHECK: ldur x8, [x0, #52] +; CHECK-NEXT: stur x8, [x1, #148] +; CHECK-NEXT: ret +define void @test_offset_unscaled(i32* %p1, i32* %p2) #0 { + %ld.ptr1 = getelementptr i32, i32* %p1, i64 13 + %1 = load i32, i32* %ld.ptr1, align 4, !tbaa !8 + %st.ptr1 = getelementptr i32, i32* %p2, i64 37 + store i32 %1, i32* %st.ptr1, align 4, !tbaa !5 + %ld.ptr2 = getelementptr i32, i32* %p1, i64 14 + %2 = load i32, i32* %ld.ptr2, align 4, !tbaa !9 + %st.ptr2 = getelementptr i32, i32* %p2, i64 38 + store i32 %2, i32* %st.ptr2, align 4, !tbaa !6 + ret void +} + +; CHECK-LABEL: test_scaled_offset_range1: +; CHECK: ldur x8, [x0, #252] +; CHECK-NEXT: stur x8, [x1, #252] +; CHECK-NEXT: ret +define void @test_scaled_offset_range1(i32* %p1, i32* %p2) #0 { + %ld.ptr1 = getelementptr i32, i32* %p1, i64 63 + %1 = load i32, i32* %ld.ptr1, align 4, !tbaa !9 + %st.ptr1 = getelementptr i32, i32* %p2, i64 63 + store i32 %1, i32* %st.ptr1, align 4, !tbaa !5 + %ld.ptr2 = getelementptr i32, i32* %p1, i64 64 + %2 = load i32, i32* %ld.ptr2, align 4, !tbaa !8 + %st.ptr2 = getelementptr i32, i32* %p2, i64 64 + store i32 %2, i32* %st.ptr2, align 4, !tbaa !6 + ret void +} + +; In the following test case, 'aarch64-ssa-load-store-opt' is applied. +; allowed offset range: 0 <= offset <= 16380 if (offset % 2 == 0) + +; CHECK-LABEL: test_scaled_offset_range2: +; CHECK: ldr x8, [x0] +; CHECK-NEXT: str x8, [x1, #16376] +; CHECK-NEXT: ret +define void @test_scaled_offset_range2(i32* %p1, i32* %p2) #0 { + %ld.ptr1 = getelementptr i32, i32* %p1, i64 0 + %1 = load i32, i32* %ld.ptr1, align 4, !tbaa !9 + %st.ptr1 = getelementptr i32, i32* %p2, i64 4094 + store i32 %1, i32* %st.ptr1, align 4, !tbaa !5 + %ld.ptr2 = getelementptr i32, i32* %p1, i64 1 + %2 = load i32, i32* %ld.ptr2, align 4, !tbaa !8 + %st.ptr2 = getelementptr i32, i32* %p2, i64 4095 + store i32 %2, i32* %st.ptr2, align 4, !tbaa !6 + ret void +} + +; In the following test case, 'aarch64-ssa-load-store-opt' is not applied +; since the first load offset is out of range. +; allowed offset range: -256 <= offset < 256 if (offset % 2 == 1) + +; CHECK-LABEL: test_scaled_offset_range3: +; CHECK: ldr w8, [x0, #260] +; CHECK-NEXT: str w8, [x1, #252] +; CHECK-NEXT: ldr w8, [x0, #264] +; CHECK-NEXT: str w8, [x1, #256] +; CHECK-NEXT: ret +define void @test_scaled_offset_range3(i32* %p1, i32* %p2) #0 { + %ld.ptr1 = getelementptr i32, i32* %p1, i64 65 + %1 = load i32, i32* %ld.ptr1, align 4, !tbaa !9 + %st.ptr1 = getelementptr i32, i32* %p2, i64 63 + store i32 %1, i32* %st.ptr1, align 4, !tbaa !5 + %ld.ptr2 = getelementptr i32, i32* %p1, i64 66 + %2 = load i32, i32* %ld.ptr2, align 4, !tbaa !8 + %st.ptr2 = getelementptr i32, i32* %p2, i64 64 + store i32 %2, i32* %st.ptr2, align 4, !tbaa !6 + ret void +} + +; In the follwoing three test cases, 'aarch64-ssa-load-store-opt' is not applied +; since disjoint memory accesses cannot be guaranteed without alias information. + +; CHECK-LABEL: test_offset_no_changed_no_tbaa_info: +; CHECK: ldur w8, [x0, #-76] +; CHECK-NEXT: stur w8, [x1, #-44] +; CHECK-NEXT: ldur w8, [x0, #-72] +; CHECK-NEXT: stur w8, [x1, #-40] +; CHECK-NEXT: ret +define void @test_offset_no_changed_no_tbaa_info(i32* %p1, i32* %p2) #0 { + %ld.ptr1 = getelementptr i32, i32* %p1, i64 -19 + %1 = load i32, i32* %ld.ptr1, align 4 + %st.ptr1 = getelementptr i32, i32* %p2, i64 -11 + store i32 %1, i32* %st.ptr1, align 4 + %ld.ptr2 = getelementptr i32, i32* %p1, i64 -18 + %2 = load i32, i32* %ld.ptr2, align 4 + %st.ptr2 = getelementptr i32, i32* %p2, i64 -10 + store i32 %2, i32* %st.ptr2, align 4 + ret void +} + +; CHECK-LABEL: test_offset_halved_no_tbaa_info: +; CHECK: ldr w8, [x0, #16] +; CHECK-NEXT: str w8, [x1, #8] +; CHECK-NEXT: ldr w8, [x0, #20] +; CHECK-NEXT: str w8, [x1, #12] +; CHECK-NEXT: ret +define void @test_offset_halved_no_tbaa_info(i32* %p1, i32* %p2) #0 { + %ld.ptr1 = getelementptr i32, i32* %p1, i64 4 + %1 = load i32, i32* %ld.ptr1, align 4 + %st.ptr1 = getelementptr i32, i32* %p2, i64 2 + store i32 %1, i32* %st.ptr1, align 4 + %ld.ptr2 = getelementptr i32, i32* %p1, i64 5 + %2 = load i32, i32* %ld.ptr2, align 4 + %st.ptr2 = getelementptr i32, i32* %p2, i64 3 + store i32 %2, i32* %st.ptr2, align 4 + ret void +} + +; CHECK-LABEL: test_offset_unscaled_no_tbaa_info: +; CHECK: ldr w8, [x0, #52] +; CHECK-NEXT: str w8, [x1, #148] +; CHECK-NEXT: ldr w8, [x0, #56] +; CHECK-NEXT: str w8, [x1, #152] +; CHECK-NEXT: ret +define void @test_offset_unscaled_no_tbaa_info(i32* %p1, i32* %p2) #0 { + %ld.ptr1 = getelementptr i32, i32* %p1, i64 13 + %1 = load i32, i32* %ld.ptr1, align 4 + %st.ptr1 = getelementptr i32, i32* %p2, i64 37 + store i32 %1, i32* %st.ptr1, align 4 + %ld.ptr2 = getelementptr i32, i32* %p1, i64 14 + %2 = load i32, i32* %ld.ptr2, align 4 + %st.ptr2 = getelementptr i32, i32* %p2, i64 38 + store i32 %2, i32* %st.ptr2, align 4 + ret void +} + +attributes #0 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "stack-protector-buffer-size"="8" "unsafe-fp-math"="true" "use-soft-float"="false" } + +!llvm.ident = !{!0} + +!0 = !{!"clang version 3.9.0 "} +!1 = !{!"int", !2, i64 0} +!2 = !{!"omnipotent char", !3, i64 0} +!3 = !{!"Simple C++ TBAA"} +!4 = !{!"structA", !1, i64 0, !1, i64 4} +!5 = !{!4, !1, i64 0} +!6 = !{!4, !1, i64 4} +!7 = !{!"structB", !1, i64 0, !1, i64 4} +!8 = !{!7, !1, i64 0} +!9 = !{!7, !1, i64 4}