Index: lib/Target/AArch64/AArch64.h =================================================================== --- lib/Target/AArch64/AArch64.h +++ lib/Target/AArch64/AArch64.h @@ -35,6 +35,7 @@ FunctionPass *createAArch64StorePairSuppressPass(); FunctionPass *createAArch64ExpandPseudoPass(); FunctionPass *createAArch64LoadStoreOptimizationPass(); +FunctionPass *createAArch64LoadStoreInterleavePass(); ModulePass *createAArch64PromoteConstantPass(); FunctionPass *createAArch64ConditionOptimizerPass(); FunctionPass *createAArch64AddressTypePromotionPass(); Index: lib/Target/AArch64/AArch64LoadStoreInterleave.cpp =================================================================== --- /dev/null +++ lib/Target/AArch64/AArch64LoadStoreInterleave.cpp @@ -0,0 +1,330 @@ +//=- AArch64LoadStoreInterleave.cpp - Optimize Load/Store pairs for AArch64 -=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass reorders load/store pair instructions to achieve better +// performance. Preferred sequence of operations is as follows: +// +// * [1]: load pair of 64-bit registers +// * [1]: store pair of 64-bit registers +// * [2]: load pair of 64-bit registers +// * [2]: store pair of 64-bit registers +// * ... +// +// Example of transformation: +// +// Before: After: +// +// 1. 1. +// 2. 2. +// 3. 3. +// 4. 4. +// 5. 5. +// 6. 6. +// 7. 7. +// +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64InstrInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-ldst-itl" + +STATISTIC(NumLdStSequencesUpdated, "Number of load/pair sequences updated"); + +namespace { +class AArch64LoadStoreInterleave : public MachineFunctionPass { + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + +public: + static char ID; + AArch64LoadStoreInterleave() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + bool interleaveMemOp(MachineBasicBlock &MBB); + void moveInstruction(MachineInstr *I, + MachineBasicBlock::iterator InsertionPoint); + const char *getPassName() const override { + return "AArch64 LoadStore Interleave"; + } +}; +} // end anonymous namespace + +char AArch64LoadStoreInterleave::ID = 0; + +FunctionPass *llvm::createAArch64LoadStoreInterleavePass() { + return new AArch64LoadStoreInterleave(); +} + +// Optimizes every basic block of the function. +bool AArch64LoadStoreInterleave::runOnMachineFunction(MachineFunction &MF) { + DEBUG(dbgs() << "********** AArch64 LoadStore Interleaving **********\n" + << "********** Function: " << MF.getName() << '\n'); + + const TargetMachine &TM = MF.getTarget(); + TII = static_cast( + TM.getSubtargetImpl()->getInstrInfo()); + TRI = TM.getSubtargetImpl()->getRegisterInfo(); + + bool Modified = false; + for (auto &MBB : MF) { + Modified |= interleaveMemOp(MBB); + } + + return Modified; +} + +// Gets size of operands of load or store pair instruction in bytes. +static int getOperandWidth(const MachineInstr* I) { + switch (I->getOpcode()) { + default: + llvm_unreachable("Didn't expect anything except load and store pairs."); + + case AArch64::STPWi: + case AArch64::LDPWi: + case AArch64::STRWui: + case AArch64::STURWi: + case AArch64::LDRWui: + case AArch64::LDURWi: + return 4; + + case AArch64::STPXi: + case AArch64::LDPXi: + case AArch64::STRXui: + case AArch64::STURXi: + case AArch64::LDRXui: + case AArch64::LDURXi: + return 8; + + case AArch64::STPSi: + case AArch64::LDPSi: + case AArch64::STRSui: + case AArch64::STURSi: + case AArch64::LDRSui: + case AArch64::LDURSi: + return 4; + + case AArch64::STPDi: + case AArch64::LDPDi: + case AArch64::STRDui: + case AArch64::STURDi: + case AArch64::LDRDui: + case AArch64::LDURDi: + return 8; + + case AArch64::STPQi: + case AArch64::LDPQi: + case AArch64::STRQui: + case AArch64::STURQi: + case AArch64::LDRQui: + case AArch64::LDURQi: + return 16; + } +} + +// Extract base address from the instruction. +static inline unsigned getBase(const MachineInstr* I) { + unsigned OpNum = (I->getNumOperands() == 4) ? 2 : 1; + return I->getOperand(OpNum).getReg(); +} + +// Extract offset from the instruction. +static inline int64_t getOffset(const MachineInstr* I) { + unsigned OpNum = (I->getNumOperands() == 4) ? 3 : 2; + return I->getOperand(OpNum).getImm(); +} + +// Checks whether store instruction can be moved before the load instruction. +static bool isSafeToMoveStore(MachineInstr *I, MachineInstr *St) { + if (getBase(I) != getBase(St)) + return false; + + int IOperandWidth; + + switch (I->getOpcode()) { + default: + return false; + + case AArch64::STPSi: + case AArch64::STPWi: + case AArch64::STPDi: + case AArch64::STPXi: + case AArch64::STPQi: + IOperandWidth = getOperandWidth(I); + break; + + case AArch64::STRSui: + case AArch64::STURSi: + case AArch64::STRWui: + case AArch64::STURWi: + IOperandWidth = 4; + break; + + case AArch64::STRDui: + case AArch64::STURDi: + case AArch64::STRXui: + case AArch64::STURXi: + IOperandWidth = 8; + break; + + case AArch64::STRQui: + case AArch64::STURQi: + IOperandWidth = 16; + break; + } + + int StOperandWidth = getOperandWidth(St); + + int64_t StBegin = getOffset(St)*StOperandWidth; + int64_t StEnd = StBegin + StOperandWidth; + + int64_t IBegin = getOffset(I)*IOperandWidth; + int64_t IEnd = IBegin + IOperandWidth; + + return (IBegin <= StBegin || IBegin >= StEnd) && + (IEnd <= StBegin || IEnd >= StEnd) && + (IBegin != StBegin || IEnd != StEnd); +} + +// Checks that instruction can be safely moved outside pair of load and store +// pair instructions. +static bool isSafeInstruction(MachineInstr *I, MachineInstr *Ld, + MachineInstr *St, const TargetRegisterInfo *TRI) { + if (I->isDebugValue()) + return true; + + if (I->isCall() || I->isTerminator() || I->hasUnmodeledSideEffects()) + return false; + + if (I->mayStore() && !isSafeToMoveStore(I, St)) + return false; + + unsigned Base = getBase(Ld); + for (const MachineOperand &MO : I->operands()) { + if (!MO.isReg()) + continue; + + unsigned Reg = MO.getReg(); + if (MO.isDef() && TRI->regsOverlap(Reg, Base)) + return false; + } + + return true; +} + +// Collects links to load and store instructions from the basic block. +static void collectLoadAndStores(MachineBasicBlock &MBB, + SmallVectorImpl &Lds, + SmallVectorImpl &Sts) { + for (MachineInstr &MI : MBB) { + switch (MI.getOpcode()) { + default: + // Just move on to the next instruction. + break; + + case AArch64::STPSi: + case AArch64::STPDi: + case AArch64::STPQi: + case AArch64::STPWi: + case AArch64::STPXi: + // Sequence of interesting operations should go first. + if (!Lds.empty()) + Sts.push_back(&MI); + break; + + case AArch64::LDPSi: + case AArch64::LDPDi: + case AArch64::LDPQi: + case AArch64::LDPWi: + case AArch64::LDPXi: + Lds.push_back(&MI); + break; + } + } +} + +// Checks if a set of load and store instructions can be safely reordered. +static bool isSafeToReorder(MachineInstr *Ld, MachineInstr *St, + const TargetRegisterInfo *TRI) { + MachineBasicBlock::iterator I = Ld; + MachineBasicBlock::iterator E = St; + while (++I != E) { + if (!isSafeInstruction(&*I, Ld, St, TRI)) + return false; + } + + return true; +} + +// Checks whether it's allowed and makes sense to move load instruction towards +// store instruction. +static bool shouldTryToMove(MachineInstr *Ld, MachineInstr *St) { + return getOperandWidth(Ld) == getOperandWidth(St) && + Ld->getOperand(0).getReg() == St->getOperand(0).getReg() && + Ld->getOperand(1).getReg() == St->getOperand(1).getReg() && + getOffset(Ld) == getOffset(St) && + std::distance(MachineBasicBlock::iterator(Ld), + MachineBasicBlock::iterator(St)) > 1; +} + +// Evaluates possibility and performs reordering of load and store instructions +// within basic block. +bool AArch64LoadStoreInterleave::interleaveMemOp(MachineBasicBlock &MBB) { + SmallVector Lds; + SmallVector Sts; + + collectLoadAndStores(MBB, Lds, Sts); + + bool Changed = false; + + for (int i = Lds.size() - 1; i >= 0; --i) { + for (int j = 0, n2 = Sts.size(); j < n2; ++j) { + MachineInstr *Ld = Lds[i]; + MachineInstr *St = Sts[j]; + + if (!shouldTryToMove(Ld, St)) + continue; + + if (!isSafeToReorder(Ld, St, TRI)) + continue; + + moveInstruction(Ld, St); + + ++NumLdStSequencesUpdated; + Changed = true; + + break; + } + } + + return Changed; +} + +// Moves load or store pair instruction before the insertion point. +void AArch64LoadStoreInterleave::moveInstruction( + MachineInstr *I, MachineBasicBlock::iterator InsertionPoint) { + MachineInstr *NewI = BuildMI(*I->getParent(), InsertionPoint, + I->getDebugLoc(), TII->get(I->getOpcode())); + for (const MachineOperand &operand : I->operands()) { + NewI->addOperand(operand); + } + + I->eraseFromParent(); +} Index: lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetMachine.cpp +++ lib/Target/AArch64/AArch64TargetMachine.cpp @@ -80,6 +80,12 @@ cl::desc("Work around Cortex-A53 erratum 835769"), cl::init(false)); +static cl::opt +EnableAArch64InterleavedMemOp("aarch64-interleaved-ldstp", cl::Hidden, + cl::desc("Allow AArch64 load/store clustering and " + "interleaving"), + cl::init(false)); + extern "C" void LLVMInitializeAArch64Target() { // Register the target. RegisterTargetMachine X(TheAArch64leTarget); @@ -269,6 +275,10 @@ } bool AArch64PassConfig::addPreEmitPass() { + // Reorder load/store pair instruction for better performance. + if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt && + EnableAArch64InterleavedMemOp) + addPass(createAArch64LoadStoreInterleavePass()); if (EnableA53Fix835769) addPass(createAArch64A53Fix835769()); // Relax conditional branch instructions if they're otherwise out of Index: lib/Target/AArch64/CMakeLists.txt =================================================================== --- lib/Target/AArch64/CMakeLists.txt +++ lib/Target/AArch64/CMakeLists.txt @@ -33,6 +33,7 @@ AArch64ISelLowering.cpp AArch64InstrInfo.cpp AArch64LoadStoreOptimizer.cpp + AArch64LoadStoreInterleave.cpp AArch64MCInstLower.cpp AArch64PromoteConstant.cpp AArch64PBQPRegAlloc.cpp Index: test/CodeGen/AArch64/arm64-variadic-aapcs.ll =================================================================== --- test/CodeGen/AArch64/arm64-variadic-aapcs.ll +++ test/CodeGen/AArch64/arm64-variadic-aapcs.ll @@ -1,4 +1,4 @@ -; RUN: llc -verify-machineinstrs -mtriple=arm64-linux-gnu -pre-RA-sched=linearize -enable-misched=false < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=arm64-linux-gnu -pre-RA-sched=linearize -enable-misched=false < %s -mcpu=cyclone | FileCheck %s %va_list = type {i8*, i8*, i8*, i32, i32} Index: test/CodeGen/AArch64/arm64-virtual_base.ll =================================================================== --- test/CodeGen/AArch64/arm64-virtual_base.ll +++ test/CodeGen/AArch64/arm64-virtual_base.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -O3 -march arm64 | FileCheck %s +; RUN: llc < %s -O3 -march arm64 -mcpu=cyclone | FileCheck %s ; %struct.Counter_Struct = type { i64, i64 } Index: test/CodeGen/AArch64/func-calls.ll =================================================================== --- test/CodeGen/AArch64/func-calls.ll +++ test/CodeGen/AArch64/func-calls.ll @@ -1,7 +1,7 @@ -; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefix=CHECK -; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-neon | FileCheck --check-prefix=CHECK-NONEON %s -; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s -; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK-BE %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mcpu=cyclone | FileCheck %s --check-prefix=CHECK +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-neon -mcpu=cyclone | FileCheck --check-prefix=CHECK-NONEON %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -mcpu=cyclone | FileCheck --check-prefix=CHECK-NOFP %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -mcpu=cyclone | FileCheck --check-prefix=CHECK-BE %s %myStruct = type { i64 , i8, i32 } @@ -142,7 +142,7 @@ ; CHECK-LE: movz x2, #{{0x2a|42}} ; CHECK-LE: mov x3, xzr ; CHECK-BE: movz {{x|w}}3, #{{0x2a|42}} -; CHECK-BE: mov x2, xzr +; CHECK-BE: mov{{z?}} x2, {{xzr|#0}} ; CHECK: bl check_i128_regalign ret void Index: test/CodeGen/AArch64/memcpy-f128.ll =================================================================== --- test/CodeGen/AArch64/memcpy-f128.ll +++ test/CodeGen/AArch64/memcpy-f128.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=aarch64 -mtriple=aarch64-linux-gnu | FileCheck %s +; RUN: llc < %s -march=aarch64 -mtriple=aarch64-linux-gnu -mcpu=cyclone | FileCheck %s %structA = type { i128 } @stubA = internal unnamed_addr constant %structA zeroinitializer, align 8 Index: test/CodeGen/AArch64/optimal-load-store-pairs.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/optimal-load-store-pairs.ll @@ -0,0 +1,38 @@ +; RUN: llc < %s -mcpu=cortex-a53 -march=aarch64 -mtriple=aarch64-linux-gnu -aarch64-interleaved-ldstp=1 | FileCheck %s +; RUN: llc < %s -mcpu=cortex-a57 -march=aarch64 -mtriple=aarch64-linux-gnu -aarch64-interleaved-ldstp=1 | FileCheck %s + +; Here "optimal" means interleaving loads and stores without any instructions in +; the middle. + +; marked as external to prevent possible optimizations +@a = external global [4 x i32] +@b = external global [4 x i32] + +define void @copy-56-bytes-with-8-byte-registers() { +; CHECK-LABEL: @copy-56-bytes-with-8-byte-registers +; CHECK: ldp {{q[0-9]+}} +; CHECK-NOT: {{adrp|add}} +; CHECK: stp {{q[0-9]+}} +; CHECK: ret +entry: + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast ([4 x i32]* @a to i8*), i8* bitcast ([4 x i32]* @b to i8*), i64 56, i32 8, i1 false) + ret void +} + +define void @copy-64-bytes-with-8-byte-registers() { +; CHECK-LABEL: @copy-64-bytes-with-8-byte-registers +; CHECK: adrp +; CHECK: add +; CHECK: adrp +; CHECK: add +; CHECK: ldp [[v1:q[0-9]+]], [[v2:q[0-9]+]] +; CHECK: stp [[v1]], [[v2]] +; CHECK: ldp [[v3:q[0-9]+]], [[v4:q[0-9]+]] +; CHECK: stp [[v3]], [[v4]] +; CHECK: ret +entry: + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast ([4 x i32]* @a to i8*), i8* bitcast ([4 x i32]* @b to i8*), i64 64, i32 8, i1 false) + ret void +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)