Index: lib/Target/AArch64/AArch64.h =================================================================== --- lib/Target/AArch64/AArch64.h +++ lib/Target/AArch64/AArch64.h @@ -35,6 +35,7 @@ FunctionPass *createAArch64StorePairSuppressPass(); FunctionPass *createAArch64ExpandPseudoPass(); FunctionPass *createAArch64LoadStoreOptimizationPass(); +FunctionPass *createAArch64LoadStoreInterleavePass(); ModulePass *createAArch64PromoteConstantPass(); FunctionPass *createAArch64ConditionOptimizerPass(); FunctionPass *createAArch64AddressTypePromotionPass(); Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -464,8 +464,16 @@ setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8; - MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4; + if (Subtarget->isCyclone()) { + MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4; + } else { + // It's optimal to use 64-bit registers with load/store pair instructions for + // memcpy() inlining, rather than doing the same with regular load/store + // instructions operating on 128-bit registers. Allow twice as big + // instructions af for memmove(). + MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 8; + } setStackPointerRegisterToSaveRestore(AArch64::SP); @@ -6604,17 +6612,24 @@ bool ZeroMemset, bool MemcpyStrSrc, MachineFunction &MF) const { - // Don't use AdvSIMD to implement 16-byte memset. It would have taken one - // instruction to materialize the v2i64 zero and one store (with restrictive - // addressing mode). Just do two i64 store of zero-registers. - bool Fast; - const Function *F = MF.getFunction(); - if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 && - !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, - Attribute::NoImplicitFloat) && - (memOpAlign(SrcAlign, DstAlign, 16) || - (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast))) - return MVT::f128; + // In general it's optimal to use 64-bit registers with load/store pair + // instructions for memcpy() inlining, rather than doing the same with regular + // load/store instructions operating on 128-bit registers. Do not use 128-bit + // types. + + if (Subtarget->isCyclone()) { + // Don't use AdvSIMD to implement 16-byte memset. It would have taken one + // instruction to materialize the v2i64 zero and one store (with restrictive + // addressing mode). Just do two i64 store of zero-registers. + bool Fast; + const Function *F = MF.getFunction(); + if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 && + !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex, + Attribute::NoImplicitFloat) && + (memOpAlign(SrcAlign, DstAlign, 16) || + (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast))) + return MVT::f128; + } return Size >= 8 ? MVT::i64 : MVT::i32; } Index: lib/Target/AArch64/AArch64LoadStoreInterleave.cpp =================================================================== --- /dev/null +++ lib/Target/AArch64/AArch64LoadStoreInterleave.cpp @@ -0,0 +1,346 @@ +//=- AArch64LoadStoreInterleave.cpp - Optimize Load/Store pairs for AArch64 -=// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// +// This pass reorders load/store pair instructions to achieve better +// performance. Preferred sequence of operations is as follows: +// +// * [1]: load pair of 64-bit registers +// * [1]: store pair of 64-bit registers +// * [2]: load pair of 64-bit registers +// * [2]: store pair of 64-bit registers +// * ... +// +// Example of transformation: +// +// Before: After: +// +// 1. 1. +// 2. 2. +// 3. 3. +// 4. 4. +// 5. 5. +// 6. 6. +// 7. 7. +// +//===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64InstrInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstrBuilder.h" +#include "llvm/Support/Debug.h" +#include "llvm/Target/TargetInstrInfo.h" +#include "llvm/Target/TargetSubtargetInfo.h" + +using namespace llvm; + +#define DEBUG_TYPE "aarch64-ldst-itl" + +STATISTIC(NumSequences, "Number of load/pair sequences updated"); + +namespace { +class AArch64LoadStoreInterleave : public MachineFunctionPass { + const TargetInstrInfo *TII; + const TargetRegisterInfo *TRI; + +public: + static char ID; + AArch64LoadStoreInterleave() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + bool interleaveMemOp(MachineBasicBlock &MBB); + MachineInstr *moveInstruction(MachineInstr *I, + MachineBasicBlock::iterator InsertionPoint); + const char *getPassName() const override { + return "AArch64 LoadStore Interleave"; + } +}; +} // end anonymous namespace + +char AArch64LoadStoreInterleave::ID = 0; + +FunctionPass *llvm::createAArch64LoadStoreInterleavePass() { + return new AArch64LoadStoreInterleave(); +} + +// Optimizes every basic block of the function. +bool AArch64LoadStoreInterleave::runOnMachineFunction(MachineFunction &MF) { + DEBUG(dbgs() << "********** AArch64 LoadStore Interleaving **********\n" + << "********** Function: " << MF.getName() << '\n'); + + const TargetMachine &TM = MF.getTarget(); + TII = static_cast( + TM.getSubtargetImpl()->getInstrInfo()); + TRI = TM.getSubtargetImpl()->getRegisterInfo(); + + bool Modified = false; + for (auto &MBB : MF) { + Modified |= interleaveMemOp(MBB); + } + + return Modified; +} + +// Gets size of operands of load or store pair instruction in bytes. +static int getOperandWidth(int Opcode) { + switch (Opcode) { + default: + llvm_unreachable("Didn't expect anything except load and store pairs."); + + case AArch64::STPWi: + case AArch64::LDPWi: + case AArch64::STRWui: + case AArch64::STURWi: + case AArch64::LDRWui: + case AArch64::LDURWi: + return 4; + + case AArch64::STPXi: + case AArch64::LDPXi: + case AArch64::STRXui: + case AArch64::STURXi: + case AArch64::LDRXui: + case AArch64::LDURXi: + return 8; + + case AArch64::STPSi: + case AArch64::LDPSi: + case AArch64::STRSui: + case AArch64::STURSi: + case AArch64::LDRSui: + case AArch64::LDURSi: + return 4; + + case AArch64::STPDi: + case AArch64::LDPDi: + case AArch64::STRDui: + case AArch64::STURDi: + case AArch64::LDRDui: + case AArch64::LDURDi: + return 8; + + case AArch64::STPQi: + case AArch64::LDPQi: + case AArch64::STRQui: + case AArch64::STURQi: + case AArch64::LDRQui: + case AArch64::LDURQi: + return 16; + } +} + +// Checks that instruction can safely be moved outside sequence of load and +// store pair instruction. +static bool isSafeInstruction(unsigned LdBase, unsigned StBase, MachineInstr *I, + const TargetRegisterInfo *TRI, int SeenStore) { + if (I->isDebugValue()) { + return true; + } + + if (I->isCall() || I->isTerminator() || I->hasUnmodeledSideEffects()) { + return false; + } + + if (I->mayStore() || (SeenStore && I->mayLoad())) { + return false; + } + + for (const MachineOperand &MO : I->operands()) { + if (!MO.isReg()) { + continue; + } + + unsigned Reg = MO.getReg(); + if (MO.isDef() && TRI->regsOverlap(Reg, LdBase)) { + return false; + } + if (SeenStore && MO.isDef() && TRI->regsOverlap(Reg, StBase)) { + return false; + } + } + + return true; +} + +// Collects links to load and store instructions from the basic block. Return +// value indicating whether at least one of instructions is a pair load or +// store. +static bool collectLoadAndStores(MachineBasicBlock &MBB, + SmallVectorImpl &Lds, + SmallVectorImpl &Sts) { + bool SeenPair = false; + for (MachineInstr &MI : MBB) { + switch (MI.getOpcode()) { + default: + // Just move on to the next instruction. + break; + + case AArch64::STPSi: + case AArch64::STPDi: + case AArch64::STPQi: + case AArch64::STPWi: + case AArch64::STPXi: + SeenPair = true; + // Fall through. + + case AArch64::STRSui: + case AArch64::STURSi: + case AArch64::STRDui: + case AArch64::STURDi: + case AArch64::STRQui: + case AArch64::STURQi: + case AArch64::STRWui: + case AArch64::STURWi: + case AArch64::STRXui: + case AArch64::STURXi: + // Loads should go first. + if (!Lds.empty()) { + Sts.push_back(&MI); + } + break; + + case AArch64::LDPDi: + case AArch64::LDPQi: + case AArch64::LDPWi: + case AArch64::LDPXi: + SeenPair = true; + // Fall through. + + case AArch64::LDRSui: + case AArch64::LDURSi: + case AArch64::LDRDui: + case AArch64::LDURDi: + case AArch64::LDRQui: + case AArch64::LDURQi: + case AArch64::LDRWui: + case AArch64::LDURWi: + case AArch64::LDRXui: + case AArch64::LDURXi: + Lds.push_back(&MI); + break; + } + } + + return SeenPair; +} + +// Extract base address from the instruction. +static inline unsigned getBase(const MachineInstr* I) { + unsigned OpNum = (I->getNumOperands() == 4) ? 2 : 1; + return I->getOperand(OpNum).getReg(); +} + +// Extract offset from the instruction. +static inline int64_t getOffset(const MachineInstr* I) { + unsigned OpNum = (I->getNumOperands() == 4) ? 3 : 2; + return I->getOperand(OpNum).getImm(); +} + +// Checks if a set of load and store instructions can be safely reordered. +static bool isSafeToReorder(MachineBasicBlock &MBB, + const SmallVectorImpl &Lds, + const SmallVectorImpl &Sts, + const TargetRegisterInfo *TRI) { + if (Sts.empty() || Sts.size() != Lds.size()) { + return false; + } + + unsigned N = Sts.size(); + + // Check that each pair of instructions operate on data of the same width. + for (unsigned i = 0; i < N; ++i) { + const int LoadWidth = getOperandWidth(Lds[i]->getOpcode()); + const int StoreWidth = getOperandWidth(Sts[i]->getOpcode()); + if (LoadWidth != StoreWidth) { + return false; + } + } + + const unsigned LdBase = getBase(Lds[0]); + const unsigned StBase = getBase(Sts[0]); + + // Check that all load and store instructions use same base register and + // each pair has same offset. + for (unsigned i = 0; i < N; ++i) { + if (getBase(Lds[i]) != LdBase || getBase(Sts[i]) != StBase) { + return false; + } + + if (getOffset(Sts[i]) != getOffset(Lds[i])) { + return false; + } + } + + bool SeenStore = false; + for (MachineBasicBlock::iterator I = Lds[0], E = Sts[N - 1]; I != E; ++I) { + if (std::find(Sts.begin(), Sts.end(), (MachineInstr*)I) != Sts.end()) { + SeenStore = true; + continue; + } + + if (std::find(Lds.begin(), Lds.end(), (MachineInstr*)I) != Lds.end()) { + continue; + } + + if (!isSafeInstruction(LdBase, StBase, I, TRI, SeenStore)) { + return false; + } + } + + return true; +} + +// Evaluates possibility and performs reordering of load and store instructions +// within basic block. +bool AArch64LoadStoreInterleave::interleaveMemOp(MachineBasicBlock &MBB) { + SmallVector Lds; + SmallVector Sts; + + if (!collectLoadAndStores(MBB, Lds, Sts)) { + return false; + } + + if (!isSafeToReorder(MBB, Lds, Sts, TRI)) { + return false; + } + + const unsigned N = Sts.size(); + + DEBUG(dbgs() << "Interleaving sequence of " << N << " instructions " + "in " << MBB.getName() << "\n"); + + MachineBasicBlock::iterator InsertionPoint = Sts[N - 1]; + + for (unsigned i = 0; i < N; ++i) { + InsertionPoint = moveInstruction(Sts[N - 1 - i], InsertionPoint); + InsertionPoint = moveInstruction(Lds[N - 1 - i], InsertionPoint); + } + + ++NumSequences; + + return true; +} + +// Moves load or store pair instruction before the insertion point and returns +// next position for insertion. +MachineInstr *AArch64LoadStoreInterleave::moveInstruction( + MachineInstr *I, MachineBasicBlock::iterator InsertionPoint) { + MachineInstr *NewI = BuildMI(*I->getParent(), InsertionPoint, + I->getDebugLoc(), TII->get(I->getOpcode())); + for (const MachineOperand &operand : I->operands()) { + NewI->addOperand(operand); + } + + I->eraseFromParent(); + + return NewI; +} Index: lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetMachine.cpp +++ lib/Target/AArch64/AArch64TargetMachine.cpp @@ -80,6 +80,12 @@ cl::desc("Work around Cortex-A53 erratum 835769"), cl::init(false)); +static cl::opt +EnableAArch64InterleavedMemOp("aarch64-interleaved-ldstp", cl::Hidden, + cl::desc("Allow AArch64 load/store clustering and " + "interleaving"), + cl::init(false)); + extern "C" void LLVMInitializeAArch64Target() { // Register the target. RegisterTargetMachine X(TheAArch64leTarget); @@ -269,6 +275,10 @@ } bool AArch64PassConfig::addPreEmitPass() { + // Reorder load/store pair instruction for better performance. + if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt && + EnableAArch64InterleavedMemOp) + addPass(createAArch64LoadStoreInterleavePass()); if (EnableA53Fix835769) addPass(createAArch64A53Fix835769()); // Relax conditional branch instructions if they're otherwise out of Index: lib/Target/AArch64/CMakeLists.txt =================================================================== --- lib/Target/AArch64/CMakeLists.txt +++ lib/Target/AArch64/CMakeLists.txt @@ -33,6 +33,7 @@ AArch64ISelLowering.cpp AArch64InstrInfo.cpp AArch64LoadStoreOptimizer.cpp + AArch64LoadStoreInterleave.cpp AArch64MCInstLower.cpp AArch64PromoteConstant.cpp AArch64PBQPRegAlloc.cpp Index: test/CodeGen/AArch64/arm64-variadic-aapcs.ll =================================================================== --- test/CodeGen/AArch64/arm64-variadic-aapcs.ll +++ test/CodeGen/AArch64/arm64-variadic-aapcs.ll @@ -1,4 +1,4 @@ -; RUN: llc -verify-machineinstrs -mtriple=arm64-linux-gnu -pre-RA-sched=linearize -enable-misched=false < %s | FileCheck %s +; RUN: llc -verify-machineinstrs -mtriple=arm64-linux-gnu -pre-RA-sched=linearize -enable-misched=false < %s -mcpu=cyclone | FileCheck %s %va_list = type {i8*, i8*, i8*, i32, i32} Index: test/CodeGen/AArch64/arm64-virtual_base.ll =================================================================== --- test/CodeGen/AArch64/arm64-virtual_base.ll +++ test/CodeGen/AArch64/arm64-virtual_base.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -O3 -march arm64 | FileCheck %s +; RUN: llc < %s -O3 -march arm64 -mcpu=cyclone | FileCheck %s ; %struct.Counter_Struct = type { i64, i64 } Index: test/CodeGen/AArch64/func-calls.ll =================================================================== --- test/CodeGen/AArch64/func-calls.ll +++ test/CodeGen/AArch64/func-calls.ll @@ -1,7 +1,7 @@ -; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefix=CHECK -; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-neon | FileCheck --check-prefix=CHECK-NONEON %s -; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s -; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK-BE %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mcpu=cyclone | FileCheck %s --check-prefix=CHECK +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-neon -mcpu=cyclone | FileCheck --check-prefix=CHECK-NONEON %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -mcpu=cyclone | FileCheck --check-prefix=CHECK-NOFP %s +; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -mcpu=cyclone | FileCheck --check-prefix=CHECK-BE %s %myStruct = type { i64 , i8, i32 } @@ -142,7 +142,7 @@ ; CHECK-LE: movz x2, #{{0x2a|42}} ; CHECK-LE: mov x3, xzr ; CHECK-BE: movz {{x|w}}3, #{{0x2a|42}} -; CHECK-BE: mov x2, xzr +; CHECK-BE: mov{{z?}} x2, {{xzr|#0}} ; CHECK: bl check_i128_regalign ret void Index: test/CodeGen/AArch64/memcpy-f128.ll =================================================================== --- test/CodeGen/AArch64/memcpy-f128.ll +++ test/CodeGen/AArch64/memcpy-f128.ll @@ -1,4 +1,4 @@ -; RUN: llc < %s -march=aarch64 -mtriple=aarch64-linux-gnu | FileCheck %s +; RUN: llc < %s -march=aarch64 -mtriple=aarch64-linux-gnu -mcpu=cyclone | FileCheck %s %structA = type { i128 } @stubA = internal unnamed_addr constant %structA zeroinitializer, align 8 Index: test/CodeGen/AArch64/optimal-load-store-pairs.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/optimal-load-store-pairs.ll @@ -0,0 +1,66 @@ +; RUN: llc < %s -mcpu=cortex-a53 -march=aarch64 -mtriple=aarch64-linux-gnu -aarch64-interleaved-ldstp=1 | FileCheck %s +; RUN: llc < %s -mcpu=cortex-a57 -march=aarch64 -mtriple=aarch64-linux-gnu -aarch64-interleaved-ldstp=1 | FileCheck %s + +; Here "optimal" means: +; - use of 64-bit registers (no floating point 128-bit registers); +; - interleaving loads and stores without any instructions in the middle. + +; marked as external to prevent possible optimizations +@a = external global [4 x i32] +@b = external global [4 x i32] + +define void @copy-16-bytes-with-8-byte-registers() { +; CHECK-LABEL: @copy-16-bytes-with-8-byte-registers +; CHECK: adrp +; CHECK: add +; CHECK: adrp +; CHECK: add +; CHECK: ldp [[v1:x[0-9]+]], [[v2:x[0-9]+]] +; CHECK: stp [[v1]], [[v2]] +; CHECK: ret +entry: + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast ([4 x i32]* @a to i8*), i8* bitcast ([4 x i32]* @b to i8*), i64 16, i32 8, i1 false) + ret void +} + +define void @copy-56-bytes-with-8-byte-registers() { +; CHECK-LABEL: @copy-56-bytes-with-8-byte-registers +; CHECK: adrp +; CHECK: add +; CHECK: adrp +; CHECK: add +; CHECK: ld{{[rp]}} {{x[0-9]+}} +; CHECK: st{{[rp]}} {{x[0-9]+}} +; CHECK: ld{{[rp]}} {{x[0-9]+}} +; CHECK: st{{[rp]}} {{x[0-9]+}} +; CHECK: ld{{[rp]}} {{x[0-9]+}} +; CHECK: st{{[rp]}} {{x[0-9]+}} +; CHECK: ld{{[rp]}} {{x[0-9]+}} +; CHECK: st{{[rp]}} {{x[0-9]+}} +; CHECK: ret +entry: + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast ([4 x i32]* @a to i8*), i8* bitcast ([4 x i32]* @b to i8*), i64 56, i32 8, i1 false) + ret void +} + +define void @copy-64-bytes-with-8-byte-registers() { +; CHECK-LABEL: @copy-64-bytes-with-8-byte-registers +; CHECK: adrp +; CHECK: add +; CHECK: adrp +; CHECK: add +; CHECK: ldp [[v1:x[0-9]+]], [[v2:x[0-9]+]] +; CHECK: stp [[v1]], [[v2]] +; CHECK: ldp [[v3:x[0-9]+]], [[v4:x[0-9]+]] +; CHECK: stp [[v3]], [[v4]] +; CHECK: ldp [[v5:x[0-9]+]], [[v6:x[0-9]+]] +; CHECK: stp [[v5]], [[v6]] +; CHECK: ldp [[v7:x[0-9]+]], [[v8:x[0-9]+]] +; CHECK: stp [[v7]], [[v8]] +; CHECK: ret +entry: + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast ([4 x i32]* @a to i8*), i8* bitcast ([4 x i32]* @b to i8*), i64 64, i32 8, i1 false) + ret void +} + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)