Index: llvm/trunk/lib/Target/AArch64/AArch64.h =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64.h +++ llvm/trunk/lib/Target/AArch64/AArch64.h @@ -44,6 +44,7 @@ FunctionPass *createAArch64ConditionOptimizerPass(); FunctionPass *createAArch64A57FPLoadBalancing(); FunctionPass *createAArch64A53Fix835769(); +FunctionPass *createFalkorMarkStridedAccessesPass(); FunctionPass *createAArch64CleanupLocalDynamicTLSPass(); @@ -66,6 +67,7 @@ void initializeAArch64PromoteConstantPass(PassRegistry&); void initializeAArch64RedundantCopyEliminationPass(PassRegistry&); void initializeAArch64StorePairSuppressPass(PassRegistry&); +void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&); void initializeLDTLSCleanupPass(PassRegistry&); } // end namespace llvm Index: llvm/trunk/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp @@ -0,0 +1,147 @@ +//===-- AArch64FalkorHWPFFix.cpp - Avoid HW prefetcher pitfalls on Falkor--===// +// +// The LLVM Compiler Infrastructure +// +// This file is distributed under the University of Illinois Open Source +// License. See LICENSE.TXT for details. +// +//===----------------------------------------------------------------------===// +// For Falkor, we want to avoid HW prefetcher instruction tag collisions that +// may inhibit the HW prefetching. This is done in two steps. Before ISel, we +// mark strided loads (i.e. those that will likely benefit from prefetching) +// with metadata. Then, after opcodes have been finalized, we insert MOVs and +// re-write loads to prevent unintnentional tag collisions. +// ===----------------------------------------------------------------------===// + +#include "AArch64.h" +#include "AArch64InstrInfo.h" +#include "AArch64TargetMachine.h" +#include "llvm/ADT/DepthFirstIterator.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/Analysis/LoopInfo.h" +#include "llvm/Analysis/ScalarEvolution.h" +#include "llvm/Analysis/ScalarEvolutionExpressions.h" +#include "llvm/CodeGen/TargetPassConfig.h" +#include "llvm/IR/Dominators.h" +#include "llvm/IR/Function.h" +#include "llvm/IR/Module.h" +#include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" + +using namespace llvm; + +#define DEBUG_TYPE "falkor-hwpf-fix" + +STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked"); + +namespace { + +class FalkorMarkStridedAccesses { +public: + FalkorMarkStridedAccesses(LoopInfo &LI, ScalarEvolution &SE) + : LI(LI), SE(SE) {} + + bool run(); + +private: + bool runOnLoop(Loop *L); + + LoopInfo &LI; + ScalarEvolution &SE; +}; + +class FalkorMarkStridedAccessesLegacy : public FunctionPass { +public: + static char ID; // Pass ID, replacement for typeid + FalkorMarkStridedAccessesLegacy() : FunctionPass(ID) { + initializeFalkorMarkStridedAccessesLegacyPass( + *PassRegistry::getPassRegistry()); + } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + AU.addPreserved(); + AU.addRequired(); + // FIXME: For some reason, preserving SE here breaks LSR (even if + // this pass changes nothing). + // AU.addPreserved(); + } + + bool runOnFunction(Function &F) override; +}; +} // namespace + +char FalkorMarkStridedAccessesLegacy::ID = 0; +INITIALIZE_PASS_BEGIN(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE, + "Falkor HW Prefetch Fix", false, false) +INITIALIZE_PASS_DEPENDENCY(TargetPassConfig) +INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) +INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass) +INITIALIZE_PASS_END(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE, + "Falkor HW Prefetch Fix", false, false) + +FunctionPass *llvm::createFalkorMarkStridedAccessesPass() { + return new FalkorMarkStridedAccessesLegacy(); +} + +bool FalkorMarkStridedAccessesLegacy::runOnFunction(Function &F) { + TargetPassConfig &TPC = getAnalysis(); + const AArch64Subtarget *ST = + TPC.getTM().getSubtargetImpl(F); + if (ST->getProcFamily() != AArch64Subtarget::Falkor) + return false; + + if (skipFunction(F)) + return false; + + LoopInfo &LI = getAnalysis().getLoopInfo(); + ScalarEvolution &SE = getAnalysis().getSE(); + + FalkorMarkStridedAccesses LDP(LI, SE); + return LDP.run(); +} + +bool FalkorMarkStridedAccesses::run() { + bool MadeChange = false; + + for (Loop *I : LI) + for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L) + MadeChange |= runOnLoop(*L); + + return MadeChange; +} + +bool FalkorMarkStridedAccesses::runOnLoop(Loop *L) { + // Only mark strided loads in the inner-most loop + if (!L->empty()) + return false; + + bool MadeChange = false; + + for (const auto BB : L->blocks()) { + for (auto &I : *BB) { + LoadInst *LoadI = dyn_cast(&I); + if (!LoadI) + continue; + + Value *PtrValue = LoadI->getPointerOperand(); + if (L->isLoopInvariant(PtrValue)) + continue; + + const SCEV *LSCEV = SE.getSCEV(PtrValue); + const SCEVAddRecExpr *LSCEVAddRec = dyn_cast(LSCEV); + if (!LSCEVAddRec || !LSCEVAddRec->isAffine()) + continue; + + LoadI->setMetadata(FALKOR_STRIDED_ACCESS_MD, + MDNode::get(LoadI->getContext(), {})); + ++NumStridedLoadsMarked; + DEBUG(dbgs() << "Load: " << I << " marked as strided\n"); + MadeChange = true; + } + } + + return MadeChange; +} Index: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.h @@ -455,6 +455,8 @@ unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL) const; + MachineMemOperand::Flags getMMOFlags(const Instruction &I) const override; + private: bool isExtFreeImpl(const Instruction *Ext) const override; Index: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -7482,6 +7482,14 @@ return (DL.getTypeSizeInBits(VecTy) + 127) / 128; } +MachineMemOperand::Flags +AArch64TargetLowering::getMMOFlags(const Instruction &I) const { + if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor && + I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr) + return MOStridedAccess; + return MachineMemOperand::MONone; +} + bool AArch64TargetLowering::isLegalInterleavedAccessType( VectorType *VecTy, const DataLayout &DL) const { Index: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.h +++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.h @@ -27,6 +27,13 @@ class AArch64Subtarget; class AArch64TargetMachine; +static const MachineMemOperand::Flags MOSuppressPair = + MachineMemOperand::MOTargetFlag1; +static const MachineMemOperand::Flags MOStridedAccess = + MachineMemOperand::MOTargetFlag2; + +#define FALKOR_STRIDED_ACCESS_MD "falkor.strided.access" + class AArch64InstrInfo final : public AArch64GenInstrInfo { const AArch64RegisterInfo RI; const AArch64Subtarget &Subtarget; @@ -81,6 +88,9 @@ /// unprofitable. bool isLdStPairSuppressed(const MachineInstr &MI) const; + /// Return true if the given load or store is a strided memory access. + bool isStridedAccess(const MachineInstr &MI) const; + /// Return true if this is an unscaled load/store. bool isUnscaledLdSt(unsigned Opc) const; Index: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -52,9 +52,6 @@ #define GET_INSTRINFO_CTOR_DTOR #include "AArch64GenInstrInfo.inc" -static const MachineMemOperand::Flags MOSuppressPair = - MachineMemOperand::MOTargetFlag1; - static cl::opt TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14), cl::desc("Restrict range of TB[N]Z instructions (DEBUG)")); @@ -1715,6 +1712,13 @@ (*MI.memoperands_begin())->setFlags(MOSuppressPair); } +/// Check all MachineMemOperands for a hint that the load/store is strided. +bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) const { + return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) { + return MMO->getFlags() & MOStridedAccess; + }); +} + bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) const { switch (Opc) { default: @@ -4433,7 +4437,8 @@ ArrayRef> AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const { static const std::pair TargetFlags[] = - {{MOSuppressPair, "aarch64-suppress-pair"}}; + {{MOSuppressPair, "aarch64-suppress-pair"}, + {MOStridedAccess, "aarch64-strided-access"}}; return makeArrayRef(TargetFlags); } Index: llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp =================================================================== --- llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp +++ llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp @@ -138,6 +138,9 @@ cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"), cl::init(-1)); +static cl::opt EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix", + cl::init(true), cl::Hidden); + extern "C" void LLVMInitializeAArch64Target() { // Register the target. RegisterTargetMachine X(getTheAArch64leTarget()); @@ -158,6 +161,7 @@ initializeAArch64PromoteConstantPass(*PR); initializeAArch64RedundantCopyEliminationPass(*PR); initializeAArch64StorePairSuppressPass(*PR); + initializeFalkorMarkStridedAccessesLegacyPass(*PR); initializeLDTLSCleanupPass(*PR); } @@ -346,8 +350,12 @@ // // Run this before LSR to remove the multiplies involved in computing the // pointer values N iterations ahead. - if (TM->getOptLevel() != CodeGenOpt::None && EnableLoopDataPrefetch) - addPass(createLoopDataPrefetchPass()); + if (TM->getOptLevel() != CodeGenOpt::None) { + if (EnableLoopDataPrefetch) + addPass(createLoopDataPrefetchPass()); + if (EnableFalkorHWPFFix) + addPass(createFalkorMarkStridedAccessesPass()); + } TargetPassConfig::addIRPasses(); Index: llvm/trunk/lib/Target/AArch64/CMakeLists.txt =================================================================== --- llvm/trunk/lib/Target/AArch64/CMakeLists.txt +++ llvm/trunk/lib/Target/AArch64/CMakeLists.txt @@ -47,6 +47,7 @@ AArch64ConditionalCompares.cpp AArch64DeadRegisterDefinitionsPass.cpp AArch64ExpandPseudoInsts.cpp + AArch64FalkorHWPFFix.cpp AArch64FastISel.cpp AArch64A53Fix835769.cpp AArch64FrameLowering.cpp Index: llvm/trunk/test/CodeGen/AArch64/falkor-hwpf.ll =================================================================== --- llvm/trunk/test/CodeGen/AArch64/falkor-hwpf.ll +++ llvm/trunk/test/CodeGen/AArch64/falkor-hwpf.ll @@ -0,0 +1,106 @@ +; RUN: opt < %s -S -falkor-hwpf-fix -mtriple aarch64 -mcpu=falkor | FileCheck %s +; RUN: opt < %s -S -falkor-hwpf-fix -mtriple aarch64 -mcpu=cortex-a57 | FileCheck %s --check-prefix=NOHWPF + +; Check that strided access metadata is added to loads in inner loops when compiling for Falkor. + +; CHECK-LABEL: @hwpf1( +; CHECK: load i32, i32* %gep, !falkor.strided.access !0 +; CHECK: load i32, i32* %gep2, !falkor.strided.access !0 + +; NOHWPF-LABEL: @hwpf1( +; NOHWPF: load i32, i32* %gep{{$}} +; NOHWPF: load i32, i32* %gep2{{$}} +define void @hwpf1(i32* %p, i32* %p2) { +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %inc, %loop ] + + %gep = getelementptr inbounds i32, i32* %p, i32 %iv + %load = load i32, i32* %gep + + %gep2 = getelementptr inbounds i32, i32* %p2, i32 %iv + %load2 = load i32, i32* %gep2 + + %inc = add i32 %iv, 1 + %exitcnd = icmp uge i32 %inc, 1024 + br i1 %exitcnd, label %exit, label %loop + +exit: + ret void +} + +; Check that outer loop strided load isn't marked. +; CHECK-LABEL: @hwpf2( +; CHECK: load i32, i32* %gep, !falkor.strided.access !0 +; CHECK: load i32, i32* %gep2{{$}} + +; NOHWPF-LABEL: @hwpf2( +; NOHWPF: load i32, i32* %gep{{$}} +; NOHWPF: load i32, i32* %gep2{{$}} +define void @hwpf2(i32* %p) { +entry: + br label %loop1 + +loop1: + %iv1 = phi i32 [ 0, %entry ], [ %inc1, %loop1.latch ] + %outer.sum = phi i32 [ 0, %entry ], [ %sum, %loop1.latch ] + br label %loop2.header + +loop2.header: + br label %loop2 + +loop2: + %iv2 = phi i32 [ 0, %loop2.header ], [ %inc2, %loop2 ] + %sum = phi i32 [ %outer.sum, %loop2.header ], [ %sum.inc, %loop2 ] + %gep = getelementptr inbounds i32, i32* %p, i32 %iv2 + %load = load i32, i32* %gep + %sum.inc = add i32 %sum, %load + %inc2 = add i32 %iv2, 1 + %exitcnd2 = icmp uge i32 %inc2, 1024 + br i1 %exitcnd2, label %exit2, label %loop2 + +exit2: + %gep2 = getelementptr inbounds i32, i32* %p, i32 %iv1 + %load2 = load i32, i32* %gep2 + br label %loop1.latch + +loop1.latch: + %inc1 = add i32 %iv1, 1 + %exitcnd1 = icmp uge i32 %inc1, 1024 + br i1 %exitcnd2, label %exit, label %loop1 + +exit: + ret void +} + + +; Check that non-strided load isn't marked. +; CHECK-LABEL: @hwpf3( +; CHECK: load i32, i32* %gep, !falkor.strided.access !0 +; CHECK: load i32, i32* %gep2{{$}} + +; NOHWPF-LABEL: @hwpf3( +; NOHWPF: load i32, i32* %gep{{$}} +; NOHWPF: load i32, i32* %gep2{{$}} +define void @hwpf3(i32* %p, i32* %p2) { +entry: + br label %loop + +loop: + %iv = phi i32 [ 0, %entry ], [ %inc, %loop ] + + %gep = getelementptr inbounds i32, i32* %p, i32 %iv + %load = load i32, i32* %gep + + %gep2 = getelementptr inbounds i32, i32* %p2, i32 %load + %load2 = load i32, i32* %gep2 + + %inc = add i32 %iv, 1 + %exitcnd = icmp uge i32 %inc, 1024 + br i1 %exitcnd, label %exit, label %loop + +exit: + ret void +} Index: llvm/trunk/test/CodeGen/MIR/AArch64/target-memoperands.mir =================================================================== --- llvm/trunk/test/CodeGen/MIR/AArch64/target-memoperands.mir +++ llvm/trunk/test/CodeGen/MIR/AArch64/target-memoperands.mir @@ -10,13 +10,17 @@ --- # CHECK-LABEL: name: target_memoperands # CHECK: %1(s64) = G_LOAD %0(p0) :: ("aarch64-suppress-pair" load 8) +# CHECK: %2(s32) = G_LOAD %0(p0) :: ("aarch64-strided-access" load 4) # CHECK: G_STORE %1(s64), %0(p0) :: ("aarch64-suppress-pair" store 8) +# CHECK: G_STORE %2(s32), %0(p0) :: ("aarch64-strided-access" store 4) name: target_memoperands body: | bb.0: %0:_(p0) = COPY %x0 %1:_(s64) = G_LOAD %0(p0) :: ("aarch64-suppress-pair" load 8) + %2:_(s32) = G_LOAD %0(p0) :: ("aarch64-strided-access" load 4) G_STORE %1(s64), %0(p0) :: ("aarch64-suppress-pair" store 8) + G_STORE %2(s32), %0(p0) :: ("aarch64-strided-access" store 4) RET_ReallyLR ...