Index: llvm/trunk/lib/Target/AArch64/AArch64.h
===================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64.h
+++ llvm/trunk/lib/Target/AArch64/AArch64.h
@@ -44,6 +44,7 @@
 FunctionPass *createAArch64ConditionOptimizerPass();
 FunctionPass *createAArch64A57FPLoadBalancing();
 FunctionPass *createAArch64A53Fix835769();
+FunctionPass *createFalkorMarkStridedAccessesPass();
 
 FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
 
@@ -66,6 +67,7 @@
 void initializeAArch64PromoteConstantPass(PassRegistry&);
 void initializeAArch64RedundantCopyEliminationPass(PassRegistry&);
 void initializeAArch64StorePairSuppressPass(PassRegistry&);
+void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&);
 void initializeLDTLSCleanupPass(PassRegistry&);
 } // end namespace llvm
 
Index: llvm/trunk/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
===================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
+++ llvm/trunk/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
@@ -0,0 +1,147 @@
+//===-- AArch64FalkorHWPFFix.cpp - Avoid HW prefetcher pitfalls on Falkor--===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// For Falkor, we want to avoid HW prefetcher instruction tag collisions that
+// may inhibit the HW prefetching.  This is done in two steps.  Before ISel, we
+// mark strided loads (i.e. those that will likely benefit from prefetching)
+// with metadata.  Then, after opcodes have been finalized, we insert MOVs and
+// re-write loads to prevent unintnentional tag collisions.
+// ===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64TargetMachine.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "falkor-hwpf-fix"
+
+STATISTIC(NumStridedLoadsMarked, "Number of strided loads marked");
+
+namespace {
+
+class FalkorMarkStridedAccesses {
+public:
+  FalkorMarkStridedAccesses(LoopInfo &LI, ScalarEvolution &SE)
+      : LI(LI), SE(SE) {}
+
+  bool run();
+
+private:
+  bool runOnLoop(Loop *L);
+
+  LoopInfo &LI;
+  ScalarEvolution &SE;
+};
+
+class FalkorMarkStridedAccessesLegacy : public FunctionPass {
+public:
+  static char ID; // Pass ID, replacement for typeid
+  FalkorMarkStridedAccessesLegacy() : FunctionPass(ID) {
+    initializeFalkorMarkStridedAccessesLegacyPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    // FIXME: For some reason, preserving SE here breaks LSR (even if
+    // this pass changes nothing).
+    // AU.addPreserved<ScalarEvolutionWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+} // namespace
+
+char FalkorMarkStridedAccessesLegacy::ID = 0;
+INITIALIZE_PASS_BEGIN(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,
+                      "Falkor HW Prefetch Fix", false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_END(FalkorMarkStridedAccessesLegacy, DEBUG_TYPE,
+                    "Falkor HW Prefetch Fix", false, false)
+
+FunctionPass *llvm::createFalkorMarkStridedAccessesPass() {
+  return new FalkorMarkStridedAccessesLegacy();
+}
+
+bool FalkorMarkStridedAccessesLegacy::runOnFunction(Function &F) {
+  TargetPassConfig &TPC = getAnalysis<TargetPassConfig>();
+  const AArch64Subtarget *ST =
+      TPC.getTM<AArch64TargetMachine>().getSubtargetImpl(F);
+  if (ST->getProcFamily() != AArch64Subtarget::Falkor)
+    return false;
+
+  if (skipFunction(F))
+    return false;
+
+  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+
+  FalkorMarkStridedAccesses LDP(LI, SE);
+  return LDP.run();
+}
+
+bool FalkorMarkStridedAccesses::run() {
+  bool MadeChange = false;
+
+  for (Loop *I : LI)
+    for (auto L = df_begin(I), LE = df_end(I); L != LE; ++L)
+      MadeChange |= runOnLoop(*L);
+
+  return MadeChange;
+}
+
+bool FalkorMarkStridedAccesses::runOnLoop(Loop *L) {
+  // Only mark strided loads in the inner-most loop
+  if (!L->empty())
+    return false;
+
+  bool MadeChange = false;
+
+  for (const auto BB : L->blocks()) {
+    for (auto &I : *BB) {
+      LoadInst *LoadI = dyn_cast<LoadInst>(&I);
+      if (!LoadI)
+        continue;
+
+      Value *PtrValue = LoadI->getPointerOperand();
+      if (L->isLoopInvariant(PtrValue))
+        continue;
+
+      const SCEV *LSCEV = SE.getSCEV(PtrValue);
+      const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
+      if (!LSCEVAddRec || !LSCEVAddRec->isAffine())
+        continue;
+
+      LoadI->setMetadata(FALKOR_STRIDED_ACCESS_MD,
+                         MDNode::get(LoadI->getContext(), {}));
+      ++NumStridedLoadsMarked;
+      DEBUG(dbgs() << "Load: " << I << " marked as strided\n");
+      MadeChange = true;
+    }
+  }
+
+  return MadeChange;
+}
Index: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.h
+++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.h
@@ -455,6 +455,8 @@
   unsigned getNumInterleavedAccesses(VectorType *VecTy,
                                      const DataLayout &DL) const;
 
+  MachineMemOperand::Flags getMMOFlags(const Instruction &I) const override;
+
 private:
   bool isExtFreeImpl(const Instruction *Ext) const override;
 
Index: llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ llvm/trunk/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -7482,6 +7482,14 @@
   return (DL.getTypeSizeInBits(VecTy) + 127) / 128;
 }
 
+MachineMemOperand::Flags
+AArch64TargetLowering::getMMOFlags(const Instruction &I) const {
+  if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
+      I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
+    return MOStridedAccess;
+  return MachineMemOperand::MONone;
+}
+
 bool AArch64TargetLowering::isLegalInterleavedAccessType(
     VectorType *VecTy, const DataLayout &DL) const {
 
Index: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.h
===================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.h
+++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.h
@@ -27,6 +27,13 @@
 class AArch64Subtarget;
 class AArch64TargetMachine;
 
+static const MachineMemOperand::Flags MOSuppressPair =
+    MachineMemOperand::MOTargetFlag1;
+static const MachineMemOperand::Flags MOStridedAccess =
+    MachineMemOperand::MOTargetFlag2;
+
+#define FALKOR_STRIDED_ACCESS_MD "falkor.strided.access"
+
 class AArch64InstrInfo final : public AArch64GenInstrInfo {
   const AArch64RegisterInfo RI;
   const AArch64Subtarget &Subtarget;
@@ -81,6 +88,9 @@
   /// unprofitable.
   bool isLdStPairSuppressed(const MachineInstr &MI) const;
 
+  /// Return true if the given load or store is a strided memory access.
+  bool isStridedAccess(const MachineInstr &MI) const;
+
   /// Return true if this is an unscaled load/store.
   bool isUnscaledLdSt(unsigned Opc) const;
 
Index: llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp
===================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ llvm/trunk/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -52,9 +52,6 @@
 #define GET_INSTRINFO_CTOR_DTOR
 #include "AArch64GenInstrInfo.inc"
 
-static const MachineMemOperand::Flags MOSuppressPair =
-    MachineMemOperand::MOTargetFlag1;
-
 static cl::opt<unsigned>
 TBZDisplacementBits("aarch64-tbz-offset-bits", cl::Hidden, cl::init(14),
                     cl::desc("Restrict range of TB[N]Z instructions (DEBUG)"));
@@ -1715,6 +1712,13 @@
   (*MI.memoperands_begin())->setFlags(MOSuppressPair);
 }
 
+/// Check all MachineMemOperands for a hint that the load/store is strided.
+bool AArch64InstrInfo::isStridedAccess(const MachineInstr &MI) const {
+  return llvm::any_of(MI.memoperands(), [](MachineMemOperand *MMO) {
+    return MMO->getFlags() & MOStridedAccess;
+  });
+}
+
 bool AArch64InstrInfo::isUnscaledLdSt(unsigned Opc) const {
   switch (Opc) {
   default:
@@ -4433,7 +4437,8 @@
 ArrayRef<std::pair<MachineMemOperand::Flags, const char *>>
 AArch64InstrInfo::getSerializableMachineMemOperandTargetFlags() const {
   static const std::pair<MachineMemOperand::Flags, const char *> TargetFlags[] =
-      {{MOSuppressPair, "aarch64-suppress-pair"}};
+      {{MOSuppressPair, "aarch64-suppress-pair"},
+       {MOStridedAccess, "aarch64-strided-access"}};
   return makeArrayRef(TargetFlags);
 }
 
Index: llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp
===================================================================
--- llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ llvm/trunk/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -138,6 +138,9 @@
     cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"),
     cl::init(-1));
 
+static cl::opt<bool> EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix",
+                                         cl::init(true), cl::Hidden);
+
 extern "C" void LLVMInitializeAArch64Target() {
   // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(getTheAArch64leTarget());
@@ -158,6 +161,7 @@
   initializeAArch64PromoteConstantPass(*PR);
   initializeAArch64RedundantCopyEliminationPass(*PR);
   initializeAArch64StorePairSuppressPass(*PR);
+  initializeFalkorMarkStridedAccessesLegacyPass(*PR);
   initializeLDTLSCleanupPass(*PR);
 }
 
@@ -346,8 +350,12 @@
   //
   // Run this before LSR to remove the multiplies involved in computing the
   // pointer values N iterations ahead.
-  if (TM->getOptLevel() != CodeGenOpt::None && EnableLoopDataPrefetch)
-    addPass(createLoopDataPrefetchPass());
+  if (TM->getOptLevel() != CodeGenOpt::None) {
+    if (EnableLoopDataPrefetch)
+      addPass(createLoopDataPrefetchPass());
+    if (EnableFalkorHWPFFix)
+      addPass(createFalkorMarkStridedAccessesPass());
+  }
 
   TargetPassConfig::addIRPasses();
 
Index: llvm/trunk/lib/Target/AArch64/CMakeLists.txt
===================================================================
--- llvm/trunk/lib/Target/AArch64/CMakeLists.txt
+++ llvm/trunk/lib/Target/AArch64/CMakeLists.txt
@@ -47,6 +47,7 @@
   AArch64ConditionalCompares.cpp
   AArch64DeadRegisterDefinitionsPass.cpp
   AArch64ExpandPseudoInsts.cpp
+  AArch64FalkorHWPFFix.cpp
   AArch64FastISel.cpp
   AArch64A53Fix835769.cpp
   AArch64FrameLowering.cpp
Index: llvm/trunk/test/CodeGen/AArch64/falkor-hwpf.ll
===================================================================
--- llvm/trunk/test/CodeGen/AArch64/falkor-hwpf.ll
+++ llvm/trunk/test/CodeGen/AArch64/falkor-hwpf.ll
@@ -0,0 +1,106 @@
+; RUN: opt < %s -S -falkor-hwpf-fix -mtriple aarch64 -mcpu=falkor | FileCheck %s
+; RUN: opt < %s -S -falkor-hwpf-fix -mtriple aarch64 -mcpu=cortex-a57 | FileCheck %s --check-prefix=NOHWPF
+
+; Check that strided access metadata is added to loads in inner loops when compiling for Falkor.
+
+; CHECK-LABEL: @hwpf1(
+; CHECK: load i32, i32* %gep, !falkor.strided.access !0
+; CHECK: load i32, i32* %gep2, !falkor.strided.access !0
+
+; NOHWPF-LABEL: @hwpf1(
+; NOHWPF: load i32, i32* %gep{{$}}
+; NOHWPF: load i32, i32* %gep2{{$}}
+define void @hwpf1(i32* %p, i32* %p2) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %loop ]
+
+  %gep = getelementptr inbounds i32, i32* %p, i32 %iv
+  %load = load i32, i32* %gep
+
+  %gep2 = getelementptr inbounds i32, i32* %p2, i32 %iv
+  %load2 = load i32, i32* %gep2
+
+  %inc = add i32 %iv, 1
+  %exitcnd = icmp uge i32 %inc, 1024
+  br i1 %exitcnd, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+; Check that outer loop strided load isn't marked.
+; CHECK-LABEL: @hwpf2(
+; CHECK: load i32, i32* %gep, !falkor.strided.access !0
+; CHECK: load i32, i32* %gep2{{$}}
+
+; NOHWPF-LABEL: @hwpf2(
+; NOHWPF: load i32, i32* %gep{{$}}
+; NOHWPF: load i32, i32* %gep2{{$}}
+define void @hwpf2(i32* %p) {
+entry:
+  br label %loop1
+
+loop1:
+  %iv1 = phi i32 [ 0, %entry ], [ %inc1, %loop1.latch ]
+  %outer.sum = phi i32 [ 0, %entry ], [ %sum, %loop1.latch ]
+  br label %loop2.header
+
+loop2.header:
+  br label %loop2
+
+loop2:
+  %iv2 = phi i32 [ 0, %loop2.header ], [ %inc2, %loop2 ]
+  %sum = phi i32 [ %outer.sum, %loop2.header ], [ %sum.inc, %loop2 ]
+  %gep = getelementptr inbounds i32, i32* %p, i32 %iv2
+  %load = load i32, i32* %gep
+  %sum.inc = add i32 %sum, %load
+  %inc2 = add i32 %iv2, 1
+  %exitcnd2 = icmp uge i32 %inc2, 1024
+  br i1 %exitcnd2, label %exit2, label %loop2
+
+exit2:
+  %gep2 = getelementptr inbounds i32, i32* %p, i32 %iv1
+  %load2 = load i32, i32* %gep2
+  br label %loop1.latch
+
+loop1.latch:
+  %inc1 = add i32 %iv1, 1
+  %exitcnd1 = icmp uge i32 %inc1, 1024
+  br i1 %exitcnd2, label %exit, label %loop1
+
+exit:
+  ret void
+}
+
+
+; Check that non-strided load isn't marked.
+; CHECK-LABEL: @hwpf3(
+; CHECK: load i32, i32* %gep, !falkor.strided.access !0
+; CHECK: load i32, i32* %gep2{{$}}
+
+; NOHWPF-LABEL: @hwpf3(
+; NOHWPF: load i32, i32* %gep{{$}}
+; NOHWPF: load i32, i32* %gep2{{$}}
+define void @hwpf3(i32* %p, i32* %p2) {
+entry:
+  br label %loop
+
+loop:
+  %iv = phi i32 [ 0, %entry ], [ %inc, %loop ]
+
+  %gep = getelementptr inbounds i32, i32* %p, i32 %iv
+  %load = load i32, i32* %gep
+
+  %gep2 = getelementptr inbounds i32, i32* %p2, i32 %load
+  %load2 = load i32, i32* %gep2
+
+  %inc = add i32 %iv, 1
+  %exitcnd = icmp uge i32 %inc, 1024
+  br i1 %exitcnd, label %exit, label %loop
+
+exit:
+  ret void
+}
Index: llvm/trunk/test/CodeGen/MIR/AArch64/target-memoperands.mir
===================================================================
--- llvm/trunk/test/CodeGen/MIR/AArch64/target-memoperands.mir
+++ llvm/trunk/test/CodeGen/MIR/AArch64/target-memoperands.mir
@@ -10,13 +10,17 @@
 ---
 # CHECK-LABEL: name: target_memoperands
 # CHECK: %1(s64) = G_LOAD %0(p0) :: ("aarch64-suppress-pair" load 8)
+# CHECK: %2(s32) = G_LOAD %0(p0) :: ("aarch64-strided-access" load 4)
 # CHECK: G_STORE %1(s64), %0(p0) :: ("aarch64-suppress-pair" store 8)
+# CHECK: G_STORE %2(s32), %0(p0) :: ("aarch64-strided-access" store 4)
 name:            target_memoperands
 body: |
   bb.0:
 
     %0:_(p0) = COPY %x0
     %1:_(s64) = G_LOAD %0(p0) :: ("aarch64-suppress-pair" load 8)
+    %2:_(s32) = G_LOAD %0(p0) :: ("aarch64-strided-access" load 4)
     G_STORE %1(s64), %0(p0) :: ("aarch64-suppress-pair" store 8)
+    G_STORE %2(s32), %0(p0) :: ("aarch64-strided-access" store 4)
     RET_ReallyLR
 ...