Index: lib/Target/AArch64/AArch64.h
===================================================================
--- lib/Target/AArch64/AArch64.h
+++ lib/Target/AArch64/AArch64.h
@@ -39,6 +39,7 @@
 FunctionPass *createAArch64ConditionOptimizerPass();
 FunctionPass *createAArch64AddressTypePromotionPass();
 FunctionPass *createAArch64A57FPLoadBalancing();
+FunctionPass *createAArch64FixCortexA53_835769();
 /// \brief Creates an ARM-specific Target Transformation Info pass.
 ImmutablePass *
 createAArch64TargetTransformInfoPass(const AArch64TargetMachine *TM);
Index: lib/Target/AArch64/AArch64FixCortexA53_835769.cpp
===================================================================
--- /dev/null
+++ lib/Target/AArch64/AArch64FixCortexA53_835769.cpp
@@ -0,0 +1,231 @@
+//===-- AArch64FixCortexA53_835769.cpp ------------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+// This pass changes code to work around Cortex-A53 erratum 835769.
+// It works around it by inserting a nop instruction in code sequences that
+// in some circumstances may trigger the erratum.
+// It inserts a nop instruction between a sequence of the following 2 classes
+// of instructions:
+// instr 1: mem-instr (including loads, stores and prefetches).
+// instr 2: non-SIMD integer multiply-accumulate writing 64-bit X registers.
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-fix-cortex-a53-835769"
+
+static cl::opt<bool>
+WorkAroundA53Erratum("aarch64-fix-cortex-a53-835769", cl::Hidden,
+                     cl::desc("Work around Cortex-A53 erratum 835769"),
+                     cl::init(false));
+
+STATISTIC(NumNopsAdded, "Number of Nops added to work around erratum 835769");
+
+//===----------------------------------------------------------------------===//
+// Helper functions
+
+// Is the instruction a match for the instruction that comes first in the
+// sequence of instructions that can trigger the erratum?
+static bool isFirstInstructionInSequence(MachineInstr *MI) {
+  // Must return true if this instruction is a load, a store or a prefetch.
+  switch (MI->getOpcode()) {
+  case AArch64::PRFMl:
+  case AArch64::PRFMroW:
+  case AArch64::PRFMroX:
+  case AArch64::PRFMui:
+  case AArch64::PRFUMi:
+    return true;
+  default:
+    return (MI->mayLoad() || MI->mayStore());
+  }
+}
+
+// Is the instruction a match for the instruction that comes second in the
+// sequence that can trigger the erratum?
+static bool isSecondInstructionInSequence(MachineInstr *MI) {
+  // Must return true for non-SIMD integer multiply-accumulates, writing
+  // to a 64-bit register.
+  switch (MI->getOpcode()) {
+  // Erratum cannot be triggered when the destination register is 32 bits,
+  // therefore only include the following.
+  case AArch64::MSUBXrrr:
+  case AArch64::MADDXrrr:
+  case AArch64::SMADDLrrr:
+  case AArch64::SMSUBLrrr:
+  case AArch64::UMADDLrrr:
+  case AArch64::UMSUBLrrr:
+    // Erratum can only be triggered by multiply-adds, not by regular
+    // non-accumulating multiplies, i.e. when Ra=XZR='11111'
+    return MI->getOperand(3).getReg() != AArch64::XZR;
+  default:
+    return false;
+  }
+}
+
+
+//===----------------------------------------------------------------------===//
+
+namespace {
+class AArch64FixCortexA53_835769 : public MachineFunctionPass {
+  const AArch64InstrInfo *TII;
+
+public:
+  static char ID;
+  explicit AArch64FixCortexA53_835769() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &F) override;
+
+  const char *getPassName() const override {
+    return "Workaround A53 erratum 835769 pass";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  bool runOnBasicBlock(MachineBasicBlock &MBB);
+};
+char AArch64FixCortexA53_835769::ID = 0;
+
+} // end anonymous namespace
+
+//===----------------------------------------------------------------------===//
+
+bool
+AArch64FixCortexA53_835769::runOnMachineFunction(MachineFunction &F) {
+  const TargetMachine &TM = F.getTarget();
+  if (!WorkAroundA53Erratum)
+    return false;
+
+  bool Changed = false;
+  DEBUG(dbgs() << "***** AArch64FixCortexA53_835769 *****\n");
+
+  TII = TM.getSubtarget<AArch64Subtarget>().getInstrInfo();
+
+  for (auto &MBB : F) {
+    Changed |= runOnBasicBlock(MBB);
+  }
+
+  return Changed;
+}
+
+// Return the block that was fallen through to get to MBB, if any,
+// otherwise nullptr.
+static MachineBasicBlock *getBBFallenThrough(MachineBasicBlock &MBB) {
+  // Get the previous machine basic block in the function.
+  MachineFunction::iterator MBBI = MBB;
+
+  // Can't go off top of function.
+  if (MBBI == MBB.getParent()->begin())
+    return nullptr;
+
+  MachineBasicBlock *PrevBB = std::prev(MBBI);
+  for (MachineBasicBlock *S : MBB.predecessors())
+    if (S == PrevBB)
+      return S;
+
+  return nullptr;
+}
+
+static MachineInstr *getLastNonPseudo(MachineBasicBlock *MBB) {
+  for (auto I = MBB->rbegin(), E = MBB->rend(); I != E; ++I) {
+    if (!I->isPseudo())
+      return &*I;
+  }
+
+  llvm_unreachable("Expected to find instruction");
+}
+
+static void insertNopBeforeInstruction(MachineBasicBlock &MBB, MachineInstr* MI,
+                                       const TargetInstrInfo *TII) {
+  // If we are the first instruction of the block, put the NOP at the end of
+  // the previous fallthrough block
+  if (MI == &MBB.front()) {
+    MachineBasicBlock *PMBB = getBBFallenThrough(MBB);
+    assert(PMBB && "Expected basic block");
+    MachineInstr *I = getLastNonPseudo(PMBB);
+    assert(I && "Expected instruction");
+    DebugLoc DL = I->getDebugLoc();
+    BuildMI(PMBB, DL, TII->get(AArch64::HINT)).addImm(0);
+  }
+  else {
+    DebugLoc DL = MI->getDebugLoc();
+    BuildMI(MBB, MI, DL, TII->get(AArch64::HINT)).addImm(0);
+  }
+
+  ++NumNopsAdded;
+}
+
+bool
+AArch64FixCortexA53_835769::runOnBasicBlock(MachineBasicBlock &MBB) {
+  bool Changed = false;
+  DEBUG(dbgs() << "Running on MBB: " << MBB << " - scanning instructions...\n");
+
+  // First, scan the basic block, looking for a sequence of 2 instructions
+  // that match the conditions under which the erratum may trigger.
+
+  // List of terminating instructions in matching sequences
+  std::vector<MachineInstr*> Sequences;
+  unsigned Idx = 0;
+  MachineInstr *PrevInstr = nullptr;
+
+  if (MachineBasicBlock *PMBB = getBBFallenThrough(MBB))
+      PrevInstr = getLastNonPseudo(PMBB);
+
+  for (auto &MI : MBB) {
+    MachineInstr *CurrInstr = &MI;
+    DEBUG(dbgs() << "  Examining: " << MI);
+    if (PrevInstr) {
+      DEBUG(dbgs() << "    PrevInstr: " << *PrevInstr
+                   << "    CurrInstr: " << *CurrInstr
+                   << "    isFirstInstructionInSequence(PrevInstr): "
+                   << isFirstInstructionInSequence(PrevInstr) << "\n"
+                   << "    isSecondInstructionInSequence(CurrInstr): "
+                   << isSecondInstructionInSequence(CurrInstr) << "\n");
+      if (isFirstInstructionInSequence(PrevInstr) &&
+          isSecondInstructionInSequence(CurrInstr)) {
+        DEBUG(dbgs() << "   ** pattern found at Idx " << Idx << "!\n");
+        Sequences.push_back(CurrInstr);
+      }
+    }
+    if (!CurrInstr->isPseudo())
+      PrevInstr = CurrInstr;
+    ++Idx;
+  }
+
+  DEBUG(dbgs() << "Scan complete, "<< Sequences.size()
+               << " occurences of pattern found.\n");
+
+  // Then update the basic block, inserting nops between the detected sequences.
+  for (auto &MI : Sequences) {
+    Changed = true;
+    insertNopBeforeInstruction(MBB, MI, TII);
+  }
+
+  return Changed;
+}
+
+// Factory function used by AArch64TargetMachine to add the pass to
+// the passmanager.
+FunctionPass *llvm::createAArch64FixCortexA53_835769() {
+  return new AArch64FixCortexA53_835769();
+}
Index: lib/Target/AArch64/AArch64TargetMachine.cpp
===================================================================
--- lib/Target/AArch64/AArch64TargetMachine.cpp
+++ lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -274,6 +274,7 @@
 }
 
 bool AArch64PassConfig::addPreEmitPass() {
+  addPass(createAArch64FixCortexA53_835769());
   // Relax conditional branch instructions if they're otherwise out of
   // range of their destination.
   addPass(createAArch64BranchRelaxation());
Index: lib/Target/AArch64/CMakeLists.txt
===================================================================
--- lib/Target/AArch64/CMakeLists.txt
+++ lib/Target/AArch64/CMakeLists.txt
@@ -26,6 +26,7 @@
   AArch64DeadRegisterDefinitionsPass.cpp
   AArch64ExpandPseudoInsts.cpp
   AArch64FastISel.cpp
+  AArch64FixCortexA53_835769.cpp
   AArch64FrameLowering.cpp
   AArch64ConditionOptimizer.cpp
   AArch64ISelDAGToDAG.cpp
Index: test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/aarch64-fix-cortex-a53-835769.ll
@@ -0,0 +1,524 @@
+; The regression tests need to test for order of emitted instructions, and
+; therefore, the tests are a bit fragile/reliant on instruction scheduling. The
+; test cases have been minimized as much as possible, but still most of the test
+; cases could break if instruction scheduling heuristics for cortex-a53 change
+; RUN: llc < %s -mcpu=cortex-a53 -aarch64-fix-cortex-a53-835769=1 -stats 2>&1 \
+; RUN:  | FileCheck %s --check-prefix CHECK
+; RUN: llc < %s -mcpu=cortex-a53 -aarch64-fix-cortex-a53-835769=0 -stats 2>&1 \
+; RUN:  | FileCheck %s --check-prefix CHECK-NOWORKAROUND
+; The following run lines are just to verify whether or not this pass runs by
+; default for given CPUs. Given the fragility of the tests, this is only run on
+; a test case where the scheduler has not freedom at all to reschedule the
+; instructions, so the potentially massively different scheduling heuristics
+; will not break the test case.
+; RUN: llc < %s -mcpu=generic    | FileCheck %s --check-prefix CHECK-BASIC-PASS-DISABLED
+; RUN: llc < %s -mcpu=cortex-a53 | FileCheck %s --check-prefix CHECK-BASIC-PASS-DISABLED
+; RUN: llc < %s -mcpu=cortex-a57 | FileCheck %s --check-prefix CHECK-BASIC-PASS-DISABLED
+; RUN: llc < %s -mcpu=cyclone    | FileCheck %s --check-prefix CHECK-BASIC-PASS-DISABLED
+
+target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
+target triple = "aarch64--linux-gnu"
+
+define i64 @f_load_madd_64(i64 %a, i64 %b, i64* nocapture readonly %c) #0 {
+entry:
+  %0 = load i64* %c, align 8
+  %mul = mul nsw i64 %0, %b
+  %add = add nsw i64 %mul, %a
+  ret i64 %add
+}
+; CHECK-LABEL: f_load_madd_64:
+; CHECK:	ldr
+; CHECK-NEXT:	nop
+; CHECK-NEXT:	madd
+; CHECK-NOWORKAROUND-LABEL: f_load_madd_64:
+; CHECK-NOWORKAROUND:	ldr
+; CHECK-NOWORKAROUND-NEXT:	madd
+; CHECK-BASIC-PASS-DISABLED-LABEL: f_load_madd_64:
+; CHECK-BASIC-PASS-DISABLED:  ldr
+; CHECK-BASIC-PASS-DISABLED-NEXT:  madd
+
+
+define i32 @f_load_madd_32(i32 %a, i32 %b, i32* nocapture readonly %c) #0 {
+entry:
+  %0 = load i32* %c, align 4
+  %mul = mul nsw i32 %0, %b
+  %add = add nsw i32 %mul, %a
+  ret i32 %add
+}
+; CHECK-LABEL: f_load_madd_32:
+; CHECK:	ldr
+; CHECK-NEXT:	madd
+; CHECK-NOWORKAROUND-LABEL: f_load_madd_32:
+; CHECK-NOWORKAROUND:	ldr
+; CHECK-NOWORKAROUND-NEXT:	madd
+
+
+define i64 @f_load_msub_64(i64 %a, i64 %b, i64* nocapture readonly %c) #0 {
+entry:
+  %0 = load i64* %c, align 8
+  %mul = mul nsw i64 %0, %b
+  %sub = sub nsw i64 %a, %mul
+  ret i64 %sub
+}
+; CHECK-LABEL: f_load_msub_64:
+; CHECK:	ldr
+; CHECK-NEXT:	nop
+; CHECK-NEXT:	msub
+; CHECK-NOWORKAROUND-LABEL: f_load_msub_64:
+; CHECK-NOWORKAROUND:	ldr
+; CHECK-NOWORKAROUND-NEXT:	msub
+
+
+define i32 @f_load_msub_32(i32 %a, i32 %b, i32* nocapture readonly %c) #0 {
+entry:
+  %0 = load i32* %c, align 4
+  %mul = mul nsw i32 %0, %b
+  %sub = sub nsw i32 %a, %mul
+  ret i32 %sub
+}
+; CHECK-LABEL: f_load_msub_32:
+; CHECK:	ldr
+; CHECK-NEXT:	msub
+; CHECK-NOWORKAROUND-LABEL: f_load_msub_32:
+; CHECK-NOWORKAROUND:	ldr
+; CHECK-NOWORKAROUND-NEXT:	msub
+
+
+define i64 @f_load_mul_64(i64 %a, i64 %b, i64* nocapture readonly %c) #0 {
+entry:
+  %0 = load i64* %c, align 8
+  %mul = mul nsw i64 %0, %b
+  ret i64 %mul
+}
+; CHECK-LABEL: f_load_mul_64:
+; CHECK:	ldr
+; CHECK-NEXT:	mul
+; CHECK-NOWORKAROUND-LABEL: f_load_mul_64:
+; CHECK-NOWORKAROUND:	ldr
+; CHECK-NOWORKAROUND-NEXT:	mul
+
+
+define i32 @f_load_mul_32(i32 %a, i32 %b, i32* nocapture readonly %c) #0 {
+entry:
+  %0 = load i32* %c, align 4
+  %mul = mul nsw i32 %0, %b
+  ret i32 %mul
+}
+; CHECK-LABEL: f_load_mul_32:
+; CHECK:	ldr
+; CHECK-NEXT:	mul
+; CHECK-NOWORKAROUND-LABEL: f_load_mul_32:
+; CHECK-NOWORKAROUND:	ldr
+; CHECK-NOWORKAROUND-NEXT:	mul
+
+
+define i64 @f_load_mneg_64(i64 %a, i64 %b, i64* nocapture readonly %c) #0 {
+entry:
+  %0 = load i64* %c, align 8
+  %mul = sub i64 0, %b
+  %sub = mul i64 %0, %mul
+  ret i64 %sub
+}
+; CHECK-LABEL: f_load_mneg_64:
+; CHECK-NOWORKAROUND-LABEL: f_load_mneg_64:
+; FIXME: only add further checks here once LLVM actually produces
+;        neg instructions
+; FIXME-CHECK: ldr
+; FIXME-CHECK-NEXT: nop
+; FIXME-CHECK-NEXT: mneg
+; FIXME-CHECK-NOWORKAROUND: ldr
+; FIXME-CHECK-NOWORKAROUND-NEXT: mneg
+
+
+define i32 @f_load_mneg_32(i32 %a, i32 %b, i32* nocapture readonly %c) #0 {
+entry:
+  %0 = load i32* %c, align 4
+  %mul = sub i32 0, %b
+  %sub = mul i32 %0, %mul
+  ret i32 %sub
+}
+; CHECK-LABEL: f_load_mneg_32:
+; CHECK-NOWORKAROUND-LABEL: f_load_mneg_32:
+; FIXME: only add further checks here once LLVM actually produces
+;        neg instructions
+; FIXME-CHECK: ldr
+; FIXME-CHECK-NEXT: mneg
+; FIXME-CHECK-NOWORKAROUND: ldr
+; FIXME-CHECK-NOWORKAROUND-NEXT: mneg
+
+
+define i64 @f_load_smaddl(i64 %a, i32 %b, i32 %c, i32* nocapture readonly %d) #0 {
+entry:
+  %conv = sext i32 %b to i64
+  %conv1 = sext i32 %c to i64
+  %mul = mul nsw i64 %conv1, %conv
+  %add = add nsw i64 %mul, %a
+  %0 = load i32* %d, align 4
+  %conv2 = sext i32 %0 to i64
+  %add3 = add nsw i64 %add, %conv2
+  ret i64 %add3
+}
+; CHECK-LABEL: f_load_smaddl:
+; CHECK:	ldrsw
+; CHECK-NEXT:	nop
+; CHECK-NEXT:	smaddl
+; CHECK-NOWORKAROUND-LABEL: f_load_smaddl:
+; CHECK-NOWORKAROUND:	ldrsw
+; CHECK-NOWORKAROUND-NEXT:	smaddl
+
+
+define i64 @f_load_smsubl_64(i64 %a, i32 %b, i32 %c, i32* nocapture readonly %d) #0 {
+entry:
+  %conv = sext i32 %b to i64
+  %conv1 = sext i32 %c to i64
+  %mul = mul nsw i64 %conv1, %conv
+  %sub = sub i64 %a, %mul
+  %0 = load i32* %d, align 4
+  %conv2 = sext i32 %0 to i64
+  %add = add nsw i64 %sub, %conv2
+  ret i64 %add
+}
+; CHECK-LABEL: f_load_smsubl_64:
+; CHECK:	ldrsw
+; CHECK-NEXT:	nop
+; CHECK-NEXT:	smsubl
+; CHECK-NOWORKAROUND-LABEL: f_load_smsubl_64:
+; CHECK-NOWORKAROUND:	ldrsw
+; CHECK-NOWORKAROUND-NEXT:	smsubl
+
+
+define i64 @f_load_smull(i64 %a, i32 %b, i32 %c, i32* nocapture readonly %d) #0 {
+entry:
+  %conv = sext i32 %b to i64
+  %conv1 = sext i32 %c to i64
+  %mul = mul nsw i64 %conv1, %conv
+  %0 = load i32* %d, align 4
+  %conv2 = sext i32 %0 to i64
+  %div = sdiv i64 %mul, %conv2
+  ret i64 %div
+}
+; CHECK-LABEL: f_load_smull:
+; CHECK:	ldrsw
+; CHECK-NEXT:	smull
+; CHECK-NOWORKAROUND-LABEL: f_load_smull:
+; CHECK-NOWORKAROUND:	ldrsw
+; CHECK-NOWORKAROUND-NEXT:	smull
+
+
+define i64 @f_load_smnegl_64(i64 %a, i32 %b, i32 %c, i32* nocapture readonly %d) #0 {
+entry:
+  %conv = sext i32 %b to i64
+  %conv1 = sext i32 %c to i64
+  %mul = sub nsw i64 0, %conv
+  %sub = mul i64 %conv1, %mul
+  %0 = load i32* %d, align 4
+  %conv2 = sext i32 %0 to i64
+  %div = sdiv i64 %sub, %conv2
+  ret i64 %div
+}
+; CHECK-LABEL: f_load_smnegl_64:
+; CHECK-NOWORKAROUND-LABEL: f_load_smnegl_64:
+; FIXME: only add further checks here once LLVM actually produces
+;        smnegl instructions
+
+
+define i64 @f_load_umaddl(i64 %a, i32 %b, i32 %c, i32* nocapture readonly %d) #0 {
+entry:
+  %conv = zext i32 %b to i64
+  %conv1 = zext i32 %c to i64
+  %mul = mul i64 %conv1, %conv
+  %add = add i64 %mul, %a
+  %0 = load i32* %d, align 4
+  %conv2 = zext i32 %0 to i64
+  %add3 = add i64 %add, %conv2
+  ret i64 %add3
+}
+; CHECK-LABEL: f_load_umaddl:
+; CHECK:	ldr
+; CHECK-NEXT:	nop
+; CHECK-NEXT:	umaddl
+; CHECK-NOWORKAROUND-LABEL: f_load_umaddl:
+; CHECK-NOWORKAROUND:	ldr
+; CHECK-NOWORKAROUND-NEXT:	umaddl
+
+
+define i64 @f_load_umsubl_64(i64 %a, i32 %b, i32 %c, i32* nocapture readonly %d) #0 {
+entry:
+  %conv = zext i32 %b to i64
+  %conv1 = zext i32 %c to i64
+  %mul = mul i64 %conv1, %conv
+  %sub = sub i64 %a, %mul
+  %0 = load i32* %d, align 4
+  %conv2 = zext i32 %0 to i64
+  %add = add i64 %sub, %conv2
+  ret i64 %add
+}
+; CHECK-LABEL: f_load_umsubl_64:
+; CHECK:	ldr
+; CHECK-NEXT:	nop
+; CHECK-NEXT:	umsubl
+; CHECK-NOWORKAROUND-LABEL: f_load_umsubl_64:
+; CHECK-NOWORKAROUND:	ldr
+; CHECK-NOWORKAROUND-NEXT:	umsubl
+
+
+define i64 @f_load_umull(i64 %a, i32 %b, i32 %c, i32* nocapture readonly %d) #0 {
+entry:
+  %conv = zext i32 %b to i64
+  %conv1 = zext i32 %c to i64
+  %mul = mul i64 %conv1, %conv
+  %0 = load i32* %d, align 4
+  %conv2 = zext i32 %0 to i64
+  %div = udiv i64 %mul, %conv2
+  ret i64 %div
+}
+; CHECK-LABEL: f_load_umull:
+; CHECK:	ldr
+; CHECK-NEXT:	umull
+; CHECK-NOWORKAROUND-LABEL: f_load_umull:
+; CHECK-NOWORKAROUND:	ldr
+; CHECK-NOWORKAROUND-NEXT:	umull
+
+
+define i64 @f_load_umnegl_64(i64 %a, i32 %b, i32 %c, i32* nocapture readonly %d) #0 {
+entry:
+  %conv = zext i32 %b to i64
+  %conv1 = zext i32 %c to i64
+  %mul = sub nsw i64 0, %conv
+  %sub = mul i64 %conv1, %mul
+  %0 = load i32* %d, align 4
+  %conv2 = zext i32 %0 to i64
+  %div = udiv i64 %sub, %conv2
+  ret i64 %div
+}
+; CHECK-LABEL: f_load_umnegl_64:
+; CHECK-NOWORKAROUND-LABEL: f_load_umnegl_64:
+; FIXME: only add further checks here once LLVM actually produces
+;        umnegl instructions
+
+
+define i64 @f_store_madd_64(i64 %a, i64 %b, i64* nocapture readonly %cp, i64* nocapture %e) #1 {
+entry:
+  %0 = load i64* %cp, align 8
+  store i64 %a, i64* %e, align 8
+  %mul = mul nsw i64 %0, %b
+  %add = add nsw i64 %mul, %a
+  ret i64 %add
+}
+; CHECK-LABEL: f_store_madd_64:
+; CHECK:	str
+; CHECK-NEXT:	nop
+; CHECK-NEXT:	madd
+; CHECK-NOWORKAROUND-LABEL: f_store_madd_64:
+; CHECK-NOWORKAROUND:	str
+; CHECK-NOWORKAROUND-NEXT:	madd
+
+
+define i32 @f_store_madd_32(i32 %a, i32 %b, i32* nocapture readonly %cp, i32* nocapture %e) #1 {
+entry:
+  %0 = load i32* %cp, align 4
+  store i32 %a, i32* %e, align 4
+  %mul = mul nsw i32 %0, %b
+  %add = add nsw i32 %mul, %a
+  ret i32 %add
+}
+; CHECK-LABEL: f_store_madd_32:
+; CHECK:	str
+; CHECK-NEXT:	madd
+; CHECK-NOWORKAROUND-LABEL: f_store_madd_32:
+; CHECK-NOWORKAROUND:	str
+; CHECK-NOWORKAROUND-NEXT:	madd
+
+
+define i64 @f_store_msub_64(i64 %a, i64 %b, i64* nocapture readonly %cp, i64* nocapture %e) #1 {
+entry:
+  %0 = load i64* %cp, align 8
+  store i64 %a, i64* %e, align 8
+  %mul = mul nsw i64 %0, %b
+  %sub = sub nsw i64 %a, %mul
+  ret i64 %sub
+}
+; CHECK-LABEL: f_store_msub_64:
+; CHECK:	str
+; CHECK-NEXT:	nop
+; CHECK-NEXT:	msub
+; CHECK-NOWORKAROUND-LABEL: f_store_msub_64:
+; CHECK-NOWORKAROUND:	str
+; CHECK-NOWORKAROUND-NEXT:	msub
+
+
+define i32 @f_store_msub_32(i32 %a, i32 %b, i32* nocapture readonly %cp, i32* nocapture %e) #1 {
+entry:
+  %0 = load i32* %cp, align 4
+  store i32 %a, i32* %e, align 4
+  %mul = mul nsw i32 %0, %b
+  %sub = sub nsw i32 %a, %mul
+  ret i32 %sub
+}
+; CHECK-LABEL: f_store_msub_32:
+; CHECK:	str
+; CHECK-NEXT:	msub
+; CHECK-NOWORKAROUND-LABEL: f_store_msub_32:
+; CHECK-NOWORKAROUND:	str
+; CHECK-NOWORKAROUND-NEXT:	msub
+
+
+define i64 @f_store_mul_64(i64 %a, i64 %b, i64* nocapture readonly %cp, i64* nocapture %e) #1 {
+entry:
+  %0 = load i64* %cp, align 8
+  store i64 %a, i64* %e, align 8
+  %mul = mul nsw i64 %0, %b
+  ret i64 %mul
+}
+; CHECK-LABEL: f_store_mul_64:
+; CHECK:	str
+; CHECK-NEXT:	mul
+; CHECK-NOWORKAROUND-LABEL: f_store_mul_64:
+; CHECK-NOWORKAROUND:	str
+; CHECK-NOWORKAROUND-NEXT:	mul
+
+
+define i32 @f_store_mul_32(i32 %a, i32 %b, i32* nocapture readonly %cp, i32* nocapture %e) #1 {
+entry:
+  %0 = load i32* %cp, align 4
+  store i32 %a, i32* %e, align 4
+  %mul = mul nsw i32 %0, %b
+  ret i32 %mul
+}
+; CHECK-LABEL: f_store_mul_32:
+; CHECK:	str
+; CHECK-NEXT:	mul
+; CHECK-NOWORKAROUND-LABEL: f_store_mul_32:
+; CHECK-NOWORKAROUND:	str
+; CHECK-NOWORKAROUND-NEXT:	mul
+
+
+define i64 @f_prefetch_madd_64(i64 %a, i64 %b, i64* nocapture readonly %cp, i64* nocapture %e) #1 {
+entry:
+  %0 = load i64* %cp, align 8
+  %1 = bitcast i64* %e to i8*
+  tail call void @llvm.prefetch(i8* %1, i32 0, i32 0, i32 1)
+  %mul = mul nsw i64 %0, %b
+  %add = add nsw i64 %mul, %a
+  ret i64 %add
+}
+; CHECK-LABEL: f_prefetch_madd_64:
+; CHECK:	prfm
+; CHECK-NEXT:   nop
+; CHECK-NEXT:	madd
+; CHECK-NOWORKAROUND-LABEL: f_prefetch_madd_64:
+; CHECK-NOWORKAROUND:	prfm
+; CHECK-NOWORKAROUND-NEXT:	madd
+
+declare void @llvm.prefetch(i8* nocapture, i32, i32, i32) #2
+
+define i32 @f_prefetch_madd_32(i32 %a, i32 %b, i32* nocapture readonly %cp, i32* nocapture %e) #1 {
+entry:
+  %0 = load i32* %cp, align 4
+  %1 = bitcast i32* %e to i8*
+  tail call void @llvm.prefetch(i8* %1, i32 1, i32 0, i32 1)
+  %mul = mul nsw i32 %0, %b
+  %add = add nsw i32 %mul, %a
+  ret i32 %add
+}
+; CHECK-LABEL: f_prefetch_madd_32:
+; CHECK:	prfm
+; CHECK-NEXT:	madd
+; CHECK-NOWORKAROUND-LABEL: f_prefetch_madd_32:
+; CHECK-NOWORKAROUND:	prfm
+; CHECK-NOWORKAROUND-NEXT:	madd
+
+define i64 @f_prefetch_msub_64(i64 %a, i64 %b, i64* nocapture readonly %cp, i64* nocapture %e) #1 {
+entry:
+  %0 = load i64* %cp, align 8
+  %1 = bitcast i64* %e to i8*
+  tail call void @llvm.prefetch(i8* %1, i32 0, i32 1, i32 1)
+  %mul = mul nsw i64 %0, %b
+  %sub = sub nsw i64 %a, %mul
+  ret i64 %sub
+}
+; CHECK-LABEL: f_prefetch_msub_64:
+; CHECK:	prfm
+; CHECK-NEXT:   nop
+; CHECK-NEXT:	msub
+; CHECK-NOWORKAROUND-LABEL: f_prefetch_msub_64:
+; CHECK-NOWORKAROUND:	prfm
+; CHECK-NOWORKAROUND-NEXT:	msub
+
+define i32 @f_prefetch_msub_32(i32 %a, i32 %b, i32* nocapture readonly %cp, i32* nocapture %e) #1 {
+entry:
+  %0 = load i32* %cp, align 4
+  %1 = bitcast i32* %e to i8*
+  tail call void @llvm.prefetch(i8* %1, i32 1, i32 1, i32 1)
+  %mul = mul nsw i32 %0, %b
+  %sub = sub nsw i32 %a, %mul
+  ret i32 %sub
+}
+; CHECK-LABEL: f_prefetch_msub_32:
+; CHECK:	prfm
+; CHECK-NEXT:	msub
+; CHECK-NOWORKAROUND-LABEL: f_prefetch_msub_32:
+; CHECK-NOWORKAROUND:	prfm
+; CHECK-NOWORKAROUND-NEXT:	msub
+
+define i64 @f_prefetch_mul_64(i64 %a, i64 %b, i64* nocapture readonly %cp, i64* nocapture %e) #1 {
+entry:
+  %0 = load i64* %cp, align 8
+  %1 = bitcast i64* %e to i8*
+  tail call void @llvm.prefetch(i8* %1, i32 0, i32 3, i32 1)
+  %mul = mul nsw i64 %0, %b
+  ret i64 %mul
+}
+; CHECK-LABEL: f_prefetch_mul_64:
+; CHECK:	prfm
+; CHECK-NEXT:	mul
+; CHECK-NOWORKAROUND-LABEL: f_prefetch_mul_64:
+; CHECK-NOWORKAROUND:	prfm
+; CHECK-NOWORKAROUND-NEXT:	mul
+
+define i32 @f_prefetch_mul_32(i32 %a, i32 %b, i32* nocapture readonly %cp, i32* nocapture %e) #1 {
+entry:
+  %0 = load i32* %cp, align 4
+  %1 = bitcast i32* %e to i8*
+  tail call void @llvm.prefetch(i8* %1, i32 1, i32 3, i32 1)
+  %mul = mul nsw i32 %0, %b
+  ret i32 %mul
+}
+; CHECK-LABEL: f_prefetch_mul_32:
+; CHECK:	prfm
+; CHECK-NEXT:	mul
+; CHECK-NOWORKAROUND-LABEL: f_prefetch_mul_32:
+; CHECK-NOWORKAROUND:	prfm
+; CHECK-NOWORKAROUND-NEXT:	mul
+
+define i64 @fall_through(i64 %a, i64 %b, i64* nocapture readonly %c) #0 {
+entry:
+  %0 = load i64* %c, align 8
+  br label %block1
+
+block1:
+  %mul = mul nsw i64 %0, %b
+  %add = add nsw i64 %mul, %a
+  %tmp = ptrtoint i8* blockaddress(@fall_through, %block1) to i64
+  %ret = add nsw i64 %tmp, %add
+  ret i64 %ret
+}
+; CHECK-LABEL:	fall_through
+; CHECK:	ldr
+; CHECK-NEXT:	nop
+; CHECK-NEXT:	.Ltmp
+; CHECK-NEXT: 	BB
+; CHECK-NEXT: 	madd
+; CHECK-NOWORKAROUND-LABEL:	fall_through
+; CHECK-NOWORKAROUND: 	ldr
+; CHECK-NOWORKAROUND-NEXT:	.Ltmp
+; CHECK-NOWORKAROUND-NEXT:	BB
+; CHECK-NOWORKAROUND-NEXT:	madd
+
+attributes #0 = { nounwind readonly "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind }
+
+
+; CHECK-LABEL: ... Statistics Collected ...
+; CHECK: 11 aarch64-fix-cortex-a53-835769 - Number of Nops added to work around erratum 835769