Index: lib/Target/AArch64/AArch64.h
===================================================================
--- lib/Target/AArch64/AArch64.h
+++ lib/Target/AArch64/AArch64.h
@@ -35,6 +35,7 @@
 FunctionPass *createAArch64StorePairSuppressPass();
 FunctionPass *createAArch64ExpandPseudoPass();
 FunctionPass *createAArch64LoadStoreOptimizationPass();
+FunctionPass *createAArch64LoadStoreInterleavePass();
 ModulePass *createAArch64PromoteConstantPass();
 FunctionPass *createAArch64ConditionOptimizerPass();
 FunctionPass *createAArch64AddressTypePromotionPass();
Index: lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.cpp
+++ lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -464,8 +464,16 @@
   setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
 
   MaxStoresPerMemset = MaxStoresPerMemsetOptSize = 8;
-  MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
   MaxStoresPerMemmove = MaxStoresPerMemmoveOptSize = 4;
+  if (Subtarget->isCyclone()) {
+    MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 4;
+  } else {
+    // It's optimal to use 64-bit registers with load/store pair instructions for
+    // memcpy() inlining, rather than doing the same with regular load/store
+    // instructions operating on 128-bit registers.  Allow twice as big
+    // instructions af for memmove().
+    MaxStoresPerMemcpy = MaxStoresPerMemcpyOptSize = 8;
+  }
 
   setStackPointerRegisterToSaveRestore(AArch64::SP);
 
@@ -6604,17 +6612,24 @@
                                                bool ZeroMemset,
                                                bool MemcpyStrSrc,
                                                MachineFunction &MF) const {
-  // Don't use AdvSIMD to implement 16-byte memset. It would have taken one
-  // instruction to materialize the v2i64 zero and one store (with restrictive
-  // addressing mode). Just do two i64 store of zero-registers.
-  bool Fast;
-  const Function *F = MF.getFunction();
-  if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
-      !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
-                                       Attribute::NoImplicitFloat) &&
-      (memOpAlign(SrcAlign, DstAlign, 16) ||
-       (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast)))
-    return MVT::f128;
+  // In general it's optimal to use 64-bit registers with load/store pair
+  // instructions for memcpy() inlining, rather than doing the same with regular
+  // load/store instructions operating on 128-bit registers. Do not use 128-bit
+  // types.
+
+  if (Subtarget->isCyclone()) {
+    // Don't use AdvSIMD to implement 16-byte memset. It would have taken one
+    // instruction to materialize the v2i64 zero and one store (with restrictive
+    // addressing mode). Just do two i64 store of zero-registers.
+    bool Fast;
+    const Function *F = MF.getFunction();
+    if (Subtarget->hasFPARMv8() && !IsMemset && Size >= 16 &&
+        !F->getAttributes().hasAttribute(AttributeSet::FunctionIndex,
+                                         Attribute::NoImplicitFloat) &&
+        (memOpAlign(SrcAlign, DstAlign, 16) ||
+         (allowsMisalignedMemoryAccesses(MVT::f128, 0, 1, &Fast) && Fast)))
+      return MVT::f128;
+  }
 
   return Size >= 8 ? MVT::i64 : MVT::i32;
 }
Index: lib/Target/AArch64/AArch64LoadStoreInterleave.cpp
===================================================================
--- /dev/null
+++ lib/Target/AArch64/AArch64LoadStoreInterleave.cpp
@@ -0,0 +1,346 @@
+//=- AArch64LoadStoreInterleave.cpp - Optimize Load/Store pairs for AArch64 -=//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass reorders load/store pair instructions to achieve better
+// performance.  Preferred sequence of operations is as follows:
+//
+// * [1]: load pair of 64-bit registers
+// * [1]: store pair of 64-bit registers
+// * [2]: load pair of 64-bit registers
+// * [2]: store pair of 64-bit registers
+// * ...
+//
+// Example of transformation:
+//
+//              Before:                      After:
+//
+//              1. <load1>                   1. <something1>
+//              2. <something1>              2. <something2>
+//              3. <store1>                  3. <load1>
+//              4. <something2>              4. <store1>
+//              5. <load2>                   5. <load2>
+//              6. <store2>                  6. <store2>
+//              7. <something3>              7. <something3>
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64.h"
+#include "AArch64InstrInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetInstrInfo.h"
+#include "llvm/Target/TargetSubtargetInfo.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-ldst-itl"
+
+STATISTIC(NumSequences, "Number of load/pair sequences updated");
+
+namespace {
+class AArch64LoadStoreInterleave : public MachineFunctionPass {
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+
+public:
+  static char ID;
+  AArch64LoadStoreInterleave() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  bool interleaveMemOp(MachineBasicBlock &MBB);
+  MachineInstr *moveInstruction(MachineInstr *I,
+                                MachineBasicBlock::iterator InsertionPoint);
+  const char *getPassName() const override {
+    return "AArch64 LoadStore Interleave";
+  }
+};
+} // end anonymous namespace
+
+char AArch64LoadStoreInterleave::ID = 0;
+
+FunctionPass *llvm::createAArch64LoadStoreInterleavePass() {
+  return new AArch64LoadStoreInterleave();
+}
+
+// Optimizes every basic block of the function.
+bool AArch64LoadStoreInterleave::runOnMachineFunction(MachineFunction &MF) {
+  DEBUG(dbgs() << "********** AArch64 LoadStore Interleaving **********\n"
+               << "********** Function: " << MF.getName() << '\n');
+
+  const TargetMachine &TM = MF.getTarget();
+  TII = static_cast<const AArch64InstrInfo *>(
+      TM.getSubtargetImpl()->getInstrInfo());
+  TRI = TM.getSubtargetImpl()->getRegisterInfo();
+
+  bool Modified = false;
+  for (auto &MBB : MF) {
+    Modified |= interleaveMemOp(MBB);
+  }
+
+  return Modified;
+}
+
+// Gets size of operands of load or store pair instruction in bytes.
+static int getOperandWidth(int Opcode) {
+  switch (Opcode) {
+  default:
+    llvm_unreachable("Didn't expect anything except load and store pairs.");
+
+  case AArch64::STPWi:
+  case AArch64::LDPWi:
+  case AArch64::STRWui:
+  case AArch64::STURWi:
+  case AArch64::LDRWui:
+  case AArch64::LDURWi:
+    return 4;
+
+  case AArch64::STPXi:
+  case AArch64::LDPXi:
+  case AArch64::STRXui:
+  case AArch64::STURXi:
+  case AArch64::LDRXui:
+  case AArch64::LDURXi:
+    return 8;
+
+  case AArch64::STPSi:
+  case AArch64::LDPSi:
+  case AArch64::STRSui:
+  case AArch64::STURSi:
+  case AArch64::LDRSui:
+  case AArch64::LDURSi:
+    return 4;
+
+  case AArch64::STPDi:
+  case AArch64::LDPDi:
+  case AArch64::STRDui:
+  case AArch64::STURDi:
+  case AArch64::LDRDui:
+  case AArch64::LDURDi:
+    return 8;
+
+  case AArch64::STPQi:
+  case AArch64::LDPQi:
+  case AArch64::STRQui:
+  case AArch64::STURQi:
+  case AArch64::LDRQui:
+  case AArch64::LDURQi:
+    return 16;
+  }
+}
+
+// Checks that instruction can safely be moved outside sequence of load and
+// store pair instruction.
+static bool isSafeInstruction(unsigned LdBase, unsigned StBase, MachineInstr *I,
+                              const TargetRegisterInfo *TRI, int SeenStore) {
+  if (I->isDebugValue()) {
+    return true;
+  }
+
+  if (I->isCall() || I->isTerminator() || I->hasUnmodeledSideEffects()) {
+    return false;
+  }
+
+  if (I->mayStore() || (SeenStore && I->mayLoad())) {
+    return false;
+  }
+
+  for (const MachineOperand &MO : I->operands()) {
+    if (!MO.isReg()) {
+      continue;
+    }
+
+    unsigned Reg = MO.getReg();
+    if (MO.isDef() && TRI->regsOverlap(Reg, LdBase)) {
+      return false;
+    }
+    if (SeenStore && MO.isDef() && TRI->regsOverlap(Reg, StBase)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// Collects links to load and store instructions from the basic block.  Return
+// value indicating whether at least one of instructions is a pair load or
+// store.
+static bool collectLoadAndStores(MachineBasicBlock &MBB,
+                                 SmallVectorImpl<MachineInstr*> &Lds,
+                                 SmallVectorImpl<MachineInstr*> &Sts) {
+  bool SeenPair = false;
+  for (MachineInstr &MI : MBB) {
+    switch (MI.getOpcode()) {
+    default:
+      // Just move on to the next instruction.
+      break;
+
+    case AArch64::STPSi:
+    case AArch64::STPDi:
+    case AArch64::STPQi:
+    case AArch64::STPWi:
+    case AArch64::STPXi:
+      SeenPair = true;
+      // Fall through.
+
+    case AArch64::STRSui:
+    case AArch64::STURSi:
+    case AArch64::STRDui:
+    case AArch64::STURDi:
+    case AArch64::STRQui:
+    case AArch64::STURQi:
+    case AArch64::STRWui:
+    case AArch64::STURWi:
+    case AArch64::STRXui:
+    case AArch64::STURXi:
+      // Loads should go first.
+      if (!Lds.empty()) {
+          Sts.push_back(&MI);
+      }
+      break;
+
+    case AArch64::LDPDi:
+    case AArch64::LDPQi:
+    case AArch64::LDPWi:
+    case AArch64::LDPXi:
+      SeenPair = true;
+      // Fall through.
+
+    case AArch64::LDRSui:
+    case AArch64::LDURSi:
+    case AArch64::LDRDui:
+    case AArch64::LDURDi:
+    case AArch64::LDRQui:
+    case AArch64::LDURQi:
+    case AArch64::LDRWui:
+    case AArch64::LDURWi:
+    case AArch64::LDRXui:
+    case AArch64::LDURXi:
+      Lds.push_back(&MI);
+      break;
+    }
+  }
+
+  return SeenPair;
+}
+
+// Extract base address from the instruction.
+static inline unsigned getBase(const MachineInstr* I) {
+  unsigned OpNum = (I->getNumOperands() == 4) ? 2 : 1;
+  return I->getOperand(OpNum).getReg();
+}
+
+// Extract offset from the instruction.
+static inline int64_t getOffset(const MachineInstr* I) {
+  unsigned OpNum = (I->getNumOperands() == 4) ? 3 : 2;
+  return I->getOperand(OpNum).getImm();
+}
+
+// Checks if a set of load and store instructions can be safely reordered.
+static bool isSafeToReorder(MachineBasicBlock &MBB,
+                            const SmallVectorImpl<MachineInstr*> &Lds,
+                            const SmallVectorImpl<MachineInstr*> &Sts,
+                            const TargetRegisterInfo *TRI) {
+  if (Sts.empty() || Sts.size() != Lds.size()) {
+    return false;
+  }
+
+  unsigned N = Sts.size();
+
+  // Check that each pair of instructions operate on data of the same width.
+  for (unsigned i = 0; i < N; ++i) {
+    const int LoadWidth = getOperandWidth(Lds[i]->getOpcode());
+    const int StoreWidth = getOperandWidth(Sts[i]->getOpcode());
+    if (LoadWidth != StoreWidth) {
+      return false;
+    }
+  }
+
+  const unsigned LdBase = getBase(Lds[0]);
+  const unsigned StBase = getBase(Sts[0]);
+
+  // Check that all load and store instructions use same base register and
+  // each pair has same offset.
+  for (unsigned i = 0; i < N; ++i) {
+    if (getBase(Lds[i]) != LdBase || getBase(Sts[i]) != StBase) {
+      return false;
+    }
+
+    if (getOffset(Sts[i]) != getOffset(Lds[i])) {
+      return false;
+    }
+  }
+
+  bool SeenStore = false;
+  for (MachineBasicBlock::iterator I = Lds[0], E = Sts[N - 1]; I != E; ++I) {
+    if (std::find(Sts.begin(), Sts.end(), (MachineInstr*)I) != Sts.end()) {
+      SeenStore = true;
+      continue;
+    }
+
+    if (std::find(Lds.begin(), Lds.end(), (MachineInstr*)I) != Lds.end()) {
+      continue;
+    }
+
+    if (!isSafeInstruction(LdBase, StBase, I, TRI, SeenStore)) {
+      return false;
+    }
+  }
+
+  return true;
+}
+
+// Evaluates possibility and performs reordering of load and store instructions
+// within basic block.
+bool AArch64LoadStoreInterleave::interleaveMemOp(MachineBasicBlock &MBB) {
+  SmallVector<MachineInstr*, 8> Lds;
+  SmallVector<MachineInstr*, 8> Sts;
+
+  if (!collectLoadAndStores(MBB, Lds, Sts)) {
+    return false;
+  }
+
+  if (!isSafeToReorder(MBB, Lds, Sts, TRI)) {
+    return false;
+  }
+
+  const unsigned N = Sts.size();
+
+  DEBUG(dbgs() << "Interleaving sequence of " << N << " instructions "
+                  "in " << MBB.getName() << "\n");
+
+  MachineBasicBlock::iterator InsertionPoint = Sts[N - 1];
+
+  for (unsigned i = 0; i < N; ++i) {
+    InsertionPoint = moveInstruction(Sts[N - 1 - i], InsertionPoint);
+    InsertionPoint = moveInstruction(Lds[N - 1 - i], InsertionPoint);
+  }
+
+  ++NumSequences;
+
+  return true;
+}
+
+// Moves load or store pair instruction before the insertion point and returns
+// next position for insertion.
+MachineInstr *AArch64LoadStoreInterleave::moveInstruction(
+    MachineInstr *I, MachineBasicBlock::iterator InsertionPoint) {
+  MachineInstr *NewI = BuildMI(*I->getParent(), InsertionPoint,
+                               I->getDebugLoc(), TII->get(I->getOpcode()));
+  for (const MachineOperand &operand : I->operands()) {
+    NewI->addOperand(operand);
+  }
+
+  I->eraseFromParent();
+
+  return NewI;
+}
Index: lib/Target/AArch64/AArch64TargetMachine.cpp
===================================================================
--- lib/Target/AArch64/AArch64TargetMachine.cpp
+++ lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -80,6 +80,12 @@
                 cl::desc("Work around Cortex-A53 erratum 835769"),
                 cl::init(false));
 
+static cl::opt<bool>
+EnableAArch64InterleavedMemOp("aarch64-interleaved-ldstp", cl::Hidden,
+                             cl::desc("Allow AArch64 load/store clustering and "
+                                      "interleaving"),
+                             cl::init(false));
+
 extern "C" void LLVMInitializeAArch64Target() {
   // Register the target.
   RegisterTargetMachine<AArch64leTargetMachine> X(TheAArch64leTarget);
@@ -269,6 +275,10 @@
 }
 
 bool AArch64PassConfig::addPreEmitPass() {
+  // Reorder load/store pair instruction for better performance.
+  if (TM->getOptLevel() != CodeGenOpt::None && EnableLoadStoreOpt &&
+      EnableAArch64InterleavedMemOp)
+    addPass(createAArch64LoadStoreInterleavePass());
   if (EnableA53Fix835769)
     addPass(createAArch64A53Fix835769());
   // Relax conditional branch instructions if they're otherwise out of
Index: lib/Target/AArch64/CMakeLists.txt
===================================================================
--- lib/Target/AArch64/CMakeLists.txt
+++ lib/Target/AArch64/CMakeLists.txt
@@ -33,6 +33,7 @@
   AArch64ISelLowering.cpp
   AArch64InstrInfo.cpp
   AArch64LoadStoreOptimizer.cpp
+  AArch64LoadStoreInterleave.cpp
   AArch64MCInstLower.cpp
   AArch64PromoteConstant.cpp
   AArch64PBQPRegAlloc.cpp
Index: test/CodeGen/AArch64/arm64-variadic-aapcs.ll
===================================================================
--- test/CodeGen/AArch64/arm64-variadic-aapcs.ll
+++ test/CodeGen/AArch64/arm64-variadic-aapcs.ll
@@ -1,4 +1,4 @@
-; RUN: llc -verify-machineinstrs -mtriple=arm64-linux-gnu -pre-RA-sched=linearize -enable-misched=false < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=arm64-linux-gnu -pre-RA-sched=linearize -enable-misched=false < %s -mcpu=cyclone | FileCheck %s
 
 %va_list = type {i8*, i8*, i8*, i32, i32}
 
Index: test/CodeGen/AArch64/arm64-virtual_base.ll
===================================================================
--- test/CodeGen/AArch64/arm64-virtual_base.ll
+++ test/CodeGen/AArch64/arm64-virtual_base.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O3 -march arm64 | FileCheck %s
+; RUN: llc < %s -O3 -march arm64 -mcpu=cyclone | FileCheck %s
 ; <rdar://13463602>
 
 %struct.Counter_Struct = type { i64, i64 }
Index: test/CodeGen/AArch64/func-calls.ll
===================================================================
--- test/CodeGen/AArch64/func-calls.ll
+++ test/CodeGen/AArch64/func-calls.ll
@@ -1,7 +1,7 @@
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu | FileCheck %s --check-prefix=CHECK
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-neon | FileCheck --check-prefix=CHECK-NONEON %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 | FileCheck --check-prefix=CHECK-NOFP %s
-; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu | FileCheck --check-prefix=CHECK-BE %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mcpu=cyclone | FileCheck %s --check-prefix=CHECK
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-neon -mcpu=cyclone | FileCheck --check-prefix=CHECK-NONEON %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64-none-linux-gnu -mattr=-fp-armv8 -mcpu=cyclone | FileCheck --check-prefix=CHECK-NOFP %s
+; RUN: llc -verify-machineinstrs < %s -mtriple=aarch64_be-none-linux-gnu -mcpu=cyclone | FileCheck --check-prefix=CHECK-BE %s
 
 %myStruct = type { i64 , i8, i32 }
 
@@ -142,7 +142,7 @@
 ; CHECK-LE: movz x2, #{{0x2a|42}}
 ; CHECK-LE: mov x3, xzr
 ; CHECK-BE: movz {{x|w}}3, #{{0x2a|42}}
-; CHECK-BE: mov x2, xzr
+; CHECK-BE: mov{{z?}} x2, {{xzr|#0}}
 ; CHECK: bl check_i128_regalign
 
   ret void
Index: test/CodeGen/AArch64/memcpy-f128.ll
===================================================================
--- test/CodeGen/AArch64/memcpy-f128.ll
+++ test/CodeGen/AArch64/memcpy-f128.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=aarch64 -mtriple=aarch64-linux-gnu | FileCheck %s
+; RUN: llc < %s -march=aarch64 -mtriple=aarch64-linux-gnu -mcpu=cyclone | FileCheck %s
 
 %structA = type { i128 }
 @stubA = internal unnamed_addr constant %structA zeroinitializer, align 8
Index: test/CodeGen/AArch64/optimal-load-store-pairs.ll
===================================================================
--- /dev/null
+++ test/CodeGen/AArch64/optimal-load-store-pairs.ll
@@ -0,0 +1,66 @@
+; RUN: llc < %s -mcpu=cortex-a53 -march=aarch64 -mtriple=aarch64-linux-gnu -aarch64-interleaved-ldstp=1 | FileCheck %s
+; RUN: llc < %s -mcpu=cortex-a57 -march=aarch64 -mtriple=aarch64-linux-gnu -aarch64-interleaved-ldstp=1 | FileCheck %s
+
+; Here "optimal" means:
+;  - use of 64-bit registers (no floating point 128-bit registers);
+;  - interleaving loads and stores without any instructions in the middle.
+
+; marked as external to prevent possible optimizations
+@a = external global [4 x i32]
+@b = external global [4 x i32]
+
+define void @copy-16-bytes-with-8-byte-registers() {
+; CHECK-LABEL: @copy-16-bytes-with-8-byte-registers
+; CHECK: adrp
+; CHECK: add
+; CHECK: adrp
+; CHECK: add
+; CHECK: ldp [[v1:x[0-9]+]], [[v2:x[0-9]+]]
+; CHECK: stp [[v1]], [[v2]]
+; CHECK: ret
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast ([4 x i32]* @a to i8*), i8* bitcast ([4 x i32]* @b to i8*), i64 16, i32 8, i1 false)
+  ret void
+}
+
+define void @copy-56-bytes-with-8-byte-registers() {
+; CHECK-LABEL: @copy-56-bytes-with-8-byte-registers
+; CHECK: adrp
+; CHECK: add
+; CHECK: adrp
+; CHECK: add
+; CHECK: ld{{[rp]}} {{x[0-9]+}}
+; CHECK: st{{[rp]}} {{x[0-9]+}}
+; CHECK: ld{{[rp]}} {{x[0-9]+}}
+; CHECK: st{{[rp]}} {{x[0-9]+}}
+; CHECK: ld{{[rp]}} {{x[0-9]+}}
+; CHECK: st{{[rp]}} {{x[0-9]+}}
+; CHECK: ld{{[rp]}} {{x[0-9]+}}
+; CHECK: st{{[rp]}} {{x[0-9]+}}
+; CHECK: ret
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast ([4 x i32]* @a to i8*), i8* bitcast ([4 x i32]* @b to i8*), i64 56, i32 8, i1 false)
+  ret void
+}
+
+define void @copy-64-bytes-with-8-byte-registers() {
+; CHECK-LABEL: @copy-64-bytes-with-8-byte-registers
+; CHECK: adrp
+; CHECK: add
+; CHECK: adrp
+; CHECK: add
+; CHECK: ldp [[v1:x[0-9]+]], [[v2:x[0-9]+]]
+; CHECK: stp [[v1]], [[v2]]
+; CHECK: ldp [[v3:x[0-9]+]], [[v4:x[0-9]+]]
+; CHECK: stp [[v3]], [[v4]]
+; CHECK: ldp [[v5:x[0-9]+]], [[v6:x[0-9]+]]
+; CHECK: stp [[v5]], [[v6]]
+; CHECK: ldp [[v7:x[0-9]+]], [[v8:x[0-9]+]]
+; CHECK: stp [[v7]], [[v8]]
+; CHECK: ret
+entry:
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast ([4 x i32]* @a to i8*), i8* bitcast ([4 x i32]* @b to i8*), i64 64, i32 8, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)