Index: llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
===================================================================
--- llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -28,8 +28,11 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSchedule.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -96,11 +99,19 @@
   AliasAnalysis *AA;
   const AArch64InstrInfo *TII;
   const TargetRegisterInfo *TRI;
+  const TargetSubtargetInfo *STI;
   const AArch64Subtarget *Subtarget;
 
   // Track which registers have been modified and used.
   BitVector ModifiedRegs, UsedRegs;
 
+  // Target has a cost model.
+  bool HasCostModel;
+  TargetSchedModel TSM;
+
+  // Function is being optimized for code size.
+  bool OptForMinSize;
+
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AAResultsWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
@@ -154,6 +165,9 @@
   bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI,
                             unsigned BaseReg, int Offset);
 
+  // Evaluate if the new instruction is a better choice than the old ones.
+  bool isProfitable(unsigned New, MachineInstr &MIA, MachineInstr &MIB);
+
   // Merge a pre- or post-index base register update into a ld/st instruction.
   MachineBasicBlock::iterator
   mergeUpdateInsn(MachineBasicBlock::iterator I,
@@ -650,6 +664,65 @@
   }
 }
 
+bool AArch64LoadStoreOpt::isProfitable(unsigned New,
+                                       MachineInstr &MIA, MachineInstr &MIB) {
+  // Default as profitable if optimizing for size or
+  // in the absence of a cost model.
+  if (OptForMinSize || !HasCostModel) {
+    DEBUG(dbgs() << "Evaluating instructions: replacement by default.\n");
+    return true;
+  }
+
+  const MCSchedClassDesc
+    *SCN = TSM.getMCSchedModel()->getSchedClassDesc(TII->get(New).getSchedClass()),
+    *SCA = TSM.resolveSchedClass(&MIA),
+    *SCB = TSM.resolveSchedClass(&MIB);
+  // Default as profitable if the new instr is variant or has invalid costs.
+  if (SCN->isVariant() || !SCN->isValid()) {
+    DEBUG(dbgs() << "Evaluating instructions: replacement by default.\n");
+    return true;
+  }
+
+  long LatN = TSM.computeInstrLatency(New),
+       LatA = TSM.computeInstrLatency(&MIA),
+       LatB = TSM.computeInstrLatency(&MIB);
+  long LatDif = LatN - std::max(LatA, LatB);
+  long UopN = SCN->NumMicroOps,
+       UopA = TSM.getNumMicroOps(&MIA, SCA),
+       UopB = TSM.getNumMicroOps(&MIB, SCB);
+  long UopDif = UopN - (UopA + UopB);
+
+  // The new instr is profitable if it is at least as fast when
+  // either it is simple or both other instrs are complex.
+  if (UopN <= 1 || (UopA > 1 && UopB > 1)) {
+    if (LatDif <= 0) {
+      DEBUG(dbgs() << "Evaluating instructions: replacement is faster.\n");
+      return true;
+    }
+  }
+  // The new instr is profitable if it is faster than it is complex when
+  // both other instrs are simple.
+  else if (UopA <= 1 && UopB <= 1) {
+    if (LatDif < -UopDif) {
+      DEBUG(dbgs() << "Evaluating instructions: replacement is faster "
+                      "though more complex.\n");
+      return true;
+    }
+  }
+  // The new instr is profitable if it is at least as fast as it is complex when
+  // either other instr is complex.
+  else if (UopA > 1 || UopB > 1) {
+    if (LatDif <= -UopDif) {
+      DEBUG(dbgs() << "Evaluating instructions: replacement is faster "
+                      "though not more complex.\n");
+      return true;
+    }
+  }
+
+  // It is not profitable.
+  return false;
+}
+
 MachineBasicBlock::iterator
 AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
                                            MachineBasicBlock::iterator MergeMI,
@@ -1344,6 +1417,11 @@
 
   unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode())
                              : getPostIndexedOpcode(I->getOpcode());
+
+  // Evaluate if the new instruction is a better choice than both old ones.
+  if (!isProfitable(NewOpc, *I, *Update))
+    return NextI;
+
   MachineInstrBuilder MIB;
   if (!isPairedLdSt(*I)) {
     // Non-paired instruction.
@@ -1373,7 +1451,7 @@
     ++NumPostFolded;
     DEBUG(dbgs() << "Creating post-indexed load/store.");
   }
-  DEBUG(dbgs() << "    Replacing instructions:\n    ");
+  DEBUG(dbgs() << " Replacing instructions:\n    ");
   DEBUG(I->print(dbgs()));
   DEBUG(dbgs() << "    ");
   DEBUG(Update->print(dbgs()));
@@ -1767,6 +1845,16 @@
   TRI = Subtarget->getRegisterInfo();
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
+  OptForMinSize = Fn.getFunction()->optForMinSize();
+
+  const TargetSubtargetInfo &STI = Fn.getSubtarget();
+  TSM.init(STI.getSchedModel(), &STI, STI.getInstrInfo());
+  // TODO: For now, only support targets with a scheduling model.  In order to
+  //       support a target that has itineraries instead, then
+  //       isProfitable() has to be modified to calculate the latency
+  //       and the number of uops.
+  HasCostModel = TSM.hasInstrSchedModel();
+
   // Resize the modified and used register bitfield trackers.  We do this once
   // per function and then clear the bitfield each time we optimize a load or
   // store.
Index: llvm/test/CodeGen/AArch64/ldst-opt.ll
===================================================================
--- llvm/test/CodeGen/AArch64/ldst-opt.ll
+++ llvm/test/CodeGen/AArch64/ldst-opt.ll
@@ -1,5 +1,6 @@
-; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -o - %s | FileCheck --check-prefix=CHECK --check-prefix=NOSTRICTALIGN %s
-; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+strict-align -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -o - %s | FileCheck --check-prefix=CHECK --check-prefix=STRICTALIGN %s
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs                      -o - %s | FileCheck %s --check-prefixes=CHECK,GENERIC,NOSTRICTALIGN
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -mattr=+strict-align -o - %s | FileCheck %s --check-prefixes=CHECK,GENERIC,STRICTALIGN
+; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -mcpu=exynos-m1      -o - %s | FileCheck %s --check-prefixes=CHECK,EXYNOS
 
 ; This file contains tests for the AArch64 load/store optimizer.
 
@@ -7,8 +8,8 @@
 %s.byte = type { i8, i8 }
 %s.halfword = type { i16, i16 }
 %s.word = type { i32, i32 }
-%s.doubleword = type { i64, i32 }
-%s.quadword = type { fp128, i32 }
+%s.doubleword = type { i64, i64 }
+%s.quadword = type { fp128, fp128 }
 %s.float = type { float, i32 }
 %s.double = type { double, i32 }
 %struct.byte = type { %padding, %s.byte }
@@ -236,8 +237,10 @@
 
 define void @load-pair-pre-indexed-word(%struct.word* %ptr) nounwind {
 ; CHECK-LABEL: load-pair-pre-indexed-word
-; CHECK: ldp w{{[0-9]+}}, w{{[0-9]+}}, [x0, #32]!
-; CHECK-NOT: add x0, x0, #32
+; GENERIC: ldp w{{[0-9]+}}, w{{[0-9]+}}, [x0, #32]!
+; GENERIC-NOT: add x0, x0, #32
+; EXYNOS: ldp w{{[0-9]+}}, w{{[0-9]+}}, [x0, #32]{{$}}
+; EXYNOS: add x0, x0, #32
 entry:
   %a = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1, i32 0
   %a1 = load i32, i32* %a, align 4
@@ -267,6 +270,41 @@
   ret void
 }
 
+define void @load-pair-pre-indexed-doubleword(%struct.doubleword* %ptr) nounwind {
+; CHECK-LABEL: load-pair-pre-indexed-doubleword
+; GENERIC: ldp x{{[0-9]+}}, x{{[0-9]+}}, [x0, #32]!
+; GENERIC-NOT: add x0, x0, #32
+; EXYNOS: ldp x{{[0-9]+}}, x{{[0-9]+}}, [x0, #32]{{$}}
+; EXYNOS: add x0, x0, #32
+entry:
+  %a = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1, i32 0
+  %a1 = load i64, i64* %a, align 8
+  %b = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1, i32 1
+  %b1 = load i64, i64* %b, align 8
+  %add = add i64 %a1, %b1
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1
+  tail call void @bar_doubleword(%s.doubleword* %c, i64 %add)
+  ret void
+}
+
+define void @store-pair-pre-indexed-doubleword(%struct.doubleword* %ptr, i64 %val) nounwind {
+; CHECK-LABEL: store-pair-pre-indexed-doubleword
+; CHECK: stp x{{[0-9]+}}, x{{[0-9]+}}, [x0, #32]!
+; CHECK-NOT: add x0, x0, #32
+entry:
+  %a = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1, i32 0
+  store i64 %val, i64* %a, align 8
+  %b = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1, i32 1
+  store i64 %val, i64* %b, align 8
+  br label %bar
+bar:
+  %c = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1
+  tail call void @bar_doubleword(%s.doubleword* %c, i64 %val)
+  ret void
+}
+
 ; Check the following transform:
 ;
 ; add x8, x8, #16
@@ -1031,7 +1069,6 @@
 define void @store-pair-post-indexed-word() nounwind {
 ; CHECK-LABEL: store-pair-post-indexed-word
 ; CHECK: stp w{{[0-9]+}}, w{{[0-9]+}}, [sp], #16
-; CHECK: ret
   %src = alloca { i32, i32 }, align 8
   %dst = alloca { i32, i32 }, align 8
 
@@ -1050,7 +1087,6 @@
 define void @store-pair-post-indexed-doubleword() nounwind {
 ; CHECK-LABEL: store-pair-post-indexed-doubleword
 ; CHECK: stp x{{[0-9]+}}, x{{[0-9]+}}, [sp], #32
-; CHECK: ret
   %src = alloca { i64, i64 }, align 8
   %dst = alloca { i64, i64 }, align 8
 
@@ -1069,7 +1105,6 @@
 define void @store-pair-post-indexed-float() nounwind {
 ; CHECK-LABEL: store-pair-post-indexed-float
 ; CHECK: stp s{{[0-9]+}}, s{{[0-9]+}}, [sp], #16
-; CHECK: ret
   %src = alloca { float, float }, align 8
   %dst = alloca { float, float }, align 8
 
@@ -1088,7 +1123,6 @@
 define void @store-pair-post-indexed-double() nounwind {
 ; CHECK-LABEL: store-pair-post-indexed-double
 ; CHECK: stp d{{[0-9]+}}, d{{[0-9]+}}, [sp], #32
-; CHECK: ret
   %src = alloca { double, double }, align 8
   %dst = alloca { double, double }, align 8
 
@@ -1104,6 +1138,26 @@
   ret void
 }
 
+define void @store-pair-post-indexed-quadword() nounwind {
+; CHECK-LABEL: store-pair-post-indexed-quadword
+; GENERIC: stp q{{[0-9]+}}, q{{[0-9]+}}, [sp], #64
+; EXYNOS: str q{{[0-9]+}}, [sp]
+; EXYNOS-NEXT: str q{{[0-9]+}}, [sp, #16]
+  %src = alloca { fp128, fp128 }, align 8
+  %dst = alloca { fp128, fp128 }, align 8
+
+  %src.realp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %src, i32 0, i32 0
+  %src.real = load fp128, fp128* %src.realp
+  %src.imagp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %src, i32 0, i32 1
+  %src.imag = load fp128, fp128* %src.imagp
+
+  %dst.realp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %dst, i32 0, i32 0
+  %dst.imagp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %dst, i32 0, i32 1
+  store fp128 %src.real, fp128* %dst.realp
+  store fp128 %src.imag, fp128* %dst.imagp
+  ret void
+}
+
 ; Check the following transform:
 ;
 ; (ldr|str) X, [x20]
@@ -1287,7 +1341,8 @@
 
 define void @post-indexed-paired-min-offset(i64* %a, i64* %b, i64 %count) nounwind {
 ; CHECK-LABEL: post-indexed-paired-min-offset
-; CHECK: ldp x{{[0-9]+}}, x{{[0-9]+}}, [x{{[0-9]+}}], #-512
+; GENERIC: ldp x{{[0-9]+}}, x{{[0-9]+}}, [x{{[0-9]+}}], #-512
+; EXYNOS: ldp x{{[0-9]+}}, x{{[0-9]+}}, [x{{[0-9]+}}]{{$}}
 ; CHECK: stp x{{[0-9]+}}, x{{[0-9]+}}, [x{{[0-9]+}}], #-512
   br label %for.body
 for.body:
@@ -1340,10 +1395,8 @@
 ; scalar stores which should get merged by AArch64LoadStoreOptimizer.
 define void @merge_zr32(i32* %p) {
 ; CHECK-LABEL: merge_zr32:
-; CHECK: // %entry
-; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}]
-; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
-; CHECK-NEXT: ret
+; NOSTRICTALIGN: str xzr, [x{{[0-9]+}}]
+; STRICTALIGN: stp wzr, wzr, [x{{[0-9]+}}]
 entry:
   store i32 0, i32* %p
   %p1 = getelementptr i32, i32* %p, i32 1
@@ -1354,11 +1407,9 @@
 ; Same as merge_zr32 but the merged stores should also get paried.
 define void @merge_zr32_2(i32* %p) {
 ; CHECK-LABEL: merge_zr32_2:
-; CHECK: // %entry
-; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
-; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
+; NOSTRICTALIGN: stp xzr, xzr, [x{{[0-9]+}}]
+; STRICTALIGN: stp wzr, wzr, [x{{[0-9]+}}]
 ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8]
-; CHECK-NEXT: ret
 entry:
   store i32 0, i32* %p
   %p1 = getelementptr i32, i32* %p, i32 1
@@ -1373,13 +1424,11 @@
 ; Like merge_zr32_2, but checking the largest allowed stp immediate offset.
 define void @merge_zr32_2_offset(i32* %p) {
 ; CHECK-LABEL: merge_zr32_2_offset:
-; CHECK: // %entry
-; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #504]
-; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #504]
+; NOSTRICTALIGN: stp xzr, xzr, [x{{[0-9]+}}, #504]
+; STRICTALIGN: str wzr, [x{{[0-9]+}}, #504]
 ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #508]
 ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #512]
 ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #516]
-; CHECK-NEXT: ret
 entry:
   %p0 = getelementptr i32, i32* %p, i32 126
   store i32 0, i32* %p0
@@ -1397,14 +1446,12 @@
 ; instruction.
 define void @no_merge_zr32_2_offset(i32* %p) {
 ; CHECK-LABEL: no_merge_zr32_2_offset:
-; CHECK: // %entry
-; NOSTRICTALIGN-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; NOSTRICTALIGN: movi v[[REG:[0-9]]].2d, #0000000000000000
 ; NOSTRICTALIGN-NEXT: str q[[REG]], [x{{[0-9]+}}, #4096]
-; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #4096]
+; STRICTALIGN: str wzr, [x{{[0-9]+}}, #4096]
 ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #4100]
 ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #4104]
 ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #4108]
-; CHECK-NEXT: ret
 entry:
   %p0 = getelementptr i32, i32* %p, i32 1024
   store i32 0, i32* %p0
@@ -1422,14 +1469,12 @@
 ; err on the side that allows for stp q instruction generation.
 define void @merge_zr32_3(i32* %p) {
 ; CHECK-LABEL: merge_zr32_3:
-; CHECK: // %entry
-; NOSTRICTALIGN-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; NOSTRICTALIGN: movi v[[REG:[0-9]]].2d, #0000000000000000
 ; NOSTRICTALIGN-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
-; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
+; STRICTALIGN: stp wzr, wzr, [x{{[0-9]+}}]
 ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8]
 ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #16]
 ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #24]
-; CHECK-NEXT: ret
 entry:
   store i32 0, i32* %p
   %p1 = getelementptr i32, i32* %p, i32 1
@@ -1452,10 +1497,8 @@
 ; Like merge_zr32, but with 2-vector type.
 define void @merge_zr32_2vec(<2 x i32>* %p) {
 ; CHECK-LABEL: merge_zr32_2vec:
-; CHECK: // %entry
-; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}]
-; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
-; CHECK-NEXT: ret
+; NOSTRICTALIGN: str xzr, [x{{[0-9]+}}]
+; STRICTALIGN: stp wzr, wzr, [x{{[0-9]+}}]
 entry:
   store <2 x i32> zeroinitializer, <2 x i32>* %p
   ret void
@@ -1464,12 +1507,10 @@
 ; Like merge_zr32, but with 3-vector type.
 define void @merge_zr32_3vec(<3 x i32>* %p) {
 ; CHECK-LABEL: merge_zr32_3vec:
-; CHECK: // %entry
-; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}]
+; NOSTRICTALIGN: str xzr, [x{{[0-9]+}}]
 ; NOSTRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #8]
-; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
+; STRICTALIGN: stp wzr, wzr, [x{{[0-9]+}}]
 ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #8]
-; CHECK-NEXT: ret
 entry:
   store <3 x i32> zeroinitializer, <3 x i32>* %p
   ret void
@@ -1478,11 +1519,9 @@
 ; Like merge_zr32, but with 4-vector type.
 define void @merge_zr32_4vec(<4 x i32>* %p) {
 ; CHECK-LABEL: merge_zr32_4vec:
-; CHECK: // %entry
-; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
-; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
+; NOSTRICTALIGN: stp xzr, xzr, [x{{[0-9]+}}]
+; STRICTALIGN: stp wzr, wzr, [x{{[0-9]+}}]
 ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8]
-; CHECK-NEXT: ret
 entry:
   store <4 x i32> zeroinitializer, <4 x i32>* %p
   ret void
@@ -1491,10 +1530,8 @@
 ; Like merge_zr32, but with 2-vector float type.
 define void @merge_zr32_2vecf(<2 x float>* %p) {
 ; CHECK-LABEL: merge_zr32_2vecf:
-; CHECK: // %entry
-; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}]
-; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
-; CHECK-NEXT: ret
+; NOSTRICTALIGN: str xzr, [x{{[0-9]+}}]
+; STRICTALIGN: stp wzr, wzr, [x{{[0-9]+}}]
 entry:
   store <2 x float> zeroinitializer, <2 x float>* %p
   ret void
@@ -1503,11 +1540,9 @@
 ; Like merge_zr32, but with 4-vector float type.
 define void @merge_zr32_4vecf(<4 x float>* %p) {
 ; CHECK-LABEL: merge_zr32_4vecf:
-; CHECK: // %entry
-; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
-; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}]
+; NOSTRICTALIGN: stp xzr, xzr, [x{{[0-9]+}}]
+; STRICTALIGN: stp wzr, wzr, [x{{[0-9]+}}]
 ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8]
-; CHECK-NEXT: ret
 entry:
   store <4 x float> zeroinitializer, <4 x float>* %p
   ret void
@@ -1516,8 +1551,7 @@
 ; Similar to merge_zr32, but for 64-bit values.
 define void @merge_zr64(i64* %p) {
 ; CHECK-LABEL: merge_zr64:
-; CHECK: // %entry
-; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; CHECK: stp xzr, xzr, [x{{[0-9]+}}]
 ; CHECK-NEXT: ret
 entry:
   store i64 0, i64* %p
@@ -1529,8 +1563,7 @@
 ; Similar to merge_zr32, but for 64-bit values and with unaligned stores.
 define void @merge_zr64_unalign(<2 x i64>* %p) {
 ; CHECK-LABEL: merge_zr64_unalign:
-; CHECK: // %entry
-; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; NOSTRICTALIGN: stp xzr, xzr, [x{{[0-9]+}}]
 ; STRICTALIGN: strb
 ; STRICTALIGN: strb
 ; STRICTALIGN: strb
@@ -1547,7 +1580,6 @@
 ; STRICTALIGN: strb
 ; STRICTALIGN: strb
 ; STRICTALIGN: strb
-; CHECK-NEXT: ret
 entry:
   store <2 x i64> zeroinitializer, <2 x i64>* %p, align 1
   ret void
@@ -1557,12 +1589,10 @@
 ; vector store since the zero constant vector has multiple uses.
 define void @merge_zr64_2(i64* %p) {
 ; CHECK-LABEL: merge_zr64_2:
-; CHECK: // %entry
-; NOSTRICTALIGN-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; NOSTRICTALIGN: movi v[[REG:[0-9]]].2d, #0000000000000000
 ; NOSTRICTALIGN-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
-; STRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; STRICTALIGN: stp xzr, xzr, [x{{[0-9]+}}]
 ; STRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #16]
-; CHECK-NEXT: ret
 entry:
   store i64 0, i64* %p
   %p1 = getelementptr i64, i64* %p, i64 1
@@ -1577,9 +1607,7 @@
 ; Like merge_zr64, but with 2-vector double type.
 define void @merge_zr64_2vecd(<2 x double>* %p) {
 ; CHECK-LABEL: merge_zr64_2vecd:
-; CHECK: // %entry
-; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
-; CHECK-NEXT: ret
+; CHECK: stp xzr, xzr, [x{{[0-9]+}}]
 entry:
   store <2 x double> zeroinitializer, <2 x double>* %p
   ret void
@@ -1588,10 +1616,8 @@
 ; Like merge_zr64, but with 3-vector i64 type.
 define void @merge_zr64_3vec(<3 x i64>* %p) {
 ; CHECK-LABEL: merge_zr64_3vec:
-; CHECK: // %entry
-; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}]
+; CHECK: stp xzr, xzr, [x{{[0-9]+}}]
 ; CHECK-NEXT: str xzr, [x{{[0-9]+}}, #16]
-; CHECK-NEXT: ret
 entry:
   store <3 x i64> zeroinitializer, <3 x i64>* %p
   ret void
@@ -1600,10 +1626,10 @@
 ; Like merge_zr64_2, but with 4-vector double type.
 define void @merge_zr64_4vecd(<4 x double>* %p) {
 ; CHECK-LABEL: merge_zr64_4vecd:
-; CHECK: // %entry
-; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
-; CHECK-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
-; CHECK-NEXT: ret
+; CHECK: movi v[[REG:[0-9]]].2d, #0000000000000000
+; GENERIC-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
+; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}, #16]
+; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}]
 entry:
   store <4 x double> zeroinitializer, <4 x double>* %p
   ret void
@@ -1612,15 +1638,13 @@
 ; Verify that non-consecutive merges do not generate q0
 define void @merge_multiple_128bit_stores(i64* %p) {
 ; CHECK-LABEL: merge_multiple_128bit_stores
-; CHECK: // %entry
-; NOSTRICTALIGN-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; NOSTRICTALIGN: movi v[[REG:[0-9]]].2d, #0000000000000000
 ; NOSTRICTALIGN-NEXT: str q0, [x0]
 ; NOSTRICTALIGN-NEXT: stur q0, [x0, #24]
 ; NOSTRICTALIGN-NEXT: str q0, [x0, #48]
-; STRICTALIGN-NEXT: stp xzr, xzr, [x0]
+; STRICTALIGN: stp xzr, xzr, [x0]
 ; STRICTALIGN-NEXT: stp xzr, xzr, [x0, #24]
 ; STRICTALIGN-NEXT: stp xzr, xzr, [x0, #48]
-; CHECK-NEXT: ret
 entry:
   store i64 0, i64* %p
   %p1 = getelementptr i64, i64* %p, i64 1
@@ -1639,15 +1663,13 @@
 ; Verify that large stores generate stp q
 define void @merge_multiple_128bit_stores_consec(i64* %p) {
 ; CHECK-LABEL: merge_multiple_128bit_stores_consec
-; CHECK: // %entry
-; NOSTRICTALIGN-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000
+; NOSTRICTALIGN: movi v[[REG:[0-9]]].2d, #0000000000000000
 ; NOSTRICTALIGN-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}]
 ; NOSTRICTALIGN-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}, #32]
-; STRICTALIGN-NEXT: stp	 xzr, xzr, [x0]
+; STRICTALIGN: stp	 xzr, xzr, [x0]
 ; STRICTALIGN-NEXT: stp	 xzr, xzr, [x0, #16]
 ; STRICTALIGN-NEXT: stp	 xzr, xzr, [x0, #32]
 ; STRICTALIGN-NEXT: stp  xzr, xzr, [x0, #48]
-; CHECK-NEXT: ret
 entry:
   store i64 0, i64* %p
   %p1 = getelementptr i64, i64* %p, i64 1
@@ -1669,8 +1691,7 @@
 
 ; Check for bug 34674 where invalid add of xzr was being generated.
 ; CHECK-LABEL: bug34674:
-; CHECK: // %entry
-; CHECK-NEXT: mov [[ZREG:x[0-9]+]], xzr
+; CHECK: mov [[ZREG:x[0-9]+]], {{#0|xzr}}
 ; CHECK-DAG: stp [[ZREG]], [[ZREG]], [x0]
 ; CHECK-DAG: add x{{[0-9]+}}, [[ZREG]], #1
 define i64 @bug34674(<2 x i64>* %p) {
Index: llvm/test/CodeGen/AArch64/machine-outliner-remarks.ll
===================================================================
--- llvm/test/CodeGen/AArch64/machine-outliner-remarks.ll
+++ llvm/test/CodeGen/AArch64/machine-outliner-remarks.ll
@@ -95,7 +95,7 @@
   ret void
 }
 
-attributes #0 = { noredzone nounwind ssp uwtable "no-frame-pointer-elim"="false" "target-cpu"="cyclone" }
+attributes #0 = { optsize minsize noredzone nounwind ssp uwtable "no-frame-pointer-elim"="false" "target-cpu"="cyclone" }
 
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!3, !4, !5, !6}
Index: llvm/test/CodeGen/AArch64/machine-outliner.ll
===================================================================
--- llvm/test/CodeGen/AArch64/machine-outliner.ll
+++ llvm/test/CodeGen/AArch64/machine-outliner.ll
@@ -61,4 +61,4 @@
 ; CHECK-NEXT: str w8, [sp], #16
 ; CHECK-NEXT: ret
 
-attributes #0 = { noredzone nounwind ssp uwtable "no-frame-pointer-elim"="false" "target-cpu"="cyclone" }
+attributes #0 = { optsize minsize noredzone nounwind ssp uwtable "no-frame-pointer-elim"="false" "target-cpu"="cyclone" }
Index: llvm/test/CodeGen/AArch64/stream-neon.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AArch64/stream-neon.ll
@@ -0,0 +1,288 @@
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=cortex-a57 -o - | FileCheck %s --check-prefixes=CHECK,CORTEX
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=cyclone    -o - | FileCheck %s --check-prefixes=CHECK,CYCLON
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m1  -o - | FileCheck %s --check-prefixes=CHECK,EXYNOS
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor     -o - | FileCheck %s --check-prefixes=CHECK,FALKOR
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo       -o - | FileCheck %s --check-prefixes=CHECK,KRYO
+; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=thunderx   -o - | FileCheck %s --check-prefixes=CHECK,THNDRX
+
+; McCalpin, John D., 1995:
+; "Memory Bandwidth and Machine Balance in Current High Performance Computers",
+; IEEE Computer Society TCCA Newsletter, December 1995.
+
+define void @copy(<4 x float>* noalias nocapture readonly %a, <4 x float>* noalias nocapture %c, i64 %length) local_unnamed_addr #0 {
+entry:
+  %cmp53 = icmp eq i64 %length, 0
+  br i1 %cmp53, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.054 = phi i64 [ %add24, %for.body ], [ 0, %for.body.preheader ]
+  %0 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %i.054
+  %1 = load <4 x float>, <4 x float>* %0, align 16
+  %add2 = add i64 %i.054, 1
+  %2 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %add2
+  %3 = load <4 x float>, <4 x float>* %2, align 16
+  %add6 = add i64 %i.054, 2
+  %4 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %add6
+  %5 = load <4 x float>, <4 x float>* %4, align 16
+  %add10 = add i64 %i.054, 3
+  %6 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %add10
+  %7 = load <4 x float>, <4 x float>* %6, align 16
+
+  %8 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %i.054
+  store <4 x float> %1, <4 x float>* %8, align 16
+  %9 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %add2
+  store <4 x float> %3, <4 x float>* %9, align 16
+  %10 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %add6
+  store <4 x float> %5, <4 x float>* %10, align 16
+  %11 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %add10
+  store <4 x float> %7, <4 x float>* %11, align 16
+
+  %add24 = add i64 %i.054, 4
+  %cmp = icmp ult i64 %add24, %length
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+; CHECK-LABEL: copy:
+; CHECK:  ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA:[0-9]+]], #-32]{{$}}
+; CYCLON: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64
+; EXYNOS: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]]{{$}}
+; FALKOR: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64
+; THNDRX: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64
+; CHECK:  stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB:[0-9]+]], #-32]{{$}}
+; CORTEX: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64
+; CORTEX: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]], #64
+; CYCLON: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}}
+; EXYNOS: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}}
+; FALKOR: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}}
+; KRYO:   ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]]{{$}}
+; KRYO:   stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}}
+; THNDRX: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}}
+}
+
+define void @scale(<4 x float>* noalias nocapture readonly %a, <4 x float>* noalias nocapture %c, float %scalar, i64 %length) local_unnamed_addr #0 {
+entry:
+  %0 = insertelement <4 x float> undef, float %scalar, i32 0
+  %vecinit3.i = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer
+  %cmp68 = icmp eq i64 %length, 0
+  br i1 %cmp68, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.069 = phi i64 [ %add28, %for.body ], [ 0, %for.body.preheader ]
+  %1 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %i.069
+  %2 = load <4 x float>, <4 x float>* %1, align 16
+  %add2 = add i64 %i.069, 1
+  %3 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %add2
+  %4 = load <4 x float>, <4 x float>* %3, align 16
+  %add6 = add i64 %i.069, 2
+  %5 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %add6
+  %6 = load <4 x float>, <4 x float>* %5, align 16
+  %add10 = add i64 %i.069, 3
+  %7 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %add10
+  %8 = load <4 x float>, <4 x float>* %7, align 16
+
+  %mul.i67 = fmul fast <4 x float> %2, %vecinit3.i
+  %mul.i66 = fmul fast <4 x float> %4, %vecinit3.i
+  %mul.i65 = fmul fast <4 x float> %6, %vecinit3.i
+  %mul.i = fmul fast <4 x float> %8, %vecinit3.i
+
+  %9 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %i.069
+  store <4 x float> %mul.i67, <4 x float>* %9, align 16
+  %10 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %add2
+  store <4 x float> %mul.i66, <4 x float>* %10, align 16
+  %11 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %add6
+  store <4 x float> %mul.i65, <4 x float>* %11, align 16
+  %12 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %add10
+  store <4 x float> %mul.i, <4 x float>* %12, align 16
+
+  %add28 = add i64 %i.069, 4
+  %cmp = icmp ult i64 %add28, %length
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+; CHECK-LABEL: scale:
+; CHECK:  ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA:[0-9]+]], #-32]{{$}}
+; CORTEX: ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64
+; CYCLON: ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64
+; EXYNOS: ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]]{{$}}
+; FALKOR: ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64
+; KRYO:   ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]]{{$}}
+; THNDRX: ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64
+; CHECK:  stp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB:[0-9]+]], #-32]{{$}}
+; CORTEX: stp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]], #64
+; CYCLON: stp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}}
+; EXYNOS: stp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}}
+; FALKOR: stp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}}
+; KRYO:   stp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}}
+; THNDRX: stp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}}
+}
+
+define void @add(<4 x float>* noalias nocapture readonly %a, <4 x float>* noalias nocapture readonly %b, <4 x float>* noalias nocapture %c, i64 %length) local_unnamed_addr #0 {
+entry:
+  %cmp94 = icmp eq i64 %length, 0
+  br i1 %cmp94, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.095 = phi i64 [ %add43, %for.body ], [ 0, %for.body.preheader ]
+  %0 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %i.095
+  %1 = load <4 x float>, <4 x float>* %0, align 16
+  %add2 = add i64 %i.095, 1
+  %2 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %add2
+  %3 = load <4 x float>, <4 x float>* %2, align 16
+  %add6 = add i64 %i.095, 2
+  %4 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %add6
+  %5 = load <4 x float>, <4 x float>* %4, align 16
+  %add10 = add i64 %i.095, 3
+  %6 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %add10
+  %7 = load <4 x float>, <4 x float>* %6, align 16
+
+  %8 = getelementptr inbounds <4 x float>, <4 x float>* %b, i64 %i.095
+  %9 = load <4 x float>, <4 x float>* %8, align 16
+  %10 = getelementptr inbounds <4 x float>, <4 x float>* %b, i64 %add2
+  %11 = load <4 x float>, <4 x float>* %10, align 16
+  %12 = getelementptr inbounds <4 x float>, <4 x float>* %b, i64 %add6
+  %13 = load <4 x float>, <4 x float>* %12, align 16
+  %14 = getelementptr inbounds <4 x float>, <4 x float>* %b, i64 %add10
+  %15 = load <4 x float>, <4 x float>* %14, align 16
+
+  %add.i = fadd fast <4 x float> %9, %1
+  %add.i93 = fadd fast <4 x float> %11, %3
+  %add.i92 = fadd fast <4 x float> %13, %5
+  %add.i91 = fadd fast <4 x float> %15, %7
+
+  %16 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %i.095
+  store <4 x float> %add.i, <4 x float>* %16, align 16
+  %17 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %add2
+  store <4 x float> %add.i93, <4 x float>* %17, align 16
+  %18 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %add6
+  store <4 x float> %add.i92, <4 x float>* %18, align 16
+  %19 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %add10
+  store <4 x float> %add.i91, <4 x float>* %19, align 16
+
+  %add43 = add i64 %i.095, 4
+  %cmp = icmp ult i64 %add43, %length
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+; CHECK-LABEL: add:
+; CHECK:  ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA:[0-9]+]], #-32]{{$}}
+; CYCLON: ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64
+; EXYNOS: ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]]{{$}}
+; CHECK:  ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB:[0-9]+]], #-32]{{$}}
+; CORTEX: ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64
+; CORTEX: ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]], #64
+; CYCLON: ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]], #64
+; EXYNOS: ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}}
+; FALKOR: ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64
+; KRYO:   ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]]{{$}}
+; KRYO:   ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}}
+; THNDRX: ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64
+; THNDRX: ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]], #64
+; CHECK:  stp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC:[0-9]+]], #-32]{{$}}
+; CORTEX: stp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC]]], #64
+; CYCLON: stp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC]]]{{$}}
+; EXYNOS: stp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC]]]{{$}}
+; FALKOR: ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]], #64
+; FALKOR: stp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC]]]{{$}}
+; KRYO:   stp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC]]]{{$}}
+; THNDRX: stp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC]]]{{$}}
+}
+
+define void @triad(<4 x float>* noalias nocapture readonly %a, <4 x float>* noalias nocapture readonly %b, <4 x float>* noalias nocapture %c, float %scalar, i64 %length) local_unnamed_addr #0 {
+entry:
+  %0 = insertelement <4 x float> undef, float %scalar, i32 0
+  %vecinit3.i = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer
+  %cmp110 = icmp eq i64 %length, 0
+  br i1 %cmp110, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader:                               ; preds = %entry
+  br label %for.body
+
+for.cond.cleanup:                                 ; preds = %for.body, %entry
+  ret void
+
+for.body:                                         ; preds = %for.body.preheader, %for.body
+  %i.0111 = phi i64 [ %add48, %for.body ], [ 0, %for.body.preheader ]
+  %1 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %i.0111
+  %2 = load <4 x float>, <4 x float>* %1, align 16
+  %add2 = add i64 %i.0111, 1
+  %3 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %add2
+  %4 = load <4 x float>, <4 x float>* %3, align 16
+  %add6 = add i64 %i.0111, 2
+  %5 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %add6
+  %6 = load <4 x float>, <4 x float>* %5, align 16
+  %add10 = add i64 %i.0111, 3
+  %7 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %add10
+  %8 = load <4 x float>, <4 x float>* %7, align 16
+
+  %9 = getelementptr inbounds <4 x float>, <4 x float>* %b, i64 %i.0111
+  %10 = load <4 x float>, <4 x float>* %9, align 16
+  %11 = getelementptr inbounds <4 x float>, <4 x float>* %b, i64 %add2
+  %12 = load <4 x float>, <4 x float>* %11, align 16
+  %13 = getelementptr inbounds <4 x float>, <4 x float>* %b, i64 %add6
+  %14 = load <4 x float>, <4 x float>* %13, align 16
+  %15 = getelementptr inbounds <4 x float>, <4 x float>* %b, i64 %add10
+  %16 = load <4 x float>, <4 x float>* %15, align 16
+
+  %mul.i109 = fmul fast <4 x float> %2, %vecinit3.i
+  %mul.i108 = fmul fast <4 x float> %4, %vecinit3.i
+  %mul.i107 = fmul fast <4 x float> %6, %vecinit3.i
+  %mul.i = fmul fast <4 x float> %8, %vecinit3.i
+
+  %add.i106 = fadd fast <4 x float> %10, %mul.i109
+  %add.i105 = fadd fast <4 x float> %12, %mul.i108
+  %add.i104 = fadd fast <4 x float> %14, %mul.i107
+  %add.i = fadd fast <4 x float> %16, %mul.i
+
+  %17 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %i.0111
+  store <4 x float> %add.i106, <4 x float>* %17, align 16
+  %18 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %add2
+  store <4 x float> %add.i105, <4 x float>* %18, align 16
+  %19 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %add6
+  store <4 x float> %add.i104, <4 x float>* %19, align 16
+  %20 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %add10
+  store <4 x float> %add.i, <4 x float>* %20, align 16
+
+  %add48 = add i64 %i.0111, 4
+  %cmp = icmp ult i64 %add48, %length
+  br i1 %cmp, label %for.body, label %for.cond.cleanup
+
+; CHECK-LABEL: triad:
+; CHECK:  ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA:[0-9]+]], #-32]{{$}}
+; CYCLON: ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64
+; EXYNOS: ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]]{{$}}
+; FALKOR: ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64
+; CHECK:  ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB:[0-9]+]], #-32]{{$}}
+; CORTEX: ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64
+; CORTEX: ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]], #64
+; CYCLON: ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]], #64
+; EXYNOS: ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}}
+; KRYO:   ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]]{{$}}
+; KRYO:   ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}}
+; THNDRX: ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64
+; THNDRX: ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]], #64
+; CHECK:  stp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC:[0-9]+]], #-32]{{$}}
+; CORTEX: stp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC]]], #64
+; CYCLON: stp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC]]]{{$}}
+; EXYNOS: stp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC]]]{{$}}
+; FALKOR: ldp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]], #64
+; FALKOR: stp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC]]]{{$}}
+; KRYO:   stp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC]]]{{$}}
+; THNDRX: stp  q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC]]]{{$}}
+}
+
+attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" "target-features"="+neon" }