Index: llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -28,8 +28,11 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/TargetRegisterInfo.h" +#include "llvm/CodeGen/TargetSchedule.h" #include "llvm/IR/DebugLoc.h" +#include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSchedule.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -96,11 +99,19 @@ AliasAnalysis *AA; const AArch64InstrInfo *TII; const TargetRegisterInfo *TRI; + const TargetSubtargetInfo *STI; const AArch64Subtarget *Subtarget; // Track which registers have been modified and used. BitVector ModifiedRegs, UsedRegs; + // Target has a cost model. + bool HasCostModel; + TargetSchedModel TSM; + + // Function is being optimized for code size. + bool OptForMinSize; + void getAnalysisUsage(AnalysisUsage &AU) const override { AU.addRequired(); MachineFunctionPass::getAnalysisUsage(AU); @@ -154,6 +165,9 @@ bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI, unsigned BaseReg, int Offset); + // Evaluate if the new instruction is a better choice than the old ones. + bool isProfitable(unsigned New, MachineInstr &MIA, MachineInstr &MIB); + // Merge a pre- or post-index base register update into a ld/st instruction. MachineBasicBlock::iterator mergeUpdateInsn(MachineBasicBlock::iterator I, @@ -650,6 +664,65 @@ } } +bool AArch64LoadStoreOpt::isProfitable(unsigned New, + MachineInstr &MIA, MachineInstr &MIB) { + // Default as profitable if optimizing for size or + // in the absence of a cost model. + if (OptForMinSize || !HasCostModel) { + DEBUG(dbgs() << "Evaluating instructions: replacement by default.\n"); + return true; + } + + const MCSchedClassDesc + *SCN = TSM.getMCSchedModel()->getSchedClassDesc(TII->get(New).getSchedClass()), + *SCA = TSM.resolveSchedClass(&MIA), + *SCB = TSM.resolveSchedClass(&MIB); + // Default as profitable if the new instr is variant or has invalid costs. + if (SCN->isVariant() || !SCN->isValid()) { + DEBUG(dbgs() << "Evaluating instructions: replacement by default.\n"); + return true; + } + + long LatN = TSM.computeInstrLatency(New), + LatA = TSM.computeInstrLatency(&MIA), + LatB = TSM.computeInstrLatency(&MIB); + long LatDif = LatN - std::max(LatA, LatB); + long UopN = SCN->NumMicroOps, + UopA = TSM.getNumMicroOps(&MIA, SCA), + UopB = TSM.getNumMicroOps(&MIB, SCB); + long UopDif = UopN - (UopA + UopB); + + // The new instr is profitable if it is at least as fast when + // either it is simple or both other instrs are complex. + if (UopN <= 1 || (UopA > 1 && UopB > 1)) { + if (LatDif <= 0) { + DEBUG(dbgs() << "Evaluating instructions: replacement is faster.\n"); + return true; + } + } + // The new instr is profitable if it is faster than it is complex when + // both other instrs are simple. + else if (UopA <= 1 && UopB <= 1) { + if (LatDif < -UopDif) { + DEBUG(dbgs() << "Evaluating instructions: replacement is faster " + "though more complex.\n"); + return true; + } + } + // The new instr is profitable if it is at least as fast as it is complex when + // either other instr is complex. + else if (UopA > 1 || UopB > 1) { + if (LatDif <= -UopDif) { + DEBUG(dbgs() << "Evaluating instructions: replacement is faster " + "though not more complex.\n"); + return true; + } + } + + // It is not profitable. + return false; +} + MachineBasicBlock::iterator AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I, MachineBasicBlock::iterator MergeMI, @@ -1344,6 +1417,11 @@ unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) : getPostIndexedOpcode(I->getOpcode()); + + // Evaluate if the new instruction is a better choice than both old ones. + if (!isProfitable(NewOpc, *I, *Update)) + return NextI; + MachineInstrBuilder MIB; if (!isPairedLdSt(*I)) { // Non-paired instruction. @@ -1373,7 +1451,7 @@ ++NumPostFolded; DEBUG(dbgs() << "Creating post-indexed load/store."); } - DEBUG(dbgs() << " Replacing instructions:\n "); + DEBUG(dbgs() << " Replacing instructions:\n "); DEBUG(I->print(dbgs())); DEBUG(dbgs() << " "); DEBUG(Update->print(dbgs())); @@ -1767,6 +1845,16 @@ TRI = Subtarget->getRegisterInfo(); AA = &getAnalysis().getAAResults(); + OptForMinSize = Fn.getFunction()->optForMinSize(); + + const TargetSubtargetInfo &STI = Fn.getSubtarget(); + TSM.init(STI.getSchedModel(), &STI, STI.getInstrInfo()); + // TODO: For now, only support targets with a scheduling model. In order to + // support a target that has itineraries instead, then + // isProfitable() has to be modified to calculate the latency + // and the number of uops. + HasCostModel = TSM.hasInstrSchedModel(); + // Resize the modified and used register bitfield trackers. We do this once // per function and then clear the bitfield each time we optimize a load or // store. Index: llvm/test/CodeGen/AArch64/ldst-opt.ll =================================================================== --- llvm/test/CodeGen/AArch64/ldst-opt.ll +++ llvm/test/CodeGen/AArch64/ldst-opt.ll @@ -1,5 +1,6 @@ -; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -o - %s | FileCheck --check-prefix=CHECK --check-prefix=NOSTRICTALIGN %s -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+strict-align -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -o - %s | FileCheck --check-prefix=CHECK --check-prefix=STRICTALIGN %s +; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK,GENERIC,NOSTRICTALIGN +; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -mattr=+strict-align -o - %s | FileCheck %s --check-prefixes=CHECK,GENERIC,STRICTALIGN +; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -mcpu=exynos-m1 -o - %s | FileCheck %s --check-prefixes=CHECK,EXYNOS ; This file contains tests for the AArch64 load/store optimizer. @@ -7,8 +8,8 @@ %s.byte = type { i8, i8 } %s.halfword = type { i16, i16 } %s.word = type { i32, i32 } -%s.doubleword = type { i64, i32 } -%s.quadword = type { fp128, i32 } +%s.doubleword = type { i64, i64 } +%s.quadword = type { fp128, fp128 } %s.float = type { float, i32 } %s.double = type { double, i32 } %struct.byte = type { %padding, %s.byte } @@ -236,8 +237,10 @@ define void @load-pair-pre-indexed-word(%struct.word* %ptr) nounwind { ; CHECK-LABEL: load-pair-pre-indexed-word -; CHECK: ldp w{{[0-9]+}}, w{{[0-9]+}}, [x0, #32]! -; CHECK-NOT: add x0, x0, #32 +; GENERIC: ldp w{{[0-9]+}}, w{{[0-9]+}}, [x0, #32]! +; GENERIC-NOT: add x0, x0, #32 +; EXYNOS: ldp w{{[0-9]+}}, w{{[0-9]+}}, [x0, #32]{{$}} +; EXYNOS: add x0, x0, #32 entry: %a = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1, i32 0 %a1 = load i32, i32* %a, align 4 @@ -267,6 +270,41 @@ ret void } +define void @load-pair-pre-indexed-doubleword(%struct.doubleword* %ptr) nounwind { +; CHECK-LABEL: load-pair-pre-indexed-doubleword +; GENERIC: ldp x{{[0-9]+}}, x{{[0-9]+}}, [x0, #32]! +; GENERIC-NOT: add x0, x0, #32 +; EXYNOS: ldp x{{[0-9]+}}, x{{[0-9]+}}, [x0, #32]{{$}} +; EXYNOS: add x0, x0, #32 +entry: + %a = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1, i32 0 + %a1 = load i64, i64* %a, align 8 + %b = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1, i32 1 + %b1 = load i64, i64* %b, align 8 + %add = add i64 %a1, %b1 + br label %bar +bar: + %c = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1 + tail call void @bar_doubleword(%s.doubleword* %c, i64 %add) + ret void +} + +define void @store-pair-pre-indexed-doubleword(%struct.doubleword* %ptr, i64 %val) nounwind { +; CHECK-LABEL: store-pair-pre-indexed-doubleword +; CHECK: stp x{{[0-9]+}}, x{{[0-9]+}}, [x0, #32]! +; CHECK-NOT: add x0, x0, #32 +entry: + %a = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1, i32 0 + store i64 %val, i64* %a, align 8 + %b = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1, i32 1 + store i64 %val, i64* %b, align 8 + br label %bar +bar: + %c = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1 + tail call void @bar_doubleword(%s.doubleword* %c, i64 %val) + ret void +} + ; Check the following transform: ; ; add x8, x8, #16 @@ -1031,7 +1069,6 @@ define void @store-pair-post-indexed-word() nounwind { ; CHECK-LABEL: store-pair-post-indexed-word ; CHECK: stp w{{[0-9]+}}, w{{[0-9]+}}, [sp], #16 -; CHECK: ret %src = alloca { i32, i32 }, align 8 %dst = alloca { i32, i32 }, align 8 @@ -1050,7 +1087,6 @@ define void @store-pair-post-indexed-doubleword() nounwind { ; CHECK-LABEL: store-pair-post-indexed-doubleword ; CHECK: stp x{{[0-9]+}}, x{{[0-9]+}}, [sp], #32 -; CHECK: ret %src = alloca { i64, i64 }, align 8 %dst = alloca { i64, i64 }, align 8 @@ -1069,7 +1105,6 @@ define void @store-pair-post-indexed-float() nounwind { ; CHECK-LABEL: store-pair-post-indexed-float ; CHECK: stp s{{[0-9]+}}, s{{[0-9]+}}, [sp], #16 -; CHECK: ret %src = alloca { float, float }, align 8 %dst = alloca { float, float }, align 8 @@ -1088,7 +1123,6 @@ define void @store-pair-post-indexed-double() nounwind { ; CHECK-LABEL: store-pair-post-indexed-double ; CHECK: stp d{{[0-9]+}}, d{{[0-9]+}}, [sp], #32 -; CHECK: ret %src = alloca { double, double }, align 8 %dst = alloca { double, double }, align 8 @@ -1104,6 +1138,26 @@ ret void } +define void @store-pair-post-indexed-quadword() nounwind { +; CHECK-LABEL: store-pair-post-indexed-quadword +; GENERIC: stp q{{[0-9]+}}, q{{[0-9]+}}, [sp], #64 +; EXYNOS: str q{{[0-9]+}}, [sp] +; EXYNOS-NEXT: str q{{[0-9]+}}, [sp, #16] + %src = alloca { fp128, fp128 }, align 8 + %dst = alloca { fp128, fp128 }, align 8 + + %src.realp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %src, i32 0, i32 0 + %src.real = load fp128, fp128* %src.realp + %src.imagp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %src, i32 0, i32 1 + %src.imag = load fp128, fp128* %src.imagp + + %dst.realp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %dst, i32 0, i32 0 + %dst.imagp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %dst, i32 0, i32 1 + store fp128 %src.real, fp128* %dst.realp + store fp128 %src.imag, fp128* %dst.imagp + ret void +} + ; Check the following transform: ; ; (ldr|str) X, [x20] @@ -1287,7 +1341,8 @@ define void @post-indexed-paired-min-offset(i64* %a, i64* %b, i64 %count) nounwind { ; CHECK-LABEL: post-indexed-paired-min-offset -; CHECK: ldp x{{[0-9]+}}, x{{[0-9]+}}, [x{{[0-9]+}}], #-512 +; GENERIC: ldp x{{[0-9]+}}, x{{[0-9]+}}, [x{{[0-9]+}}], #-512 +; EXYNOS: ldp x{{[0-9]+}}, x{{[0-9]+}}, [x{{[0-9]+}}]{{$}} ; CHECK: stp x{{[0-9]+}}, x{{[0-9]+}}, [x{{[0-9]+}}], #-512 br label %for.body for.body: @@ -1340,10 +1395,8 @@ ; scalar stores which should get merged by AArch64LoadStoreOptimizer. define void @merge_zr32(i32* %p) { ; CHECK-LABEL: merge_zr32: -; CHECK: // %entry -; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}] -; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] -; CHECK-NEXT: ret +; NOSTRICTALIGN: str xzr, [x{{[0-9]+}}] +; STRICTALIGN: stp wzr, wzr, [x{{[0-9]+}}] entry: store i32 0, i32* %p %p1 = getelementptr i32, i32* %p, i32 1 @@ -1354,11 +1407,9 @@ ; Same as merge_zr32 but the merged stores should also get paried. define void @merge_zr32_2(i32* %p) { ; CHECK-LABEL: merge_zr32_2: -; CHECK: // %entry -; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}] -; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] +; NOSTRICTALIGN: stp xzr, xzr, [x{{[0-9]+}}] +; STRICTALIGN: stp wzr, wzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8] -; CHECK-NEXT: ret entry: store i32 0, i32* %p %p1 = getelementptr i32, i32* %p, i32 1 @@ -1373,13 +1424,11 @@ ; Like merge_zr32_2, but checking the largest allowed stp immediate offset. define void @merge_zr32_2_offset(i32* %p) { ; CHECK-LABEL: merge_zr32_2_offset: -; CHECK: // %entry -; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #504] -; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #504] +; NOSTRICTALIGN: stp xzr, xzr, [x{{[0-9]+}}, #504] +; STRICTALIGN: str wzr, [x{{[0-9]+}}, #504] ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #508] ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #512] ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #516] -; CHECK-NEXT: ret entry: %p0 = getelementptr i32, i32* %p, i32 126 store i32 0, i32* %p0 @@ -1397,14 +1446,12 @@ ; instruction. define void @no_merge_zr32_2_offset(i32* %p) { ; CHECK-LABEL: no_merge_zr32_2_offset: -; CHECK: // %entry -; NOSTRICTALIGN-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 +; NOSTRICTALIGN: movi v[[REG:[0-9]]].2d, #0000000000000000 ; NOSTRICTALIGN-NEXT: str q[[REG]], [x{{[0-9]+}}, #4096] -; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #4096] +; STRICTALIGN: str wzr, [x{{[0-9]+}}, #4096] ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #4100] ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #4104] ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #4108] -; CHECK-NEXT: ret entry: %p0 = getelementptr i32, i32* %p, i32 1024 store i32 0, i32* %p0 @@ -1422,14 +1469,12 @@ ; err on the side that allows for stp q instruction generation. define void @merge_zr32_3(i32* %p) { ; CHECK-LABEL: merge_zr32_3: -; CHECK: // %entry -; NOSTRICTALIGN-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 +; NOSTRICTALIGN: movi v[[REG:[0-9]]].2d, #0000000000000000 ; NOSTRICTALIGN-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}] -; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] +; STRICTALIGN: stp wzr, wzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #16] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #24] -; CHECK-NEXT: ret entry: store i32 0, i32* %p %p1 = getelementptr i32, i32* %p, i32 1 @@ -1452,10 +1497,8 @@ ; Like merge_zr32, but with 2-vector type. define void @merge_zr32_2vec(<2 x i32>* %p) { ; CHECK-LABEL: merge_zr32_2vec: -; CHECK: // %entry -; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}] -; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] -; CHECK-NEXT: ret +; NOSTRICTALIGN: str xzr, [x{{[0-9]+}}] +; STRICTALIGN: stp wzr, wzr, [x{{[0-9]+}}] entry: store <2 x i32> zeroinitializer, <2 x i32>* %p ret void @@ -1464,12 +1507,10 @@ ; Like merge_zr32, but with 3-vector type. define void @merge_zr32_3vec(<3 x i32>* %p) { ; CHECK-LABEL: merge_zr32_3vec: -; CHECK: // %entry -; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}] +; NOSTRICTALIGN: str xzr, [x{{[0-9]+}}] ; NOSTRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #8] -; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] +; STRICTALIGN: stp wzr, wzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #8] -; CHECK-NEXT: ret entry: store <3 x i32> zeroinitializer, <3 x i32>* %p ret void @@ -1478,11 +1519,9 @@ ; Like merge_zr32, but with 4-vector type. define void @merge_zr32_4vec(<4 x i32>* %p) { ; CHECK-LABEL: merge_zr32_4vec: -; CHECK: // %entry -; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}] -; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] +; NOSTRICTALIGN: stp xzr, xzr, [x{{[0-9]+}}] +; STRICTALIGN: stp wzr, wzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8] -; CHECK-NEXT: ret entry: store <4 x i32> zeroinitializer, <4 x i32>* %p ret void @@ -1491,10 +1530,8 @@ ; Like merge_zr32, but with 2-vector float type. define void @merge_zr32_2vecf(<2 x float>* %p) { ; CHECK-LABEL: merge_zr32_2vecf: -; CHECK: // %entry -; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}] -; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] -; CHECK-NEXT: ret +; NOSTRICTALIGN: str xzr, [x{{[0-9]+}}] +; STRICTALIGN: stp wzr, wzr, [x{{[0-9]+}}] entry: store <2 x float> zeroinitializer, <2 x float>* %p ret void @@ -1503,11 +1540,9 @@ ; Like merge_zr32, but with 4-vector float type. define void @merge_zr32_4vecf(<4 x float>* %p) { ; CHECK-LABEL: merge_zr32_4vecf: -; CHECK: // %entry -; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}] -; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] +; NOSTRICTALIGN: stp xzr, xzr, [x{{[0-9]+}}] +; STRICTALIGN: stp wzr, wzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8] -; CHECK-NEXT: ret entry: store <4 x float> zeroinitializer, <4 x float>* %p ret void @@ -1516,8 +1551,7 @@ ; Similar to merge_zr32, but for 64-bit values. define void @merge_zr64(i64* %p) { ; CHECK-LABEL: merge_zr64: -; CHECK: // %entry -; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}] +; CHECK: stp xzr, xzr, [x{{[0-9]+}}] ; CHECK-NEXT: ret entry: store i64 0, i64* %p @@ -1529,8 +1563,7 @@ ; Similar to merge_zr32, but for 64-bit values and with unaligned stores. define void @merge_zr64_unalign(<2 x i64>* %p) { ; CHECK-LABEL: merge_zr64_unalign: -; CHECK: // %entry -; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}] +; NOSTRICTALIGN: stp xzr, xzr, [x{{[0-9]+}}] ; STRICTALIGN: strb ; STRICTALIGN: strb ; STRICTALIGN: strb @@ -1547,7 +1580,6 @@ ; STRICTALIGN: strb ; STRICTALIGN: strb ; STRICTALIGN: strb -; CHECK-NEXT: ret entry: store <2 x i64> zeroinitializer, <2 x i64>* %p, align 1 ret void @@ -1557,12 +1589,10 @@ ; vector store since the zero constant vector has multiple uses. define void @merge_zr64_2(i64* %p) { ; CHECK-LABEL: merge_zr64_2: -; CHECK: // %entry -; NOSTRICTALIGN-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 +; NOSTRICTALIGN: movi v[[REG:[0-9]]].2d, #0000000000000000 ; NOSTRICTALIGN-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}] -; STRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}] +; STRICTALIGN: stp xzr, xzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #16] -; CHECK-NEXT: ret entry: store i64 0, i64* %p %p1 = getelementptr i64, i64* %p, i64 1 @@ -1577,9 +1607,7 @@ ; Like merge_zr64, but with 2-vector double type. define void @merge_zr64_2vecd(<2 x double>* %p) { ; CHECK-LABEL: merge_zr64_2vecd: -; CHECK: // %entry -; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}] -; CHECK-NEXT: ret +; CHECK: stp xzr, xzr, [x{{[0-9]+}}] entry: store <2 x double> zeroinitializer, <2 x double>* %p ret void @@ -1588,10 +1616,8 @@ ; Like merge_zr64, but with 3-vector i64 type. define void @merge_zr64_3vec(<3 x i64>* %p) { ; CHECK-LABEL: merge_zr64_3vec: -; CHECK: // %entry -; CHECK-NEXT: stp xzr, xzr, [x{{[0-9]+}}] +; CHECK: stp xzr, xzr, [x{{[0-9]+}}] ; CHECK-NEXT: str xzr, [x{{[0-9]+}}, #16] -; CHECK-NEXT: ret entry: store <3 x i64> zeroinitializer, <3 x i64>* %p ret void @@ -1600,10 +1626,10 @@ ; Like merge_zr64_2, but with 4-vector double type. define void @merge_zr64_4vecd(<4 x double>* %p) { ; CHECK-LABEL: merge_zr64_4vecd: -; CHECK: // %entry -; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 -; CHECK-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}] -; CHECK-NEXT: ret +; CHECK: movi v[[REG:[0-9]]].2d, #0000000000000000 +; GENERIC-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}] +; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}, #16] +; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}] entry: store <4 x double> zeroinitializer, <4 x double>* %p ret void @@ -1612,15 +1638,13 @@ ; Verify that non-consecutive merges do not generate q0 define void @merge_multiple_128bit_stores(i64* %p) { ; CHECK-LABEL: merge_multiple_128bit_stores -; CHECK: // %entry -; NOSTRICTALIGN-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 +; NOSTRICTALIGN: movi v[[REG:[0-9]]].2d, #0000000000000000 ; NOSTRICTALIGN-NEXT: str q0, [x0] ; NOSTRICTALIGN-NEXT: stur q0, [x0, #24] ; NOSTRICTALIGN-NEXT: str q0, [x0, #48] -; STRICTALIGN-NEXT: stp xzr, xzr, [x0] +; STRICTALIGN: stp xzr, xzr, [x0] ; STRICTALIGN-NEXT: stp xzr, xzr, [x0, #24] ; STRICTALIGN-NEXT: stp xzr, xzr, [x0, #48] -; CHECK-NEXT: ret entry: store i64 0, i64* %p %p1 = getelementptr i64, i64* %p, i64 1 @@ -1639,15 +1663,13 @@ ; Verify that large stores generate stp q define void @merge_multiple_128bit_stores_consec(i64* %p) { ; CHECK-LABEL: merge_multiple_128bit_stores_consec -; CHECK: // %entry -; NOSTRICTALIGN-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 +; NOSTRICTALIGN: movi v[[REG:[0-9]]].2d, #0000000000000000 ; NOSTRICTALIGN-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}] ; NOSTRICTALIGN-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}, #32] -; STRICTALIGN-NEXT: stp xzr, xzr, [x0] +; STRICTALIGN: stp xzr, xzr, [x0] ; STRICTALIGN-NEXT: stp xzr, xzr, [x0, #16] ; STRICTALIGN-NEXT: stp xzr, xzr, [x0, #32] ; STRICTALIGN-NEXT: stp xzr, xzr, [x0, #48] -; CHECK-NEXT: ret entry: store i64 0, i64* %p %p1 = getelementptr i64, i64* %p, i64 1 @@ -1669,8 +1691,7 @@ ; Check for bug 34674 where invalid add of xzr was being generated. ; CHECK-LABEL: bug34674: -; CHECK: // %entry -; CHECK-NEXT: mov [[ZREG:x[0-9]+]], xzr +; CHECK: mov [[ZREG:x[0-9]+]], {{#0|xzr}} ; CHECK-DAG: stp [[ZREG]], [[ZREG]], [x0] ; CHECK-DAG: add x{{[0-9]+}}, [[ZREG]], #1 define i64 @bug34674(<2 x i64>* %p) { Index: llvm/test/CodeGen/AArch64/machine-outliner-remarks.ll =================================================================== --- llvm/test/CodeGen/AArch64/machine-outliner-remarks.ll +++ llvm/test/CodeGen/AArch64/machine-outliner-remarks.ll @@ -95,7 +95,7 @@ ret void } -attributes #0 = { noredzone nounwind ssp uwtable "no-frame-pointer-elim"="false" "target-cpu"="cyclone" } +attributes #0 = { optsize minsize noredzone nounwind ssp uwtable "no-frame-pointer-elim"="false" "target-cpu"="cyclone" } !llvm.dbg.cu = !{!0} !llvm.module.flags = !{!3, !4, !5, !6} Index: llvm/test/CodeGen/AArch64/machine-outliner.ll =================================================================== --- llvm/test/CodeGen/AArch64/machine-outliner.ll +++ llvm/test/CodeGen/AArch64/machine-outliner.ll @@ -61,4 +61,4 @@ ; CHECK-NEXT: str w8, [sp], #16 ; CHECK-NEXT: ret -attributes #0 = { noredzone nounwind ssp uwtable "no-frame-pointer-elim"="false" "target-cpu"="cyclone" } +attributes #0 = { optsize minsize noredzone nounwind ssp uwtable "no-frame-pointer-elim"="false" "target-cpu"="cyclone" } Index: llvm/test/CodeGen/AArch64/stream-neon.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/stream-neon.ll @@ -0,0 +1,288 @@ +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=cortex-a57 -o - | FileCheck %s --check-prefixes=CHECK,CORTEX +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=cyclone -o - | FileCheck %s --check-prefixes=CHECK,CYCLON +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=exynos-m1 -o - | FileCheck %s --check-prefixes=CHECK,EXYNOS +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=falkor -o - | FileCheck %s --check-prefixes=CHECK,FALKOR +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=kryo -o - | FileCheck %s --check-prefixes=CHECK,KRYO +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mcpu=thunderx -o - | FileCheck %s --check-prefixes=CHECK,THNDRX + +; McCalpin, John D., 1995: +; "Memory Bandwidth and Machine Balance in Current High Performance Computers", +; IEEE Computer Society TCCA Newsletter, December 1995. + +define void @copy(<4 x float>* noalias nocapture readonly %a, <4 x float>* noalias nocapture %c, i64 %length) local_unnamed_addr #0 { +entry: + %cmp53 = icmp eq i64 %length, 0 + br i1 %cmp53, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %i.054 = phi i64 [ %add24, %for.body ], [ 0, %for.body.preheader ] + %0 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %i.054 + %1 = load <4 x float>, <4 x float>* %0, align 16 + %add2 = add i64 %i.054, 1 + %2 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %add2 + %3 = load <4 x float>, <4 x float>* %2, align 16 + %add6 = add i64 %i.054, 2 + %4 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %add6 + %5 = load <4 x float>, <4 x float>* %4, align 16 + %add10 = add i64 %i.054, 3 + %6 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %add10 + %7 = load <4 x float>, <4 x float>* %6, align 16 + + %8 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %i.054 + store <4 x float> %1, <4 x float>* %8, align 16 + %9 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %add2 + store <4 x float> %3, <4 x float>* %9, align 16 + %10 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %add6 + store <4 x float> %5, <4 x float>* %10, align 16 + %11 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %add10 + store <4 x float> %7, <4 x float>* %11, align 16 + + %add24 = add i64 %i.054, 4 + %cmp = icmp ult i64 %add24, %length + br i1 %cmp, label %for.body, label %for.cond.cleanup + +; CHECK-LABEL: copy: +; CHECK: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA:[0-9]+]], #-32]{{$}} +; CYCLON: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64 +; EXYNOS: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]]{{$}} +; FALKOR: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64 +; THNDRX: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64 +; CHECK: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB:[0-9]+]], #-32]{{$}} +; CORTEX: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64 +; CORTEX: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]], #64 +; CYCLON: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}} +; EXYNOS: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}} +; FALKOR: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}} +; KRYO: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]]{{$}} +; KRYO: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}} +; THNDRX: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}} +} + +define void @scale(<4 x float>* noalias nocapture readonly %a, <4 x float>* noalias nocapture %c, float %scalar, i64 %length) local_unnamed_addr #0 { +entry: + %0 = insertelement <4 x float> undef, float %scalar, i32 0 + %vecinit3.i = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer + %cmp68 = icmp eq i64 %length, 0 + br i1 %cmp68, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %i.069 = phi i64 [ %add28, %for.body ], [ 0, %for.body.preheader ] + %1 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %i.069 + %2 = load <4 x float>, <4 x float>* %1, align 16 + %add2 = add i64 %i.069, 1 + %3 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %add2 + %4 = load <4 x float>, <4 x float>* %3, align 16 + %add6 = add i64 %i.069, 2 + %5 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %add6 + %6 = load <4 x float>, <4 x float>* %5, align 16 + %add10 = add i64 %i.069, 3 + %7 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %add10 + %8 = load <4 x float>, <4 x float>* %7, align 16 + + %mul.i67 = fmul fast <4 x float> %2, %vecinit3.i + %mul.i66 = fmul fast <4 x float> %4, %vecinit3.i + %mul.i65 = fmul fast <4 x float> %6, %vecinit3.i + %mul.i = fmul fast <4 x float> %8, %vecinit3.i + + %9 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %i.069 + store <4 x float> %mul.i67, <4 x float>* %9, align 16 + %10 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %add2 + store <4 x float> %mul.i66, <4 x float>* %10, align 16 + %11 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %add6 + store <4 x float> %mul.i65, <4 x float>* %11, align 16 + %12 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %add10 + store <4 x float> %mul.i, <4 x float>* %12, align 16 + + %add28 = add i64 %i.069, 4 + %cmp = icmp ult i64 %add28, %length + br i1 %cmp, label %for.body, label %for.cond.cleanup + +; CHECK-LABEL: scale: +; CHECK: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA:[0-9]+]], #-32]{{$}} +; CORTEX: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64 +; CYCLON: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64 +; EXYNOS: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]]{{$}} +; FALKOR: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64 +; KRYO: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]]{{$}} +; THNDRX: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64 +; CHECK: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB:[0-9]+]], #-32]{{$}} +; CORTEX: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]], #64 +; CYCLON: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}} +; EXYNOS: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}} +; FALKOR: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}} +; KRYO: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}} +; THNDRX: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}} +} + +define void @add(<4 x float>* noalias nocapture readonly %a, <4 x float>* noalias nocapture readonly %b, <4 x float>* noalias nocapture %c, i64 %length) local_unnamed_addr #0 { +entry: + %cmp94 = icmp eq i64 %length, 0 + br i1 %cmp94, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %i.095 = phi i64 [ %add43, %for.body ], [ 0, %for.body.preheader ] + %0 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %i.095 + %1 = load <4 x float>, <4 x float>* %0, align 16 + %add2 = add i64 %i.095, 1 + %2 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %add2 + %3 = load <4 x float>, <4 x float>* %2, align 16 + %add6 = add i64 %i.095, 2 + %4 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %add6 + %5 = load <4 x float>, <4 x float>* %4, align 16 + %add10 = add i64 %i.095, 3 + %6 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %add10 + %7 = load <4 x float>, <4 x float>* %6, align 16 + + %8 = getelementptr inbounds <4 x float>, <4 x float>* %b, i64 %i.095 + %9 = load <4 x float>, <4 x float>* %8, align 16 + %10 = getelementptr inbounds <4 x float>, <4 x float>* %b, i64 %add2 + %11 = load <4 x float>, <4 x float>* %10, align 16 + %12 = getelementptr inbounds <4 x float>, <4 x float>* %b, i64 %add6 + %13 = load <4 x float>, <4 x float>* %12, align 16 + %14 = getelementptr inbounds <4 x float>, <4 x float>* %b, i64 %add10 + %15 = load <4 x float>, <4 x float>* %14, align 16 + + %add.i = fadd fast <4 x float> %9, %1 + %add.i93 = fadd fast <4 x float> %11, %3 + %add.i92 = fadd fast <4 x float> %13, %5 + %add.i91 = fadd fast <4 x float> %15, %7 + + %16 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %i.095 + store <4 x float> %add.i, <4 x float>* %16, align 16 + %17 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %add2 + store <4 x float> %add.i93, <4 x float>* %17, align 16 + %18 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %add6 + store <4 x float> %add.i92, <4 x float>* %18, align 16 + %19 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %add10 + store <4 x float> %add.i91, <4 x float>* %19, align 16 + + %add43 = add i64 %i.095, 4 + %cmp = icmp ult i64 %add43, %length + br i1 %cmp, label %for.body, label %for.cond.cleanup + +; CHECK-LABEL: add: +; CHECK: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA:[0-9]+]], #-32]{{$}} +; CYCLON: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64 +; EXYNOS: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]]{{$}} +; CHECK: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB:[0-9]+]], #-32]{{$}} +; CORTEX: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64 +; CORTEX: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]], #64 +; CYCLON: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]], #64 +; EXYNOS: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}} +; FALKOR: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64 +; KRYO: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]]{{$}} +; KRYO: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}} +; THNDRX: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64 +; THNDRX: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]], #64 +; CHECK: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC:[0-9]+]], #-32]{{$}} +; CORTEX: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC]]], #64 +; CYCLON: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC]]]{{$}} +; EXYNOS: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC]]]{{$}} +; FALKOR: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]], #64 +; FALKOR: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC]]]{{$}} +; KRYO: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC]]]{{$}} +; THNDRX: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC]]]{{$}} +} + +define void @triad(<4 x float>* noalias nocapture readonly %a, <4 x float>* noalias nocapture readonly %b, <4 x float>* noalias nocapture %c, float %scalar, i64 %length) local_unnamed_addr #0 { +entry: + %0 = insertelement <4 x float> undef, float %scalar, i32 0 + %vecinit3.i = shufflevector <4 x float> %0, <4 x float> undef, <4 x i32> zeroinitializer + %cmp110 = icmp eq i64 %length, 0 + br i1 %cmp110, label %for.cond.cleanup, label %for.body.preheader + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %i.0111 = phi i64 [ %add48, %for.body ], [ 0, %for.body.preheader ] + %1 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %i.0111 + %2 = load <4 x float>, <4 x float>* %1, align 16 + %add2 = add i64 %i.0111, 1 + %3 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %add2 + %4 = load <4 x float>, <4 x float>* %3, align 16 + %add6 = add i64 %i.0111, 2 + %5 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %add6 + %6 = load <4 x float>, <4 x float>* %5, align 16 + %add10 = add i64 %i.0111, 3 + %7 = getelementptr inbounds <4 x float>, <4 x float>* %a, i64 %add10 + %8 = load <4 x float>, <4 x float>* %7, align 16 + + %9 = getelementptr inbounds <4 x float>, <4 x float>* %b, i64 %i.0111 + %10 = load <4 x float>, <4 x float>* %9, align 16 + %11 = getelementptr inbounds <4 x float>, <4 x float>* %b, i64 %add2 + %12 = load <4 x float>, <4 x float>* %11, align 16 + %13 = getelementptr inbounds <4 x float>, <4 x float>* %b, i64 %add6 + %14 = load <4 x float>, <4 x float>* %13, align 16 + %15 = getelementptr inbounds <4 x float>, <4 x float>* %b, i64 %add10 + %16 = load <4 x float>, <4 x float>* %15, align 16 + + %mul.i109 = fmul fast <4 x float> %2, %vecinit3.i + %mul.i108 = fmul fast <4 x float> %4, %vecinit3.i + %mul.i107 = fmul fast <4 x float> %6, %vecinit3.i + %mul.i = fmul fast <4 x float> %8, %vecinit3.i + + %add.i106 = fadd fast <4 x float> %10, %mul.i109 + %add.i105 = fadd fast <4 x float> %12, %mul.i108 + %add.i104 = fadd fast <4 x float> %14, %mul.i107 + %add.i = fadd fast <4 x float> %16, %mul.i + + %17 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %i.0111 + store <4 x float> %add.i106, <4 x float>* %17, align 16 + %18 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %add2 + store <4 x float> %add.i105, <4 x float>* %18, align 16 + %19 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %add6 + store <4 x float> %add.i104, <4 x float>* %19, align 16 + %20 = getelementptr inbounds <4 x float>, <4 x float>* %c, i64 %add10 + store <4 x float> %add.i, <4 x float>* %20, align 16 + + %add48 = add i64 %i.0111, 4 + %cmp = icmp ult i64 %add48, %length + br i1 %cmp, label %for.body, label %for.cond.cleanup + +; CHECK-LABEL: triad: +; CHECK: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA:[0-9]+]], #-32]{{$}} +; CYCLON: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64 +; EXYNOS: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]]{{$}} +; FALKOR: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64 +; CHECK: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB:[0-9]+]], #-32]{{$}} +; CORTEX: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64 +; CORTEX: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]], #64 +; CYCLON: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]], #64 +; EXYNOS: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}} +; KRYO: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]]{{$}} +; KRYO: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]]{{$}} +; THNDRX: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RA]]], #64 +; THNDRX: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]], #64 +; CHECK: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC:[0-9]+]], #-32]{{$}} +; CORTEX: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC]]], #64 +; CYCLON: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC]]]{{$}} +; EXYNOS: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC]]]{{$}} +; FALKOR: ldp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RB]]], #64 +; FALKOR: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC]]]{{$}} +; KRYO: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC]]]{{$}} +; THNDRX: stp q{{[0-9]+}}, q{{[0-9]+}}, [x[[RC]]]{{$}} +} + +attributes #0 = { norecurse nounwind "correctly-rounded-divide-sqrt-fp-math"="false" "less-precise-fpmad"="false" "no-infs-fp-math"="true" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "unsafe-fp-math"="true" "use-soft-float"="false" "target-features"="+neon" }