Index: llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -27,8 +27,11 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/TargetSchedule.h" #include "llvm/IR/DebugLoc.h" +#include "llvm/MC/MCInstrDesc.h" #include "llvm/MC/MCRegisterInfo.h" +#include "llvm/MC/MCSchedule.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" @@ -154,6 +157,10 @@ bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI, unsigned BaseReg, int Offset); + // Evaluate if the new instruction is a better choice than the old ones. + bool isProfitableMergeUpdate(unsigned New, + MachineInstr &OldA, MachineInstr &OldB); + // Merge a pre- or post-index base register update into a ld/st instruction. MachineBasicBlock::iterator mergeUpdateInsn(MachineBasicBlock::iterator I, @@ -168,6 +175,9 @@ // Find and promote load instructions which read directly from store. bool tryToPromoteLoadFromStore(MachineBasicBlock::iterator &MBBI); + // Find and merge a base register updates before or after a ld/st instruction. + bool tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI); + bool optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt); bool runOnMachineFunction(MachineFunction &Fn) override; @@ -578,6 +588,101 @@ getLdStRegOp(MI).getReg() == AArch64::WZR; } +static bool isMergeableLdStUpdate(MachineInstr &MI) { + // Do update merging. It's simpler to keep this separate from the above + // switchs, though not strictly necessary. + unsigned Opc = MI.getOpcode(); + switch (Opc) { + default: + return false; + // Scaled instructions. + case AArch64::STRSui: + case AArch64::STRDui: + case AArch64::STRQui: + case AArch64::STRXui: + case AArch64::STRWui: + case AArch64::STRHHui: + case AArch64::STRBBui: + case AArch64::LDRSui: + case AArch64::LDRDui: + case AArch64::LDRQui: + case AArch64::LDRXui: + case AArch64::LDRWui: + case AArch64::LDRHHui: + case AArch64::LDRBBui: + // Unscaled instructions. + case AArch64::STURSi: + case AArch64::STURDi: + case AArch64::STURQi: + case AArch64::STURWi: + case AArch64::STURXi: + case AArch64::LDURSi: + case AArch64::LDURDi: + case AArch64::LDURQi: + case AArch64::LDURWi: + case AArch64::LDURXi: + // Paired instructions. + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPDi: + case AArch64::LDPQi: + case AArch64::LDPWi: + case AArch64::LDPXi: + case AArch64::STPSi: + case AArch64::STPDi: + case AArch64::STPQi: + case AArch64::STPWi: + case AArch64::STPXi: + // Make sure this is a reg+imm (as opposed to an address reloc). + if (!getLdStOffsetOp(MI).isImm()) + return false; + + return true; + } +} + +bool AArch64LoadStoreOpt::isProfitableMergeUpdate(unsigned New, + MachineInstr &OldA, + MachineInstr &OldB) { + const MachineFunction *MF = OldA.getMF(); + // Default as profitable if optimizing for size. + if (MF->getFunction()->optForSize()) + return true; + + TargetSchedModel SM; + const TargetSubtargetInfo &STI = MF->getSubtarget(); + SM.init(STI.getSchedModel(), &STI, STI.getInstrInfo()); + // It is profitable in the absence of a cost model. + if (!SM.hasInstrSchedModel()) + return true; + + const MCInstrDesc &NewID = TII->get(New), + &OldAID = TII->get(OldA.getOpcode()), + &OldBID = TII->get(OldB.getOpcode()); + const MCSchedClassDesc + *NewSD = SM.getMCSchedModel()->getSchedClassDesc(NewID.getSchedClass()), + *OldASD = SM.getMCSchedModel()->getSchedClassDesc(OldAID.getSchedClass()), + *OldBSD = SM.getMCSchedModel()->getSchedClassDesc(OldBID.getSchedClass()); + // Default as profitable in the absence of a valid cost model or when variant. + if (!NewSD->isValid() || NewSD->isVariant() || + !OldASD->isValid() || OldASD->isVariant() || + !OldBSD->isValid() || OldBSD->isVariant()) + return true; + + unsigned NewLat = SM.computeInstrLatency(New), + OldALat = SM.computeInstrLatency(&OldA), + OldBLat = SM.computeInstrLatency(&OldB); + // It is profitable if the new instruction is faster than both old ones. + if (NewLat < (OldALat + OldBLat)) + return true; + // It is profitable if the new instruction uses fewer uops than both old ones. + else if (NewLat == (OldALat + OldBLat)) + return (NewSD->NumMicroOps < (OldASD->NumMicroOps + OldBSD->NumMicroOps)); + // It is not profitable. + else + return false; +} + MachineBasicBlock::iterator AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I, MachineBasicBlock::iterator MergeMI, @@ -1272,6 +1377,9 @@ unsigned NewOpc = IsPreIdx ? getPreIndexedOpcode(I->getOpcode()) : getPostIndexedOpcode(I->getOpcode()); + if (!isProfitableMergeUpdate(NewOpc, *I, *Update)) + return ++I; + MachineInstrBuilder MIB; if (!isPairedLdSt(*I)) { // Non-paired instruction. @@ -1294,10 +1402,14 @@ } (void)MIB; - if (IsPreIdx) + if (IsPreIdx) { + ++NumPreFolded; DEBUG(dbgs() << "Creating pre-indexed load/store."); - else + } + else { + ++NumPostFolded; DEBUG(dbgs() << "Creating post-indexed load/store."); + } DEBUG(dbgs() << " Replacing instructions:\n "); DEBUG(I->print(dbgs())); DEBUG(dbgs() << " "); @@ -1558,6 +1670,60 @@ return false; } +bool +AArch64LoadStoreOpt::tryToMergeLdStUpdate(MachineBasicBlock::iterator &MBBI) { + MachineInstr &MI = *MBBI; + MachineBasicBlock::iterator E = MI.getParent()->end(); + MachineBasicBlock::iterator Update; + + // Look forward to try to form a post-index instruction. For example, + // ldr x0, [x20] + // add x20, x20, #32 + // merged into: + // ldr x0, [x20], #32 + Update = findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit); + if (Update != E) { + // Merge the update into the ld/st. + MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); + return true; + } + + // Don't know how to handle unscaled pre/post-index versions below, so bail. + if (TII->isUnscaledLdSt(MI.getOpcode())) + return false; + + // Look back to try to find a pre-index instruction. For example, + // add x0, x0, #8 + // ldr x1, [x0] + // merged into: + // ldr x1, [x0, #8]! + Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit); + if (Update != E) { + // Merge the update into the ld/st. + MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); + return true; + } + + // The immediate in the load/store is scaled by the size of the memory + // operation. The immediate in the add we're looking for, + // however, is not, so adjust here. + int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI); + + // Look forward to try to find a post-index instruction. For example, + // ldr x1, [x0, #64] + // add x0, x0, #64 + // merged into: + // ldr x1, [x0, #64]! + Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit); + if (Update != E) { + // Merge the update into the ld/st. + MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); + return true; + } + + return false; +} + bool AArch64LoadStoreOpt::optimizeBlock(MachineBasicBlock &MBB, bool EnableNarrowZeroStOpt) { bool Modified = false; @@ -1618,7 +1784,6 @@ } else ++MBBI; } - // 3) Find loads and stores that can be merged into a single load or store // pair instruction. // e.g., @@ -1641,119 +1806,15 @@ // ; becomes // ldr x0, [x2], #4 for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end(); - MBBI != E;) { - MachineInstr &MI = *MBBI; - // Do update merging. It's simpler to keep this separate from the above - // switchs, though not strictly necessary. - unsigned Opc = MI.getOpcode(); - switch (Opc) { - default: - // Just move on to the next instruction. - ++MBBI; - break; - // Scaled instructions. - case AArch64::STRSui: - case AArch64::STRDui: - case AArch64::STRQui: - case AArch64::STRXui: - case AArch64::STRWui: - case AArch64::STRHHui: - case AArch64::STRBBui: - case AArch64::LDRSui: - case AArch64::LDRDui: - case AArch64::LDRQui: - case AArch64::LDRXui: - case AArch64::LDRWui: - case AArch64::LDRHHui: - case AArch64::LDRBBui: - // Unscaled instructions. - case AArch64::STURSi: - case AArch64::STURDi: - case AArch64::STURQi: - case AArch64::STURWi: - case AArch64::STURXi: - case AArch64::LDURSi: - case AArch64::LDURDi: - case AArch64::LDURQi: - case AArch64::LDURWi: - case AArch64::LDURXi: - // Paired instructions. - case AArch64::LDPSi: - case AArch64::LDPSWi: - case AArch64::LDPDi: - case AArch64::LDPQi: - case AArch64::LDPWi: - case AArch64::LDPXi: - case AArch64::STPSi: - case AArch64::STPDi: - case AArch64::STPQi: - case AArch64::STPWi: - case AArch64::STPXi: { - // Make sure this is a reg+imm (as opposed to an address reloc). - if (!getLdStOffsetOp(MI).isImm()) { - ++MBBI; - break; - } - // Look forward to try to form a post-index instruction. For example, - // ldr x0, [x20] - // add x20, x20, #32 - // merged into: - // ldr x0, [x20], #32 - MachineBasicBlock::iterator Update = - findMatchingUpdateInsnForward(MBBI, 0, UpdateLimit); - if (Update != E) { - // Merge the update into the ld/st. - MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/false); + MBBI != E;) + if (isMergeableLdStUpdate(*MBBI)) { + if (tryToMergeLdStUpdate(MBBI)) Modified = true; - ++NumPostFolded; - break; - } - - // Don't know how to handle unscaled pre/post-index versions below, so - // move to the next instruction. - if (TII->isUnscaledLdSt(Opc)) { + else ++MBBI; - break; - } - - // Look back to try to find a pre-index instruction. For example, - // add x0, x0, #8 - // ldr x1, [x0] - // merged into: - // ldr x1, [x0, #8]! - Update = findMatchingUpdateInsnBackward(MBBI, UpdateLimit); - if (Update != E) { - // Merge the update into the ld/st. - MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); - Modified = true; - ++NumPreFolded; - break; - } - // The immediate in the load/store is scaled by the size of the memory - // operation. The immediate in the add we're looking for, - // however, is not, so adjust here. - int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI); - - // Look forward to try to find a post-index instruction. For example, - // ldr x1, [x0, #64] - // add x0, x0, #64 - // merged into: - // ldr x1, [x0, #64]! - Update = findMatchingUpdateInsnForward(MBBI, UnscaledOffset, UpdateLimit); - if (Update != E) { - // Merge the update into the ld/st. - MBBI = mergeUpdateInsn(MBBI, Update, /*IsPreIdx=*/true); - Modified = true; - ++NumPreFolded; - break; - } - - // Nothing found. Just move to the next instruction. - ++MBBI; - break; - } } - } + else + ++MBBI; return Modified; } Index: llvm/test/CodeGen/AArch64/ldst-opt.ll =================================================================== --- llvm/test/CodeGen/AArch64/ldst-opt.ll +++ llvm/test/CodeGen/AArch64/ldst-opt.ll @@ -1,5 +1,6 @@ -; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -o - %s | FileCheck --check-prefix=CHECK --check-prefix=NOSTRICTALIGN %s -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+strict-align -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -o - %s | FileCheck --check-prefix=CHECK --check-prefix=STRICTALIGN %s +; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK,GENERIC,NOSTRICTALIGN +; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=0 -disable-lsr -verify-machineinstrs -mattr=+strict-align -o - %s | FileCheck %s --check-prefixes=CHECK,GENERIC,STRICTALIGN +; RUN: llc -mtriple=aarch64-linux-gnu -aarch64-enable-atomic-cfg-tidy=1 -disable-lsr -verify-machineinstrs -mcpu=exynos-m1 -o - %s | FileCheck %s --check-prefixes=CHECK,EXYNOS ; This file contains tests for the AArch64 load/store optimizer. @@ -7,8 +8,8 @@ %s.byte = type { i8, i8 } %s.halfword = type { i16, i16 } %s.word = type { i32, i32 } -%s.doubleword = type { i64, i32 } -%s.quadword = type { fp128, i32 } +%s.doubleword = type { i64, i64 } +%s.quadword = type { fp128, fp128 } %s.float = type { float, i32 } %s.double = type { double, i32 } %struct.byte = type { %padding, %s.byte } @@ -145,7 +146,9 @@ define void @load-pre-indexed-quadword(%struct.quadword* %ptr) nounwind { ; CHECK-LABEL: load-pre-indexed-quadword -; CHECK: ldr q{{[0-9]+}}, [x{{[0-9]+}}, #32]! +; GENERIC: ldr q{{[0-9]+}}, [x{{[0-9]+}}, #32]! +; EXYNOS: ldr q{{[0-9]+}}, [x{{[0-9]+}}, #32] +; EXYNOS-NEXT: add x{{[0-9]+}}, x{{[0-9]+}}, #32 entry: %a = getelementptr inbounds %struct.quadword, %struct.quadword* %ptr, i64 0, i32 1, i32 0 %add = load fp128, fp128* %a, align 16 @@ -158,7 +161,9 @@ define void @store-pre-indexed-quadword(%struct.quadword* %ptr, fp128 %val) nounwind { ; CHECK-LABEL: store-pre-indexed-quadword -; CHECK: str q{{[0-9]+}}, [x{{[0-9]+}}, #32]! +; GENERIC: str q{{[0-9]+}}, [x{{[0-9]+}}, #32]! +; EXYNOS: add x{{[0-9]+}}, x{{[0-9]+}}, #32 +; EXYNOS-NEXT: str q{{[0-9]+}}, [x{{[0-9]+}}, #32] entry: %a = getelementptr inbounds %struct.quadword, %struct.quadword* %ptr, i64 0, i32 1, i32 0 store fp128 %val, fp128* %a, align 16 @@ -236,7 +241,9 @@ define void @load-pair-pre-indexed-word(%struct.word* %ptr) nounwind { ; CHECK-LABEL: load-pair-pre-indexed-word -; CHECK: ldp w{{[0-9]+}}, w{{[0-9]+}}, [x0, #32]! +; GENERIC: ldp w{{[0-9]+}}, w{{[0-9]+}}, [x0, #32]! +; EXYNOS: ldr w{{[0-9]+}}, [x0, #32]! +; EXYNOS-NEXT: ldr w{{[0-9]+}}, [x0, #4] ; CHECK-NOT: add x0, x0, #32 entry: %a = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1, i32 0 @@ -253,7 +260,9 @@ define void @store-pair-pre-indexed-word(%struct.word* %ptr, i32 %val) nounwind { ; CHECK-LABEL: store-pair-pre-indexed-word -; CHECK: stp w{{[0-9]+}}, w{{[0-9]+}}, [x0, #32]! +; GENERIC: stp w{{[0-9]+}}, w{{[0-9]+}}, [x0, #32]! +; EXYNOS: str w{{[0-9]+}}, [x0, #32]! +; EXYNOS-NEXT: str w{{[0-9]+}}, [x0, #4] ; CHECK-NOT: add x0, x0, #32 entry: %a = getelementptr inbounds %struct.word, %struct.word* %ptr, i64 0, i32 1, i32 0 @@ -267,6 +276,43 @@ ret void } +define void @load-pair-pre-indexed-doubleword(%struct.doubleword* %ptr) nounwind { +; CHECK-LABEL: load-pair-pre-indexed-doubleword +; GENERIC: ldp x{{[0-9]+}}, x{{[0-9]+}}, [x0, #32]! +; EXYNOS: ldr x{{[0-9]+}}, [x0, #32]! +; EXYNOS-NEXT: ldr x{{[0-9]+}}, [x0, #8] +; CHECK-NOT: add x0, x0, #32 +entry: + %a = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1, i32 0 + %a1 = load i64, i64* %a, align 8 + %b = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1, i32 1 + %b1 = load i64, i64* %b, align 8 + %add = add i64 %a1, %b1 + br label %bar +bar: + %c = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1 + tail call void @bar_doubleword(%s.doubleword* %c, i64 %add) + ret void +} + +define void @store-pair-pre-indexed-doubleword(%struct.doubleword* %ptr, i64 %val) nounwind { +; CHECK-LABEL: store-pair-pre-indexed-doubleword +; GENERIC: stp x{{[0-9]+}}, x{{[0-9]+}}, [x0, #32]! +; EXYNOS: str x{{[0-9]+}}, [x0, #32]! +; EXYNOS-NEXT: str x{{[0-9]+}}, [x0, #8] +; CHECK-NOT: add x0, x0, #32 +entry: + %a = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1, i32 0 + store i64 %val, i64* %a, align 8 + %b = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1, i32 1 + store i64 %val, i64* %b, align 8 + br label %bar +bar: + %c = getelementptr inbounds %struct.doubleword, %struct.doubleword* %ptr, i64 0, i32 1 + tail call void @bar_doubleword(%s.doubleword* %c, i64 %val) + ret void +} + ; Check the following transform: ; ; add x8, x8, #16 @@ -1031,7 +1077,6 @@ define void @store-pair-post-indexed-word() nounwind { ; CHECK-LABEL: store-pair-post-indexed-word ; CHECK: stp w{{[0-9]+}}, w{{[0-9]+}}, [sp], #16 -; CHECK: ret %src = alloca { i32, i32 }, align 8 %dst = alloca { i32, i32 }, align 8 @@ -1050,7 +1095,6 @@ define void @store-pair-post-indexed-doubleword() nounwind { ; CHECK-LABEL: store-pair-post-indexed-doubleword ; CHECK: stp x{{[0-9]+}}, x{{[0-9]+}}, [sp], #32 -; CHECK: ret %src = alloca { i64, i64 }, align 8 %dst = alloca { i64, i64 }, align 8 @@ -1069,7 +1113,6 @@ define void @store-pair-post-indexed-float() nounwind { ; CHECK-LABEL: store-pair-post-indexed-float ; CHECK: stp s{{[0-9]+}}, s{{[0-9]+}}, [sp], #16 -; CHECK: ret %src = alloca { float, float }, align 8 %dst = alloca { float, float }, align 8 @@ -1088,7 +1131,6 @@ define void @store-pair-post-indexed-double() nounwind { ; CHECK-LABEL: store-pair-post-indexed-double ; CHECK: stp d{{[0-9]+}}, d{{[0-9]+}}, [sp], #32 -; CHECK: ret %src = alloca { double, double }, align 8 %dst = alloca { double, double }, align 8 @@ -1104,6 +1146,27 @@ ret void } +define void @store-pair-post-indexed-quadword() nounwind { +; CHECK-LABEL: store-pair-post-indexed-quadword +; GENERIC: stp q{{[0-9]+}}, q{{[0-9]+}}, [sp], #64 +; EXYNOS: str q{{[0-9]+}}, [sp] +; EXYNOS-NEXT: str q{{[0-9]+}}, [sp, #16] +; EXYNOS-NEXT: add sp, sp, #64 + %src = alloca { fp128, fp128 }, align 8 + %dst = alloca { fp128, fp128 }, align 8 + + %src.realp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %src, i32 0, i32 0 + %src.real = load fp128, fp128* %src.realp + %src.imagp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %src, i32 0, i32 1 + %src.imag = load fp128, fp128* %src.imagp + + %dst.realp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %dst, i32 0, i32 0 + %dst.imagp = getelementptr inbounds { fp128, fp128 }, { fp128, fp128 }* %dst, i32 0, i32 1 + store fp128 %src.real, fp128* %dst.realp + store fp128 %src.imag, fp128* %dst.imagp + ret void +} + ; Check the following transform: ; ; (ldr|str) X, [x20] @@ -1343,6 +1406,7 @@ ; CHECK: // %entry ; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] +; EXYNOS-NEXT: str xzr, [x{{[0-9]+}}] ; CHECK-NEXT: ret entry: store i32 0, i32* %p @@ -1358,6 +1422,7 @@ ; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8] +; EXYNOS-NEXT: stp xzr, xzr, [x{{[0-9]+}}] ; CHECK-NEXT: ret entry: store i32 0, i32* %p @@ -1379,6 +1444,7 @@ ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #508] ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #512] ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #516] +; EXYNOS-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #504] ; CHECK-NEXT: ret entry: %p0 = getelementptr i32, i32* %p, i32 126 @@ -1404,6 +1470,8 @@ ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #4100] ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #4104] ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #4108] +; EXYNOS-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 +; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}, #4096] ; CHECK-NEXT: ret entry: %p0 = getelementptr i32, i32* %p, i32 1024 @@ -1429,6 +1497,9 @@ ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #16] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #24] +; EXYNOS-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 +; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}] +; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}, #16] ; CHECK-NEXT: ret entry: store i32 0, i32* %p @@ -1455,6 +1526,7 @@ ; CHECK: // %entry ; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] +; EXYNOS-NEXT: str xzr, [x{{[0-9]+}}] ; CHECK-NEXT: ret entry: store <2 x i32> zeroinitializer, <2 x i32>* %p @@ -1469,6 +1541,8 @@ ; NOSTRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #8] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: str wzr, [x{{[0-9]+}}, #8] +; EXYNOS-NEXT: str xzr, [x{{[0-9]+}}] +; EXYNOS-NEXT: str wzr, [x{{[0-9]+}}, #8] ; CHECK-NEXT: ret entry: store <3 x i32> zeroinitializer, <3 x i32>* %p @@ -1482,6 +1556,7 @@ ; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8] +; EXYNOS-NEXT: stp xzr, xzr, [x{{[0-9]+}}] ; CHECK-NEXT: ret entry: store <4 x i32> zeroinitializer, <4 x i32>* %p @@ -1494,6 +1569,7 @@ ; CHECK: // %entry ; NOSTRICTALIGN-NEXT: str xzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] +; EXYNOS-NEXT: str xzr, [x{{[0-9]+}}] ; CHECK-NEXT: ret entry: store <2 x float> zeroinitializer, <2 x float>* %p @@ -1507,6 +1583,7 @@ ; NOSTRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp wzr, wzr, [x{{[0-9]+}}, #8] +; EXYNOS-NEXT: stp xzr, xzr, [x{{[0-9]+}}] ; CHECK-NEXT: ret entry: store <4 x float> zeroinitializer, <4 x float>* %p @@ -1547,6 +1624,7 @@ ; STRICTALIGN: strb ; STRICTALIGN: strb ; STRICTALIGN: strb +; EXYNOS-NEXT: stp xzr, xzr, [x{{[0-9]+}}] ; CHECK-NEXT: ret entry: store <2 x i64> zeroinitializer, <2 x i64>* %p, align 1 @@ -1562,6 +1640,9 @@ ; NOSTRICTALIGN-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}] ; STRICTALIGN-NEXT: stp xzr, xzr, [x{{[0-9]+}}, #16] +; EXYNOS-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 +; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}] +; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}, #16] ; CHECK-NEXT: ret entry: store i64 0, i64* %p @@ -1601,8 +1682,11 @@ define void @merge_zr64_4vecd(<4 x double>* %p) { ; CHECK-LABEL: merge_zr64_4vecd: ; CHECK: // %entry -; CHECK-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 -; CHECK-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}] +; GENERIC-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 +; GENERIC-NEXT: stp q[[REG]], q[[REG]], [x{{[0-9]+}}] +; EXYNOS-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 +; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}, #16] +; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}] ; CHECK-NEXT: ret entry: store <4 x double> zeroinitializer, <4 x double>* %p @@ -1620,6 +1704,10 @@ ; STRICTALIGN-NEXT: stp xzr, xzr, [x0] ; STRICTALIGN-NEXT: stp xzr, xzr, [x0, #24] ; STRICTALIGN-NEXT: stp xzr, xzr, [x0, #48] +; EXYNOS-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 +; EXYNOS-NEXT: str q0, [x0] +; EXYNOS-NEXT: stur q0, [x0, #24] +; EXYNOS-NEXT: str q0, [x0, #48] ; CHECK-NEXT: ret entry: store i64 0, i64* %p @@ -1647,6 +1735,11 @@ ; STRICTALIGN-NEXT: stp xzr, xzr, [x0, #16] ; STRICTALIGN-NEXT: stp xzr, xzr, [x0, #32] ; STRICTALIGN-NEXT: stp xzr, xzr, [x0, #48] +; EXYNOS-NEXT: movi v[[REG:[0-9]]].2d, #0000000000000000 +; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}] +; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}, #16] +; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}, #32] +; EXYNOS-NEXT: str q[[REG]], [x{{[0-9]+}}, #48] ; CHECK-NEXT: ret entry: store i64 0, i64* %p @@ -1670,7 +1763,7 @@ ; Check for bug 34674 where invalid add of xzr was being generated. ; CHECK-LABEL: bug34674: ; CHECK: // %entry -; CHECK-NEXT: mov [[ZREG:x[0-9]+]], xzr +; CHECK-NEXT: mov [[ZREG:x[0-9]+]], {{#0|xzr}} ; CHECK-DAG: stp [[ZREG]], [[ZREG]], [x0] ; CHECK-DAG: add x{{[0-9]+}}, [[ZREG]], #1 define i64 @bug34674(<2 x i64>* %p) {