Index: llvm/lib/Target/AArch64/AArch64InstrInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -103,6 +103,8 @@ /// Returns whether the instruction is a pre-indexed load/store. static bool isPreLdSt(const MachineInstr &MI); + static bool isPairedLdSt(const MachineInstr &MI); + /// Returns whether the instruction is FP or NEON. static bool isFpOrNEON(const MachineInstr &MI); @@ -492,6 +494,8 @@ /// Return opcode to be used for indirect calls. unsigned getBLRCallOpcode(const MachineFunction &MF); +const MachineOperand &getLdStBaseOp(const MachineInstr &MI); +const MachineOperand &getLdStOffsetOp(const MachineInstr &MI); // struct TSFlags { #define TSFLAG_ELEMENT_SIZE_TYPE(X) (X) // 3-bits Index: llvm/lib/Target/AArch64/AArch64InstrInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -3152,6 +3152,26 @@ return isPreLd(MI) || isPreSt(MI); } +bool AArch64InstrInfo::isPairedLdSt(const MachineInstr &MI) { + switch (MI.getOpcode()) { + default: + return false; + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPDi: + case AArch64::LDPQi: + case AArch64::LDPWi: + case AArch64::LDPXi: + case AArch64::STPSi: + case AArch64::STPDi: + case AArch64::STPQi: + case AArch64::STPWi: + case AArch64::STPXi: + case AArch64::STGPi: + return true; + } +} + static const TargetRegisterClass *getRegClass(const MachineInstr &MI, Register Reg) { if (MI.getParent() == nullptr) @@ -8004,6 +8024,22 @@ return AArch64::BLR; } +namespace llvm { +const MachineOperand &getLdStBaseOp(const MachineInstr &MI) { + unsigned Idx = + AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2 + : 1; + return MI.getOperand(Idx); +} + +const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) { + unsigned Idx = + AArch64InstrInfo::isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3 + : 2; + return MI.getOperand(Idx); +} +} // namespace llvm + #define GET_INSTRINFO_HELPERS #define GET_INSTRMAP_INFO #include "AArch64GenInstrInfo.inc" Index: llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -556,26 +556,6 @@ } } -static bool isPairedLdSt(const MachineInstr &MI) { - switch (MI.getOpcode()) { - default: - return false; - case AArch64::LDPSi: - case AArch64::LDPSWi: - case AArch64::LDPDi: - case AArch64::LDPQi: - case AArch64::LDPWi: - case AArch64::LDPXi: - case AArch64::STPSi: - case AArch64::STPDi: - case AArch64::STPQi: - case AArch64::STPWi: - case AArch64::STPXi: - case AArch64::STGPi: - return true; - } -} - static bool isPreLdStPairCandidate(MachineInstr &FirstMI, MachineInstr &MI) { unsigned OpcA = FirstMI.getOpcode(); @@ -610,7 +590,7 @@ // Returns the scale and offset range of pre/post indexed variants of MI. static void getPrePostIndexedMemOpInfo(const MachineInstr &MI, int &Scale, int &MinOffset, int &MaxOffset) { - bool IsPaired = isPairedLdSt(MI); + bool IsPaired = AArch64InstrInfo::isPairedLdSt(MI); bool IsTagStore = isTagStore(MI); // ST*G and all paired ldst have the same scale in pre/post-indexed variants // as in the "unsigned offset" variant. @@ -632,17 +612,8 @@ bool IsPreLdSt = AArch64InstrInfo::isPreLdSt(MI); if (IsPreLdSt) PairedRegOp += 1; - unsigned Idx = isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0; - return MI.getOperand(Idx); -} - -static const MachineOperand &getLdStBaseOp(const MachineInstr &MI) { - unsigned Idx = isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 2 : 1; - return MI.getOperand(Idx); -} - -static const MachineOperand &getLdStOffsetOp(const MachineInstr &MI) { - unsigned Idx = isPairedLdSt(MI) || AArch64InstrInfo::isPreLdSt(MI) ? 3 : 2; + unsigned Idx = + AArch64InstrInfo::isPairedLdSt(MI) || IsPreLdSt ? PairedRegOp : 0; return MI.getOperand(Idx); } @@ -1823,7 +1794,7 @@ MachineInstrBuilder MIB; int Scale, MinOffset, MaxOffset; getPrePostIndexedMemOpInfo(*I, Scale, MinOffset, MaxOffset); - if (!isPairedLdSt(*I)) { + if (!AArch64InstrInfo::isPairedLdSt(*I)) { // Non-paired instruction. MIB = BuildMI(*I->getParent(), I, I->getDebugLoc(), TII->get(NewOpc)) .add(getLdStRegOp(*Update)) @@ -1944,7 +1915,7 @@ // behavior in this case unlike normal stores, and always performs writeback // after reading the source register value. if (!isTagStore(MemMI) && MemMI.getOpcode() != AArch64::STGPi) { - bool IsPairedInsn = isPairedLdSt(MemMI); + bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI); for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { Register DestReg = getLdStRegOp(MemMI, i).getReg(); if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) @@ -2015,7 +1986,7 @@ // If the base register overlaps a destination register, we can't // merge the update. if (!isTagStore(MemMI)) { - bool IsPairedInsn = isPairedLdSt(MemMI); + bool IsPairedInsn = AArch64InstrInfo::isPairedLdSt(MemMI); for (unsigned i = 0, e = IsPairedInsn ? 2 : 1; i != e; ++i) { Register DestReg = getLdStRegOp(MemMI, i).getReg(); if (DestReg == BaseReg || TRI->isSubRegister(BaseReg, DestReg)) Index: llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp +++ llvm/lib/Target/AArch64/AArch64MachineScheduler.cpp @@ -7,10 +7,53 @@ //===----------------------------------------------------------------------===// #include "AArch64MachineScheduler.h" +#include "AArch64InstrInfo.h" #include "MCTargetDesc/AArch64MCTargetDesc.h" using namespace llvm; +static bool needReorderStoreMI(const MachineInstr *MI) { + if (!MI) + return false; + + switch (MI->getOpcode()) { + default: + return false; + case AArch64::STPQi: + case AArch64::STRQui: + case AArch64::STURQi: + return getLdStOffsetOp(*MI).getType() == MachineOperand::MO_Immediate; + } + + return false; +} + +// Return true if two stores with same base address may overlap writes +static bool mayOverlapWrite(const MachineInstr &MI0, const MachineInstr &MI1, + int64_t &Off0, int64_t &Off1) { + const MachineOperand &Base0 = getLdStBaseOp(MI0); + const MachineOperand &Base1 = getLdStBaseOp(MI1); + + // May overlapping writes if two store instructions without same base + if (!Base0.isIdenticalTo(Base1)) + return true; + + int StoreSize0 = AArch64InstrInfo::getMemScale(MI0); + int StoreSize1 = AArch64InstrInfo::getMemScale(MI1); + Off0 = AArch64InstrInfo::hasUnscaledLdStOffset(MI0.getOpcode()) + ? getLdStOffsetOp(MI0).getImm() + : getLdStOffsetOp(MI0).getImm() * StoreSize0; + Off1 = AArch64InstrInfo::hasUnscaledLdStOffset(MI1.getOpcode()) + ? getLdStOffsetOp(MI1).getImm() + : getLdStOffsetOp(MI1).getImm() * StoreSize1; + + const MachineInstr &MI = (Off0 < Off1) ? MI0 : MI1; + int Multiples = AArch64InstrInfo::isPairedLdSt(MI) ? 2 : 1; + int StoreSize = AArch64InstrInfo::getMemScale(MI) * Multiples; + + return llabs(Off0 - Off1) < StoreSize; +} + bool AArch64PostRASchedStrategy::tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand) { bool OriginalResult = PostGenericScheduler::tryCandidate(Cand, TryCand); @@ -18,20 +61,16 @@ if (Cand.isValid()) { MachineInstr *Instr0 = TryCand.SU->getInstr(); MachineInstr *Instr1 = Cand.SU->getInstr(); - // When dealing with two STPqi's. - if (Instr0 && Instr1 && Instr0->getOpcode() == Instr1->getOpcode () && - Instr0->getOpcode() == AArch64::STPQi) - { - MachineOperand &Base0 = Instr0->getOperand(2); - MachineOperand &Base1 = Instr1->getOperand(2); - int64_t Off0 = Instr0->getOperand(3).getImm(); - int64_t Off1 = Instr1->getOperand(3).getImm(); - // With the same base address and non-overlapping writes. - if (Base0.isIdenticalTo(Base1) && llabs (Off0 - Off1) >= 2) { - TryCand.Reason = NodeOrder; - // Order them by ascending offsets. - return Off0 < Off1; - } + + if (!needReorderStoreMI(Instr0) || !needReorderStoreMI(Instr1)) + return OriginalResult; + + int64_t Off0, Off1; + // With the same base address and non-overlapping writes. + if (!mayOverlapWrite(*Instr0, *Instr1, Off0, Off1)) { + TryCand.Reason = NodeOrder; + // Order them by ascending offsets. + return Off0 < Off1; } } Index: llvm/test/CodeGen/AArch64/aarch64-mops.ll =================================================================== --- llvm/test/CodeGen/AArch64/aarch64-mops.ll +++ llvm/test/CodeGen/AArch64/aarch64-mops.ll @@ -1477,8 +1477,8 @@ ; SDAG-WITHOUT-MOPS-O2-NEXT: add x8, x0, #284 ; SDAG-WITHOUT-MOPS-O2-NEXT: ldr q3, [x1, #240] ; SDAG-WITHOUT-MOPS-O2-NEXT: str q0, [x8] -; SDAG-WITHOUT-MOPS-O2-NEXT: stp q2, q1, [x0, #256] ; SDAG-WITHOUT-MOPS-O2-NEXT: str q3, [x0, #240] +; SDAG-WITHOUT-MOPS-O2-NEXT: stp q2, q1, [x0, #256] ; SDAG-WITHOUT-MOPS-O2-NEXT: ret ; ; SDAG-MOPS-O2-LABEL: memcpy_inline_300: Index: llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll =================================================================== --- llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll +++ llvm/test/CodeGen/AArch64/argument-blocks-array-of-struct.ll @@ -232,9 +232,9 @@ ; CHECK-LABEL: array_of_struct_in_memory: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: stp q0, q0, [x8, #16] ; CHECK-NEXT: stp q0, q0, [x8, #48] -; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: ret ret [ 5 x %T_STRUCT_SAMEM ] zeroinitializer } @@ -350,9 +350,9 @@ ; CHECK-LABEL: array_of_struct_nested_same_field_types_2: ; CHECK: // %bb.0: ; CHECK-NEXT: movi v0.2d, #0000000000000000 +; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: stp q0, q0, [x8, #16] ; CHECK-NEXT: stp q0, q0, [x8, #48] -; CHECK-NEXT: str q0, [x8] ; CHECK-NEXT: ret ret [ 2 x %T_NESTED_STRUCT_SAMEM ] zeroinitializer } Index: llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll =================================================================== --- llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll +++ llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll @@ -365,11 +365,11 @@ ; CHECK-NEXT: .cfi_offset w22, -32 ; CHECK-NEXT: .cfi_offset w30, -48 ; CHECK-NEXT: adrp x8, .LCPI16_0 +; CHECK-NEXT: str q1, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: stp q0, q2, [sp, #48] // 32-byte Folded Spill ; CHECK-NEXT: mov v2.16b, v1.16b -; CHECK-NEXT: str q1, [sp, #32] // 16-byte Folded Spill -; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: str q1, [sp, #16] // 16-byte Folded Spill ; CHECK-NEXT: bl __getf2 ; CHECK-NEXT: ldr q0, [sp, #32] // 16-byte Folded Reload @@ -455,10 +455,10 @@ ; CHECK-NEXT: .cfi_offset w22, -32 ; CHECK-NEXT: .cfi_offset w30, -48 ; CHECK-NEXT: adrp x8, .LCPI17_0 -; CHECK-NEXT: stp q2, q3, [sp, #64] // 32-byte Folded Spill -; CHECK-NEXT: mov v2.16b, v1.16b ; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill ; CHECK-NEXT: str q0, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: stp q2, q3, [sp, #64] // 32-byte Folded Spill +; CHECK-NEXT: mov v2.16b, v1.16b ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0] ; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: str q1, [sp, #32] // 16-byte Folded Spill Index: llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll =================================================================== --- llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll +++ llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll @@ -339,11 +339,11 @@ ; CHECK-NEXT: .cfi_offset w20, -16 ; CHECK-NEXT: .cfi_offset w30, -32 ; CHECK-NEXT: adrp x8, .LCPI16_0 +; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill ; CHECK-NEXT: stp q0, q2, [sp, #48] // 32-byte Folded Spill ; CHECK-NEXT: mov v2.16b, v1.16b -; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill -; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI16_0] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: str q1, [sp, #32] // 16-byte Folded Spill ; CHECK-NEXT: bl __getf2 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload @@ -407,12 +407,12 @@ ; CHECK-NEXT: .cfi_offset w20, -16 ; CHECK-NEXT: .cfi_offset w30, -32 ; CHECK-NEXT: adrp x8, .LCPI17_0 +; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill ; CHECK-NEXT: stp q0, q2, [sp, #16] // 32-byte Folded Spill ; CHECK-NEXT: mov v2.16b, v1.16b -; CHECK-NEXT: str q1, [sp] // 16-byte Folded Spill -; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: str q3, [sp, #80] // 16-byte Folded Spill ; CHECK-NEXT: ldr q1, [x8, :lo12:.LCPI17_0] +; CHECK-NEXT: mov v0.16b, v2.16b ; CHECK-NEXT: str q1, [sp, #64] // 16-byte Folded Spill ; CHECK-NEXT: bl __getf2 ; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload Index: llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll =================================================================== --- llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll +++ llvm/test/CodeGen/AArch64/no-quad-ldp-stp.ll @@ -2,8 +2,8 @@ ; RUN: llc < %s -mtriple=aarch64-eabi -mcpu=exynos-m3 -verify-machineinstrs -asm-verbose=false | FileCheck %s --check-prefixes=CHECK,FAST ; CHECK-LABEL: test_nopair_st -; SLOW: str ; SLOW: stur +; SLOW: str ; SLOW-NOT: stp ; FAST: stp define void @test_nopair_st(double* %ptr, <2 x double> %v1, <2 x double> %v2) {