Index: lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp =================================================================== --- lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -16,6 +16,7 @@ #include "AArch64Subtarget.h" #include "MCTargetDesc/AArch64AddressingModes.h" #include "llvm/ADT/BitVector.h" +#include "llvm/ADT/SmallVector.h" #include "llvm/ADT/Statistic.h" #include "llvm/CodeGen/MachineBasicBlock.h" #include "llvm/CodeGen/MachineFunctionPass.h" @@ -483,6 +484,30 @@ return (Num + PowOf2 - 1) & ~(PowOf2 - 1); } +static bool mayAlias(MachineInstr *MIa, MachineInstr *MIb, + const AArch64InstrInfo *TII) { + // One of the instructions must modify memory. + if (!MIa->mayStore() && !MIb->mayStore()) + return false; + + // Both instructions must be memory operations. + if (!((MIa->mayLoad() || MIa->mayStore()) && + (MIb->mayLoad() || MIb->mayStore()))) + return false; + + return !TII->areMemAccessesTriviallyDisjoint(MIa, MIb); +} + +static bool mayAlias(MachineInstr *MIa, + SmallVectorImpl &MemInsns, + const AArch64InstrInfo *TII) { + for (auto &MIb : MemInsns) + if (mayAlias(MIa, MIb, TII)) + return true; + + return false; +} + /// findMatchingInsn - Scan the instructions looking for a load/store that can /// be combined with the current instruction into a load/store pair. MachineBasicBlock::iterator @@ -518,6 +543,10 @@ BitVector ModifiedRegs, UsedRegs; ModifiedRegs.resize(TRI->getNumRegs()); UsedRegs.resize(TRI->getNumRegs()); + + // Remember any instructions that read/write memory between FirstMI and MI. + SmallVector MemInsns; + for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { MachineInstr *MI = MBBI; // Skip DBG_VALUE instructions. Otherwise debug info can affect the @@ -566,6 +595,8 @@ bool MIIsUnscaled = isUnscaledLdst(MI->getOpcode()); if (!inBoundsForPair(MIIsUnscaled, MinOffset, OffsetStride)) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + if (MI->mayLoad() || MI->mayStore()) + MemInsns.push_back(MI); continue; } // If the alignment requirements of the paired (scaled) instruction @@ -574,6 +605,8 @@ if (IsUnscaled && EnableAArch64UnscaledMemOp && (alignTo(MinOffset, OffsetStride) != MinOffset)) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + if (MI->mayLoad() || MI->mayStore()) + MemInsns.push_back(MI); continue; } // If the destination register of the loads is the same register, bail @@ -581,22 +614,29 @@ // registers the same is UNPREDICTABLE and will result in an exception. if (MayLoad && Reg == MI->getOperand(0).getReg()) { trackRegDefsUses(MI, ModifiedRegs, UsedRegs, TRI); + if (MI->mayLoad() || MI->mayStore()) + MemInsns.push_back(MI); continue; } // If the Rt of the second instruction was not modified or used between - // the two instructions, we can combine the second into the first. + // the two instructions and none of the instructions between the second + // and first alias with the second, we can combine the second into the + // first. if (!ModifiedRegs[MI->getOperand(0).getReg()] && - !UsedRegs[MI->getOperand(0).getReg()]) { + !UsedRegs[MI->getOperand(0).getReg()] && + !mayAlias(MI, MemInsns, TII)) { MergeForward = false; return MBBI; } // Likewise, if the Rt of the first instruction is not modified or used - // between the two instructions, we can combine the first into the - // second. + // between the two instructions and none of the instructions between the + // first and the second alias with the first, we can combine the first + // into the second. if (!ModifiedRegs[FirstMI->getOperand(0).getReg()] && - !UsedRegs[FirstMI->getOperand(0).getReg()]) { + !UsedRegs[FirstMI->getOperand(0).getReg()] && + !mayAlias(FirstMI, MemInsns, TII)) { MergeForward = true; return MBBI; } @@ -605,21 +645,9 @@ } } - // If the instruction wasn't a matching load or store, but does (or can) - // modify memory, stop searching, as we don't have alias analysis or - // anything like that to tell us whether the access is tromping on the - // locations we care about. The big one we want to catch is calls. - // - // FIXME: Theoretically, we can do better than that for SP and FP based - // references since we can effectively know where those are touching. It's - // unclear if it's worth the extra code, though. Most paired instructions - // will be sequential, perhaps with a few intervening non-memory related - // instructions. - if (MI->mayStore() || MI->isCall()) - return E; - // Likewise, if we're matching a store instruction, we don't want to - // move across a load, as it may be reading the same location. - if (FirstMI->mayStore() && MI->mayLoad()) + // If the instruction wasn't a matching load or store. Stop searching if we + // encounter a call instruction that might modify memory. + if (MI->isCall()) return E; // Update modified / uses register lists. @@ -629,6 +657,10 @@ // return early. if (ModifiedRegs[BaseReg]) return E; + + // Update list of instructions that read/write memory. + if (MI->mayLoad() || MI->mayStore()) + MemInsns.push_back(MI); } return E; } Index: test/CodeGen/AArch64/arm64-ldp-aa.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/arm64-ldp-aa.ll @@ -0,0 +1,60 @@ +; RUN: llc < %s -march=arm64 -enable-misched=false -verify-machineinstrs | FileCheck %s + +; The next set of tests makes sure we can combine the second instruction into +; the first. + +; CHECK-LABEL: ldp_int_aa +; CHECK: ldp w8, w9, [x1] +; CHECK: str w0, [x1, #8 +; CHECK: ret +define i32 @ldp_int_aa(i32 %a, i32* %p) nounwind { + %tmp = load i32, i32* %p, align 4 + %str.ptr = getelementptr inbounds i32, i32* %p, i64 2 + store i32 %a, i32* %str.ptr, align 4 + %add.ptr = getelementptr inbounds i32, i32* %p, i64 1 + %tmp1 = load i32, i32* %add.ptr, align 4 + %add = add nsw i32 %tmp1, %tmp + ret i32 %add +} + +; CHECK-LABEL: ldp_long_aa +; CHECK: ldp x8, x9, [x1] +; CHECK: str x0, [x1, #16] +; CHECK: ret +define i64 @ldp_long_aa(i64 %a, i64* %p) nounwind { + %tmp = load i64, i64* %p, align 8 + %str.ptr = getelementptr inbounds i64, i64* %p, i64 2 + store i64 %a, i64* %str.ptr, align 4 + %add.ptr = getelementptr inbounds i64, i64* %p, i64 1 + %tmp1 = load i64, i64* %add.ptr, align 8 + %add = add nsw i64 %tmp1, %tmp + ret i64 %add +} + +; CHECK-LABEL: ldp_float_aa +; CHECK: str s0, [x0, #8] +; CHECK: ldp s1, s0, [x0] +; CHECK: ret +define float @ldp_float_aa(float %a, float* %p) nounwind { + %tmp = load float, float* %p, align 4 + %str.ptr = getelementptr inbounds float, float* %p, i64 2 + store float %a, float* %str.ptr, align 4 + %add.ptr = getelementptr inbounds float, float* %p, i64 1 + %tmp1 = load float, float* %add.ptr, align 4 + %add = fadd float %tmp, %tmp1 + ret float %add +} + +; CHECK-LABEL: ldp_double_aa +; CHECK: str d0, [x0, #16] +; CHECK: ldp d1, d0, [x0] +; CHECK: ret +define double @ldp_double_aa(double %a, double* %p) nounwind { + %tmp = load double, double* %p, align 8 + %str.ptr = getelementptr inbounds double, double* %p, i64 2 + store double %a, double* %str.ptr, align 4 + %add.ptr = getelementptr inbounds double, double* %p, i64 1 + %tmp1 = load double, double* %add.ptr, align 8 + %add = fadd double %tmp, %tmp1 + ret double %add +} Index: test/CodeGen/AArch64/arm64-stp-aa.ll =================================================================== --- /dev/null +++ test/CodeGen/AArch64/arm64-stp-aa.ll @@ -0,0 +1,111 @@ +; RUN: llc < %s -march=arm64 -enable-misched=false -aarch64-stp-suppress=false -verify-machineinstrs | FileCheck %s + +; The next set of tests makes sure we can combine the second instruction into +; the first. + +; CHECK-LABEL: stp_int_aa +; CHECK: stp w0, w1, [x2] +; CHECK: ldr w0, [x2, #8] +; CHECK: ret +define i32 @stp_int_aa(i32 %a, i32 %b, i32* nocapture %p) nounwind { + store i32 %a, i32* %p, align 4 + %ld.ptr = getelementptr inbounds i32, i32* %p, i64 2 + %tmp = load i32, i32* %ld.ptr, align 4 + %add.ptr = getelementptr inbounds i32, i32* %p, i64 1 + store i32 %b, i32* %add.ptr, align 4 + ret i32 %tmp +} + +; CHECK-LABEL: stp_long_aa +; CHECK: stp x0, x1, [x2] +; CHECK: ldr x0, [x2, #16] +; CHECK: ret +define i64 @stp_long_aa(i64 %a, i64 %b, i64* nocapture %p) nounwind { + store i64 %a, i64* %p, align 8 + %ld.ptr = getelementptr inbounds i64, i64* %p, i64 2 + %tmp = load i64, i64* %ld.ptr, align 4 + %add.ptr = getelementptr inbounds i64, i64* %p, i64 1 + store i64 %b, i64* %add.ptr, align 8 + ret i64 %tmp +} + +; CHECK-LABEL: stp_float_aa +; CHECK: stp s0, s1, [x0] +; CHECK: ldr s0, [x0, #8] +; CHECK: ret +define float @stp_float_aa(float %a, float %b, float* nocapture %p) nounwind { + store float %a, float* %p, align 4 + %ld.ptr = getelementptr inbounds float, float* %p, i64 2 + %tmp = load float, float* %ld.ptr, align 4 + %add.ptr = getelementptr inbounds float, float* %p, i64 1 + store float %b, float* %add.ptr, align 4 + ret float %tmp +} + +; CHECK-LABEL: stp_double_aa +; CHECK: stp d0, d1, [x0] +; CHECK: ldr d0, [x0, #16] +; CHECK: ret +define double @stp_double_aa(double %a, double %b, double* nocapture %p) nounwind { + store double %a, double* %p, align 8 + %ld.ptr = getelementptr inbounds double, double* %p, i64 2 + %tmp = load double, double* %ld.ptr, align 4 + %add.ptr = getelementptr inbounds double, double* %p, i64 1 + store double %b, double* %add.ptr, align 8 + ret double %tmp +} + +; The next set of tests makes sure we can combine the first instruction into +; the second. + +; CHECK-LABEL: stp_int_aa_after +; CHECK: ldr w0, [x3, #4] +; CHECK: stp w1, w2, [x3] +; CHECK: ret +define i32 @stp_int_aa_after(i32 %w0, i32 %a, i32 %b, i32* nocapture %p) nounwind { + store i32 %a, i32* %p, align 4 + %ld.ptr = getelementptr inbounds i32, i32* %p, i64 1 + %tmp = load i32, i32* %ld.ptr, align 4 + %add.ptr = getelementptr inbounds i32, i32* %p, i64 1 + store i32 %b, i32* %add.ptr, align 4 + ret i32 %tmp +} + +; CHECK-LABEL: stp_long_aa_after +; CHECK: ldr x0, [x3, #8] +; CHECK: stp x1, x2, [x3] +; CHECK: ret +define i64 @stp_long_aa_after(i64 %x0, i64 %a, i64 %b, i64* nocapture %p) nounwind { + store i64 %a, i64* %p, align 8 + %ld.ptr = getelementptr inbounds i64, i64* %p, i64 1 + %tmp = load i64, i64* %ld.ptr, align 4 + %add.ptr = getelementptr inbounds i64, i64* %p, i64 1 + store i64 %b, i64* %add.ptr, align 8 + ret i64 %tmp +} + +; CHECK-LABEL: stp_float_aa_after +; CHECK: ldr s0, [x0, #4] +; CHECK: stp s1, s2, [x0] +; CHECK: ret +define float @stp_float_aa_after(float %s0, float %a, float %b, float* nocapture %p) nounwind { + store float %a, float* %p, align 4 + %ld.ptr = getelementptr inbounds float, float* %p, i64 1 + %tmp = load float, float* %ld.ptr, align 4 + %add.ptr = getelementptr inbounds float, float* %p, i64 1 + store float %b, float* %add.ptr, align 4 + ret float %tmp +} + +; CHECK-LABEL: stp_double_aa_after +; CHECK: ldr d0, [x0, #8] +; CHECK: stp d1, d2, [x0] +; CHECK: ret +define double @stp_double_aa_after(double %d0, double %a, double %b, double* nocapture %p) nounwind { + store double %a, double* %p, align 8 + %ld.ptr = getelementptr inbounds double, double* %p, i64 1 + %tmp = load double, double* %ld.ptr, align 4 + %add.ptr = getelementptr inbounds double, double* %p, i64 1 + store double %b, double* %add.ptr, align 8 + ret double %tmp +}