diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp --- a/llvm/lib/CodeGen/MachineScheduler.cpp +++ b/llvm/lib/CodeGen/MachineScheduler.cpp @@ -1498,7 +1498,7 @@ : BaseOp->getIndex() < RHS.BaseOp->getIndex(); if (Offset != RHS.Offset) - return StackGrowsDown ? Offset > RHS.Offset : Offset < RHS.Offset; + return Offset < RHS.Offset; return SU->NodeNum < RHS.SU->NodeNum; } diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h @@ -89,6 +89,12 @@ /// if there is a corresponding unscaled variant available. static Optional getUnscaledLdSt(unsigned Opc); + /// Scaling factor for (scaled or unscaled) load or store. + static int getMemScale(unsigned Opc); + static int getMemScale(const MachineInstr &MI) { + return getMemScale(MI.getOpcode()); + } + /// Returns the index for the immediate for a given instruction. static unsigned getLoadStoreImmIdx(unsigned Opc); diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp @@ -2228,54 +2228,82 @@ return true; } -static unsigned getOffsetStride(unsigned Opc) { +// Scaling factor for unscaled load or store. +int AArch64InstrInfo::getMemScale(unsigned Opc) { switch (Opc) { default: - return 0; - case AArch64::LDURQi: - case AArch64::STURQi: - return 16; - case AArch64::LDURXi: - case AArch64::LDURDi: - case AArch64::STURXi: - case AArch64::STURDi: - return 8; - case AArch64::LDURWi: + llvm_unreachable("Opcode has unknown scale!"); + case AArch64::LDRBBui: + case AArch64::LDURBBi: + case AArch64::LDRSBWui: + case AArch64::LDURSBWi: + case AArch64::STRBBui: + case AArch64::STURBBi: + return 1; + case AArch64::LDRHHui: + case AArch64::LDURHHi: + case AArch64::LDRSHWui: + case AArch64::LDURSHWi: + case AArch64::STRHHui: + case AArch64::STURHHi: + return 2; + case AArch64::LDRSui: case AArch64::LDURSi: + case AArch64::LDRSWui: case AArch64::LDURSWi: - case AArch64::STURWi: + case AArch64::LDRWui: + case AArch64::LDURWi: + case AArch64::STRSui: case AArch64::STURSi: + case AArch64::STRWui: + case AArch64::STURWi: + case AArch64::LDPSi: + case AArch64::LDPSWi: + case AArch64::LDPWi: + case AArch64::STPSi: + case AArch64::STPWi: return 4; + case AArch64::LDRDui: + case AArch64::LDURDi: + case AArch64::LDRXui: + case AArch64::LDURXi: + case AArch64::STRDui: + case AArch64::STURDi: + case AArch64::STRXui: + case AArch64::STURXi: + case AArch64::LDPDi: + case AArch64::LDPXi: + case AArch64::STPDi: + case AArch64::STPXi: + return 8; + case AArch64::LDRQui: + case AArch64::LDURQi: + case AArch64::STRQui: + case AArch64::STURQi: + case AArch64::LDPQi: + case AArch64::STPQi: + case AArch64::STGOffset: + case AArch64::STZGOffset: + case AArch64::ST2GOffset: + case AArch64::STZ2GOffset: + case AArch64::STGPi: + return 16; } } // Scale the unscaled offsets. Returns false if the unscaled offset can't be // scaled. static bool scaleOffset(unsigned Opc, int64_t &Offset) { - unsigned OffsetStride = getOffsetStride(Opc); - if (OffsetStride == 0) - return false; + int Scale = AArch64InstrInfo::getMemScale(Opc); + // If the byte-offset isn't a multiple of the stride, we can't scale this // offset. - if (Offset % OffsetStride != 0) + if (Offset % Scale != 0) return false; // Convert the byte-offset used by unscaled into an "element" offset used // by the scaled pair load/store instructions. - Offset /= OffsetStride; - return true; -} - -// Unscale the scaled offsets. Returns false if the scaled offset can't be -// unscaled. -static bool unscaleOffset(unsigned Opc, int64_t &Offset) { - unsigned OffsetStride = getOffsetStride(Opc); - if (OffsetStride == 0) - return false; - - // Convert the "element" offset used by scaled pair load/store instructions - // into the byte-offset used by unscaled. - Offset *= OffsetStride; + Offset /= Scale; return true; } @@ -2306,15 +2334,17 @@ int64_t ObjectOffset1 = MFI.getObjectOffset(FI1); int64_t ObjectOffset2 = MFI.getObjectOffset(FI2); assert(ObjectOffset1 <= ObjectOffset2 && "Object offsets are not ordered."); - // Get the byte-offset from the object offset. - if (!unscaleOffset(Opcode1, Offset1) || !unscaleOffset(Opcode2, Offset2)) + // Convert to scaled object offsets. + int Scale1 = AArch64InstrInfo::getMemScale(Opcode1); + if (ObjectOffset1 % Scale1 != 0) return false; + ObjectOffset1 /= Scale1; + int Scale2 = AArch64InstrInfo::getMemScale(Opcode2); + if (ObjectOffset2 % Scale2 != 0) + return false; + ObjectOffset2 /= Scale2; ObjectOffset1 += Offset1; ObjectOffset2 += Offset2; - // Get the "element" index in the object. - if (!scaleOffset(Opcode1, ObjectOffset1) || - !scaleOffset(Opcode2, ObjectOffset2)) - return false; return ObjectOffset1 + 1 == ObjectOffset2; } @@ -2374,7 +2404,7 @@ // The caller should already have ordered First/SecondLdSt by offset. // Note: except for non-equal frame index bases if (BaseOp1.isFI()) { - assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 >= Offset2) && + assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) && "Caller should have ordered offsets."); const MachineFrameInfo &MFI = @@ -2383,8 +2413,7 @@ BaseOp2.getIndex(), Offset2, SecondOpc); } - assert((!BaseOp1.isIdenticalTo(BaseOp2) || Offset1 <= Offset2) && - "Caller should have ordered offsets."); + assert(Offset1 <= Offset2 && "Caller should have ordered offsets."); return Offset1 + 1 == Offset2; } diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -215,69 +215,6 @@ } } -// Scaling factor for unscaled load or store. -static int getMemScale(const MachineInstr &MI) { - switch (MI.getOpcode()) { - default: - llvm_unreachable("Opcode has unknown scale!"); - case AArch64::LDRBBui: - case AArch64::LDURBBi: - case AArch64::LDRSBWui: - case AArch64::LDURSBWi: - case AArch64::STRBBui: - case AArch64::STURBBi: - return 1; - case AArch64::LDRHHui: - case AArch64::LDURHHi: - case AArch64::LDRSHWui: - case AArch64::LDURSHWi: - case AArch64::STRHHui: - case AArch64::STURHHi: - return 2; - case AArch64::LDRSui: - case AArch64::LDURSi: - case AArch64::LDRSWui: - case AArch64::LDURSWi: - case AArch64::LDRWui: - case AArch64::LDURWi: - case AArch64::STRSui: - case AArch64::STURSi: - case AArch64::STRWui: - case AArch64::STURWi: - case AArch64::LDPSi: - case AArch64::LDPSWi: - case AArch64::LDPWi: - case AArch64::STPSi: - case AArch64::STPWi: - return 4; - case AArch64::LDRDui: - case AArch64::LDURDi: - case AArch64::LDRXui: - case AArch64::LDURXi: - case AArch64::STRDui: - case AArch64::STURDi: - case AArch64::STRXui: - case AArch64::STURXi: - case AArch64::LDPDi: - case AArch64::LDPXi: - case AArch64::STPDi: - case AArch64::STPXi: - return 8; - case AArch64::LDRQui: - case AArch64::LDURQi: - case AArch64::STRQui: - case AArch64::STURQi: - case AArch64::LDPQi: - case AArch64::STPQi: - case AArch64::STGOffset: - case AArch64::STZGOffset: - case AArch64::ST2GOffset: - case AArch64::STZ2GOffset: - case AArch64::STGPi: - return 16; - } -} - static unsigned getMatchingNonSExtOpcode(unsigned Opc, bool *IsValidLdStrOpc = nullptr) { if (IsValidLdStrOpc) @@ -588,7 +525,7 @@ // ST*G and all paired ldst have the same scale in pre/post-indexed variants // as in the "unsigned offset" variant. // All other pre/post indexed ldst instructions are unscaled. - Scale = (IsTagStore || IsPaired) ? getMemScale(MI) : 1; + Scale = (IsTagStore || IsPaired) ? AArch64InstrInfo::getMemScale(MI) : 1; if (IsPaired) { MinOffset = -64; @@ -620,8 +557,8 @@ MachineInstr &StoreInst, const AArch64InstrInfo *TII) { assert(isMatchingStore(LoadInst, StoreInst) && "Expect only matched ld/st."); - int LoadSize = getMemScale(LoadInst); - int StoreSize = getMemScale(StoreInst); + int LoadSize = TII->getMemScale(LoadInst); + int StoreSize = TII->getMemScale(StoreInst); int UnscaledStOffset = TII->isUnscaledLdSt(StoreInst) ? getLdStOffsetOp(StoreInst).getImm() : getLdStOffsetOp(StoreInst).getImm() * StoreSize; @@ -731,7 +668,7 @@ unsigned Opc = I->getOpcode(); bool IsScaled = !TII->isUnscaledLdSt(Opc); - int OffsetStride = IsScaled ? 1 : getMemScale(*I); + int OffsetStride = IsScaled ? 1 : TII->getMemScale(*I); bool MergeForward = Flags.getMergeForward(); // Insert our new paired instruction after whichever of the paired @@ -800,7 +737,7 @@ unsigned Opc = SExtIdx == -1 ? I->getOpcode() : getMatchingNonSExtOpcode(I->getOpcode()); bool IsUnscaled = TII->isUnscaledLdSt(Opc); - int OffsetStride = IsUnscaled ? getMemScale(*I) : 1; + int OffsetStride = IsUnscaled ? TII->getMemScale(*I) : 1; bool MergeForward = Flags.getMergeForward(); // Insert our new paired instruction after whichever of the paired @@ -818,11 +755,11 @@ // We're trying to pair instructions that differ in how they are scaled. If // I is scaled then scale the offset of Paired accordingly. Otherwise, do // the opposite (i.e., make Paired's offset unscaled). - int MemSize = getMemScale(*Paired); + int MemSize = TII->getMemScale(*Paired); if (PairedIsUnscaled) { // If the unscaled offset isn't a multiple of the MemSize, we can't // pair the operations together. - assert(!(PairedOffset % getMemScale(*Paired)) && + assert(!(PairedOffset % TII->getMemScale(*Paired)) && "Offset should be a multiple of the stride!"); PairedOffset /= MemSize; } else { @@ -847,9 +784,9 @@ int OffsetImm = getLdStOffsetOp(*RtMI).getImm(); // Scale the immediate offset, if necessary. if (TII->isUnscaledLdSt(RtMI->getOpcode())) { - assert(!(OffsetImm % getMemScale(*RtMI)) && + assert(!(OffsetImm % TII->getMemScale(*RtMI)) && "Unscaled offset cannot be scaled."); - OffsetImm /= getMemScale(*RtMI); + OffsetImm /= TII->getMemScale(*RtMI); } // Construct the new instruction. @@ -944,8 +881,8 @@ MachineBasicBlock::iterator NextI = LoadI; ++NextI; - int LoadSize = getMemScale(*LoadI); - int StoreSize = getMemScale(*StoreI); + int LoadSize = TII->getMemScale(*LoadI); + int StoreSize = TII->getMemScale(*StoreI); Register LdRt = getLdStRegOp(*LoadI).getReg(); const MachineOperand &StMO = getLdStRegOp(*StoreI); Register StRt = getLdStRegOp(*StoreI).getReg(); @@ -1223,7 +1160,7 @@ Register Reg = getLdStRegOp(FirstMI).getReg(); Register BaseReg = getLdStBaseOp(FirstMI).getReg(); int Offset = getLdStOffsetOp(FirstMI).getImm(); - int OffsetStride = IsUnscaled ? getMemScale(FirstMI) : 1; + int OffsetStride = IsUnscaled ? TII->getMemScale(FirstMI) : 1; bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI); // Track which register units have been modified and used between the first @@ -1259,7 +1196,7 @@ // We're trying to pair instructions that differ in how they are scaled. // If FirstMI is scaled then scale the offset of MI accordingly. // Otherwise, do the opposite (i.e., make MI's offset unscaled). - int MemSize = getMemScale(MI); + int MemSize = TII->getMemScale(MI); if (MIIsUnscaled) { // If the unscaled offset isn't a multiple of the MemSize, we can't // pair the operations together: bail and keep looking. @@ -1492,7 +1429,7 @@ MachineBasicBlock::iterator MBBI = I; Register BaseReg = getLdStBaseOp(MemMI).getReg(); - int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * getMemScale(MemMI); + int MIUnscaledOffset = getLdStOffsetOp(MemMI).getImm() * TII->getMemScale(MemMI); // Scan forward looking for post-index opportunities. Updating instructions // can't be formed if the memory instruction doesn't have the offset we're @@ -1663,7 +1600,7 @@ // with Offset-1) bool IsUnscaled = TII->isUnscaledLdSt(MI); int Offset = getLdStOffsetOp(MI).getImm(); - int OffsetStride = IsUnscaled ? getMemScale(MI) : 1; + int OffsetStride = IsUnscaled ? TII->getMemScale(MI) : 1; // Allow one more for offset. if (Offset > 0) Offset -= OffsetStride; @@ -1723,7 +1660,7 @@ // The immediate in the load/store is scaled by the size of the memory // operation. The immediate in the add we're looking for, // however, is not, so adjust here. - int UnscaledOffset = getLdStOffsetOp(MI).getImm() * getMemScale(MI); + int UnscaledOffset = getLdStOffsetOp(MI).getImm() * TII->getMemScale(MI); // Look forward to try to find a pre-index instruction. For example, // ldr x1, [x0, #64] diff --git a/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll b/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll --- a/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll +++ b/llvm/test/CodeGen/AArch64/arm64-memset-inline.ll @@ -113,9 +113,9 @@ define void @bzero_26_stack() { ; CHECK-LABEL: bzero_26_stack: -; CHECK: stp xzr, xzr, [sp, #8] -; CHECK-NEXT: str xzr, [sp] +; CHECK: stp xzr, xzr, [sp] ; CHECK-NEXT: strh wzr, [sp, #24] +; CHECK-NEXT: str xzr, [sp, #16] ; CHECK-NEXT: bl something %buf = alloca [26 x i8], align 1 %cast = bitcast [26 x i8]* %buf to i8* @@ -259,9 +259,9 @@ define void @memset_16_stack() { ; CHECK-LABEL: memset_16_stack: ; CHECK: mov x8, #-6148914691236517206 +; CHECK-NEXT: str x8, [sp, #-32]! ; CHECK-NEXT: mov x0, sp ; CHECK-NEXT: stp x8, x30, [sp, #8] -; CHECK-NEXT: str x8, [sp] ; CHECK-NEXT: bl something %buf = alloca [16 x i8], align 1 %cast = bitcast [16 x i8]* %buf to i8* diff --git a/llvm/test/CodeGen/AArch64/cluster-frame-index.mir b/llvm/test/CodeGen/AArch64/cluster-frame-index.mir --- a/llvm/test/CodeGen/AArch64/cluster-frame-index.mir +++ b/llvm/test/CodeGen/AArch64/cluster-frame-index.mir @@ -1,11 +1,10 @@ #RUN: llc -mtriple=aarch64-- -mcpu=cyclone -run-pass machine-scheduler -o - %s | FileCheck %s -... --- name: merge_stack # CHECK-LABEL: name: merge_stack tracksRegLiveness: true stack: - - { id: 0, size: 64, alignment: 8 } + - { id: 0, size: 16, alignment: 8 } body: | bb.0: liveins: $w0, $w1 @@ -25,3 +24,30 @@ ; CHECK-NEXT: STRXui ; CHECK-NEXT: STRXui ; CHECK-NEXT: RET +... +--- +name: merge_fixedstack +# CHECK-LABEL: name: merge_fixedstack +tracksRegLiveness: true +fixedStack: + - { id: 0, size: 16, alignment: 8, offset: -16 } +body: | + bb.0: + liveins: $w0, $w1 + + %0:gpr32 = COPY $w0 + %1:gpr32 = COPY $w1 + undef %3.sub_32:gpr64 = ORRWrs $wzr, %0, 0 + STRXui %3, %fixed-stack.0, 0 :: (store 8) + undef %5.sub_32:gpr64 = ORRWrs $wzr, %1, 0 + STRXui %5, %fixed-stack.0, 1 :: (store 8) + RET_ReallyLR + + ; CHECK: COPY + ; CHECK-NEXT: COPY + ; CHECK-NEXT: ORRWrs + ; CHECK-NEXT: ORRWrs + ; CHECK-NEXT: STRXui + ; CHECK-NEXT: STRXui + ; CHECK-NEXT: RET +... diff --git a/llvm/test/CodeGen/AArch64/expand-select.ll b/llvm/test/CodeGen/AArch64/expand-select.ll --- a/llvm/test/CodeGen/AArch64/expand-select.ll +++ b/llvm/test/CodeGen/AArch64/expand-select.ll @@ -4,20 +4,20 @@ define void @foo(i32 %In1, <2 x i128> %In2, <2 x i128> %In3, <2 x i128> *%Out) { ; CHECK-LABEL: foo: ; CHECK: // %bb.0: -; CHECK-NEXT: and w9, w0, #0x1 +; CHECK-NEXT: and w8, w0, #0x1 ; CHECK-NEXT: fmov s0, wzr -; CHECK-NEXT: ldp x10, x8, [sp, #8] -; CHECK-NEXT: fmov s1, w9 -; CHECK-NEXT: ldr x9, [sp] +; CHECK-NEXT: ldp x10, x9, [sp, #8] +; CHECK-NEXT: fmov s1, w8 +; CHECK-NEXT: ldr x8, [sp] ; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s ; CHECK-NEXT: fmov w11, s0 ; CHECK-NEXT: tst w11, #0x1 ; CHECK-NEXT: csel x11, x2, x6, ne ; CHECK-NEXT: csel x12, x3, x7, ne -; CHECK-NEXT: csel x9, x4, x9, ne +; CHECK-NEXT: csel x8, x4, x8, ne ; CHECK-NEXT: csel x10, x5, x10, ne -; CHECK-NEXT: stp x9, x10, [x8, #16] -; CHECK-NEXT: stp x11, x12, [x8] +; CHECK-NEXT: stp x8, x10, [x9, #16] +; CHECK-NEXT: stp x11, x12, [x9] ; CHECK-NEXT: ret %cond = and i32 %In1, 1 %cbool = icmp eq i32 %cond, 0 @@ -31,25 +31,25 @@ define void @bar(i32 %In1, <2 x i96> %In2, <2 x i96> %In3, <2 x i96> *%Out) { ; CHECK-LABEL: bar: ; CHECK: // %bb.0: -; CHECK-NEXT: and w10, w0, #0x1 +; CHECK-NEXT: and w9, w0, #0x1 ; CHECK-NEXT: fmov s0, wzr -; CHECK-NEXT: fmov s1, w10 +; CHECK-NEXT: fmov s1, w9 ; CHECK-NEXT: cmeq v0.4s, v1.4s, v0.4s ; CHECK-NEXT: ldp x11, x8, [sp, #8] -; CHECK-NEXT: ldr x9, [sp] +; CHECK-NEXT: ldr x10, [sp] ; CHECK-NEXT: dup v1.4s, v0.s[0] -; CHECK-NEXT: mov x10, v1.d[1] -; CHECK-NEXT: lsr x10, x10, #32 -; CHECK-NEXT: tst w10, #0x1 -; CHECK-NEXT: fmov w10, s0 +; CHECK-NEXT: mov x9, v1.d[1] +; CHECK-NEXT: lsr x9, x9, #32 +; CHECK-NEXT: tst w9, #0x1 +; CHECK-NEXT: fmov w9, s0 ; CHECK-NEXT: csel x11, x5, x11, ne -; CHECK-NEXT: csel x9, x4, x9, ne -; CHECK-NEXT: tst w10, #0x1 -; CHECK-NEXT: csel x10, x3, x7, ne +; CHECK-NEXT: csel x10, x4, x10, ne +; CHECK-NEXT: tst w9, #0x1 +; CHECK-NEXT: csel x9, x3, x7, ne ; CHECK-NEXT: csel x12, x2, x6, ne -; CHECK-NEXT: stur x9, [x8, #12] +; CHECK-NEXT: stur x10, [x8, #12] ; CHECK-NEXT: str x12, [x8] -; CHECK-NEXT: str w10, [x8, #8] +; CHECK-NEXT: str w9, [x8, #8] ; CHECK-NEXT: str w11, [x8, #20] ; CHECK-NEXT: ret %cond = and i32 %In1, 1 diff --git a/llvm/test/CodeGen/AArch64/tailcall_misched_graph.ll b/llvm/test/CodeGen/AArch64/tailcall_misched_graph.ll --- a/llvm/test/CodeGen/AArch64/tailcall_misched_graph.ll +++ b/llvm/test/CodeGen/AArch64/tailcall_misched_graph.ll @@ -39,8 +39,8 @@ ; Make sure that there is an dependence edge between fi#-2 and fi#-4. ; Without this edge the scheduler would be free to move the store accross the load. -; COMMON: SU({{.*}}): [[VRB]]:gpr64 = LDRXui %fixed-stack.2 -; COMMON-NOT: SU +; COMMON: {{^SU(.*)}}: [[VRB]]:gpr64 = LDRXui %fixed-stack.2 +; COMMON-NOT: {{^SU(.*)}}: ; COMMON: Successors: ; COMMON: SU([[DEPSTOREB:.*]]): Ord Latency=0 ; COMMON: SU([[DEPSTOREA:.*]]): Ord Latency=0