diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -29,6 +29,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/DebugLoc.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" @@ -169,9 +170,9 @@ findMatchingUpdateInsnBackward(MachineBasicBlock::iterator I, unsigned Limit); // Find an instruction that updates the base register of the ld/st - // instruction. - bool isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI, - unsigned BaseReg, int Offset); + // instruction. Return update Offset or 0 if not found. + int isMatchingUpdateInsn(MachineInstr &MemMI, MachineInstr &MI, + unsigned BaseReg, int Offset); // Merge a pre- or post-index base register update into a ld/st instruction. MachineBasicBlock::iterator @@ -1698,9 +1699,9 @@ return NextI; } -bool AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI, - MachineInstr &MI, - unsigned BaseReg, int Offset) { +int AArch64LoadStoreOpt::isMatchingUpdateInsn(MachineInstr &MemMI, + MachineInstr &MI, + unsigned BaseReg, int Offset) { switch (MI.getOpcode()) { default: break; @@ -1739,10 +1740,10 @@ // If we have a non-zero Offset, we check that it matches the amount // we're adding to the register. if (!Offset || Offset == UpdateOffset) - return true; + return UpdateOffset; break; } - return false; + return 0; } MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward( @@ -1780,6 +1781,23 @@ ModifiedRegUnits.clear(); UsedRegUnits.clear(); ++MBBI; + + // We can't post-increment the stack pointer if any instruction between + // the memory access (I) and the increment (MBBI) can access the memory + // region defined by [SP, MBBI]. Premature stack + const bool BaseRegSP = BaseReg == AArch64::SP; + + if (BaseRegSP) { + // FIXME: For now, we always block the optimization over SP in windows + // targets as it requires to adjust the unwind/debug info, messing up + // the unwind info can actually cause a miscompile. + const MCAsmInfo *MAI = I->getMF()->getTarget().getMCAsmInfo(); + if (MAI->usesWindowsCFI() && + I->getMF()->getFunction().needsUnwindTableEntry()) + return E; + } + + bool MayAccessStack = false; for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { MachineInstr &MI = *MBBI; @@ -1789,8 +1807,13 @@ ++Count; // If we found a match, return it. - if (isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset)) + const int Offset = isMatchingUpdateInsn(*I, MI, BaseReg, UnscaledOffset); + if (Offset) { + // Prevent premature stack-popping + if (MayAccessStack && Offset > 0) + continue; return MBBI; + } // Update the status of what the instruction clobbered and used. LiveRegUnits::accumulateUsedDefed(MI, ModifiedRegUnits, UsedRegUnits, TRI); @@ -1800,6 +1823,25 @@ if (!ModifiedRegUnits.available(BaseReg) || !UsedRegUnits.available(BaseReg)) return E; + + // FIXME: For now we conservativly detect any instruction that might access + // the stack. We should validate that the access is in the memory region of + // two memory operands, (e.g. [SP, MBBI]), but that requires a new may-alias + // API. May also need to track if other registers might point to the stack. + if (!MayAccessStack && BaseRegSP && MBBI->mayLoadOrStore()) { + if (MBBI->memoperands_empty()) { + MayAccessStack = true; + continue; + } + MachineFrameInfo *MFI = &(I->getMF()->getFrameInfo()); + assert(MFI); + for (MachineMemOperand *MMO : MBBI->memoperands()) { + const PseudoSourceValue *PSVa = MMO->getPseudoValue(); + MayAccessStack = (!PSVa) || PSVa->mayAlias(MFI); + if (MayAccessStack) + break; + } + } } return E; } @@ -1844,7 +1886,7 @@ ++Count; // If we found a match, return it. - if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset)) + if (isMatchingUpdateInsn(*I, MI, BaseReg, Offset) != 0) return MBBI; // Update the status of what the instruction clobbered and used. diff --git a/llvm/test/CodeGen/AArch64/aarch64-ldst-no-premature-sp-pop.mir b/llvm/test/CodeGen/AArch64/aarch64-ldst-no-premature-sp-pop.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-ldst-no-premature-sp-pop.mir @@ -0,0 +1,85 @@ +# RUN: llc -start-before=aarch64-ldst-opt -o - %s | FileCheck %s +# CHECK-NOT: stp xzr, xzr, [sp], #16 +# CHECK: add sp, sp, #16 +--- | + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + target triple = "aarch64-arm-none-eabi" + + define hidden i32 @foo(i32 %0) { + %2 = alloca [4 x i32], align 4 + %3 = bitcast [4 x i32]* %2 to i8* + call void @llvm.memset.p0i8.i64(i8* nonnull align 4 dereferenceable(16) %3, i8 0, i64 16, i1 false) + %4 = sext i32 %0 to i64 + %5 = getelementptr inbounds [4 x i32], [4 x i32]* %2, i64 0, i64 %4 + %6 = load i32, i32* %5, align 4 + ret i32 %6 + } + + declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #2 + declare void @llvm.stackprotector(i8*, i8**) #3 + + !llvm.module.flags = !{!0} + !llvm.ident = !{!1} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{!"clang version 11.0.0 "} + !2 = !{!3, !3, i64 0} + !3 = !{!"int", !4, i64 0} + !4 = !{!"omnipotent char", !5, i64 0} + !5 = !{!"Simple C++ TBAA"} + +... +--- +name: foo +alignment: 8 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$w0', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 16 + offsetAdjustment: 0 + maxAlignment: 8 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 16 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, type: default, offset: -16, size: 16, + alignment: 8, stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: -16, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.0 (%ir-block.1): + liveins: $w0 + + $sp = frame-setup SUBXri $sp, 16, 0 + $x8 = ADDXri $sp, 0, 0 + STRXui $xzr, $sp, 1 :: (store 8 into %ir.3 + 8) + STRXui $xzr, $sp, 0 :: (store 8 into %ir.3) + renamable $w0 = LDRWroW killed renamable $x8, killed renamable $w0, 1, 1 :: (load 4 from %ir.5, !tbaa !2) + $sp = frame-destroy ADDXri $sp, 16, 0 + RET undef $lr, implicit $w0 + +... diff --git a/llvm/test/CodeGen/AArch64/arm64-nvcast.ll b/llvm/test/CodeGen/AArch64/arm64-nvcast.ll --- a/llvm/test/CodeGen/AArch64/arm64-nvcast.ll +++ b/llvm/test/CodeGen/AArch64/arm64-nvcast.ll @@ -1,31 +1,41 @@ ; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s -; CHECK-LABEL: _test: -; CHECK-DAG: fmov.2d v0, #2.00000000 -; CHECK-DAG: and [[MASK_IDX:x[0-9]+]], x1, #0x3 -; CHECK-DAG: mov x9, sp -; CHECK-DAG: str q0, [sp], #16 -; CHECK-DAG: bfi [[PTR:x[0-9]+]], [[MASK_IDX]], #2, #2 -; CHECK: ldr s0, {{\[}}[[PTR]]{{\]}} -; CHECK: str s0, [x0] - define void @test(float * %p1, i32 %v1) { +; CHECK-LABEL: test: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: sub sp, sp, #16 ; =16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: fmov.2d v0, #2.00000000 +; CHECK-NEXT: and x8, x1, #0x3 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: bfi x9, x8, #2, #2 +; CHECK-NEXT: ldr s0, [x9] +; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: add sp, sp, #16 ; =16 +; CHECK-NEXT: ret entry: %v2 = extractelement <3 x float> , i32 %v1 store float %v2, float* %p1, align 4 ret void } -; CHECK-LABEL: _test2 -; CHECK: movi.16b v0, #63 -; CHECK-DAG: and [[MASK_IDX:x[0-9]+]], x1, #0x3 -; CHECK-DAG: str q0, [sp], #16 -; CHECK-DAG: mov x9, sp -; CHECK-DAG: bfi [[PTR:x[0-9]+]], [[MASK_IDX]], #2, #2 -; CHECK: ldr s0, {{\[}}[[PTR]]{{\]}} -; CHECK: str s0, [x0] - define void @test2(float * %p1, i32 %v1) { +; CHECK-LABEL: test2: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: sub sp, sp, #16 ; =16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: movi.16b v0, #63 +; CHECK-NEXT: and x8, x1, #0x3 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: bfi x9, x8, #2, #2 +; CHECK-NEXT: ldr s0, [x9] +; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: add sp, sp, #16 ; =16 +; CHECK-NEXT: ret entry: %v2 = extractelement <3 x float> , i32 %v1 store float %v2, float* %p1, align 4 diff --git a/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll b/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll --- a/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll +++ b/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll @@ -24,10 +24,12 @@ %struct.S2 = type { i32, i32, i32, i32 } define dso_local [2 x i64] @"?f2"() { entry: +; FIXME: Missed optimization, the entire SP push/pop could be removed ; CHECK-LABEL: f2 -; CHECK: stp xzr, xzr, [sp], #16 -; CHECK: mov x0, xzr -; CHECK: mov x1, xzr +; CHECK: stp xzr, xzr, [sp, #-16]! +; CHECK-NEXT: mov x0, xzr +; CHECK-NEXT: mov x1, xzr +; CHECK-NEXT: add sp, sp, #16 %retval = alloca %struct.S2, align 4 %a = getelementptr inbounds %struct.S2, %struct.S2* %retval, i32 0, i32 0