diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp --- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp +++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp @@ -29,6 +29,7 @@ #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetRegisterInfo.h" #include "llvm/IR/DebugLoc.h" +#include "llvm/MC/MCAsmInfo.h" #include "llvm/MC/MCRegisterInfo.h" #include "llvm/Pass.h" #include "llvm/Support/CommandLine.h" @@ -1780,6 +1781,21 @@ ModifiedRegUnits.clear(); UsedRegUnits.clear(); ++MBBI; + + // We can't post-increment the stack pointer if any instruction between + // the memory access (I) and the increment (MBBI) can access the memory + // region defined by [SP, MBBI]. + const bool BaseRegSP = BaseReg == AArch64::SP; + if (BaseRegSP) { + // FIXME: For now, we always block the optimization over SP in windows + // targets as it requires to adjust the unwind/debug info, messing up + // the unwind info can actually cause a miscompile. + const MCAsmInfo *MAI = I->getMF()->getTarget().getMCAsmInfo(); + if (MAI->usesWindowsCFI() && + I->getMF()->getFunction().needsUnwindTableEntry()) + return E; + } + for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) { MachineInstr &MI = *MBBI; @@ -1797,8 +1813,11 @@ // Otherwise, if the base register is used or modified, we have no match, so // return early. + // If we are optimizing SP, do not allow instructions that may load or store + // in between the load and the optimized value update. if (!ModifiedRegUnits.available(BaseReg) || - !UsedRegUnits.available(BaseReg)) + !UsedRegUnits.available(BaseReg) || + (BaseRegSP && MBBI->mayLoadOrStore())) return E; } return E; diff --git a/llvm/test/CodeGen/AArch64/aarch64-ldst-no-premature-sp-pop.mir b/llvm/test/CodeGen/AArch64/aarch64-ldst-no-premature-sp-pop.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/aarch64-ldst-no-premature-sp-pop.mir @@ -0,0 +1,85 @@ +# RUN: llc -start-before=aarch64-ldst-opt -o - %s | FileCheck %s +# CHECK-NOT: stp xzr, xzr, [sp], #16 +# CHECK: add sp, sp, #16 +--- | + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + target triple = "aarch64-arm-none-eabi" + + define hidden i32 @foo(i32 %0) { + %2 = alloca [4 x i32], align 4 + %3 = bitcast [4 x i32]* %2 to i8* + call void @llvm.memset.p0i8.i64(i8* nonnull align 4 dereferenceable(16) %3, i8 0, i64 16, i1 false) + %4 = sext i32 %0 to i64 + %5 = getelementptr inbounds [4 x i32], [4 x i32]* %2, i64 0, i64 %4 + %6 = load i32, i32* %5, align 4 + ret i32 %6 + } + + declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #2 + declare void @llvm.stackprotector(i8*, i8**) #3 + + !llvm.module.flags = !{!0} + !llvm.ident = !{!1} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{!"clang version 11.0.0 "} + !2 = !{!3, !3, i64 0} + !3 = !{!"int", !4, i64 0} + !4 = !{!"omnipotent char", !5, i64 0} + !5 = !{!"Simple C++ TBAA"} + +... +--- +name: foo +alignment: 8 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$w0', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 16 + offsetAdjustment: 0 + maxAlignment: 8 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 16 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, type: default, offset: -16, size: 16, + alignment: 8, stack-id: default, callee-saved-register: '', callee-saved-restored: true, + local-offset: -16, debug-info-variable: '', debug-info-expression: '', + debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + bb.0 (%ir-block.1): + liveins: $w0 + + $sp = frame-setup SUBXri $sp, 16, 0 + $x8 = ADDXri $sp, 0, 0 + STRXui $xzr, $sp, 1 :: (store 8 into %ir.3 + 8) + STRXui $xzr, $sp, 0 :: (store 8 into %ir.3) + renamable $w0 = LDRWroW killed renamable $x8, killed renamable $w0, 1, 1 :: (load 4 from %ir.5, !tbaa !2) + $sp = frame-destroy ADDXri $sp, 16, 0 + RET undef $lr, implicit $w0 + +... diff --git a/llvm/test/CodeGen/AArch64/arm64-nvcast.ll b/llvm/test/CodeGen/AArch64/arm64-nvcast.ll --- a/llvm/test/CodeGen/AArch64/arm64-nvcast.ll +++ b/llvm/test/CodeGen/AArch64/arm64-nvcast.ll @@ -1,31 +1,41 @@ ; RUN: llc < %s -mtriple=arm64-apple-ios | FileCheck %s -; CHECK-LABEL: _test: -; CHECK-DAG: fmov.2d v0, #2.00000000 -; CHECK-DAG: and [[MASK_IDX:x[0-9]+]], x1, #0x3 -; CHECK-DAG: mov x9, sp -; CHECK-DAG: str q0, [sp], #16 -; CHECK-DAG: bfi [[PTR:x[0-9]+]], [[MASK_IDX]], #2, #2 -; CHECK: ldr s0, {{\[}}[[PTR]]{{\]}} -; CHECK: str s0, [x0] - define void @test(float * %p1, i32 %v1) { +; CHECK-LABEL: test: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: sub sp, sp, #16 ; =16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: fmov.2d v0, #2.00000000 +; CHECK-NEXT: and x8, x1, #0x3 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: bfi x9, x8, #2, #2 +; CHECK-NEXT: ldr s0, [x9] +; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: add sp, sp, #16 ; =16 +; CHECK-NEXT: ret entry: %v2 = extractelement <3 x float> , i32 %v1 store float %v2, float* %p1, align 4 ret void } -; CHECK-LABEL: _test2 -; CHECK: movi.16b v0, #63 -; CHECK-DAG: and [[MASK_IDX:x[0-9]+]], x1, #0x3 -; CHECK-DAG: str q0, [sp], #16 -; CHECK-DAG: mov x9, sp -; CHECK-DAG: bfi [[PTR:x[0-9]+]], [[MASK_IDX]], #2, #2 -; CHECK: ldr s0, {{\[}}[[PTR]]{{\]}} -; CHECK: str s0, [x0] - define void @test2(float * %p1, i32 %v1) { +; CHECK-LABEL: test2: +; CHECK: ; %bb.0: ; %entry +; CHECK-NEXT: sub sp, sp, #16 ; =16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: ; kill: def $w1 killed $w1 def $x1 +; CHECK-NEXT: movi.16b v0, #63 +; CHECK-NEXT: and x8, x1, #0x3 +; CHECK-NEXT: mov x9, sp +; CHECK-NEXT: str q0, [sp] +; CHECK-NEXT: bfi x9, x8, #2, #2 +; CHECK-NEXT: ldr s0, [x9] +; CHECK-NEXT: str s0, [x0] +; CHECK-NEXT: add sp, sp, #16 ; =16 +; CHECK-NEXT: ret entry: %v2 = extractelement <3 x float> , i32 %v1 store float %v2, float* %p1, align 4 diff --git a/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll b/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll --- a/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll +++ b/llvm/test/CodeGen/AArch64/arm64-windows-calls.ll @@ -24,10 +24,12 @@ %struct.S2 = type { i32, i32, i32, i32 } define dso_local [2 x i64] @"?f2"() { entry: +; FIXME: Missed optimization, the entire SP push/pop could be removed ; CHECK-LABEL: f2 -; CHECK: stp xzr, xzr, [sp], #16 -; CHECK: mov x0, xzr -; CHECK: mov x1, xzr +; CHECK: stp xzr, xzr, [sp, #-16]! +; CHECK-NEXT: mov x0, xzr +; CHECK-NEXT: mov x1, xzr +; CHECK-NEXT: add sp, sp, #16 %retval = alloca %struct.S2, align 4 %a = getelementptr inbounds %struct.S2, %struct.S2* %retval, i32 0, i32 0