diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp @@ -338,7 +338,7 @@ // Emit the second SP adjustment after saving callee saved registers. if (FirstSPAdjustAmount) { - uint64_t SecondSPAdjustAmount = MFI.getStackSize() - FirstSPAdjustAmount; + int64_t SecondSPAdjustAmount = MFI.getStackSize() - FirstSPAdjustAmount; assert(SecondSPAdjustAmount > 0 && "SecondSPAdjustAmount should be greater than zero"); adjustReg(MBB, MBBI, DL, SPReg, SPReg, -SecondSPAdjustAmount, @@ -444,7 +444,7 @@ uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF); if (FirstSPAdjustAmount) { - uint64_t SecondSPAdjustAmount = MFI.getStackSize() - FirstSPAdjustAmount; + int64_t SecondSPAdjustAmount = MFI.getStackSize() - FirstSPAdjustAmount; assert(SecondSPAdjustAmount > 0 && "SecondSPAdjustAmount should be greater than zero"); @@ -632,24 +632,55 @@ const std::vector &CSI = MFI.getCalleeSavedInfo(); uint64_t StackSize = MFI.getStackSize(); + const uint64_t DoNotSplitSPAdjustment = 0; + // Disable SplitSPAdjust if save-restore libcall used. The callee saved // registers will be pushed by the save-restore libcalls, so we don't have to // split the SP adjustment in this case. if (RVFI->getLibCallStackSize()) - return 0; - - // Return the FirstSPAdjustAmount if the StackSize can not fit in signed - // 12-bit and there exists a callee saved register need to be pushed. - if (!isInt<12>(StackSize) && (CSI.size() > 0)) { - // FirstSPAdjustAmount is choosed as (2048 - StackAlign) - // because 2048 will cause sp = sp + 2048 in epilogue split into - // multi-instructions. The offset smaller than 2048 can fit in signle - // load/store instruction and we have to stick with the stack alignment. - // 2048 is 16-byte alignment. The stack alignment for RV32 and RV64 is 16, - // for RV32E is 4. So (2048 - StackAlign) will satisfy the stack alignment. - return 2048 - getStackAlign().value(); + return DoNotSplitSPAdjustment; + + // If we have any Callee-saved registers, we want to ensure they are saved + // using minimal instructions, which will require splitting the stack + // adjustment. + if (CSI.size() > 0) { + const RISCVSubtarget &STI = MF.getSubtarget(); + + // We want to split if the stack size is over a certain threshold. This + // threshold is set by which instructions are available, as we'd prefer to + // use the smallest instructions available. + uint32_t OffsetAddressableLimit; + if (STI.hasStdExtC() && !STI.is64Bit()) { + // On RV32*C, we want to use c.lwsp and c.swsp. These have approximately a + // 7-bit limit, though the offset must also be a multiple of 4. + OffsetAddressableLimit = 252; + } else if (STI.hasStdExtC() && STI.is64Bit()) { + // On RV64*C, we want to use c.ldsp and c.sdsp. These have approximately a + // 8-bit limit, though the offset must also be a multiple of 8. + OffsetAddressableLimit = 504; + } else { + // If we don't have compressed instructions, we want to use the offset in + // l{d,w} or s{d,w}, which has a 12-bit limit, so 2048. + // + // However, we also want to ensure that we can do both "first" stack + // adjustments in one single instruction, preferrably `addi`. In the + // prolog, this will be `addi sp, sp, -` which will fit a limit of + // 2048, but undoing this in the epilog (`addi sp, sp, 2048`) does not fit + // into a single instruction. + // + // So, in the end we need to choose a value less than 2048, to fit into + // the limit. We would most prefer the offsets remained as aligned as the + // stack is, so we choose 2048 - StackAlign. + OffsetAddressableLimit = 2048 - getStackAlign().value(); + } + + // There's only point in splitting if the stack size is over the threshold + // we found. Otherwise we'll have no problem addressing these offsets. + if (StackSize > OffsetAddressableLimit) + return OffsetAddressableLimit; } - return 0; + + return DoNotSplitSPAdjustment; } bool RISCVFrameLowering::spillCalleeSavedRegisters( diff --git a/llvm/test/CodeGen/RISCV/split-sp-adjust.ll b/llvm/test/CodeGen/RISCV/split-sp-adjust.ll --- a/llvm/test/CodeGen/RISCV/split-sp-adjust.ll +++ b/llvm/test/CodeGen/RISCV/split-sp-adjust.ll @@ -1,45 +1,283 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -mtriple=riscv32 -verify-machineinstrs < %s \ ; RUN: | FileCheck %s -check-prefix=RV32I +; RUN: llc -mtriple=riscv32 -mattr=+c -verify-machineinstrs -riscv-no-aliases < %s \ +; RUN: | FileCheck %s -check-prefix=RV32IC +; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \ +; RUN: | FileCheck %s -check-prefix=RV64I +; RUN: llc -mtriple=riscv64 -mattr=+c -verify-machineinstrs -riscv-no-aliases < %s \ +; RUN: | FileCheck %s -check-prefix=RV64IC -; The stack size is 2048 and the SP adjustment will be split. -define i32 @SplitSP() nounwind { -; RV32I-LABEL: SplitSP: +; These tests check split stack pointer adjustment. +; +; Instead of adjusting the stack pointer once, if the stack is larger than a +; threshold, we adjust it twice: once to before saving the CSRs, and once after. +; The intention here is to ensure that the offsets when saving the CSRs fit into +; the offset field in the CSR load/store instructions, so the threshold depends +; on which instructions are available. +; +; We show which instructions are compressed, because the important thing is that +; the thresholds allow the stack offsets to fit into c.{l,s}{d,w}sp if +; compressed instructions are available, and `{l,s}{d,w}` if not. + +declare i32 @use_pointer(i8*) + +; stack size is 128 and stack pointer adjustment will not be split +define i32 @stack_never_split() nounwind { +; RV32I-LABEL: stack_never_split: ; RV32I: # %bb.0: # %entry -; RV32I-NEXT: addi sp, sp, -2032 -; RV32I-NEXT: sw ra, 2028(sp) -; RV32I-NEXT: addi sp, sp, -16 -; RV32I-NEXT: addi a0, sp, 16 -; RV32I-NEXT: call foo -; RV32I-NEXT: mv a0, zero -; RV32I-NEXT: addi sp, sp, 16 -; RV32I-NEXT: lw ra, 2028(sp) -; RV32I-NEXT: addi sp, sp, 2032 +; RV32I-NEXT: addi sp, sp, -128 +; RV32I-NEXT: sw ra, 124(sp) +; RV32I-NEXT: addi a0, sp, 4 +; RV32I-NEXT: call use_pointer +; RV32I-NEXT: lw ra, 124(sp) +; RV32I-NEXT: addi sp, sp, 128 ; RV32I-NEXT: ret +; +; RV32IC-LABEL: stack_never_split: +; RV32IC: # %bb.0: # %entry +; RV32IC-NEXT: c.addi16sp sp, -128 +; RV32IC-NEXT: c.swsp ra, 124(sp) +; RV32IC-NEXT: c.addi4spn a0, sp, 4 +; RV32IC-NEXT: call use_pointer +; RV32IC-NEXT: c.lwsp ra, 124(sp) +; RV32IC-NEXT: c.addi16sp sp, 128 +; RV32IC-NEXT: c.jr ra +; +; RV64I-LABEL: stack_never_split: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: addi sp, sp, -128 +; RV64I-NEXT: sd ra, 120(sp) +; RV64I-NEXT: mv a0, sp +; RV64I-NEXT: call use_pointer +; RV64I-NEXT: ld ra, 120(sp) +; RV64I-NEXT: addi sp, sp, 128 +; RV64I-NEXT: ret +; +; RV64IC-LABEL: stack_never_split: +; RV64IC: # %bb.0: # %entry +; RV64IC-NEXT: c.addi16sp sp, -128 +; RV64IC-NEXT: c.sdsp ra, 120(sp) +; RV64IC-NEXT: c.mv a0, sp +; RV64IC-NEXT: call use_pointer +; RV64IC-NEXT: c.ldsp ra, 120(sp) +; RV64IC-NEXT: c.addi16sp sp, 128 +; RV64IC-NEXT: c.jr ra entry: - %xx = alloca [2028 x i8], align 1 - %0 = getelementptr inbounds [2028 x i8], [2028 x i8]* %xx, i32 0, i32 0 - %call = call i32 @foo(i8* nonnull %0) - ret i32 0 + %xx = alloca [120 x i8], align 1 + %0 = getelementptr inbounds [120 x i8], [120 x i8]* %xx, i32 0, i32 0 + %call = call i32 @use_pointer(i8* nonnull %0) + ret i32 %call +} + +; stack size is 240 (RV32) or 256 (RV64) and stack pointer adjustment will not +; be split +define i32 @stack_never_split_2() nounwind { +; RV32I-LABEL: stack_never_split_2: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: addi sp, sp, -240 +; RV32I-NEXT: sw ra, 236(sp) +; RV32I-NEXT: mv a0, sp +; RV32I-NEXT: call use_pointer +; RV32I-NEXT: lw ra, 236(sp) +; RV32I-NEXT: addi sp, sp, 240 +; RV32I-NEXT: ret +; +; RV32IC-LABEL: stack_never_split_2: +; RV32IC: # %bb.0: # %entry +; RV32IC-NEXT: c.addi16sp sp, -240 +; RV32IC-NEXT: c.swsp ra, 236(sp) +; RV32IC-NEXT: c.mv a0, sp +; RV32IC-NEXT: call use_pointer +; RV32IC-NEXT: c.lwsp ra, 236(sp) +; RV32IC-NEXT: c.addi16sp sp, 240 +; RV32IC-NEXT: c.jr ra +; +; RV64I-LABEL: stack_never_split_2: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: addi sp, sp, -256 +; RV64I-NEXT: sd ra, 248(sp) +; RV64I-NEXT: addi a0, sp, 12 +; RV64I-NEXT: call use_pointer +; RV64I-NEXT: ld ra, 248(sp) +; RV64I-NEXT: addi sp, sp, 256 +; RV64I-NEXT: ret +; +; RV64IC-LABEL: stack_never_split_2: +; RV64IC: # %bb.0: # %entry +; RV64IC-NEXT: c.addi16sp sp, -256 +; RV64IC-NEXT: c.sdsp ra, 248(sp) +; RV64IC-NEXT: c.addi4spn a0, sp, 12 +; RV64IC-NEXT: call use_pointer +; RV64IC-NEXT: c.ldsp ra, 248(sp) +; RV64IC-NEXT: c.addi16sp sp, 256 +; RV64IC-NEXT: c.jr ra +entry: + %xx = alloca [236 x i8], align 1 + %0 = getelementptr inbounds [236 x i8], [236 x i8]* %xx, i32 0, i32 0 + %call = call i32 @use_pointer(i8* nonnull %0) + ret i32 %call +} + +; stack size is 256 and stack pointer adjustment will be split on RV32*C +define i32 @stack_split_rv32_c() nounwind { +; RV32I-LABEL: stack_split_rv32_c: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: addi sp, sp, -256 +; RV32I-NEXT: sw ra, 252(sp) +; RV32I-NEXT: addi a0, sp, 8 +; RV32I-NEXT: call use_pointer +; RV32I-NEXT: lw ra, 252(sp) +; RV32I-NEXT: addi sp, sp, 256 +; RV32I-NEXT: ret +; +; RV32IC-LABEL: stack_split_rv32_c: +; RV32IC: # %bb.0: # %entry +; RV32IC-NEXT: addi sp, sp, -252 +; RV32IC-NEXT: c.swsp ra, 248(sp) +; RV32IC-NEXT: c.addi sp, -4 +; RV32IC-NEXT: c.addi4spn a0, sp, 8 +; RV32IC-NEXT: call use_pointer +; RV32IC-NEXT: c.addi sp, 4 +; RV32IC-NEXT: c.lwsp ra, 248(sp) +; RV32IC-NEXT: addi sp, sp, 252 +; RV32IC-NEXT: c.jr ra +; +; RV64I-LABEL: stack_split_rv32_c: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: addi sp, sp, -256 +; RV64I-NEXT: sd ra, 248(sp) +; RV64I-NEXT: addi a0, sp, 4 +; RV64I-NEXT: call use_pointer +; RV64I-NEXT: ld ra, 248(sp) +; RV64I-NEXT: addi sp, sp, 256 +; RV64I-NEXT: ret +; +; RV64IC-LABEL: stack_split_rv32_c: +; RV64IC: # %bb.0: # %entry +; RV64IC-NEXT: c.addi16sp sp, -256 +; RV64IC-NEXT: c.sdsp ra, 248(sp) +; RV64IC-NEXT: c.addi4spn a0, sp, 4 +; RV64IC-NEXT: call use_pointer +; RV64IC-NEXT: c.ldsp ra, 248(sp) +; RV64IC-NEXT: c.addi16sp sp, 256 +; RV64IC-NEXT: c.jr ra +entry: + %xx = alloca [244 x i8], align 1 + %0 = getelementptr inbounds [244 x i8], [244 x i8]* %xx, i32 0, i32 0 + %call = call i32 @use_pointer(i8* nonnull %0) + ret i32 %call +} + +; stack size is 496 (RV32) or 512 (RV64) and stack pointer adjustment will be +; split on RV32C and RV64C +define i32 @stack_split_rv64_c() nounwind { +; RV32I-LABEL: stack_split_rv64_c: +; RV32I: # %bb.0: # %entry +; RV32I-NEXT: addi sp, sp, -496 +; RV32I-NEXT: sw ra, 492(sp) +; RV32I-NEXT: mv a0, sp +; RV32I-NEXT: call use_pointer +; RV32I-NEXT: lw ra, 492(sp) +; RV32I-NEXT: addi sp, sp, 496 +; RV32I-NEXT: ret +; +; RV32IC-LABEL: stack_split_rv64_c: +; RV32IC: # %bb.0: # %entry +; RV32IC-NEXT: addi sp, sp, -252 +; RV32IC-NEXT: c.swsp ra, 248(sp) +; RV32IC-NEXT: addi sp, sp, -244 +; RV32IC-NEXT: c.mv a0, sp +; RV32IC-NEXT: call use_pointer +; RV32IC-NEXT: addi sp, sp, 244 +; RV32IC-NEXT: c.lwsp ra, 248(sp) +; RV32IC-NEXT: addi sp, sp, 252 +; RV32IC-NEXT: c.jr ra +; +; RV64I-LABEL: stack_split_rv64_c: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: addi sp, sp, -512 +; RV64I-NEXT: sd ra, 504(sp) +; RV64I-NEXT: addi a0, sp, 12 +; RV64I-NEXT: call use_pointer +; RV64I-NEXT: ld ra, 504(sp) +; RV64I-NEXT: addi sp, sp, 512 +; RV64I-NEXT: ret +; +; RV64IC-LABEL: stack_split_rv64_c: +; RV64IC: # %bb.0: # %entry +; RV64IC-NEXT: addi sp, sp, -504 +; RV64IC-NEXT: c.sdsp ra, 496(sp) +; RV64IC-NEXT: c.addi sp, -8 +; RV64IC-NEXT: c.addi4spn a0, sp, 12 +; RV64IC-NEXT: call use_pointer +; RV64IC-NEXT: c.addi sp, 8 +; RV64IC-NEXT: c.ldsp ra, 496(sp) +; RV64IC-NEXT: addi sp, sp, 504 +; RV64IC-NEXT: c.jr ra +entry: + %xx = alloca [492 x i8], align 1 + %0 = getelementptr inbounds [492 x i8], [492 x i8]* %xx, i32 0, i32 0 + %call = call i32 @use_pointer(i8* nonnull %0) + ret i32 %call } -; The stack size is 2032 and the SP adjustment will not be split. -define i32 @NoSplitSP() nounwind { -; RV32I-LABEL: NoSplitSP: +; The stack size is 2048 and the SP adjustment will always be split +define i32 @stack_split_always() nounwind { +; RV32I-LABEL: stack_split_always: ; RV32I: # %bb.0: # %entry ; RV32I-NEXT: addi sp, sp, -2032 ; RV32I-NEXT: sw ra, 2028(sp) -; RV32I-NEXT: addi a0, sp, 4 -; RV32I-NEXT: call foo +; RV32I-NEXT: addi sp, sp, -16 +; RV32I-NEXT: addi a0, sp, 16 +; RV32I-NEXT: call use_pointer ; RV32I-NEXT: mv a0, zero +; RV32I-NEXT: addi sp, sp, 16 ; RV32I-NEXT: lw ra, 2028(sp) ; RV32I-NEXT: addi sp, sp, 2032 ; RV32I-NEXT: ret +; +; RV32IC-LABEL: stack_split_always: +; RV32IC: # %bb.0: # %entry +; RV32IC-NEXT: addi sp, sp, -252 +; RV32IC-NEXT: c.swsp ra, 248(sp) +; RV32IC-NEXT: addi sp, sp, -1796 +; RV32IC-NEXT: c.addi4spn a0, sp, 16 +; RV32IC-NEXT: call use_pointer +; RV32IC-NEXT: c.li a0, 0 +; RV32IC-NEXT: addi sp, sp, 1796 +; RV32IC-NEXT: c.lwsp ra, 248(sp) +; RV32IC-NEXT: addi sp, sp, 252 +; RV32IC-NEXT: c.jr ra +; +; RV64I-LABEL: stack_split_always: +; RV64I: # %bb.0: # %entry +; RV64I-NEXT: addi sp, sp, -2032 +; RV64I-NEXT: sd ra, 2024(sp) +; RV64I-NEXT: addi sp, sp, -16 +; RV64I-NEXT: addi a0, sp, 12 +; RV64I-NEXT: call use_pointer +; RV64I-NEXT: mv a0, zero +; RV64I-NEXT: addi sp, sp, 16 +; RV64I-NEXT: ld ra, 2024(sp) +; RV64I-NEXT: addi sp, sp, 2032 +; RV64I-NEXT: ret +; +; RV64IC-LABEL: stack_split_always: +; RV64IC: # %bb.0: # %entry +; RV64IC-NEXT: addi sp, sp, -504 +; RV64IC-NEXT: c.sdsp ra, 496(sp) +; RV64IC-NEXT: addi sp, sp, -1544 +; RV64IC-NEXT: c.addi4spn a0, sp, 12 +; RV64IC-NEXT: call use_pointer +; RV64IC-NEXT: c.li a0, 0 +; RV64IC-NEXT: addi sp, sp, 1544 +; RV64IC-NEXT: c.ldsp ra, 496(sp) +; RV64IC-NEXT: addi sp, sp, 504 +; RV64IC-NEXT: c.jr ra entry: - %xx = alloca [2024 x i8], align 1 - %0 = getelementptr inbounds [2024 x i8], [2024 x i8]* %xx, i32 0, i32 0 - %call = call i32 @foo(i8* nonnull %0) + %xx = alloca [2028 x i8], align 1 + %0 = getelementptr inbounds [2028 x i8], [2028 x i8]* %xx, i32 0, i32 0 + %call = call i32 @use_pointer(i8* nonnull %0) ret i32 0 } - -declare i32 @foo(i8*)