diff --git a/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp b/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp --- a/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp +++ b/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp @@ -128,6 +128,19 @@ const ARMBaseRegisterInfo &MRI, unsigned MIFlags = MachineInstr::NoFlags) { MachineFunction &MF = *MBB.getParent(); const ARMSubtarget &ST = MF.getSubtarget(); + + // Use a single sp-relative add if the immediate is small enough. + if (BaseReg == ARM::SP && + (DestReg.isVirtual() || isARMLowRegister(DestReg)) && NumBytes >= 0 && + NumBytes <= 1020 && (NumBytes % 4) == 0) { + BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDrSPi), DestReg) + .addReg(ARM::SP) + .addImm(NumBytes / 4) + .add(predOps(ARMCC::AL)) + .setMIFlags(MIFlags); + return; + } + bool isHigh = !isARMLowRegister(DestReg) || (BaseReg != 0 && !isARMLowRegister(BaseReg)); bool isSub = false; @@ -422,19 +435,33 @@ return true; } + // The offset doesn't fit, but we may be able to put some of the offset into + // the ldr to simplify the generation of the rest of it. NumBits = 5; Mask = (1 << NumBits) - 1; - - // If this is a thumb spill / restore, we will be using a constpool load to - // materialize the offset. - if (Opcode == ARM::tLDRspi || Opcode == ARM::tSTRspi) { - ImmOp.ChangeToImmediate(0); - } else { - // Otherwise, it didn't fit. Pull in what we can to simplify the immed. - ImmedOffset = ImmedOffset & Mask; - ImmOp.ChangeToImmediate(ImmedOffset); - Offset &= ~(Mask * Scale); + InstrOffs = 0; + auto &ST = MF.getSubtarget(); + // If using the maximum ldr offset will put the rest into the range of a + // single sp-relative add then do so. + if (FrameReg == ARM::SP && Offset - (Mask * Scale) <= 1020) { + InstrOffs = Mask; + } else if (ST.genExecuteOnly()) { + // With execute-only the offset is generated either with movw+movt or an + // add+lsl sequence. If subtracting an offset will make the top half zero + // then that saves a movt or lsl+add. Otherwise if we don't have movw then + // we may be able to subtract a value such that it makes the bottom byte + // zero, saving an add. + unsigned BottomBits = (Offset / Scale) & Mask; + bool CanMakeBottomByteZero = ((Offset - BottomBits * Scale) & 0xff) == 0; + bool TopHalfZero = (Offset & 0xffff0000) == 0; + bool CanMakeTopHalfZero = ((Offset - Mask * Scale) & 0xffff0000) == 0; + if (!TopHalfZero && CanMakeTopHalfZero) + InstrOffs = Mask; + else if (!ST.useMovt() && CanMakeBottomByteZero) + InstrOffs = BottomBits; } + ImmOp.ChangeToImmediate(InstrOffs); + Offset -= InstrOffs * Scale; } return Offset == 0; diff --git a/llvm/test/CodeGen/ARM/execute-only-split-offset.ll b/llvm/test/CodeGen/ARM/execute-only-split-offset.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/ARM/execute-only-split-offset.ll @@ -0,0 +1,141 @@ +; RUN: llc -mtriple=thumbv8m.base-eabi -mattr=+execute-only %s -o - | FileCheck --check-prefixes=CHECK,CHECK-MOVW %s +; RUN: llc -mtriple=thumbv6m-eabi -mattr=+execute-only %s -o - | FileCheck --check-prefixes=CHECK,CHECK-NOMOVW %s + +; Largest offset that fits into sp-relative ldr +; CHECK-LABEL: ldr_range_end: +; CHECK: ldr {{r[0-9]+}}, [sp, #1020] +define i32 @ldr_range_end() { +entry: + %var = alloca i32, align 4 + %arr = alloca [1020 x i8], align 4 + %0 = load i32, ptr %var, align 4 + ret i32 %0 +} + +; Smallest offset that fits into add+ldr +; CHECK-LABEL: add_ldr_range_start: +; CHECK: add [[REG:r[0-9]+]], sp, #900 +; CHECK-NEXT: ldr {{r[0-9]+}}, [[[REG]], #124] +define i32 @add_ldr_range_start() { +entry: + %var = alloca i32, align 4 + %arr = alloca [1024 x i8], align 4 + %0 = load i32, ptr %var, align 4 + ret i32 %0 +} + +; Largest offset that fits into add+ldr +; CHECK-LABEL: add_ldr_range_end: +; CHECK: add [[REG:r[0-9]+]], sp, #1020 +; CHECK-NEXT: ldr {{r[0-9]+}}, [[[REG]], #124] +define i32 @add_ldr_range_end() { +entry: + %var = alloca i32, align 4 + %arr = alloca [1144 x i8], align 4 + %0 = load i32, ptr %var, align 4 + ret i32 %0 +} + +; Smallest offset where we start using mov32. If we don't have movw then using +; an ldr offset means we save an add. +; CHECK-LABEL: mov32_range_start: +; CHECK-MOVW: movw [[REG:r[0-9]+]], #1148 +; CHECK-NOMOVW: movs [[REG:r[0-9]+]], #4 +; CHECK-NOMOVW-NEXT: lsls [[REG]], [[REG]], #8 +; CHECK-NEXT: add [[REG]], sp +; CHECK-MOVW-NEXT: ldr {{r[0-9]+}}, [[[REG]]] +; CHECK-NOMOVW-NEXT: ldr {{r[0-9]+}}, [[[REG]], #124] +define i32 @mov32_range_start() { +entry: + %var = alloca i32, align 4 + %arr = alloca [1148 x i8], align 4 + %0 = load i32, ptr %var, align 4 + ret i32 %0 +} + +; Here using an ldr offset doesn't save an add so we shouldn't do it. +; CHECK-LABEL: mov32_range_next: +; CHECK-MOVW: movw [[REG:r[0-9]+]], #1152 +; CHECK-NOMOVW: movs [[REG:r[0-9]+]], #4 +; CHECK-NOMOVW-NEXT: lsls [[REG]], [[REG]], #8 +; CHECK-NOMOVW-NEXT: adds [[REG]], #128 +; CHECK-NEXT: add [[REG]], sp +; CHECK-NEXT: ldr {{r[0-9]+}}, [[[REG]]] +define i32 @mov32_range_next() { +entry: + %var = alloca i32, align 4 + %arr = alloca [1152 x i8], align 4 + %0 = load i32, ptr %var, align 4 + ret i32 %0 +} + +; Smallest offset where using an ldr offset prevents needing a movt or lsl+add +; CHECK-LABEL: can_clear_top_byte_start: +; CHECK: add sp, {{r[0-9]+}} +; CHECK-MOVW: movw [[REG:r[0-9]+]], #65412 +; CHECK-NOMOVW: movs [[REG:r[0-9]+]], #255 +; CHECK-NOMOVW-NEXT: lsls [[REG:r[0-9]+]], [[REG:r[0-9]+]], #8 +; CHECK-NOMOVW-NEXT: adds [[REG:r[0-9]+]], #132 +; CHECK-NEXT: add [[REG]], sp +; CHECK-NEXT: ldr {{r[0-9]+}}, [[[REG]], #124] +define i32 @can_clear_top_byte_start() { +entry: + %var = alloca i32, align 4 + %arr = alloca [65536 x i8], align 4 + %0 = load i32, ptr %var, align 4 + ret i32 %0 +} + +; Largest offset where using an ldr offset prevents needing a movt or lsl+add +; CHECK-LABEL: can_clear_top_byte_end: +; CHECK: add sp, {{r[0-9]+}} +; CHECK-MOVW: movw [[REG:r[0-9]+]], #65532 +; CHECK-NOMOVW: movs [[REG:r[0-9]+]], #255 +; CHECK-NOMOVW-NEXT: lsls [[REG:r[0-9]+]], [[REG:r[0-9]+]], #8 +; CHECK-NOMOVW-NEXT: adds [[REG:r[0-9]+]], #252 +; CHECK-NEXT: add [[REG]], sp +; CHECK-NEXT: ldr {{r[0-9]+}}, [[[REG]], #124] +define i32 @can_clear_top_byte_end() { +entry: + %var = alloca i32, align 4 + %arr = alloca [65656 x i8], align 4 + %0 = load i32, ptr %var, align 4 + ret i32 %0 +} + +; Smallest offset where using an ldr offset doesn't clear the top byte, though +; we can use an ldr offset if not using movt to save an add of the low byte. +; CHECK-LABEL: cant_clear_top_byte_start: +; CHECK: add sp, {{r[0-9]+}} +; CHECK-MOVW: movw [[REG:r[0-9]+]], #124 +; CHECK-MOVW-NEXT: movt [[REG:r[0-9]+]], #1 +; CHECK-NOMOVW: movs [[REG:r[0-9]+]], #1 +; CHECK-NOMOVW-NEXT: lsls [[REG:r[0-9]+]], [[REG:r[0-9]+]], #16 +; CHECK-NEXT: add [[REG]], sp +; CHECK-MOVW-NEXT: ldr {{r[0-9]+}}, [[[REG]]] +; CHECK-NOMOVW-NEXT: ldr {{r[0-9]+}}, [[[REG]], #124] +define i32 @cant_clear_top_byte_start() { +entry: + %var = alloca i32, align 4 + %arr = alloca [65660 x i8], align 4 + %0 = load i32, ptr %var, align 4 + ret i32 %0 +} + +; An ldr offset doesn't help for anything, so we shouldn't do it. +; CHECK-LABEL: cant_clear_top_byte_next: +; CHECK: add sp, {{r[0-9]+}} +; CHECK-MOVW: movw [[REG:r[0-9]+]], #128 +; CHECK-MOVW: movt [[REG:r[0-9]+]], #1 +; CHECK-NOMOVW: movs [[REG:r[0-9]+]], #1 +; CHECK-NOMOVW-NEXT: lsls [[REG:r[0-9]+]], [[REG:r[0-9]+]], #16 +; CHECK-NOMOVW-NEXT: adds [[REG:r[0-9]+]], #128 +; CHECK-NEXT: add [[REG]], sp +; CHECK-NEXT: ldr {{r[0-9]+}}, [[[REG]]] +define i32 @cant_clear_top_byte_next() { +entry: + %var = alloca i32, align 4 + %arr = alloca [65664 x i8], align 4 + %0 = load i32, ptr %var, align 4 + ret i32 %0 +} diff --git a/llvm/test/CodeGen/ARM/large-stack.ll b/llvm/test/CodeGen/ARM/large-stack.ll --- a/llvm/test/CodeGen/ARM/large-stack.ll +++ b/llvm/test/CodeGen/ARM/large-stack.ll @@ -44,7 +44,8 @@ ;; are we choosing correct store/tSTRspi pattern for execute-only ; CHECK: movs [[REG:r[0-9]+]], #0x30 ; CHECK-NEXT: lsls [[REG]], [[REG]], #0x18 -; CHECK-NEXT: adds [[REG]], #0x8 +; CHECK-NEXT: add [[REG]], sp +; CHECK-NEXT: str {{r[0-9]+}}, [[[REG]], #0x8] %tmp1 = load i32, ptr %tmp ret i32 %tmp1 } diff --git a/llvm/test/CodeGen/Thumb/emergency-spill-slot.ll b/llvm/test/CodeGen/Thumb/emergency-spill-slot.ll --- a/llvm/test/CodeGen/Thumb/emergency-spill-slot.ll +++ b/llvm/test/CodeGen/Thumb/emergency-spill-slot.ll @@ -176,18 +176,13 @@ ; CHECK-NEXT: @APP ; CHECK-NEXT: @NO_APP ; CHECK-NEXT: str r0, [sp] -; CHECK-NEXT: ldr r0, .LCPI3_0 -; CHECK-NEXT: add r0, sp -; CHECK-NEXT: str r5, [r0] +; CHECK-NEXT: add r0, sp, #904 +; CHECK-NEXT: str r5, [r0, #124] ; CHECK-NEXT: ldr r0, [sp] ; CHECK-NEXT: @APP ; CHECK-NEXT: @NO_APP ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop {r4, r5, r6, r7, pc} -; CHECK-NEXT: .p2align 2 -; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: .LCPI3_0: -; CHECK-NEXT: .long 1028 @ 0x404 entry: %asm1 = call { i32, i32, i32, i32, i32, i32, i32, i32 } asm "", "={r0},={r1},={r2},={r3},={r4},={r5},={r6},={r7},0,1,2,3,4,5,6,7"(ptr %p, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef) %asmresult = extractvalue { i32, i32, i32, i32, i32, i32, i32, i32 } %asm1, 0 diff --git a/llvm/test/CodeGen/Thumb/stack-access.ll b/llvm/test/CodeGen/Thumb/stack-access.ll --- a/llvm/test/CodeGen/Thumb/stack-access.ll +++ b/llvm/test/CodeGen/Thumb/stack-access.ll @@ -110,19 +110,18 @@ store i32 1, ptr %arr2, align 4 ; %arr2 is in range, but this element of it is not -; CHECK-DAG: ldr [[RA:r[0-9]+]], .LCPI7_2 -; CHECK-DAG: add [[RA]], sp -; CHECK-DAG: str [[REG]], [{{r[0-9]+}}] +; CHECK-DAG: add [[RA:r[0-9]+]], sp, #900 +; CHECK-DAG: str [[REG]], [{{r[0-9]+}}, #124] %arr2idx2 = getelementptr inbounds [224 x i32], ptr %arr2, i32 0, i32 32 store i32 1, ptr %arr2idx2, align 4 ; %arr3 is not in range -; CHECK-DAG: ldr [[RB:r[0-9]+]], .LCPI7_3 +; CHECK-DAG: ldr [[RB:r[0-9]+]], .LCPI7_2 ; CHECK-DAG: add [[RB]], sp ; CHECK-DAG: str [[REG]], [{{r[0-9]+}}] store i32 1, ptr %arr3, align 4 -; CHECK-DAG: ldr [[RC:r[0-9]+]], .LCPI7_4 +; CHECK-DAG: ldr [[RC:r[0-9]+]], .LCPI7_3 ; CHECK-DAG: add [[RC]], sp ; CHECK-DAG: str [[REG]], [{{r[0-9]+}}] %arr3idx2 = getelementptr inbounds [224 x i32], ptr %arr3, i32 0, i32 32 diff --git a/llvm/test/CodeGen/Thumb/stack_guard_remat.ll b/llvm/test/CodeGen/Thumb/stack_guard_remat.ll --- a/llvm/test/CodeGen/Thumb/stack_guard_remat.ll +++ b/llvm/test/CodeGen/Thumb/stack_guard_remat.ll @@ -3,9 +3,8 @@ ; RUN: llc < %s -mtriple=thumb-apple-darwin -relocation-model=dynamic-no-pic -no-integrated-as | FileCheck %s -check-prefix=NO-PIC -check-prefix=DYNAMIC-NO-PIC ;PIC: foo2 -;PIC: ldr [[SAVED_GUARD:r[0-9]+]], [[GUARD_STACK_OFFSET:LCPI[0-9_]+]] -;PIC-NEXT: add [[SAVED_GUARD]], sp -;PIC-NEXT: ldr [[SAVED_GUARD]], [[[SAVED_GUARD]]] +;PIC: add [[SAVED_GUARD:r[0-9]+]], sp, #904 +;PIC-NEXT: ldr [[SAVED_GUARD]], [[[SAVED_GUARD]], #124] ;PIC-NEXT: ldr [[ORIGINAL_GUARD:r[0-9]+]], [[ORIGINAL_GUARD_LABEL:LCPI[0-9_]+]] ;PIC-NEXT: [[LABEL1:LPC[0-9_]+]]: ;PIC-NEXT: add [[ORIGINAL_GUARD]], pc @@ -13,28 +12,21 @@ ;PIC-NEXT: ldr [[ORIGINAL_GUARD]], [[[ORIGINAL_GUARD]]] ;PIC-NEXT: cmp [[ORIGINAL_GUARD]], [[SAVED_GUARD]] -;PIC: [[GUARD_STACK_OFFSET]]: -;PIC-NEXT: .long 1028 ;PIC: [[ORIGINAL_GUARD_LABEL]]: ;PIC-NEXT: .long L___stack_chk_guard$non_lazy_ptr-([[LABEL1]]+4) ;NO-PIC: foo2 -;NO-PIC: ldr [[SAVED_GUARD:r[0-9]+]], [[GUARD_STACK_OFFSET:LCPI[0-9_]+]] -;NO-PIC-NEXT: add [[SAVED_GUARD]], sp -;NO-PIC-NEXT: ldr [[SAVED_GUARD]], [[[SAVED_GUARD]]] +;NO-PIC: add [[SAVED_GUARD:r[0-9]+]], sp, #904 +;NO-PIC-NEXT: ldr [[SAVED_GUARD]], [[[SAVED_GUARD]], #124] ;NO-PIC-NEXT: ldr [[ORIGINAL_GUARD:r[0-9]+]], [[ORIGINAL_GUARD_LABEL:LCPI[0-9_]+]] ;NO-PIC-NOT: LPC ;NO-PIC-NEXT: ldr [[ORIGINAL_GUARD]], [[[ORIGINAL_GUARD]]] ;DYNAMIC-NO-PIC-NEXT: ldr [[ORIGINAL_GUARD]], [[[ORIGINAL_GUARD]]] ;NO-PIC-NEXT: cmp [[ORIGINAL_GUARD]], [[SAVED_GUARD]] -;STATIC: [[GUARD_STACK_OFFSET]]: -;STATIC-NEXT: .long 1028 ;STATIC: [[ORIGINAL_GUARD_LABEL]]: ;STATIC-NEXT: .long ___stack_chk_guard -;DYNAMIC-NO-PIC: [[GUARD_STACK_OFFSET]]: -;DYNAMIC-NO-PIC-NEXT: .long 1028 ;DYNAMIC-NO-PIC: [[ORIGINAL_GUARD_LABEL]]: ;DYNAMIC-NO-PIC-NEXT: .long L___stack_chk_guard$non_lazy_ptr