Index: lib/Target/ARM/ARMBaseRegisterInfo.cpp =================================================================== --- lib/Target/ARM/ARMBaseRegisterInfo.cpp +++ lib/Target/ARM/ARMBaseRegisterInfo.cpp @@ -548,12 +548,13 @@ // and pick a real one. Offset += 128; // 128 bytes of spill slots - // If there is a frame pointer, try using it. + // If there's a frame pointer and the addressing mode allows it, try using it. // The FP is only available if there is no dynamic realignment. We // don't know for sure yet whether we'll need that, so we guess based // on whether there are any local variables that would trigger it. unsigned StackAlign = TFI->getStackAlignment(); - if (TFI->hasFP(MF) && + if (TFI->hasFP(MF) && + (MI->getDesc().TSFlags & ARMII::AddrModeMask) != ARMII::AddrModeT1_s && !((MFI->getLocalFrameMaxAlign() > StackAlign) && canRealignStack(MF))) { if (isFrameOffsetLegal(MI, FPOffset)) return false; @@ -670,7 +671,7 @@ NumBits = 8; break; case ARMII::AddrModeT1_s: - NumBits = 5; + NumBits = 8; Scale = 4; isSigned = false; break; Index: lib/Target/ARM/ARMISelDAGToDAG.cpp =================================================================== --- lib/Target/ARM/ARMISelDAGToDAG.cpp +++ lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -1191,6 +1191,11 @@ SDValue &Base, SDValue &OffImm) { if (N.getOpcode() == ISD::FrameIndex) { int FI = cast(N)->getIndex(); + // Only multiples of 4 are allowed for the offset, so the frame object + // alignment must be at least 4. + MachineFrameInfo *MFI = MF->getFrameInfo(); + if (MFI->getObjectAlignment(FI) < 4) + MFI->setObjectAlignment(FI, 4); Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); OffImm = CurDAG->getTargetConstant(0, MVT::i32); return true; @@ -1208,6 +1213,11 @@ Base = N.getOperand(0); if (Base.getOpcode() == ISD::FrameIndex) { int FI = cast(Base)->getIndex(); + // For LHS+RHS to result in an offset that's a multiple of 4 the object + // indexed by the LHS must be 4-byte aligned. + MachineFrameInfo *MFI = MF->getFrameInfo(); + if (MFI->getObjectAlignment(FI) < 4) + MFI->setObjectAlignment(FI, 4); Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); } OffImm = CurDAG->getTargetConstant(RHSC, MVT::i32); @@ -2496,6 +2506,11 @@ int FI = cast(N)->getIndex(); SDValue TFI = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy()); if (Subtarget->isThumb1Only()) { + // Set the alignment of the frame object to 4, to avoid having to generate + // more than one ADD + MachineFrameInfo *MFI = MF->getFrameInfo(); + if (MFI->getObjectAlignment(FI) < 4) + MFI->setObjectAlignment(FI, 4); return CurDAG->SelectNodeTo(N, ARM::tADDframe, MVT::i32, TFI, CurDAG->getTargetConstant(0, MVT::i32)); } else { Index: lib/Target/ARM/ARMInstrThumb.td =================================================================== --- lib/Target/ARM/ARMInstrThumb.td +++ lib/Target/ARM/ARMInstrThumb.td @@ -1394,6 +1394,12 @@ def : T1Pat<(zextloadi1 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>; +// extload from the stack -> word load from the stack, as it avoids having to +// materialize the base in a separate register. +def : T1Pat<(extloadi1 t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>; +def : T1Pat<(extloadi8 t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>; +def : T1Pat<(extloadi16 t_addrmode_sp:$addr), (tLDRspi t_addrmode_sp:$addr)>; + // extload -> zextload def : T1Pat<(extloadi1 t_addrmode_rrs1:$addr), (tLDRBr t_addrmode_rrs1:$addr)>; def : T1Pat<(extloadi1 t_addrmode_is1:$addr), (tLDRBi t_addrmode_is1:$addr)>; Index: test/CodeGen/Thumb/stack-access.ll =================================================================== --- /dev/null +++ test/CodeGen/Thumb/stack-access.ll @@ -0,0 +1,74 @@ +; RUN: llc -mtriple=thumb-eabi < %s -o - | FileCheck %s + +; Check that stack addresses are generated using a single ADD +define void @test1(i8** %p) { + %x = alloca i8, align 1 + %y = alloca i8, align 1 + %z = alloca i8, align 1 +; CHECK: add r1, sp, #8 +; CHECK: str r1, [r0] + store i8* %x, i8** %p, align 4 +; CHECK: add r1, sp, #4 +; CHECK: str r1, [r0] + store i8* %y, i8** %p, align 4 +; CHECK: mov r1, sp +; CHECK: str r1, [r0] + store i8* %z, i8** %p, align 4 + ret void +} + +; Stack offsets larger than 1020 still need two ADDs +define void @test2([1024 x i8]** %p) { + %arr1 = alloca [1024 x i8], align 1 + %arr2 = alloca [1024 x i8], align 1 +; CHECK: add r1, sp, #1020 +; CHECK: adds r1, #4 +; CHECK: str r1, [r0] + store [1024 x i8]* %arr1, [1024 x i8]** %p, align 4 +; CHECK: mov r1, sp +; CHECK: str r1, [r0] + store [1024 x i8]* %arr2, [1024 x i8]** %p, align 4 + ret void +} + +; If possible stack-based lrdb/ldrh are widened to use SP-based addressing +define i32 @test3() #0 { + %x = alloca i8, align 1 + %y = alloca i8, align 1 +; CHECK: ldr r0, [sp] + %1 = load i8* %x, align 1 +; CHECK: ldr r1, [sp, #4] + %2 = load i8* %y, align 1 + %3 = add nsw i8 %1, %2 + %4 = zext i8 %3 to i32 + ret i32 %4 +} + +define i32 @test4() #0 { + %x = alloca i16, align 2 + %y = alloca i16, align 2 +; CHECK: ldr r0, [sp] + %1 = load i16* %x, align 2 +; CHECK: ldr r1, [sp, #4] + %2 = load i16* %y, align 2 + %3 = add nsw i16 %1, %2 + %4 = zext i16 %3 to i32 + ret i32 %4 +} + +; Don't widen if the value needs to be zero-extended +define zeroext i8 @test5() { + %x = alloca i8, align 1 +; CHECK: mov r0, sp +; CHECK: ldrb r0, [r0] + %1 = load i8* %x, align 1 + ret i8 %1 +} + +define zeroext i16 @test6() { + %x = alloca i16, align 2 +; CHECK: mov r0, sp +; CHECK: ldrh r0, [r0] + %1 = load i16* %x, align 2 + ret i16 %1 +}