This is an archive of the discontinued LLVM Phabricator instance.

[Thumb1] Improve base + offset materialization in the load/store optimizer
ClosedPublic

Authored by mroth on Aug 21 2014, 6:44 AM.

Download Raw Diff

Details

Reviewers

Commits

rGdfdda0d41cf0: Thumb1 load/store optimizer: Improve code to materialize new base register.
rL216193: Thumb1 load/store optimizer: Improve code to materialize new base register.

Summary

This patch improves the instruction sequence the load/store optimizer emits to materialize a new base register with offset applied.

If we have a chain of loads/stores like this:

ldr r0, [r5, #4]
ldr r1, [r5, #8]
ldr r2, [r5, #12]

The pass will always use a MOV and 8-bit immediate add (source and destination register are the same in tADDi8) to get a new base:
mov r2, r5
adds r2, #4
ldm r2, {r0, r1, r2}

However, if the immediate fits into 3 bits, as in this case, we can actually generate (with tADDi3):
adds r2, r5, #4
ldm r2, {r0, r1, r2}

I’ve also added a test case for this and made two existing load/store optimizer tests run with –verify-machineinstrs to catch any problems.

Cheers
Moritz

Diff Detail

Repository: rL LLVM

Event Timeline

mroth updated this revision to Diff 12766.Aug 21 2014, 6:44 AM

mroth retitled this revision from to [Thumb1] Improve base + offset materialization in the load/store optimizer.

mroth updated this object.

mroth edited the test plan for this revision. (Show Details)

mroth added a reviewer: rengolin.

mroth set the repository for this revision to rL LLVM.

mroth added a subscriber: Unknown Object (MLST).

Hi Moritz,

LGTM, thanks!

--renato

This revision is now accepted and ready to land.Aug 21 2014, 7:18 AM

Closed by commit rL216193 (authored by @mroth).

Thanks! Committed as r216193.

Cheers
Moritz

Revision Contents

Path

Size

llvm/

trunk/

lib/

Target/

ARM/

ARMLoadStoreOptimizer.cpp

18 lines

test/

CodeGen/

Thumb/

ldm-stm-base-materialization.ll

29 lines

thumb-ldm.ll

2 lines

thumb-memcpy-ldm-stm.ll

2 lines

Diff 12787

llvm/trunk/lib/Target/ARM/ARMLoadStoreOptimizer.cpp

Show First 20 Lines • Show All 354 Lines • ▼ Show 20 Lines	if (isi32Load(Opcode)) {
// Use the scratch register to use as a new base.		// Use the scratch register to use as a new base.
NewBase = Scratch;		NewBase = Scratch;
if (NewBase == 0)		if (NewBase == 0)
return false;		return false;
}		}

int BaseOpc =		int BaseOpc =
isThumb2 ? ARM::t2ADDri :		isThumb2 ? ARM::t2ADDri :
		(isThumb1 && Offset < 8) ? ARM::tADDi3 :
isThumb1 ? ARM::tADDi8 : ARM::ADDri;		isThumb1 ? ARM::tADDi8 : ARM::ADDri;

if (Offset < 0) {		if (Offset < 0) {
		Offset = - Offset;
BaseOpc =		BaseOpc =
isThumb2 ? ARM::t2SUBri :		isThumb2 ? ARM::t2SUBri :
		(isThumb1 && Offset < 8) ? ARM::tSUBi3 :
isThumb1 ? ARM::tSUBi8 : ARM::SUBri;		isThumb1 ? ARM::tSUBi8 : ARM::SUBri;
Offset = - Offset;
}		}

if (!TL->isLegalAddImmediate(Offset))		if (!TL->isLegalAddImmediate(Offset))
// FIXME: Try add with register operand?		// FIXME: Try add with register operand?
return false; // Probably not worth it then.		return false; // Probably not worth it then.

if (isThumb1) {		if (isThumb1) {
if (Base != NewBase) {		// Thumb1: depending on immediate size, use either
		// ADD NewBase, Base, #imm3
		// or
		// MOV NewBase, Base
		// ADD NewBase, #imm8.
		if (Base != NewBase && Offset >= 8) {
// Need to insert a MOV to the new base first.		// Need to insert a MOV to the new base first.
// FIXME: If the immediate fits in 3 bits, use ADD instead.
BuildMI(MBB, MBBI, dl, TII->get(ARM::tMOVr), NewBase)		BuildMI(MBB, MBBI, dl, TII->get(ARM::tMOVr), NewBase)
.addReg(Base, getKillRegState(BaseKill))		.addReg(Base, getKillRegState(BaseKill))
.addImm(Pred).addReg(PredReg);		.addImm(Pred).addReg(PredReg);
		// Set up BaseKill and Base correctly to insert the ADDS/SUBS below.
		Base = NewBase;
		BaseKill = false;
}		}
AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase))		AddDefaultT1CC(BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase))
.addReg(NewBase, getKillRegState(true)).addImm(Offset)		.addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
.addImm(Pred).addReg(PredReg);		.addImm(Pred).addReg(PredReg);
} else {		} else {
BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)		BuildMI(MBB, MBBI, dl, TII->get(BaseOpc), NewBase)
.addReg(Base, getKillRegState(BaseKill)).addImm(Offset)		.addReg(Base, getKillRegState(BaseKill)).addImm(Offset)
.addImm(Pred).addReg(PredReg).addReg(0);		.addImm(Pred).addReg(PredReg).addReg(0);
}		}

Base = NewBase;		Base = NewBase;
BaseKill = true; // New base is always killed straight away.		BaseKill = true; // New base is always killed straight away.
}		}

bool isDef = (isi32Load(Opcode) \|\| Opcode == ARM::VLDRS \|\|		bool isDef = (isi32Load(Opcode) \|\| Opcode == ARM::VLDRS \|\|
Opcode == ARM::VLDRD);		Opcode == ARM::VLDRD);

// Get LS multiple opcode. Note that for Thumb1 this might be an opcode with		// Get LS multiple opcode. Note that for Thumb1 this might be an opcode with
▲ Show 20 Lines • Show All 1,723 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/Thumb/ldm-stm-base-materialization.ll

				; RUN: llc < %s -mtriple=thumbv6m-eabi -verify-machineinstrs -o - \| FileCheck %s
				target datalayout = "e-m:e-p:32:32-i1:8:32-i8:8:32-i16:16:32-i64:64-v128:64:128-a:0:32-n32-S64"
				target triple = "thumbv6m-none--eabi"

				@a = external global i32*
				@b = external global i32*

				; Function Attrs: nounwind
				define void @foo() #0 {
				entry:
				; CHECK-LABEL: foo:
				; CHECK: ldr r[[SB:[0-9]]], .LCPI
				; CHECK: ldr r[[LB:[0-9]]], .LCPI
				; CHECK: adds r[[NLB:[0-9]]], r[[LB]], #4
				; CHECK-NEXT: ldm r[[NLB]],
				; CHECK: adds r[[NSB:[0-9]]], r[[SB]], #4
				; CHECK-NEXT: stm r[[NSB]]
				%0 = load i32** @a, align 4
				%arrayidx = getelementptr inbounds i32* %0, i32 1
				%1 = bitcast i32* %arrayidx to i8*
				%2 = load i32** @b, align 4
				%arrayidx1 = getelementptr inbounds i32* %2, i32 1
				%3 = bitcast i32* %arrayidx1 to i8*
				tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %1, i8* %3, i32 24, i32 4, i1 false)
				ret void
				}

				; Function Attrs: nounwind
				declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i32, i1) #1

llvm/trunk/test/CodeGen/Thumb/thumb-ldm.ll

	; RUN: llc < %s -mtriple=thumbv6m-eabi -o - \| FileCheck %s			; RUN: llc < %s -mtriple=thumbv6m-eabi -verify-machineinstrs -o - \| FileCheck %s

	@X = external global [0 x i32] ; <[0 x i32]*> [#uses=5]			@X = external global [0 x i32] ; <[0 x i32]*> [#uses=5]

	define i32 @t1() {			define i32 @t1() {
	; CHECK-LABEL: t1:			; CHECK-LABEL: t1:
	; CHECK: push {r7, lr}			; CHECK: push {r7, lr}
	; CHECK: ldm			; CHECK: ldm
	; CHECK: pop {r7, pc}			; CHECK: pop {r7, pc}
	Show All 33 Lines

llvm/trunk/test/CodeGen/Thumb/thumb-memcpy-ldm-stm.ll

	; RUN: llc -mtriple=thumbv6m-eabi %s -o - \| FileCheck %s			; RUN: llc -mtriple=thumbv6m-eabi -verify-machineinstrs %s -o - \| FileCheck %s

	@d = external global [64 x i32]			@d = external global [64 x i32]
	@s = external global [64 x i32]			@s = external global [64 x i32]

	; Function Attrs: nounwind			; Function Attrs: nounwind
	define void @t1() #0 {			define void @t1() #0 {
	entry:			entry:
	; CHECK-LABEL: t1			; CHECK-LABEL: t1
	Show All 18 Lines