This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Target/AVR/
-
Target/
-
AVR/
-
AVRISelLowering.cpp
-
test/CodeGen/AVR/
-
CodeGen/
-
AVR/
1/1
rot.ll
-
shift.ll

Differential D86418

[AVR] Improve inline rotate/shift expansions
ClosedPublic

Authored by aykevl on Aug 23 2020, 5:27 AM.

Download Raw Diff

Details

Reviewers

dylanmckay

Commits

rGe03ba2198dbb: [AVR] Improve inline rotate/shift expansions

Summary

These expansions were rather inefficient and were done with more code
than necessary. This change optimizes them to use expansions more
similar to GCC. The code size is the same (when optimizing for code
size) but somehow LLVM reorders blocks in a non-optimal way. Still, this
should be an improvement with a reduction in code size of around 0.12%
(when building compiler-rt).

I made this patch to get more familiar with these inline expansions, in the hope that I can also do the other expansions inline (such as 32-bit shifts).

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

aykevl created this revision.Aug 23 2020, 5:27 AM

Herald added subscribers: llvm-commits, Jim, hiraditya. · View Herald TranscriptAug 23 2020, 5:27 AM

aykevl requested review of this revision.Aug 23 2020, 5:27 AM

Harbormaster completed remote builds in B69251: Diff 287251.Aug 23 2020, 5:57 AM

I have tested this locally with my compiler-rt test setup. All tests still pass with this patch applied while the binary size is slightly reduced (when optimizing for size).

llvm/test/CodeGen/AVR/rot.ll
16	Note: this doesn't decrease binary size because LLVM duplicates the loop check (`dec` and `br`). When optimizing for size, this would be reduced by one instruction, such as in `shift_i8_i8_size` of shift.ll.

Great patch

I made this patch to get more familiar with these inline expansions, in the hope that I can also do the other expansions inline (such as 32-bit shifts).

That would be nice!

This revision is now accepted and ready to land.Aug 26 2020, 8:59 AM

This revision was landed with ongoing or failed builds.Oct 31 2020, 3:16 PM

Closed by commit rGe03ba2198dbb: [AVR] Improve inline rotate/shift expansions (authored by aykevl). · Explain Why

This revision was automatically updated to reflect the committed changes.

aykevl added a commit: rGe03ba2198dbb: [AVR] Improve inline rotate/shift expansions.

In D86418#2239289, @dylanmckay wrote:

Great patch

I made this patch to get more familiar with these inline expansions, in the hope that I can also do the other expansions inline (such as 32-bit shifts).

That would be nice!

I have tried several things and thought a lot about this, but I can't come up with a good way to handle this. The only thing I can think of is introduce new pseudo 32-bit and 64-bit shift instructions, but that just seems really ugly.
The problem is that I see no way of converting these bigger than 16 bit integers to 16-bit (or 8-bit) open coded instruction sequences before type legalization. Reading https://llvm.org/docs/CodeGenerator.html, it appears that these larger than 16 bit integers are all converted to builtin calls before any custom lowering can be done.

I suspect the real solution is to switch to GlobalISel, which apparently is much more flexible and should allow inserting a custom pass before type legalization.

Revision Contents

Path

Size

llvm/

lib/

Target/

AVR/

AVRISelLowering.cpp

55 lines

test/

CodeGen/

AVR/

rot.ll

16 lines

shift.ll

44 lines

Diff 302113

llvm/lib/Target/AVR/AVRISelLowering.cpp

Show First 20 Lines • Show All 1,452 Lines • ▼ Show 20 Lines	MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
const BasicBlock *LLVM_BB = BB->getBasicBlock();		const BasicBlock *LLVM_BB = BB->getBasicBlock();

MachineFunction::iterator I;		MachineFunction::iterator I;
for (I = BB->getIterator(); I != F->end() && &(*I) != BB; ++I);		for (I = BB->getIterator(); I != F->end() && &(*I) != BB; ++I);
if (I != F->end()) ++I;		if (I != F->end()) ++I;

// Create loop block.		// Create loop block.
MachineBasicBlock *LoopBB = F->CreateMachineBasicBlock(LLVM_BB);		MachineBasicBlock *LoopBB = F->CreateMachineBasicBlock(LLVM_BB);
		MachineBasicBlock *CheckBB = F->CreateMachineBasicBlock(LLVM_BB);
MachineBasicBlock *RemBB = F->CreateMachineBasicBlock(LLVM_BB);		MachineBasicBlock *RemBB = F->CreateMachineBasicBlock(LLVM_BB);

F->insert(I, LoopBB);		F->insert(I, LoopBB);
		F->insert(I, CheckBB);
F->insert(I, RemBB);		F->insert(I, RemBB);

// Update machine-CFG edges by transferring all successors of the current		// Update machine-CFG edges by transferring all successors of the current
// block to the block containing instructions after shift.		// block to the block containing instructions after shift.
RemBB->splice(RemBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)),		RemBB->splice(RemBB->begin(), BB, std::next(MachineBasicBlock::iterator(MI)),
BB->end());		BB->end());
RemBB->transferSuccessorsAndUpdatePHIs(BB);		RemBB->transferSuccessorsAndUpdatePHIs(BB);

// Add adges BB => LoopBB => RemBB, BB => RemBB, LoopBB => LoopBB.		// Add edges BB => LoopBB => CheckBB => RemBB, CheckBB => LoopBB.
BB->addSuccessor(LoopBB);		BB->addSuccessor(CheckBB);
BB->addSuccessor(RemBB);		LoopBB->addSuccessor(CheckBB);
LoopBB->addSuccessor(RemBB);		CheckBB->addSuccessor(LoopBB);
LoopBB->addSuccessor(LoopBB);		CheckBB->addSuccessor(RemBB);

Register ShiftAmtReg = RI.createVirtualRegister(&AVR::LD8RegClass);		Register ShiftAmtReg = RI.createVirtualRegister(&AVR::GPR8RegClass);
Register ShiftAmtReg2 = RI.createVirtualRegister(&AVR::LD8RegClass);		Register ShiftAmtReg2 = RI.createVirtualRegister(&AVR::GPR8RegClass);
Register ShiftReg = RI.createVirtualRegister(RC);		Register ShiftReg = RI.createVirtualRegister(RC);
Register ShiftReg2 = RI.createVirtualRegister(RC);		Register ShiftReg2 = RI.createVirtualRegister(RC);
Register ShiftAmtSrcReg = MI.getOperand(2).getReg();		Register ShiftAmtSrcReg = MI.getOperand(2).getReg();
Register SrcReg = MI.getOperand(1).getReg();		Register SrcReg = MI.getOperand(1).getReg();
Register DstReg = MI.getOperand(0).getReg();		Register DstReg = MI.getOperand(0).getReg();

// BB:		// BB:
// cpi N, 0		// rjmp CheckBB
// breq RemBB		BuildMI(BB, dl, TII.get(AVR::RJMPk)).addMBB(CheckBB);
BuildMI(BB, dl, TII.get(AVR::CPIRdK)).addReg(ShiftAmtSrcReg).addImm(0);
BuildMI(BB, dl, TII.get(AVR::BREQk)).addMBB(RemBB);

// LoopBB:		// LoopBB:
		// ShiftReg2 = shift ShiftReg
		auto ShiftMI = BuildMI(LoopBB, dl, TII.get(Opc), ShiftReg2).addReg(ShiftReg);
		if (HasRepeatedOperand)
		ShiftMI.addReg(ShiftReg);

		// CheckBB:
// ShiftReg = phi [%SrcReg, BB], [%ShiftReg2, LoopBB]		// ShiftReg = phi [%SrcReg, BB], [%ShiftReg2, LoopBB]
// ShiftAmt = phi [%N, BB], [%ShiftAmt2, LoopBB]		// ShiftAmt = phi [%N, BB], [%ShiftAmt2, LoopBB]
// ShiftReg2 = shift ShiftReg		// DestReg = phi [%SrcReg, BB], [%ShiftReg, LoopBB]
// ShiftAmt2 = ShiftAmt - 1;		// ShiftAmt2 = ShiftAmt - 1;
BuildMI(LoopBB, dl, TII.get(AVR::PHI), ShiftReg)		// if (ShiftAmt2 >= 0) goto LoopBB;
		BuildMI(CheckBB, dl, TII.get(AVR::PHI), ShiftReg)
.addReg(SrcReg)		.addReg(SrcReg)
.addMBB(BB)		.addMBB(BB)
.addReg(ShiftReg2)		.addReg(ShiftReg2)
.addMBB(LoopBB);		.addMBB(LoopBB);
BuildMI(LoopBB, dl, TII.get(AVR::PHI), ShiftAmtReg)		BuildMI(CheckBB, dl, TII.get(AVR::PHI), ShiftAmtReg)
.addReg(ShiftAmtSrcReg)		.addReg(ShiftAmtSrcReg)
.addMBB(BB)		.addMBB(BB)
.addReg(ShiftAmtReg2)		.addReg(ShiftAmtReg2)
.addMBB(LoopBB);		.addMBB(LoopBB);
		BuildMI(CheckBB, dl, TII.get(AVR::PHI), DstReg)
auto ShiftMI = BuildMI(LoopBB, dl, TII.get(Opc), ShiftReg2).addReg(ShiftReg);
if (HasRepeatedOperand)
ShiftMI.addReg(ShiftReg);

BuildMI(LoopBB, dl, TII.get(AVR::SUBIRdK), ShiftAmtReg2)
.addReg(ShiftAmtReg)
.addImm(1);
BuildMI(LoopBB, dl, TII.get(AVR::BRNEk)).addMBB(LoopBB);

// RemBB:
// DestReg = phi [%SrcReg, BB], [%ShiftReg, LoopBB]
BuildMI(*RemBB, RemBB->begin(), dl, TII.get(AVR::PHI), DstReg)
.addReg(SrcReg)		.addReg(SrcReg)
.addMBB(BB)		.addMBB(BB)
.addReg(ShiftReg2)		.addReg(ShiftReg2)
.addMBB(LoopBB);		.addMBB(LoopBB);

		BuildMI(CheckBB, dl, TII.get(AVR::DECRd), ShiftAmtReg2)
		.addReg(ShiftAmtReg);
		BuildMI(CheckBB, dl, TII.get(AVR::BRPLk)).addMBB(LoopBB);

MI.eraseFromParent(); // The pseudo instruction is gone now.		MI.eraseFromParent(); // The pseudo instruction is gone now.
return RemBB;		return RemBB;
}		}

static bool isCopyMulResult(MachineBasicBlock::iterator const &I) {		static bool isCopyMulResult(MachineBasicBlock::iterator const &I) {
if (I->getOpcode() == AVR::COPY) {		if (I->getOpcode() == AVR::COPY) {
Register SrcReg = I->getOperand(1).getReg();		Register SrcReg = I->getOperand(1).getReg();
return (SrcReg == AVR::R0 \|\| SrcReg == AVR::R1);		return (SrcReg == AVR::R0 \|\| SrcReg == AVR::R1);
▲ Show 20 Lines • Show All 476 Lines • Show Last 20 Lines

llvm/test/CodeGen/AVR/rot.ll

	; RUN: llc < %s -march=avr \| FileCheck %s			; RUN: llc < %s -march=avr \| FileCheck %s

	; Bit rotation tests.			; Bit rotation tests.

	; CHECK-LABEL: rol8:			; CHECK-LABEL: rol8:
	define i8 @rol8(i8 %val, i8 %amt) {			define i8 @rol8(i8 %val, i8 %amt) {
	; CHECK: andi r22, 7			; CHECK: andi r22, 7

	; CHECK-NEXT: cpi r22, 0			; CHECK-NEXT: dec r22
	; CHECK-NEXT: breq .LBB0_2			; CHECK-NEXT: brmi .LBB0_2

	; CHECK-NEXT: .LBB0_1:			; CHECK-NEXT: .LBB0_1:
	; CHECK-NEXT: lsl r24			; CHECK-NEXT: lsl r24
	; CHECK-NEXT: adc r24, r1			; CHECK-NEXT: adc r24, r1
	; CHECK-NEXT: subi r22, 1			; CHECK-NEXT: dec r22
	; CHECK-NEXT: brne .LBB0_1			; CHECK-NEXT: brpl .LBB0_1
				aykevlAuthorUnsubmitted Done Reply Inline Actions Note: this doesn't decrease binary size because LLVM duplicates the loop check (`dec` and `br`). When optimizing for size, this would be reduced by one instruction, such as in `shift_i8_i8_size` of shift.ll. aykevl: Note: this doesn't decrease binary size because LLVM duplicates the loop check (`dec` and `br`).

	; CHECK-NEXT: .LBB0_2:			; CHECK-NEXT: .LBB0_2:
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%mod = urem i8 %amt, 8			%mod = urem i8 %amt, 8

	%inv = sub i8 8, %mod			%inv = sub i8 8, %mod
	%parta = shl i8 %val, %mod			%parta = shl i8 %val, %mod
	%partb = lshr i8 %val, %inv			%partb = lshr i8 %val, %inv

	%rotl = or i8 %parta, %partb			%rotl = or i8 %parta, %partb

	ret i8 %rotl			ret i8 %rotl
	}			}


	; CHECK-LABEL: ror8:			; CHECK-LABEL: ror8:
	define i8 @ror8(i8 %val, i8 %amt) {			define i8 @ror8(i8 %val, i8 %amt) {
	; CHECK: andi r22, 7			; CHECK: andi r22, 7

	; CHECK-NEXT: cpi r22, 0			; CHECK-NEXT: dec r22
	; CHECK-NEXT: breq .LBB1_2			; CHECK-NEXT: brmi .LBB1_2

	; CHECK-NEXT: .LBB1_1:			; CHECK-NEXT: .LBB1_1:
	; CHECK-NEXT: lsr r24			; CHECK-NEXT: lsr r24
	; CHECK-NEXT: ldi r0, 0			; CHECK-NEXT: ldi r0, 0
	; CHECK-NEXT: ror r0			; CHECK-NEXT: ror r0
	; CHECK-NEXT: or r24, r0			; CHECK-NEXT: or r24, r0
	; CHECK-NEXT: subi r22, 1			; CHECK-NEXT: dec r22
	; CHECK-NEXT: brne .LBB1_1			; CHECK-NEXT: brpl .LBB1_1

	; CHECK-NEXT: .LBB1_2:			; CHECK-NEXT: .LBB1_2:
	; CHECK-NEXT: ret			; CHECK-NEXT: ret
	%mod = urem i8 %amt, 8			%mod = urem i8 %amt, 8

	%inv = sub i8 8, %mod			%inv = sub i8 8, %mod
	%parta = lshr i8 %val, %mod			%parta = lshr i8 %val, %mod
	%partb = shl i8 %val, %inv			%partb = shl i8 %val, %inv

	%rotr = or i8 %parta, %partb			%rotr = or i8 %parta, %partb

	ret i8 %rotr			ret i8 %rotr
	}			}

llvm/test/CodeGen/AVR/shift.ll

	; RUN: llc < %s -march=avr \| FileCheck %s			; RUN: llc < %s -march=avr \| FileCheck %s

				; Optimize for speed.
				; CHECK-LABEL: shift_i8_i8_speed
				define i8 @shift_i8_i8_speed(i8 %a, i8 %b) {
				; CHECK: dec r22
				; CHECK-NEXT: brmi .LBB0_2
				; CHECK-NEXT: .LBB0_1:
				; CHECK-NEXT: lsl r24
				; CHECK-NEXT: dec r22
				; CHECK-NEXT: brpl .LBB0_1
				; CHECK-NEXT: .LBB0_2:
				; CHECK-NEXT: ret
				%result = shl i8 %a, %b
				ret i8 %result
				}

				; Optimize for size (producing slightly smaller code).
				; CHECK-LABEL: shift_i8_i8_size
				define i8 @shift_i8_i8_size(i8 %a, i8 %b) optsize {
				; CHECK: .LBB1_1:
				; CHECK-NEXT: dec r22
				; CHECK-NEXT: brmi .LBB1_3
				; CHECK: lsl r24
				; CHECK-NEXT: rjmp .LBB1_1
				; CHECK-NEXT: .LBB1_3:
				; CHECK-NEXT: ret
				%result = shl i8 %a, %b
				ret i8 %result
				}

				; CHECK-LABEL: shift_i16_i16
				define i16 @shift_i16_i16(i16 %a, i16 %b) {
				; CHECK: dec r22
				; CHECK-NEXT: brmi .LBB2_2
				; CHECK-NEXT: .LBB2_1:
				; CHECK-NEXT: lsl r24
				; CHECK-NEXT: rol r25
				; CHECK-NEXT: dec r22
				; CHECK-NEXT: brpl .LBB2_1
				; CHECK-NEXT: .LBB2_2:
				; CHECK-NEXT: ret
				%result = shl i16 %a, %b
				ret i16 %result
				}

	; CHECK-LABEL: shift_i64_i64			; CHECK-LABEL: shift_i64_i64
	define i64 @shift_i64_i64(i64 %a, i64 %b) {			define i64 @shift_i64_i64(i64 %a, i64 %b) {
	; CHECK: call __ashldi3			; CHECK: call __ashldi3
	%result = shl i64 %a, %b			%result = shl i64 %a, %b
	ret i64 %result			ret i64 %result
	}			}