This is an archive of the discontinued LLVM Phabricator instance.

[LSR] Improved code generation for Zero Compare loops
AbandonedPublic

Authored by joanlluch on Jun 23 2019, 10:22 AM.

Details

Summary

Improves loop code generation. All targets are affected but most benefits are obtained for X86. Creates shorter code in a number of cases by allowing the Strength Reduce algorithm to consider both the direct and swapped forms of zero compare instructions, which enhances the opportunities to obtain an overall better LSR solution. Given equal LSR solution cost, the patch also honours the direction of the loop induction variable specified in the user source code, which in practice also tends to result in a better solution.

The patch broke a number of regression tests due to inherent test fragility, not because of intended test failures. I fixed the CodeGen tests for the ARM and X86 architectures.

An example of code improved by this patch:

int func(void);
void func2(void);

void LSRTest(int count)
{
  count += func();
  for ( ; count != 20; ++count ) {
    func2();
  }
}

Before:

	.section	__TEXT,__text,regular,pure_instructions
	.macosx_version_min 10, 12
	.globl	_LSRTest
_LSRTest:
	.cfi_startproc
	pushl	%ebp
	.cfi_def_cfa_offset 8
	.cfi_offset %ebp, -8
	movl	%esp, %ebp
	.cfi_def_cfa_register %ebp
	pushl	%esi
	pushl	%eax
	.cfi_offset %esi, -12
	calll	_func
	addl	8(%ebp), %eax
	pushl	$20
	popl	%esi
	subl	%eax, %esi
	jmp	LBB0_1
LBB0_2:
	calll	_func2
	decl	%esi
LBB0_1:
	testl	%esi, %esi
	jne	LBB0_2
	addl	$4, %esp
	popl	%esi
	popl	%ebp
	retl
	.cfi_endproc

After:

	.section	__TEXT,__text,regular,pure_instructions
	.macosx_version_min 10, 12
	.globl	_LSRTest
_LSRTest:
	.cfi_startproc
	pushl	%ebp
	.cfi_def_cfa_offset 8
	.cfi_offset %ebp, -8
	movl	%esp, %ebp
	.cfi_def_cfa_register %ebp
	pushl	%esi
	pushl	%eax
	.cfi_offset %esi, -12
	movl	8(%ebp), %esi
	calll	_func
	leal	-20(%eax,%esi), %esi
	jmp	LBB0_1
LBB0_2:
	calll	_func2
	incl	%esi
LBB0_1:
	testl	%esi, %esi
	jne	LBB0_2
	addl	$4, %esp
	popl	%esi
	popl	%ebp
	retl
	.cfi_endproc

Diff Detail

Event Timeline

joanlluch created this revision.Jun 23 2019, 10:22 AM
jsji added a subscriber: shchenz.Jun 23 2019, 3:56 PM
joanlluch changed the repository for this revision from rL LLVM to rG LLVM Github Monorepo.Jun 25 2019, 12:30 AM
shchenz requested changes to this revision.Jun 25 2019, 7:48 PM

This patch causes a lot of cases fail except platform X86 and ARM.

And there is a compiling time crash:

: 'RUN: at line 6';   /home/czhengsz/llvm_new/llvm-project/llvm/build/bin/llc -switch-peel-threshold=101 < /home/czhengsz/llvm_new/llvm-project/llvm/test/CodeGen/SystemZ/loop-03.ll -mtriple=s390x-linux-gnu -mcpu=z13 | /home/czhengsz/llvm_new/llvm-project/llvm/build/bin/FileCheck /home/czhengsz/llvm_new/llvm-project/llvm/test/CodeGen/SystemZ/loop-03.ll
--
Exit Code: 2

Command Output (stderr):
--
llc: /home/czhengsz/llvm_new/llvm-project/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp:3350: void {anonymous}::LSRInstance::InsertSupplementalFormula(const llvm::SCEV*, {anonymous}::LSRUse&, size_t): Assertion `Inserted && "Supplemental formula already exists!"' failed.
Stack dump:
0.      Program arguments: /home/czhengsz/llvm_new/llvm-project/llvm/build/bin/llc -switch-peel-threshold=101 -mtriple=s390x-linux-gnu -mcpu=z13 
1.      Running pass 'Function Pass Manager' on module '<stdin>'.
2.      Running pass 'Loop Pass Manager' on function '@fun0'
3.      Running pass 'Loop Strength Reduction' on basic block '%23'
 #0 0x00000000127b8768 llvm::sys::PrintStackTrace(llvm::raw_ostream&) (/home/czhengsz/llvm_new/llvm-project/llvm/build/bin/llc+0x127b8768)
 #1 0x00000000127b8890 PrintStackTraceSignalHandler(void*) (/home/czhengsz/llvm_new/llvm-project/llvm/build/bin/llc+0x127b8890)
 #2 0x00000000127b6324 llvm::sys::RunSignalHandlers() (/home/czhengsz/llvm_new/llvm-project/llvm/build/bin/llc+0x127b6324)
 #3 0x00000000127b6504 SignalHandler(int) (/home/czhengsz/llvm_new/llvm-project/llvm/build/bin/llc+0x127b6504)
 #4 0x00003fffb6470478 (linux-vdso64.so.1+0x478)
 #5 0x00003fffb5d0e100 raise (/opt/at12.0/lib64/power8/libc.so.6+0x4e100)
 #6 0x00003fffb5ce4598 abort (/opt/at12.0/lib64/power8/libc.so.6+0x24598)
 #7 0x00003fffb5cfb624 (/opt/at12.0/lib64/power8/libc.so.6+0x3b624)
 #8 0x00003fffb5cfb6c4 __assert_fail (/opt/at12.0/lib64/power8/libc.so.6+0x3b6c4)
 #9 0x000000001220151c (anonymous namespace)::LSRInstance::InsertSupplementalFormula(llvm::SCEV const*, (anonymous namespace)::LSRUse&, unsigned long) (/home/czhengsz/llvm_new/llvm-project/llvm/build/bin/llc+0x1220151c)
#10 0x0000000012209050 (anonymous namespace)::LSRInstance::CollectFixupsAndInitialFormulae() (/home/czhengsz/llvm_new/llvm-project/llvm/build/bin/llc+0x12209050)
#11 0x0000000012212908 ReduceLoopStrength(llvm::Loop*, llvm::IVUsers&, llvm::ScalarEvolution&, llvm::DominatorTree&, llvm::LoopInfo&, llvm::TargetTransformInfo const&) (/home/czhengsz/llvm_new/llvm-project/llvm/build/bin/llc+0x12212908)
#12 0x00000000117e5938 llvm::LPPassManager::runOnFunction(llvm::Function&) (/home/czhengsz/llvm_new/llvm-project/llvm/build/bin/llc+0x117e5938)
#13 0x00000000120240b0 llvm::FPPassManager::runOnFunction(llvm::Function&) (/home/czhengsz/llvm_new/llvm-project/llvm/build/bin/llc+0x120240b0)
#14 0x00000000120242f8 llvm::FPPassManager::runOnModule(llvm::Module&) (/home/czhengsz/llvm_new/llvm-project/llvm/build/bin/llc+0x120242f8)
#15 0x00000000120230e0 llvm::legacy::PassManagerImpl::run(llvm::Module&) (/home/czhengsz/llvm_new/llvm-project/llvm/build/bin/llc+0x120230e0)
#16 0x000000001202350c llvm::legacy::PassManager::run(llvm::Module&) (/home/czhengsz/llvm_new/llvm-project/llvm/build/bin/llc+0x1202350c)
#17 0x00000000103b585c compileModule(char**, llvm::LLVMContext&) (/home/czhengsz/llvm_new/llvm-project/llvm/build/bin/llc+0x103b585c)
#18 0x0000000010318678 main (/home/czhengsz/llvm_new/llvm-project/llvm/build/bin/llc+0x10318678)
#19 0x00003fffb5ce4bf8 (/opt/at12.0/lib64/power8/libc.so.6+0x24bf8)
#20 0x00003fffb5ce4e04 __libc_start_main (/opt/at12.0/lib64/power8/libc.so.6+0x24e04)
FileCheck error: '-' is empty.
FileCheck command line:  /home/czhengsz/llvm_new/llvm-project/llvm/build/bin/FileCheck /home/czhengsz/llvm_new/llvm-project/llvm/test/CodeGen/SystemZ/loop-03.ll

I think these failures should be fixed firstly.

One opinion about this patch's example, seems you found the improvement for X86, because incl loop(negative loop count) on X86 can use the instruction movl+ leal, but decl loop(positive loop count) uses addl + pushl + popl + subl.

As I know, PowerPC has hardware loop, it needs the loop count must be a positive value. This patch changes all loop to a incl loop with negative loop count which is surely a deg for PowerPC. So maybe you need to treat this improvement for specific platform by using some target hook function?

This revision now requires changes to proceed.Jun 25 2019, 7:48 PM

Fixes reported assertion crash on SystemZ

Hi shchenz,

Thanks for taking the time for reviewing this. I solved your reported assertion crash on SystemZ, and it is now fixed.

I stated on the description comment that I fixed codegen tests for X86 and ARM, but I'm waiting feedback for other platforms because I'm not that versed on them. It would just be very easy for me to just run "update_llc_test_checks.py" and forget, but I do not think that this is what the community would want to see. On the other hand, I understand that you can help with the PowerPC platform, so maybe you can look at that.

This patch improves code generation not because it choses a particular induction variable direction (increment or decrement) but because it choses the less costly or most natural one. Following there are a couple of examples that show improvements for both positive and negative increments:

void LSRTestA( int a, unsigned ammount )
{
  ammount += 8;
  while ( ammount-- )
    bar(a);
}

void LSRTestB( int a, unsigned ammount )
{
  ammount += 8;
  for ( ; ammount != 0 ; ammount++ )
    bar(a);
}

Before:

	.section	__TEXT,__text,regular,pure_instructions
	.macosx_version_min 10, 12
	.globl	_LSRTestA
_LSRTestA:
	.cfi_startproc
	pushl	%ebp
	.cfi_def_cfa_offset 8
	.cfi_offset %ebp, -8
	movl	%esp, %ebp
	.cfi_def_cfa_register %ebp
	pushl	%edi
	pushl	%esi
	subl	$16, %esp
	.cfi_offset %esi, -16
	.cfi_offset %edi, -12
	movl	8(%ebp), %esi
	pushl	$-8
	popl	%edi
	subl	12(%ebp), %edi
	jmp	LBB0_1
LBB0_2:
	movl	%esi, (%esp)
	calll	_bar
	incl	%edi
LBB0_1:
	testl	%edi, %edi
	jne	LBB0_2
	addl	$16, %esp
	popl	%esi
	popl	%edi
	popl	%ebp
	retl
	.cfi_endproc

	.globl	_LSRTestB
_LSRTestB:
	.cfi_startproc
	pushl	%ebp
	.cfi_def_cfa_offset 8
	.cfi_offset %ebp, -8
	movl	%esp, %ebp
	.cfi_def_cfa_register %ebp
	pushl	%edi
	pushl	%esi
	subl	$16, %esp
	.cfi_offset %esi, -16
	.cfi_offset %edi, -12
	movl	8(%ebp), %esi
	pushl	$-8
	popl	%edi
	subl	12(%ebp), %edi
	jmp	LBB1_1
LBB1_2:
	movl	%esi, (%esp)
	calll	_bar
	decl	%edi
LBB1_1:
	testl	%edi, %edi
	jne	LBB1_2
	addl	$16, %esp
	popl	%esi
	popl	%edi
	popl	%ebp
	retl
	.cfi_endproc

After:

	.section	__TEXT,__text,regular,pure_instructions
	.macosx_version_min 10, 12
	.globl	_LSRTestA
_LSRTestA:
	.cfi_startproc
	pushl	%ebp
	.cfi_def_cfa_offset 8
	.cfi_offset %ebp, -8
	movl	%esp, %ebp
	.cfi_def_cfa_register %ebp
	pushl	%edi
	pushl	%esi
	subl	$16, %esp
	.cfi_offset %esi, -16
	.cfi_offset %edi, -12
	movl	8(%ebp), %esi
	movl	12(%ebp), %edi
	addl	$8, %edi
	jmp	LBB0_1
LBB0_2:
	decl	%edi
	movl	%esi, (%esp)
	calll	_bar
LBB0_1:
	testl	%edi, %edi
	jne	LBB0_2
	addl	$16, %esp
	popl	%esi
	popl	%edi
	popl	%ebp
	retl
	.cfi_endproc

	.globl	_LSRTestB
_LSRTestB:
	.cfi_startproc
	pushl	%ebp
	.cfi_def_cfa_offset 8
	.cfi_offset %ebp, -8
	movl	%esp, %ebp
	.cfi_def_cfa_register %ebp
	pushl	%edi
	pushl	%esi
	subl	$16, %esp
	.cfi_offset %esi, -16
	.cfi_offset %edi, -12
	movl	8(%ebp), %esi
	movl	12(%ebp), %edi
	addl	$8, %edi
	jmp	LBB1_1
LBB1_2:
	movl	%esi, (%esp)
	calll	_bar
	incl	%edi
LBB1_1:
	testl	%edi, %edi
	jne	LBB1_2
	addl	$16, %esp
	popl	%esi
	popl	%edi
	popl	%ebp
	retl
	.cfi_endproc

On these examples, a pair of potentially expensive push, pop instructions are replaced by a mov.

About PowerPC, I'm not that versed on it. If it needs the loop count to be always positive to take advantage of hardware, then I suggest you to propose a patch for that. Maybe we can work together if you agree. My current proposal does not particularly favour any count direction, but given equal cost it tends to honour the one specified on the source code. So at the end of the day, I think that even for PowerPC it should still result in overall improvements because most loops are positive counting anyway. Please correct me if I am wrong.

I'm not sure why your examples have push instructions to load an immediate into registers unless you're compiling with -Oz? And if that's the case then changing push/pop to mov is an increase in code size.

joanlluch added a comment.EditedJun 26 2019, 2:07 AM

Hi Craig, thanks for commenting.
Yes, I was actually compiling for -Oz, but the differences when using -Os are even bigger. Let me try to explain every case.

  • For Oz, the compiler indeed generates expensive push/pop instructions as an attempt to reduce code size. However, after the patch is applied these instructions are removed without any code size increase. I have checked that in several scenarios and the result is either the same size or less size. This is because the patch reduces the overall number of instructions. On the examples above, the resulting code size is identical, this is because the sequence:
	pushl	$-8                     ## encoding: [0x6a,0xf8]
	popl	%edi                    ## encoding: [0x5f]
	subl	12(%ebp), %edi          ## encoding: [0x2b,0x7d,0x0c]

is replaced by:

	movl	12(%ebp), %edi          ## encoding: [0x8b,0x7d,0x0c]
	addl	$8, %edi                ## encoding: [0x83,0xc7,0x08]

It's a total of 6 machine code bytes in both cases, but one less instruction after the patch.
This is just an example, there are many cases where both the code size and number of instructions are reduced.

  • Now, if we compile the same with the -Os flag we get even further improvements. this is the resulting code before and after for the same source code:

Before:

	.section	__TEXT,__text,regular,pure_instructions
	.macosx_version_min 10, 12
	.globl	_LSRTestA               ## -- Begin function LSRTestA
_LSRTestA:                              ## @LSRTestA
	.cfi_startproc
## %bb.0:                               ## %entry
	pushl	%ebp                    ## encoding: [0x55]
	.cfi_def_cfa_offset 8
	.cfi_offset %ebp, -8
	movl	%esp, %ebp              ## encoding: [0x89,0xe5]
	.cfi_def_cfa_register %ebp
	pushl	%edi                    ## encoding: [0x57]
	pushl	%esi                    ## encoding: [0x56]
	subl	$16, %esp               ## encoding: [0x83,0xec,0x10]
	.cfi_offset %esi, -16
	.cfi_offset %edi, -12
	movl	12(%ebp), %eax          ## encoding: [0x8b,0x45,0x0c]
	cmpl	$-8, %eax               ## encoding: [0x83,0xf8,0xf8]
	je	LBB0_3                  ## encoding: [0x74,A]
                                        ##   fixup A - offset: 1, value: LBB0_3-1, kind: FK_PCRel_1
## %bb.1:                               ## %while.body.preheader
	movl	8(%ebp), %esi           ## encoding: [0x8b,0x75,0x08]
	movl	$-8, %edi               ## encoding: [0xbf,0xf8,0xff,0xff,0xff]
	subl	%eax, %edi              ## encoding: [0x29,0xc7]
LBB0_2:                                 ## %while.body
                                        ## =>This Inner Loop Header: Depth=1
	movl	%esi, (%esp)            ## encoding: [0x89,0x34,0x24]
	calll	_bar                    ## encoding: [0xe8,A,A,A,A]
                                        ##   fixup A - offset: 1, value: _bar-4, kind: FK_PCRel_4
	incl	%edi                    ## encoding: [0x47]
	jne	LBB0_2                  ## encoding: [0x75,A]
                                        ##   fixup A - offset: 1, value: LBB0_2-1, kind: FK_PCRel_1
LBB0_3:                                 ## %while.end
	addl	$16, %esp               ## encoding: [0x83,0xc4,0x10]
	popl	%esi                    ## encoding: [0x5e]
	popl	%edi                    ## encoding: [0x5f]
	popl	%ebp                    ## encoding: [0x5d]
	retl                            ## encoding: [0xc3]
	.cfi_endproc
                                        ## -- End function
	.globl	_LSRTestB               ## -- Begin function LSRTestB
_LSRTestB:                              ## @LSRTestB
	.cfi_startproc
## %bb.0:                               ## %entry
	pushl	%ebp                    ## encoding: [0x55]
	.cfi_def_cfa_offset 8
	.cfi_offset %ebp, -8
	movl	%esp, %ebp              ## encoding: [0x89,0xe5]
	.cfi_def_cfa_register %ebp
	pushl	%edi                    ## encoding: [0x57]
	pushl	%esi                    ## encoding: [0x56]
	subl	$16, %esp               ## encoding: [0x83,0xec,0x10]
	.cfi_offset %esi, -16
	.cfi_offset %edi, -12
	movl	12(%ebp), %eax          ## encoding: [0x8b,0x45,0x0c]
	cmpl	$-8, %eax               ## encoding: [0x83,0xf8,0xf8]
	je	LBB1_3                  ## encoding: [0x74,A]
                                        ##   fixup A - offset: 1, value: LBB1_3-1, kind: FK_PCRel_1
## %bb.1:                               ## %for.body.preheader
	movl	8(%ebp), %esi           ## encoding: [0x8b,0x75,0x08]
	movl	$-8, %edi               ## encoding: [0xbf,0xf8,0xff,0xff,0xff]
	subl	%eax, %edi              ## encoding: [0x29,0xc7]
LBB1_2:                                 ## %for.body
                                        ## =>This Inner Loop Header: Depth=1
	movl	%esi, (%esp)            ## encoding: [0x89,0x34,0x24]
	calll	_bar                    ## encoding: [0xe8,A,A,A,A]
                                        ##   fixup A - offset: 1, value: _bar-4, kind: FK_PCRel_4
	decl	%edi                    ## encoding: [0x4f]
	jne	LBB1_2                  ## encoding: [0x75,A]
                                        ##   fixup A - offset: 1, value: LBB1_2-1, kind: FK_PCRel_1
LBB1_3:                                 ## %for.end
	addl	$16, %esp               ## encoding: [0x83,0xc4,0x10]
	popl	%esi                    ## encoding: [0x5e]
	popl	%edi                    ## encoding: [0x5f]
	popl	%ebp                    ## encoding: [0x5d]
	retl                            ## encoding: [0xc3]
	.cfi_endproc

After:

	.section	__TEXT,__text,regular,pure_instructions
	.macosx_version_min 10, 12
	.globl	_LSRTestA               ## -- Begin function LSRTestA
_LSRTestA:                              ## @LSRTestA
	.cfi_startproc
## %bb.0:                               ## %entry
	pushl	%ebp                    ## encoding: [0x55]
	.cfi_def_cfa_offset 8
	.cfi_offset %ebp, -8
	movl	%esp, %ebp              ## encoding: [0x89,0xe5]
	.cfi_def_cfa_register %ebp
	pushl	%edi                    ## encoding: [0x57]
	pushl	%esi                    ## encoding: [0x56]
	subl	$16, %esp               ## encoding: [0x83,0xec,0x10]
	.cfi_offset %esi, -16
	.cfi_offset %edi, -12
	movl	12(%ebp), %esi          ## encoding: [0x8b,0x75,0x0c]
	addl	$8, %esi                ## encoding: [0x83,0xc6,0x08]
	je	LBB0_3                  ## encoding: [0x74,A]
                                        ##   fixup A - offset: 1, value: LBB0_3-1, kind: FK_PCRel_1
## %bb.1:                               ## %while.body.preheader
	movl	8(%ebp), %edi           ## encoding: [0x8b,0x7d,0x08]
LBB0_2:                                 ## %while.body
                                        ## =>This Inner Loop Header: Depth=1
	movl	%edi, (%esp)            ## encoding: [0x89,0x3c,0x24]
	calll	_bar                    ## encoding: [0xe8,A,A,A,A]
                                        ##   fixup A - offset: 1, value: _bar-4, kind: FK_PCRel_4
	decl	%esi                    ## encoding: [0x4e]
	jne	LBB0_2                  ## encoding: [0x75,A]
                                        ##   fixup A - offset: 1, value: LBB0_2-1, kind: FK_PCRel_1
LBB0_3:                                 ## %while.end
	addl	$16, %esp               ## encoding: [0x83,0xc4,0x10]
	popl	%esi                    ## encoding: [0x5e]
	popl	%edi                    ## encoding: [0x5f]
	popl	%ebp                    ## encoding: [0x5d]
	retl                            ## encoding: [0xc3]
	.cfi_endproc
                                        ## -- End function
	.globl	_LSRTestB               ## -- Begin function LSRTestB
_LSRTestB:                              ## @LSRTestB
	.cfi_startproc
## %bb.0:                               ## %entry
	pushl	%ebp                    ## encoding: [0x55]
	.cfi_def_cfa_offset 8
	.cfi_offset %ebp, -8
	movl	%esp, %ebp              ## encoding: [0x89,0xe5]
	.cfi_def_cfa_register %ebp
	pushl	%edi                    ## encoding: [0x57]
	pushl	%esi                    ## encoding: [0x56]
	subl	$16, %esp               ## encoding: [0x83,0xec,0x10]
	.cfi_offset %esi, -16
	.cfi_offset %edi, -12
	movl	12(%ebp), %esi          ## encoding: [0x8b,0x75,0x0c]
	addl	$8, %esi                ## encoding: [0x83,0xc6,0x08]
	je	LBB1_3                  ## encoding: [0x74,A]
                                        ##   fixup A - offset: 1, value: LBB1_3-1, kind: FK_PCRel_1
## %bb.1:                               ## %for.body.preheader
	movl	8(%ebp), %edi           ## encoding: [0x8b,0x7d,0x08]
LBB1_2:                                 ## %for.body
                                        ## =>This Inner Loop Header: Depth=1
	movl	%edi, (%esp)            ## encoding: [0x89,0x3c,0x24]
	calll	_bar                    ## encoding: [0xe8,A,A,A,A]
                                        ##   fixup A - offset: 1, value: _bar-4, kind: FK_PCRel_4
	incl	%esi                    ## encoding: [0x46]
	jne	LBB1_2                  ## encoding: [0x75,A]
                                        ##   fixup A - offset: 1, value: LBB1_2-1, kind: FK_PCRel_1
LBB1_3:                                 ## %for.end
	addl	$16, %esp               ## encoding: [0x83,0xc4,0x10]
	popl	%esi                    ## encoding: [0x5e]
	popl	%edi                    ## encoding: [0x5f]
	popl	%ebp                    ## encoding: [0x5d]
	retl                            ## encoding: [0xc3]
	.cfi_endproc

So in this case we get an improvement of 2 less instructions per function and 7 less bytes per function.

  • The same sort of improvements are obtained with the -O3 setting and the remaining ones.

Please let me know if you need me to perform some specific test.

Thanks,
[Edited for grammar typos]

@craig.topper , can this be considered ok for the X86?. Thanks.

joanlluch abandoned this revision.Jul 8 2019, 8:24 AM