This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/CodeGen/
-
CodeGen/
1
CodeGenPrepare.cpp
-
test/CodeGen/ARM/
-
CodeGen/
-
ARM/
1
memcpy-no-inline.ll
-
memfunc.ll

Differential D132233

[CGP][ARM] Dont align memcpy args when optimization for size
Needs ReviewPublic

Authored by dmgreen on Aug 19 2022, 7:38 AM.

Download Raw Diff

Details

Reviewers

samtebbs
john.brawn
SjoerdMeijer

Summary

This was added back in D7908, to align memcpy args. It should be limited when optimizing for size to prevent extra unnecessary padding being added. It seems to only currently be used under arm.

Diff Detail

Event Timeline

dmgreen created this revision.Aug 19 2022, 7:38 AM

Herald added a project: Restricted Project. · View Herald TranscriptAug 19 2022, 7:38 AM

Herald added subscribers: hiraditya, kristof.beyls. · View Herald Transcript

dmgreen requested review of this revision.Aug 19 2022, 7:38 AM

Herald added a project: Restricted Project. · View Herald TranscriptAug 19 2022, 7:38 AM

Harbormaster completed remote builds in B182218: Diff 453994.Aug 19 2022, 8:16 AM

SjoerdMeijer added inline comments.Aug 19 2022, 8:21 AM

llvm/test/CodeGen/ARM/memcpy-no-inline.ll
59–85	I understand the logic of this patch, but I am struggling with the tests. In this new test below, it's unclear to me why we shouldn't be generating the libcall. To make the differences clear I am wondering if these tests should test more or if they should be `llc .. \| llvm-objdump -d ..` tests so that we can actually see codesize?

I've updated the test with all the output - and just shown the diffs here. With that test, if the array is no longer aligned then a series of load/stores will be needed, an LDM will have too-high alignment requirements.

Harbormaster completed remote builds in B183153: Diff 455277.Aug 24 2022, 10:32 AM

efriedma added a subscriber: efriedma.Aug 24 2022, 1:43 PM

efriedma added inline comments.

llvm/lib/CodeGen/CodeGenPrepare.cpp
2130	Not sure I understand the placement of this check. Increasing the alignment of an alloca or a MemIntrinsic doesn't directly increase codesize. And increasing the alignment of a global variable only increases codeisze if we're forced to insert extra padding.

Revision Contents

Path

Size

llvm/

lib/

CodeGen/

CodeGenPrepare.cpp

2 lines

test/

CodeGen/

ARM/

memcpy-no-inline.ll

18 lines

memfunc.ll

39 lines

Diff 455277

llvm/lib/CodeGen/CodeGenPrepare.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 2,121 Lines • ▼ Show 20 Lines	if (CI->isInlineAsm()) {
if (optimizeInlineAsmInst(CI))		if (optimizeInlineAsmInst(CI))
return true;		return true;
}		}

// Align the pointer arguments to this call if the target thinks it's a good		// Align the pointer arguments to this call if the target thinks it's a good
// idea		// idea
unsigned MinSize;		unsigned MinSize;
Align PrefAlign;		Align PrefAlign;
if (TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {		if (!OptSize && TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
		efriedmaUnsubmitted Not Done Reply Inline Actions Not sure I understand the placement of this check. Increasing the alignment of an alloca or a MemIntrinsic doesn't directly increase codesize. And increasing the alignment of a global variable only increases codeisze if we're forced to insert extra padding. efriedma: Not sure I understand the placement of this check. Increasing the alignment of an alloca or a…
for (auto &Arg : CI->args()) {		for (auto &Arg : CI->args()) {
// We want to align both objects whose address is used directly and		// We want to align both objects whose address is used directly and
// objects whose address is used in casts and GEPs, though it only makes		// objects whose address is used in casts and GEPs, though it only makes
// sense for GEPs if the offset is a multiple of the desired alignment and		// sense for GEPs if the offset is a multiple of the desired alignment and
// if size - offset meets the size threshold.		// if size - offset meets the size threshold.
if (!Arg->getType()->isPointerTy())		if (!Arg->getType()->isPointerTy())
continue;		continue;
APInt Offset(DL->getIndexSizeInBits(		APInt Offset(DL->getIndexSizeInBits(
▲ Show 20 Lines • Show All 6,286 Lines • Show Last 20 Lines

llvm/test/CodeGen/ARM/memcpy-no-inline.ll

	Show All 11 Lines
	define void @foo() #0 {			define void @foo() #0 {
	; CHECK-LABEL: foo:			; CHECK-LABEL: foo:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: .save {r7, lr}			; CHECK-NEXT: .save {r7, lr}
	; CHECK-NEXT: push {r7, lr}			; CHECK-NEXT: push {r7, lr}
	; CHECK-NEXT: .pad #32			; CHECK-NEXT: .pad #32
	; CHECK-NEXT: sub sp, #32			; CHECK-NEXT: sub sp, #32
	; CHECK-NEXT: ldr r1, .LCPI0_0			; CHECK-NEXT: ldr r1, .LCPI0_0
	; CHECK-NEXT: mov r0, sp			; CHECK-NEXT: add.w r0, sp, #1
	; CHECK-NEXT: movs r2, #31			; CHECK-NEXT: movs r2, #31
	; CHECK-NEXT: bl __aeabi_memcpy			; CHECK-NEXT: bl __aeabi_memcpy
	; CHECK-NEXT: add sp, #32			; CHECK-NEXT: add sp, #32
	; CHECK-NEXT: pop {r7, pc}			; CHECK-NEXT: pop {r7, pc}
	; CHECK-NEXT: .p2align 2			; CHECK-NEXT: .p2align 2
	; CHECK-NEXT: @ %bb.1:			; CHECK-NEXT: @ %bb.1:
	; CHECK-NEXT: .LCPI0_0:			; CHECK-NEXT: .LCPI0_0:
	; CHECK-NEXT: .long .L.str			; CHECK-NEXT: .long .L.str
	entry:			entry:
	%mystring = alloca [31 x i8], align 1			%mystring = alloca [31 x i8], align 1
	%0 = getelementptr inbounds [31 x i8], [31 x i8]* %mystring, i32 0, i32 0			%0 = getelementptr inbounds [31 x i8], [31 x i8]* %mystring, i32 0, i32 0
	call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %0, i8* align 1 getelementptr inbounds ([31 x i8], [31 x i8]* @.str, i32 0, i32 0), i32 31, i1 false)			call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %0, i8* align 1 getelementptr inbounds ([31 x i8], [31 x i8]* @.str, i32 0, i32 0), i32 31, i1 false)
	ret void			ret void
	}			}

	define void @bar() #0 {			define void @bar() #0 {
	; CHECK-LABEL: bar:			; CHECK-LABEL: bar:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: .save {r4, r5, r6, lr}			; CHECK-NEXT: .save {r7, lr}
	; CHECK-NEXT: push {r4, r5, r6, lr}			; CHECK-NEXT: push {r7, lr}
	; CHECK-NEXT: .pad #32			; CHECK-NEXT: .pad #32
	; CHECK-NEXT: sub sp, #32			; CHECK-NEXT: sub sp, #32
	; CHECK-NEXT: ldr r0, .LCPI1_0			; CHECK-NEXT: ldr r1, .LCPI1_0
	; CHECK-NEXT: mov r1, sp			; CHECK-NEXT: add.w r0, sp, #1
	; CHECK-NEXT: ldm r0!, {r2, r3, r4, r5, r6}			; CHECK-NEXT: movs r2, #21
	; CHECK-NEXT: stm r1!, {r2, r3, r4, r5, r6}			; CHECK-NEXT: bl __aeabi_memcpy
	; CHECK-NEXT: ldrb r0, [r0]
	; CHECK-NEXT: strb r0, [r1]
	; CHECK-NEXT: add sp, #32			; CHECK-NEXT: add sp, #32
	; CHECK-NEXT: pop {r4, r5, r6, pc}			; CHECK-NEXT: pop {r7, pc}
	; CHECK-NEXT: .p2align 2			; CHECK-NEXT: .p2align 2
	; CHECK-NEXT: @ %bb.1:			; CHECK-NEXT: @ %bb.1:
	; CHECK-NEXT: .LCPI1_0:			; CHECK-NEXT: .LCPI1_0:
	; CHECK-NEXT: .long .L.str.1			; CHECK-NEXT: .long .L.str.1
	entry:			entry:
	%mystring = alloca [31 x i8], align 1			%mystring = alloca [31 x i8], align 1
	%0 = getelementptr inbounds [31 x i8], [31 x i8]* %mystring, i32 0, i32 0			%0 = getelementptr inbounds [31 x i8], [31 x i8]* %mystring, i32 0, i32 0
	call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %0, i8* align 1 getelementptr inbounds ([21 x i8], [21 x i8]* @.str.1, i32 0, i32 0), i32 21, i1 false)			call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 1 %0, i8* align 1 getelementptr inbounds ([21 x i8], [21 x i8]* @.str.1, i32 0, i32 0), i32 21, i1 false)
	ret void			ret void
	}			}

	define void @bar2() #0 {			define void @bar2() #0 {
	; CHECK-LABEL: bar2:			; CHECK-LABEL: bar2:
	; CHECK: @ %bb.0: @ %entry			; CHECK: @ %bb.0: @ %entry
	; CHECK-NEXT: .save {r4, r5, r6, lr}			; CHECK-NEXT: .save {r4, r5, r6, lr}
	; CHECK-NEXT: push {r4, r5, r6, lr}			; CHECK-NEXT: push {r4, r5, r6, lr}
	; CHECK-NEXT: .pad #32			; CHECK-NEXT: .pad #32
	; CHECK-NEXT: sub sp, #32			; CHECK-NEXT: sub sp, #32
	; CHECK-NEXT: ldr r0, .LCPI2_0			; CHECK-NEXT: ldr r0, .LCPI2_0
	; CHECK-NEXT: mov r1, sp			; CHECK-NEXT: mov r1, sp
	; CHECK-NEXT: ldm r0!, {r2, r3, r4, r5, r6}			; CHECK-NEXT: ldm r0!, {r2, r3, r4, r5, r6}
	; CHECK-NEXT: stm r1!, {r2, r3, r4, r5, r6}			; CHECK-NEXT: stm r1!, {r2, r3, r4, r5, r6}
	; CHECK-NEXT: ldrb r0, [r0]			; CHECK-NEXT: ldrb r0, [r0]
	; CHECK-NEXT: strb r0, [r1]			; CHECK-NEXT: strb r0, [r1]
	; CHECK-NEXT: add sp, #32			; CHECK-NEXT: add sp, #32
	; CHECK-NEXT: pop {r4, r5, r6, pc}			; CHECK-NEXT: pop {r4, r5, r6, pc}
	; CHECK-NEXT: .p2align 2			; CHECK-NEXT: .p2align 2
	; CHECK-NEXT: @ %bb.1:			; CHECK-NEXT: @ %bb.1:
	; CHECK-NEXT: .LCPI2_0:			; CHECK-NEXT: .LCPI2_0:
	; CHECK-NEXT: .long .L.str.2			; CHECK-NEXT: .long .L.str.2
	entry:			entry:
	%mystring = alloca [32 x i8], align 4			%mystring = alloca [32 x i8], align 4
	%0 = getelementptr inbounds [32 x i8], [32 x i8]* %mystring, i32 0, i32 0			%0 = getelementptr inbounds [32 x i8], [32 x i8]* %mystring, i32 0, i32 0
	call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 getelementptr inbounds ([21 x i8], [21 x i8]* @.str.2, i32 0, i32 0), i32 21, i1 false)			call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 4 %0, i8* align 4 getelementptr inbounds ([21 x i8], [21 x i8]* @.str.2, i32 0, i32 0), i32 21, i1 false)
	ret void			ret void
	}			}

				SjoerdMeijerUnsubmitted Not Done Reply Inline Actions I understand the logic of this patch, but I am struggling with the tests. In this new test below, it's unclear to me why we shouldn't be generating the libcall. To make the differences clear I am wondering if these tests should test more or if they should be `llc .. \| llvm-objdump -d ..` tests so that we can actually see codesize? SjoerdMeijer: I understand the logic of this patch, but I am struggling with the tests. In this new test…
	declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i1) #1			declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture readonly, i32, i1) #1

	attributes #0 = { minsize noinline nounwind optsize }			attributes #0 = { minsize noinline nounwind optsize }

llvm/test/CodeGen/ARM/memfunc.ll

Show First 20 Lines • Show All 371 Lines • ▼ Show 20 Lines	entry:
; CHECK-GNUEABI: bl memset		; CHECK-GNUEABI: bl memset
%arr2 = alloca [13 x i8], align 1		%arr2 = alloca [13 x i8], align 1
%2 = getelementptr inbounds [13 x i8], [13 x i8]* %arr2, i32 0, i32 16		%2 = getelementptr inbounds [13 x i8], [13 x i8]* %arr2, i32 0, i32 16
call void @llvm.memset.p0i8.i32(i8* %2, i8 1, i32 %n, i1 false)		call void @llvm.memset.p0i8.i32(i8* %2, i8 1, i32 %n, i1 false)

ret void		ret void
}		}

		; Check that alloca arguments are not aligned when the function is minsize
		define void @fminsize(i8* %dest, i32 %n) "frame-pointer"="all" minsize {
		entry:
		; CHECK-LABEL: fminsize

		; CHECK: {{add(.w)? r., sp, #27\|sub(.w)? r., r(7\|11), #17}}
		; CHECK-IOS: bl _memmove
		; CHECK-DARWIN: bl _memmove
		; CHECK-EABI: bl __aeabi_memmove
		; CHECK-GNUEABI: bl memmove
		%arr0 = alloca [9 x i8], align 1
		%0 = bitcast [9 x i8]* %arr0 to i8*
		call void @llvm.memmove.p0i8.p0i8.i32(i8* %dest, i8* %0, i32 %n, i1 false)

		; CHECK: {{add(.w)? r., sp, #(10\|14)\|sub(.w)? r., r(7\|11), #26}}
		; CHECK-IOS: bl _memcpy
		; CHECK-DARWIN: bl _memcpy
		; CHECK-EABI: bl __aeabi_memcpy
		; CHECK-GNUEABI: bl memcpy
		%arr1 = alloca [9 x i8], align 1
		%1 = bitcast [9 x i8]* %arr1 to i8*
		call void @llvm.memcpy.p0i8.p0i8.i32(i8* %dest, i8* %1, i32 %n, i1 false)

		; CHECK: {{add(.w)? r., sp, #(1\|5)\|sub(.w)? r., r(7\|11), #35}}
		; CHECK-IOS: mov r1, #1
		; CHECK-IOS: bl _memset
		; CHECK-DARWIN: movs r1, #1
		; CHECK-DARWIN: bl _memset
		; CHECK-EABI: mov r2, #1
		; CHECK-EABI: bl __aeabi_memset
		; CHECK-GNUEABI: mov r1, #1
		; CHECK-GNUEABI: bl memset
		%arr2 = alloca [9 x i8], align 1
		%2 = bitcast [9 x i8]* %arr2 to i8*
		call void @llvm.memset.p0i8.i32(i8* %2, i8 1, i32 %n, i1 false)

		ret void
		}

; Check that global variables are aligned if they are large enough, but only if		; Check that global variables are aligned if they are large enough, but only if
; they are defined in this object and don't have an explicit section.		; they are defined in this object and don't have an explicit section.
@arr1 = global [7 x i8] c"\01\02\03\04\05\06\07", align 1		@arr1 = global [7 x i8] c"\01\02\03\04\05\06\07", align 1
@arr2 = global [8 x i8] c"\01\02\03\04\05\06\07\08", align 1		@arr2 = global [8 x i8] c"\01\02\03\04\05\06\07\08", align 1
@arr3 = global [7 x i8] c"\01\02\03\04\05\06\07", section "foo,bar", align 1		@arr3 = global [7 x i8] c"\01\02\03\04\05\06\07", section "foo,bar", align 1
@arr4 = global [8 x i8] c"\01\02\03\04\05\06\07\08", section "foo,bar", align 1		@arr4 = global [8 x i8] c"\01\02\03\04\05\06\07\08", section "foo,bar", align 1
@arr5 = weak global [7 x i8] c"\01\02\03\04\05\06\07", align 1		@arr5 = weak global [7 x i8] c"\01\02\03\04\05\06\07", align 1
@arr6 = weak_odr global [7 x i8] c"\01\02\03\04\05\06\07", align 1		@arr6 = weak_odr global [7 x i8] c"\01\02\03\04\05\06\07", align 1
▲ Show 20 Lines • Show All 54 Lines • Show Last 20 Lines