This is an archive of the discontinued LLVM Phabricator instance.

[ARM] Fix lowering of misaligned memcpy/memset
ClosedPublic

Authored by john.brawn on May 23 2017, 7:40 AM.

Download Raw Diff

Details

Reviewers

arsenm
rengolin
lhames
efriedma

Commits

rG9009d2905deb: [ARM] Fix lowering of misaligned memcpy/memset
rL303990: [ARM] Fix lowering of misaligned memcpy/memset

Summary

Currently getOptimalMemOpType returns i32 for large enough sizes without checking for alignment, leading to poor code generation when misaligned accesses aren't permitted as we generate a word store then later split it up into byte stores. This means we inadvertantly go over the MaxStoresPerMemcpy limit and for memset we splat the memset value into a word then immediately split it up again.

Fix this by leaving it up to FindOptimalMemOpLowering to figure out which type to use, but also fix a bug there where it wasn't correctly checking if misaligned memory accesses are allowed.

Diff Detail

Repository: rL LLVM

Event Timeline

john.brawn created this revision.May 23 2017, 7:40 AM

Herald added subscribers: eraman, javed.absar, wdng, aemerson. · View Herald TranscriptMay 23 2017, 7:40 AM

Please regenerate the tests in a separate commit, to make it clear what this patch is actually changing. Please change the description to make it clear what this is actually fixing; as far as I can tell, we aren't generating "wrong code", just inlining memcpy and memset calls too aggressively.

john.brawn edited the summary of this revision. (Show Details)May 24 2017, 6:22 AM

In D33442#762325, @efriedma wrote:

Please regenerate the tests in a separate commit, to make it clear what this patch is actually changing.

I've moved the test changes that aren't directly due to this code change to D33495

Please change the description to make it clear what this is actually fixing; as far as I can tell, we aren't generating "wrong code", just inlining memcpy and memset calls too aggressively.

Done

efriedma added inline comments.May 24 2017, 11:06 AM

lib/CodeGen/SelectionDAG/SelectionDAG.cpp
4782 ↗	(On Diff #100079)	The use of getPointerTy() here seems dubious. On many architectures, you can use getPointerTy as a rough proxy for the largest legal integer type, but that's not universal, and the usage of DstAS doesn't make any sense. Maybe just loop over {i64, i32, i16}.
4796 ↗	(On Diff #100079)	The use of isTypeLegal() here looks suspicious; i16 is not legal on ARM, but we definitely want to use i16 stores if we can. Do you have any testcases where the known alignment is two?

john.brawn added inline comments.May 25 2017, 6:42 AM

lib/CodeGen/SelectionDAG/SelectionDAG.cpp
4782 ↗	(On Diff #100079)	I'd thought about doing that but instead went with the simpler change. I'll give the loop approach a try.
4796 ↗	(On Diff #100079)	This loop is actually finding the largest legal type, i.e. i32 on ARM, so that if VT is larger than that it's used instead. Adding a test sounds like a good idea though.

Adjust FindOptimalMemOpLowering to use a loop instead of the pointer type, and add a 2-byte aligned test.

LGTM.

This revision is now accepted and ready to land.May 25 2017, 12:12 PM

Closed by commit rL303990: [ARM] Fix lowering of misaligned memcpy/memset (authored by john.brawn). · Explain WhyMay 26 2017, 6:59 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

CodeGen/

SelectionDAG/

SelectionDAG.cpp

22 lines

Target/

ARM/

ARMISelLowering.cpp

6 lines

test/

CodeGen/

ARM/

memcpy-inline.ll

5 lines

memset-inline.ll

50 lines

Diff 100401

llvm/trunk/lib/CodeGen/SelectionDAG/SelectionDAG.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 4,773 Lines • ▼ Show 20 Lines	static bool FindOptimalMemOpLowering(std::vector<EVT> &MemOps,
// means it's possible to change the alignment of the destination.		// means it's possible to change the alignment of the destination.
// 'MemcpyStrSrc' indicates whether the memcpy source is constant so it does		// 'MemcpyStrSrc' indicates whether the memcpy source is constant so it does
// not need to be loaded.		// not need to be loaded.
EVT VT = TLI.getOptimalMemOpType(Size, DstAlign, SrcAlign,		EVT VT = TLI.getOptimalMemOpType(Size, DstAlign, SrcAlign,
IsMemset, ZeroMemset, MemcpyStrSrc,		IsMemset, ZeroMemset, MemcpyStrSrc,
DAG.getMachineFunction());		DAG.getMachineFunction());

if (VT == MVT::Other) {		if (VT == MVT::Other) {
if (DstAlign >= DAG.getDataLayout().getPointerPrefAlignment(DstAS) \|\|		// Use the largest integer type whose alignment constraints are satisfied.
TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign)) {		// We only need to check DstAlign here as SrcAlign is always greater or
VT = TLI.getPointerTy(DAG.getDataLayout(), DstAS);		// equal to DstAlign (or zero).
} else {		VT = MVT::i64;
switch (DstAlign & 7) {		while (DstAlign && DstAlign < VT.getSizeInBits() / 8 &&
case 0: VT = MVT::i64; break;		!TLI.allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign))
case 4: VT = MVT::i32; break;		VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
case 2: VT = MVT::i16; break;		assert(VT.isInteger());
default: VT = MVT::i8; break;
}
}

		// Find the largest legal integer type.
MVT LVT = MVT::i64;		MVT LVT = MVT::i64;
while (!TLI.isTypeLegal(LVT))		while (!TLI.isTypeLegal(LVT))
LVT = (MVT::SimpleValueType)(LVT.SimpleTy - 1);		LVT = (MVT::SimpleValueType)(LVT.SimpleTy - 1);
assert(LVT.isInteger());		assert(LVT.isInteger());

		// If the type we've chosen is larger than the largest legal integer type
		// then use that instead.
if (VT.bitsGT(LVT))		if (VT.bitsGT(LVT))
VT = LVT;		VT = LVT;
}		}

unsigned NumMemOps = 0;		unsigned NumMemOps = 0;
while (Size != 0) {		while (Size != 0) {
unsigned VTSize = VT.getSizeInBits() / 8;		unsigned VTSize = VT.getSizeInBits() / 8;
while (VTSize > Size) {		while (VTSize > Size) {
▲ Show 20 Lines • Show All 3,139 Lines • Show Last 20 Lines

llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 12,141 Lines • ▼ Show 20 Lines	if ((!IsMemset \|\| ZeroMemset) && Subtarget->hasNEON() &&
} else if (Size >= 8 &&		} else if (Size >= 8 &&
(memOpAlign(SrcAlign, DstAlign, 8) \|\|		(memOpAlign(SrcAlign, DstAlign, 8) \|\|
(allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) &&		(allowsMisalignedMemoryAccesses(MVT::f64, 0, 1, &Fast) &&
Fast))) {		Fast))) {
return MVT::f64;		return MVT::f64;
}		}
}		}

// Lowering to i32/i16 if the size permits.
if (Size >= 4)
return MVT::i32;
else if (Size >= 2)
return MVT::i16;

// Let the target-independent logic figure it out.		// Let the target-independent logic figure it out.
return MVT::Other;		return MVT::Other;
}		}

bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {		bool ARMTargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
if (Val.getOpcode() != ISD::LOAD)		if (Val.getOpcode() != ISD::LOAD)
return false;		return false;

▲ Show 20 Lines • Show All 1,900 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/ARM/memcpy-inline.ll

	Show First 20 Lines • Show All 89 Lines • ▼ Show 20 Lines
	; CHECK: movs [[REG5:r[0-9]+]], #0			; CHECK: movs [[REG5:r[0-9]+]], #0
	; CHECK: strb [[REG5]], [r0, #6]			; CHECK: strb [[REG5]], [r0, #6]
	; CHECK: movw [[REG6:r[0-9]+]], #21587			; CHECK: movw [[REG6:r[0-9]+]], #21587
	; CHECK: strh [[REG6]], [r0, #4]			; CHECK: strh [[REG6]], [r0, #4]
	; CHECK: movw [[REG7:r[0-9]+]], #18500			; CHECK: movw [[REG7:r[0-9]+]], #18500
	; CHECK: movt [[REG7:r[0-9]+]], #22866			; CHECK: movt [[REG7:r[0-9]+]], #22866
	; CHECK: str [[REG7]]			; CHECK: str [[REG7]]
	; CHECK-T1-LABEL: t5:			; CHECK-T1-LABEL: t5:
	; CHECK-T1: movs [[TREG3:r[0-9]]],			; CHECK-T1: bl _memcpy
	; CHECK-T1: strb [[TREG3]],
	; CHECK-T1: movs [[TREG4:r[0-9]]],
	; CHECK-T1: strb [[TREG4]],
	tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str5, i64 0, i64 0), i64 7, i32 1, i1 false)			tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %C, i8* getelementptr inbounds ([7 x i8], [7 x i8]* @.str5, i64 0, i64 0), i64 7, i32 1, i1 false)
	ret void			ret void
	}			}

	define void @t6() nounwind {			define void @t6() nounwind {
	entry:			entry:
	; CHECK-LABEL: t6:			; CHECK-LABEL: t6:
	; CHECK: vldr [[REG9:d[0-9]+]], [r0]			; CHECK: vldr [[REG9:d[0-9]+]], [r0]
	Show All 32 Lines

llvm/trunk/test/CodeGen/ARM/memset-inline.ll

	Show All 32 Lines
	; CHECK-6M: str [[REG]], [sp]			; CHECK-6M: str [[REG]], [sp]
	%buf = alloca [26 x i8], align 1			%buf = alloca [26 x i8], align 1
	%0 = getelementptr inbounds [26 x i8], [26 x i8]* %buf, i32 0, i32 0			%0 = getelementptr inbounds [26 x i8], [26 x i8]* %buf, i32 0, i32 0
	call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false)			call void @llvm.memset.p0i8.i32(i8* %0, i8 0, i32 26, i32 1, i1 false)
	call void @something(i8* %0) nounwind			call void @something(i8* %0) nounwind
	ret void			ret void
	}			}

				define void @t3(i8* %p) {
				entry:
				; CHECK-7A-LABEL: t3:
				; CHECK-7A: muls [[REG:r[0-9]+]],
				; CHECK-7A: str [[REG]],
				; CHECK-6M-LABEL: t3:
				; CHECK-6M-NOT: muls
				; CHECK-6M: strb [[REG:r[0-9]+]],
				; CHECK-6M: strb [[REG]],
				; CHECK-6M: strb [[REG]],
				; CHECK-6M: strb [[REG]],
				br label %for.body

				for.body:
				%i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
				%0 = trunc i32 %i to i8
				call void @llvm.memset.p0i8.i32(i8* %p, i8 %0, i32 4, i32 1, i1 false)
				call void @something(i8* %p)
				%inc = add nuw nsw i32 %i, 1
				%exitcond = icmp eq i32 %inc, 255
				br i1 %exitcond, label %for.end, label %for.body

				for.end:
				ret void
				}

				define void @t4(i8* %p) {
				entry:
				; CHECK-7A-LABEL: t4:
				; CHECK-7A: muls [[REG:r[0-9]+]],
				; CHECK-7A: str [[REG]],
				; CHECK-6M-LABEL: t4:
				; CHECK-6M: muls [[REG:r[0-9]+]],
				; CHECK-6M: strh [[REG]],
				; CHECK-6M: strh [[REG]],
				br label %for.body

				for.body:
				%i = phi i32 [ 0, %entry ], [ %inc, %for.body ]
				%0 = trunc i32 %i to i8
				call void @llvm.memset.p0i8.i32(i8* %p, i8 %0, i32 4, i32 2, i1 false)
				call void @something(i8* %p)
				%inc = add nuw nsw i32 %i, 1
				%exitcond = icmp eq i32 %inc, 255
				br i1 %exitcond, label %for.end, label %for.body

				for.end:
				ret void
				}

	declare void @something(i8*) nounwind			declare void @something(i8*) nounwind
	declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind			declare void @llvm.memset.p0i8.i32(i8* nocapture, i8, i32, i32, i1) nounwind
	declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind			declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind