Diff 355223

llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp

Show All 15 Lines
// presence of non-idiom instructions. The initial implementation of the		// presence of non-idiom instructions. The initial implementation of the
// heuristics applies to idioms in multi-block loops.		// heuristics applies to idioms in multi-block loops.
//		//
//===----------------------------------------------------------------------===//		//===----------------------------------------------------------------------===//
//		//
// TODO List:		// TODO List:
//		//
// Future loop memory idioms to recognize:		// Future loop memory idioms to recognize:
// memcmp, memmove, strlen, etc.		// memcmp, strlen, etc.
// Future floating point idioms to recognize in -ffast-math mode:		// Future floating point idioms to recognize in -ffast-math mode:
// fpowi		// fpowi
// Future integer operation idioms to recognize:		// Future integer operation idioms to recognize:
// ctpop		// ctpop
//		//
// Beware that isel's default lowering for ctpop is highly inefficient for		// Beware that isel's default lowering for ctpop is highly inefficient for
// i64 and larger types when i64 is legal and the value has few bits set. It		// i64 and larger types when i64 is legal and the value has few bits set. It
// would be good to enhance isel to emit a loop for ctpop in this case.		// would be good to enhance isel to emit a loop for ctpop in this case.
▲ Show 20 Lines • Show All 69 Lines • ▼ Show 20 Lines
#include <cstdint>		#include <cstdint>
#include <utility>		#include <utility>
#include <vector>		#include <vector>

using namespace llvm;		using namespace llvm;

#define DEBUG_TYPE "loop-idiom"		#define DEBUG_TYPE "loop-idiom"

STATISTIC(NumMemSet, "Number of memset's formed from loop stores");		STATISTIC(NumMemSet, "Number of memset's formed from loop stores");
		xbolva00Unsubmitted Done Reply Inline Actions Also you can add NumMemMove.. xbolva00: Also you can add NumMemMove..
STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");		STATISTIC(NumMemCpy, "Number of memcpy's formed from loop load+stores");
STATISTIC(		STATISTIC(
NumShiftUntilBitTest,		NumShiftUntilBitTest,
"Number of uncountable loops recognized as 'shift until bitttest' idiom");		"Number of uncountable loops recognized as 'shift until bitttest' idiom");
STATISTIC(NumShiftUntilZero,		STATISTIC(NumShiftUntilZero,
"Number of uncountable loops recognized as 'shift until zero' idiom");		"Number of uncountable loops recognized as 'shift until zero' idiom");

bool DisableLIRP::All;		bool DisableLIRP::All;
▲ Show 20 Lines • Show All 1,105 Lines • ▼ Show 20 Lines	bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
// changed the IR, even if we later clean up these added instructions. There		// changed the IR, even if we later clean up these added instructions. There
// may be structural differences e.g. in the order of use lists not accounted		// may be structural differences e.g. in the order of use lists not accounted
// for in just a textual dump of the IR. This is written as a variable, even		// for in just a textual dump of the IR. This is written as a variable, even
// though statically all the places this dominates could be replaced with		// though statically all the places this dominates could be replaced with
// 'true', with the hope that anyone trying to be clever / "more precise" with		// 'true', with the hope that anyone trying to be clever / "more precise" with
// the return value will read this comment, and leave them alone.		// the return value will read this comment, and leave them alone.
Changed = true;		Changed = true;

SmallPtrSet<Instruction *, 1> Stores;		SmallPtrSet<Instruction *, 2> Stores;
Stores.insert(TheStore);		Stores.insert(TheStore);

bool IsMemCpy = isa<MemCpyInst>(TheStore);		bool IsMemCpy = isa<MemCpyInst>(TheStore);
const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";		const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";

if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,		bool UseMemMove =
StoreSize, *AA, Stores)) {		mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
		StoreSize, *AA, Stores);
		if (UseMemMove) {
		Stores.insert(TheLoad);
		if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop,
		BECount, StoreSize, *AA, Stores)) {
ORE.emit([&]() {		ORE.emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",		return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
TheStore)		TheStore)
<< ore::NV("Inst", InstRemark) << " in "		<< ore::NV("Inst", InstRemark) << " in "
<< ore::NV("Function", TheStore->getFunction())		<< ore::NV("Function", TheStore->getFunction())
<< " function will not be hoisted: "		<< " function will not be hoisted: "
<< ore::NV("Reason", "The loop may access store location");		<< ore::NV("Reason", "The loop may access store location");
});		});
return Changed;		return Changed;
}		}
		Stores.erase(TheLoad);
		}

const SCEV *LdStart = LoadEv->getStart();		const SCEV *LdStart = LoadEv->getStart();
unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();		unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();

// Handle negative strided loops.		// Handle negative strided loops.
if (NegStride)		if (NegStride)
LdStart = getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSize, SE);		LdStart = getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSize, SE);

Show All 12 Lines	ORE.emit([&]() {
return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)		return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
<< ore::NV("Inst", InstRemark) << " in "		<< ore::NV("Inst", InstRemark) << " in "
<< ore::NV("Function", TheStore->getFunction())		<< ore::NV("Function", TheStore->getFunction())
<< " function will not be hoisted: "		<< " function will not be hoisted: "
<< ore::NV("Reason", "The loop may access load location");		<< ore::NV("Reason", "The loop may access load location");
});		});
return Changed;		return Changed;
}		}
		if (UseMemMove) {
		// Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr for
		// negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr.
		efriedmaUnsubmitted Done Reply Inline Actions Finally, someone trying to add this transform figured out this check... I think this is the fourth patch trying to add memmove to loopidiom. Do you need to compare the offset to the size of the load/store operations? I think you might get some funny behavior if the operations overlap. efriedma: Finally, someone trying to add this transform figured out this check... I think this is the…
		yurai007AuthorUnsubmitted Done Reply Inline Actions Finally, someone trying to add this transform figured out this check... I think this is the fourth patch trying to add memmove to loopidiom. Well, hopefully there won't be fifth one :) Do you need to compare the offset to the size of the load/store operations? I think you might get some funny behavior if the operations overlap. Yes, it would be nice to prevent overlapping. I'm going to add protection against it and UT reproducer. yurai007: > Finally, someone trying to add this transform figured out this check... I think this is the…
		yurai007AuthorUnsubmitted Done Reply Inline Actions Now load and store can't overlap each other and their sizes must be equal. However I'm not sure how reproducer should look like. Currently isLegalStore reject everything which has abs(Stride) != StoreSize so my understanding is that only one load and one store in loop are allowed. I ended up with following reproducer: define void @do_not_form_memmove_for_overlapped_access(i32* %s, i64 %size) { entry: %end.idx = add i64 %size, -1 %end.ptr = getelementptr inbounds i32, i32* %s, i64 %end.idx br label %while.body while.body: %phi.ptr = phi i32* [ %s, %entry ], [ %next.ptr, %while.body ] %next = bitcast i32* %phi.ptr to i16* %src.ptr = getelementptr i16, i16* %next, i64 1 %src.ptr2 = bitcast i16* %src.ptr to i32* ; below misaligned load is overlapped with store. %val = load i32, i32* %src.ptr2, align 4 %dst.ptr = getelementptr i32, i32* %phi.ptr, i64 0 store i32 %val, i32* %dst.ptr, align 4 %next.ptr = getelementptr i32, i32* %phi.ptr, i64 1 %cmp = icmp eq i32* %next.ptr, %end.ptr br i1 %cmp, label %exit, label %while.body exit: ret void } The thing is that I'm not sure whether or not above snippet is correct IR since we do misaligned load. yurai007: Now load and store can't overlap each other and their sizes must be equal. However I'm not sure…
		yurai007AuthorUnsubmitted Done Reply Inline Actions Ok, I checked documentation and mailing list for better understanding. Changing load just to "%val = load i32, i32* %src.ptr2, align 2" should be fine and give legal IR with underaligned access. Added improved overlapping access UT reproducer. yurai007: Ok, I checked documentation and mailing list for better understanding. Changing load just to…
		int64_t LoadOff = 0, StoreOff = 0;
		const Value *BP1 = llvm::GetPointerBaseWithConstantOffset(
		LoadBasePtr->stripPointerCasts(), LoadOff, *DL);
		const Value *BP2 = llvm::GetPointerBaseWithConstantOffset(
		StoreBasePtr->stripPointerCasts(), StoreOff, *DL);
		int64_t LoadSize =
		DL->getTypeSizeInBits(TheLoad->getType()).getFixedSize() / 8;
		if (BP1 != BP2 \|\| LoadSize != int64_t(StoreSize))
		return Changed;
		if ((!NegStride && LoadOff < StoreOff + int64_t(StoreSize)) \|\|
		(NegStride && LoadOff + LoadSize > StoreOff))
		return Changed;
		}

if (avoidLIRForMultiBlockLoop())		if (avoidLIRForMultiBlockLoop())
return Changed;		return Changed;

// Okay, everything is safe, we can transform this!		// Okay, everything is safe, we can transform this!

const SCEV *NumBytesS =		const SCEV *NumBytesS =
getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);		getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);

Value *NumBytes =		Value *NumBytes =
Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());		Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());

CallInst *NewCall = nullptr;		CallInst *NewCall = nullptr;
// Check whether to generate an unordered atomic memcpy:		// Check whether to generate an unordered atomic memcpy:
// If the load or store are atomic, then they must necessarily be unordered		// If the load or store are atomic, then they must necessarily be unordered
// by previous checks.		// by previous checks.
if (!TheStore->isAtomic() && !TheLoad->isAtomic())		if (!TheStore->isAtomic() && !TheLoad->isAtomic()) {
		if (UseMemMove)
		NewCall = Builder.CreateMemMove(StoreBasePtr, StoreAlign, LoadBasePtr,
		LoadAlign, NumBytes);
		else
NewCall = Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr,		NewCall = Builder.CreateMemCpy(StoreBasePtr, StoreAlign, LoadBasePtr,
LoadAlign, NumBytes);		LoadAlign, NumBytes);
else {		} else {
		// For now don't support unordered atomic memmove.
		if (UseMemMove)
		return Changed;
// We cannot allow unaligned ops for unordered load/store, so reject		// We cannot allow unaligned ops for unordered load/store, so reject
// anything where the alignment isn't at least the element size.		// anything where the alignment isn't at least the element size.
assert((StoreAlign.hasValue() && LoadAlign.hasValue()) &&		assert((StoreAlign.hasValue() && LoadAlign.hasValue()) &&
"Expect unordered load/store to have align.");		"Expect unordered load/store to have align.");
if (StoreAlign.getValue() < StoreSize \|\| LoadAlign.getValue() < StoreSize)		if (StoreAlign.getValue() < StoreSize \|\| LoadAlign.getValue() < StoreSize)
return Changed;		return Changed;

// If the element.atomic memcpy is not lowered into explicit		// If the element.atomic memcpy is not lowered into explicit
// loads/stores later, then it will be lowered into an element-size		// loads/stores later, then it will be lowered into an element-size
// specific lib call. If the lib call doesn't exist for our store size, then		// specific lib call. If the lib call doesn't exist for our store size, then
// we shouldn't generate the memcpy.		// we shouldn't generate the memcpy.
if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())		if (StoreSize > TTI->getAtomicMemIntrinsicMaxElementSize())
return Changed;		return Changed;

// Create the call.		// Create the call.
// Note that unordered atomic loads/stores are required by the spec to		// Note that unordered atomic loads/stores are required by the spec to
// have an alignment but non-atomic loads/stores may not.		// have an alignment but non-atomic loads/stores may not.
NewCall = Builder.CreateElementUnorderedAtomicMemCpy(		NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
		efriedmaUnsubmitted Done Reply Inline Actions Do we need to make sure we don't fall into this codepath? efriedma: Do we need to make sure we don't fall into this codepath?
		yurai007AuthorUnsubmitted Done Reply Inline Actions Yes, now this branch is ommited. Added tests to catch such mistakes. yurai007: Yes, now this branch is ommited. Added tests to catch such mistakes.
StoreBasePtr, StoreAlign.getValue(), LoadBasePtr, LoadAlign.getValue(),		StoreBasePtr, StoreAlign.getValue(), LoadBasePtr, LoadAlign.getValue(),
NumBytes, StoreSize);		NumBytes, StoreSize);
}		}
NewCall->setDebugLoc(TheStore->getDebugLoc());		NewCall->setDebugLoc(TheStore->getDebugLoc());

if (MSSAU) {		if (MSSAU) {
MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(		MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);		NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);		MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
}		}

LLVM_DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n"		LLVM_DEBUG(dbgs() << " Formed new call: " << *NewCall << "\n"
<< " from load ptr=" << LoadEv << " at: " << TheLoad		<< " from load ptr=" << LoadEv << " at: " << TheLoad
<< "\n"		<< "\n"
<< " from store ptr=" << StoreEv << " at: " << TheStore		<< " from store ptr=" << StoreEv << " at: " << TheStore
<< "\n");		<< "\n");

ORE.emit([&]() {		ORE.emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",		return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
NewCall->getDebugLoc(), Preheader)		NewCall->getDebugLoc(), Preheader)
<< "Formed a call to "		<< "Formed a call to "
<< ore::NV("NewFunction", NewCall->getCalledFunction())		<< ore::NV("NewFunction", NewCall->getCalledFunction())
<< "() intrinsic from " << ore::NV("Inst", InstRemark)		<< "() intrinsic from " << ore::NV("Inst", InstRemark)
<< " instruction in " << ore::NV("Function", TheStore->getFunction())		<< " instruction in " << ore::NV("Function", TheStore->getFunction())
<< " function";		<< " function";
});		});

// Okay, the memcpy has been formed. Zap the original store and anything that		// Okay, the memcpy has been formed. Zap the original store and anything that
// feeds into it.		// feeds into it.
if (MSSAU)		if (MSSAU)
MSSAU->removeMemoryAccess(TheStore, true);		MSSAU->removeMemoryAccess(TheStore, true);
deleteDeadInstruction(TheStore);		deleteDeadInstruction(TheStore);
if (MSSAU && VerifyMemorySSA)		if (MSSAU && VerifyMemorySSA)
MSSAU->getMemorySSA()->verifyMemorySSA();		MSSAU->getMemorySSA()->verifyMemorySSA();
		if (!UseMemMove)
		xbolva00Unsubmitted Done Reply Inline Actions … and increment it here xbolva00: … and increment it here
		yurai007AuthorUnsubmitted Done Reply Inline Actions Ok, I will add this counter. yurai007: Ok, I will add this counter.
++NumMemCpy;		++NumMemCpy;
		yurai007AuthorUnsubmitted Done Reply Inline Actions Don't touch this counter when memmove is emitted. yurai007: Don't touch this counter when memmove is emitted.
ExpCleaner.markResultUsed();		ExpCleaner.markResultUsed();
return true;		return true;
}		}

// When compiling for codesize we avoid idiom recognition for a multi-block loop		// When compiling for codesize we avoid idiom recognition for a multi-block loop
// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.		// unless it is a loop_memset idiom or a memset/memcpy idiom in a nested loop.
//		//
bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,		bool LoopIdiomRecognize::avoidLIRForMultiBlockLoop(bool IsMemset,
▲ Show 20 Lines • Show All 1,388 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopIdiom/X86/unordered-atomic-memcpy.ll

Show First 20 Lines • Show All 448 Lines • ▼ Show 20 Lines	for.body: ; preds = %entry, %for.body
store atomic i32 1, i32* %arrayidx unordered, align 4		store atomic i32 1, i32* %arrayidx unordered, align 4
%indvar.next = add i64 %indvar, 1		%indvar.next = add i64 %indvar, 1
%exitcond = icmp eq i64 %indvar.next, 10000		%exitcond = icmp eq i64 %indvar.next, 10000
br i1 %exitcond, label %for.end, label %for.body		br i1 %exitcond, label %for.end, label %for.body

for.end: ; preds = %for.body		for.end: ; preds = %for.body
ret void		ret void
}		}

		; Make sure that atomic memcpy or memmove don't get recognized by mistake
		; when looping with positive stride
		define void @test_no_memcpy_memmove1(i8* %Src, i64 %Size) {
		; CHECK-LABEL: @test_no_memcpy_memmove1(
		; CHECK-NOT: call void @llvm.memcpy.element.unordered.atomic
		; CHECK-NOT: call void @llvm.memmove.element.unordered.atomic
		; CHECK: store
		; CHECK: ret void
		bb.nph:
		br label %for.body

		for.body: ; preds = %bb.nph, %for.body
		%indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
		%Step = add nuw nsw i64 %indvar, 1
		%SrcI = getelementptr i8, i8* %Src, i64 %Step
		%DestI = getelementptr i8, i8* %Src, i64 %indvar
		%V = load i8, i8* %SrcI, align 1
		store atomic i8 %V, i8* %DestI unordered, align 1
		%indvar.next = add i64 %indvar, 1
		%exitcond = icmp eq i64 %indvar.next, %Size
		br i1 %exitcond, label %for.end, label %for.body

		for.end: ; preds = %for.body, %entry
		ret void
		}

		; Make sure that atomic memcpy or memmove don't get recognized by mistake
		; when looping with negative stride
		define void @test_no_memcpy_memmove2(i8* %Src, i64 %Size) {
		; CHECK-LABEL: @test_no_memcpy_memmove2(
		; CHECK-NOT: call void @llvm.memcpy.element.unordered.atomic
		; CHECK-NOT: call void @llvm.memmove.element.unordered.atomic
		; CHECK: store
		; CHECK: ret void
		bb.nph:
		%cmp1 = icmp sgt i64 %Size, 0
		br i1 %cmp1, label %for.body, label %for.end

		for.body: ; preds = %bb.nph, %.for.body
		%indvar = phi i64 [ %Step, %for.body ], [ %Size, %bb.nph ]
		%Step = add nsw i64 %indvar, -1
		%SrcI = getelementptr inbounds i8, i8* %Src, i64 %Step
		%V = load i8, i8* %SrcI, align 1
		%DestI = getelementptr inbounds i8, i8* %Src, i64 %indvar
		store atomic i8 %V, i8* %DestI unordered, align 1
		%exitcond = icmp sgt i64 %indvar, 1
		br i1 %exitcond, label %for.body, label %for.end

		for.end: ; preds = %for.body, %entry
		ret void
		}

llvm/test/Transforms/LoopIdiom/basic.ll

	Show First 20 Lines • Show All 683 Lines • ▼ Show 20 Lines
	define void @PR14241(i32* %s, i64 %size) {			define void @PR14241(i32* %s, i64 %size) {
	; Ensure that we don't form a memcpy for strided loops. Briefly, when we taught			; Ensure that we don't form a memcpy for strided loops. Briefly, when we taught
	; LoopIdiom about memmove and strided loops, this got miscompiled into a memcpy			; LoopIdiom about memmove and strided loops, this got miscompiled into a memcpy
	; instead of a memmove. If we get the memmove transform back, this will catch			; instead of a memmove. If we get the memmove transform back, this will catch
	; regressions.			; regressions.
	;			;
	; CHECK-LABEL: @PR14241(			; CHECK-LABEL: @PR14241(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
				; CHECK-NEXT: [[S1:%.]] = bitcast i32 [[S:%.]] to i8
	; CHECK-NEXT: [[END_IDX:%.]] = add i64 [[SIZE:%.]], -1			; CHECK-NEXT: [[END_IDX:%.]] = add i64 [[SIZE:%.]], -1
	; CHECK-NEXT: [[END_PTR:%.]] = getelementptr inbounds i32, i32 [[S:%.*]], i64 [[END_IDX]]			; CHECK-NEXT: [[END_PTR:%.]] = getelementptr inbounds i32, i32 [[S:%.*]], i64 [[END_IDX]]
				; CHECK-NEXT: [[SCEVGEP:%.]] = getelementptr i32, i32 [[S]], i64 1
				; CHECK-NEXT: [[SCEVGEP2:%.]] = bitcast i32 [[SCEVGEP]] to i8*
				; CHECK-NEXT: [[TMP1:%.*]] = shl i64 [[SIZE]], 2
				; CHECK-NEXT: [[TMP2:%.*]] = add i64 [[TMP1]], -8
				; CHECK-NEXT: [[TMP3:%.*]] = lshr i64 [[TMP2]], 2
				; CHECK-NEXT: [[TMP4:%.*]] = shl nuw i64 [[TMP3]], 2
				; CHECK-NEXT: [[TMP5:%.*]] = add i64 [[TMP4]], 4
				; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64(i8* align 4 [[S1]], i8* align 4 [[SCEVGEP2]], i64 [[TMP5]], i1 false)
	; CHECK-NEXT: br label [[WHILE_BODY:%.*]]			; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
	; CHECK: while.body:			; CHECK: while.body:
	; CHECK-NEXT: [[PHI_PTR:%.]] = phi i32 [ [[S]], [[ENTRY:%.]] ], [ [[NEXT_PTR:%.]], [[WHILE_BODY]] ]			; CHECK-NEXT: [[PHI_PTR:%.]] = phi i32 [ [[S]], [[ENTRY:%.]] ], [ [[NEXT_PTR:%.]], [[WHILE_BODY]] ]
	; CHECK-NEXT: [[SRC_PTR:%.]] = getelementptr inbounds i32, i32 [[PHI_PTR]], i64 1			; CHECK-NEXT: [[SRC_PTR:%.]] = getelementptr inbounds i32, i32 [[PHI_PTR]], i64 1
	; CHECK-NEXT: [[VAL:%.]] = load i32, i32 [[SRC_PTR]], align 4			; CHECK-NEXT: [[VAL:%.]] = load i32, i32 [[SRC_PTR]], align 4
	; CHECK-NEXT: [[DST_PTR:%.]] = getelementptr inbounds i32, i32 [[PHI_PTR]], i64 0			; CHECK-NEXT: [[DST_PTR:%.]] = getelementptr inbounds i32, i32 [[PHI_PTR]], i64 0
	; CHECK-NEXT: store i32 [[VAL]], i32* [[DST_PTR]], align 4
	; CHECK-NEXT: [[NEXT_PTR]] = getelementptr inbounds i32, i32* [[PHI_PTR]], i64 1			; CHECK-NEXT: [[NEXT_PTR]] = getelementptr inbounds i32, i32* [[PHI_PTR]], i64 1
	; CHECK-NEXT: [[CMP:%.]] = icmp eq i32 [[NEXT_PTR]], [[END_PTR]]			; CHECK-NEXT: [[CMP:%.]] = icmp eq i32 [[NEXT_PTR]], [[END_PTR]]
	; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[WHILE_BODY]]			; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[WHILE_BODY]]
	; CHECK: exit:			; CHECK: exit:
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	;			;

	entry:			entry:
	%end.idx = add i64 %size, -1			%end.idx = add i64 %size, -1
	%end.ptr = getelementptr inbounds i32, i32* %s, i64 %end.idx			%end.ptr = getelementptr inbounds i32, i32* %s, i64 %end.idx
	br label %while.body			br label %while.body
	; FIXME: When we regain the ability to form a memmove here, this test should be
	; reversed and turned into a positive assertion.

	while.body:			while.body:
	%phi.ptr = phi i32* [ %s, %entry ], [ %next.ptr, %while.body ]			%phi.ptr = phi i32* [ %s, %entry ], [ %next.ptr, %while.body ]
	%src.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 1			%src.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 1
	%val = load i32, i32* %src.ptr, align 4			%val = load i32, i32* %src.ptr, align 4
	%dst.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 0			%dst.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 0
	store i32 %val, i32* %dst.ptr, align 4			store i32 %val, i32* %dst.ptr, align 4
	%next.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 1			%next.ptr = getelementptr inbounds i32, i32* %phi.ptr, i64 1
	▲ Show 20 Lines • Show All 336 Lines • ▼ Show 20 Lines

	loop.exit:			loop.exit:
	br label %exit			br label %exit

	exit:			exit:
	ret void			ret void
	}			}

				;; Memmove formation.
				define void @PR46179_positive_stride(i8* %Src, i64 %Size) {
				; CHECK-LABEL: @PR46179_positive_stride(
				; CHECK-NEXT: bb.nph:
				; CHECK-NEXT: [[SCEVGEP:%.]] = getelementptr i8, i8 [[SRC:%.*]], i64 1
				; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 [[SRC]], i8* align 1 [[SCEVGEP]], i64 [[SIZE:%.*]], i1 false)
				; CHECK-NEXT: br label [[FOR_BODY:%.*]]
				; CHECK: for.body:
				; CHECK-NEXT: [[INDVAR:%.]] = phi i64 [ 0, [[BB_NPH:%.]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ]
				; CHECK-NEXT: [[STEP:%.*]] = add nuw nsw i64 [[INDVAR]], 1
				; CHECK-NEXT: [[SRCI:%.]] = getelementptr i8, i8 [[SRC]], i64 [[STEP]]
				; CHECK-NEXT: [[DESTI:%.]] = getelementptr i8, i8 [[SRC]], i64 [[INDVAR]]
				; CHECK-NEXT: [[V:%.]] = load i8, i8 [[SRCI]], align 1
				; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
				; CHECK-NEXT: [[EXITCOND:%.]] = icmp eq i64 [[INDVAR_NEXT]], [[SIZE:%.]]
				; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
				; CHECK: for.end:
				; CHECK-NEXT: ret void
				;
				bb.nph:
				br label %for.body

				for.body: ; preds = %bb.nph, %for.body
				%indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
				%Step = add nuw nsw i64 %indvar, 1
				%SrcI = getelementptr i8, i8* %Src, i64 %Step
				%DestI = getelementptr i8, i8* %Src, i64 %indvar
				%V = load i8, i8* %SrcI, align 1
				store i8 %V, i8* %DestI, align 1
				%indvar.next = add i64 %indvar, 1
				%exitcond = icmp eq i64 %indvar.next, %Size
				br i1 %exitcond, label %for.end, label %for.body

				for.end: ; preds = %for.body, %entry
				ret void
				}

				;; Memmove formation.
				define void @PR46179_negative_stride(i8* %Src, i64 %Size) {
				; CHECK-LABEL: @PR46179_negative_stride(
				; CHECK-NEXT: bb.nph:
				; CHECK-NEXT: [[CMP1:%.]] = icmp sgt i64 [[SIZE:%.]], 0
				; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.]], label [[FOR_END:%.]]
				; CHECK: for.body.preheader:
				; CHECK-NEXT: [[SCEVGEP:%.]] = getelementptr i8, i8 [[SRC:%.*]], i64 1
				; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 [[SCEVGEP]], i8* align 1 [[SRC]], i64 [[SIZE]], i1 false)
				; CHECK-NEXT: br label [[FOR_BODY:%.*]]
				; CHECK: for.body:
				; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ [[STEP]], [[FOR_BODY]] ], [ [[SIZE]], [[FOR_BODY_PREHEADER]] ]
				; CHECK-NEXT: [[STEP:%.*]] = add nsw i64 [[INDVAR]], -1
				; CHECK-NEXT: [[SRCI:%.]] = getelementptr inbounds i8, i8 [[SRC:%.*]], i64 [[STEP]]
				; CHECK-NEXT: [[V:%.]] = load i8, i8 [[SRCI]], align 1
				; CHECK-NEXT: [[DESTI:%.]] = getelementptr inbounds i8, i8 [[SRC]], i64 [[INDVAR]]
				; CHECK-NEXT: [[EXITCOND:%.*]] = icmp sgt i64 [[INDVAR]], 1
				; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END]]
				; CHECK: for.end:
				; CHECK-NEXT: ret void
				;
				bb.nph:
				%cmp1 = icmp sgt i64 %Size, 0
				br i1 %cmp1, label %for.body, label %for.end

				for.body: ; preds = %bb.nph, %.for.body
				%indvar = phi i64 [ %Step, %for.body ], [ %Size, %bb.nph ]
				%Step = add nsw i64 %indvar, -1
				%SrcI = getelementptr inbounds i8, i8* %Src, i64 %Step
				%V = load i8, i8* %SrcI, align 1
				%DestI = getelementptr inbounds i8, i8* %Src, i64 %indvar
				store i8 %V, i8* %DestI, align 1
				%exitcond = icmp sgt i64 %indvar, 1
				br i1 %exitcond, label %for.body, label %for.end

				for.end: ; preds = %.for.body, %bb.nph
				ret void
				}

				;; Do not form memmove from previous store when stride is positive.
				define void @do_not_form_memmove1(i8* %Src, i64 %Size) {
				; CHECK-LABEL: @do_not_form_memmove1(
				; CHECK-NEXT: bb.nph:
				; CHECK-NEXT: br label [[FOR_BODY:%.*]]
				; CHECK: for.body:
				; CHECK-NEXT: [[INDVAR:%.]] = phi i64 [ 1, [[BB_NPH:%.]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ]
				; CHECK-NEXT: [[STEP:%.*]] = add nuw nsw i64 [[INDVAR]], -1
				; CHECK-NEXT: [[SRCI:%.]] = getelementptr i8, i8 [[SRC:%.*]], i64 [[STEP]]
				; CHECK-NEXT: [[DESTI:%.]] = getelementptr i8, i8 [[SRC]], i64 [[INDVAR]]
				; CHECK-NEXT: [[V:%.]] = load i8, i8 [[SRCI]], align 1
				; CHECK-NEXT: store i8 [[V]], i8* [[DESTI]], align 1
				; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
				; CHECK-NEXT: [[EXITCOND:%.]] = icmp eq i64 [[INDVAR_NEXT]], [[SIZE:%.]]
				; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
				; CHECK: for.end:
				; CHECK-NEXT: ret void
				;
				bb.nph:
				br label %for.body

				for.body: ; preds = %bb.nph, %for.body
				%indvar = phi i64 [ 1, %bb.nph ], [ %indvar.next, %for.body ]
				%Step = add nuw nsw i64 %indvar, -1
				%SrcI = getelementptr i8, i8* %Src, i64 %Step
				%DestI = getelementptr i8, i8* %Src, i64 %indvar
				%V = load i8, i8* %SrcI, align 1
				store i8 %V, i8* %DestI, align 1
				%indvar.next = add i64 %indvar, 1
				%exitcond = icmp eq i64 %indvar.next, %Size
				br i1 %exitcond, label %for.end, label %for.body

				for.end: ; preds = %for.body, %entry
				ret void
				}

				;; Do not form memmove from next store when stride is negative.
				define void @do_not_form_memmove2(i8* %Src, i64 %Size) {
				; CHECK-LABEL: @do_not_form_memmove2(
				; CHECK-NEXT: bb.nph:
				; CHECK-NEXT: [[CMP1:%.]] = icmp sgt i64 [[SIZE:%.]], 0
				; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.]], label [[FOR_END:%.]]
				; CHECK: for.body.preheader:
				; CHECK-NEXT: br label [[FOR_BODY:%.*]]
				; CHECK: for.body:
				; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT]], [[FOR_BODY]] ], [ [[SIZE]], [[FOR_BODY_PREHEADER]] ]
				; CHECK-NEXT: [[STEP:%.*]] = add nuw nsw i64 [[INDVAR]], 1
				; CHECK-NEXT: [[SRCI:%.]] = getelementptr inbounds i8, i8 [[SRC:%.*]], i64 [[STEP]]
				; CHECK-NEXT: [[V:%.]] = load i8, i8 [[SRCI]], align 1
				; CHECK-NEXT: [[DESTI:%.]] = getelementptr inbounds i8, i8 [[SRC]], i64 [[INDVAR]]
				; CHECK-NEXT: store i8 [[V]], i8* [[DESTI]], align 1
				; CHECK-NEXT: [[INDVAR_NEXT:%.*]] = add nsw i64 [[INDVAR]], -1
				; CHECK-NEXT: [[EXITCOND:%.*]] = icmp sgt i64 [[INDVAR]], 1
				; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END]]
				; CHECK: for.end:
				; CHECK-NEXT: ret void
				;
				bb.nph:
				%cmp1 = icmp sgt i64 %Size, 0
				br i1 %cmp1, label %for.body, label %for.end

				for.body: ; preds = %bb.nph, %.for.body
				%indvar = phi i64 [ %indvar.next, %for.body ], [ %Size, %bb.nph ]
				%Step = add nuw nsw i64 %indvar, 1
				%SrcI = getelementptr inbounds i8, i8* %Src, i64 %Step
				%V = load i8, i8* %SrcI, align 1
				%DestI = getelementptr inbounds i8, i8* %Src, i64 %indvar
				store i8 %V, i8* %DestI, align 1
				%indvar.next = add nsw i64 %indvar, -1
				%exitcond = icmp sgt i64 %indvar, 1
				br i1 %exitcond, label %for.body, label %for.end

				for.end: ; preds = %.for.body, %bb.nph
				ret void
				}

				;; Do not form memmove when underaligned load is overlapped with store.
				define void @do_not_form_memmove3(i32* %s, i64 %size) {
				; CHECK-LABEL: @do_not_form_memmove3(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[END_IDX:%.]] = add i64 [[SIZE:%.]], -1
				; CHECK-NEXT: [[END_PTR:%.]] = getelementptr inbounds i32, i32 [[S:%.*]], i64 [[END_IDX]]
				; CHECK-NEXT: br label [[WHILE_BODY:%.*]]
				; CHECK: while.body:
				; CHECK-NEXT: [[PHI_PTR:%.]] = phi i32 [ [[S]], [[ENTRY:%.]] ], [ [[NEXT_PTR:%.]], [[WHILE_BODY]] ]
				; CHECK-NEXT: [[NEXT:%.]] = bitcast i32 [[PHI_PTR]] to i16*
				; CHECK-NEXT: [[SRC_PTR:%.]] = getelementptr i16, i16 [[NEXT]], i64 1
				; CHECK-NEXT: [[SRC_PTR2:%.]] = bitcast i16 [[SRC_PTR]] to i32*
				; CHECK-NEXT: [[VAL:%.]] = load i32, i32 [[SRC_PTR2]], align 2
				; CHECK-NEXT: [[DST_PTR:%.]] = getelementptr i32, i32 [[PHI_PTR]], i64 0
				; CHECK-NEXT: store i32 [[VAL]], i32* [[DST_PTR]], align 4
				; CHECK-NEXT: [[NEXT_PTR]] = getelementptr i32, i32* [[PHI_PTR]], i64 1
				; CHECK-NEXT: [[CMP:%.]] = icmp eq i32 [[NEXT_PTR]], [[END_PTR]]
				; CHECK-NEXT: br i1 [[CMP]], label [[EXIT:%.*]], label [[WHILE_BODY]]
				; CHECK: exit:
				; CHECK-NEXT: ret void
				;
				entry:
				%end.idx = add i64 %size, -1
				%end.ptr = getelementptr inbounds i32, i32* %s, i64 %end.idx
				br label %while.body

				while.body:
				%phi.ptr = phi i32* [ %s, %entry ], [ %next.ptr, %while.body ]
				%next = bitcast i32* %phi.ptr to i16*
				%src.ptr = getelementptr i16, i16* %next, i64 1
				%src.ptr2 = bitcast i16* %src.ptr to i32*
				; below underaligned load is overlapped with store.
				%val = load i32, i32* %src.ptr2, align 2
				%dst.ptr = getelementptr i32, i32* %phi.ptr, i64 0
				store i32 %val, i32* %dst.ptr, align 4
				%next.ptr = getelementptr i32, i32* %phi.ptr, i64 1
				%cmp = icmp eq i32* %next.ptr, %end.ptr
				br i1 %cmp, label %exit, label %while.body

				exit:
				ret void
				}

				;; Memcpy formation is still preferred over memmove.
				define void @prefer_memcpy_over_memmove(i8* noalias %Src, i8* noalias %Dest, i64 %Size) {
				; CHECK-LABEL: @prefer_memcpy_over_memmove(
				yurai007AuthorUnsubmitted Done Reply Inline Actions Fix test to emit memmove. yurai007: Fix test to emit memmove.
				; CHECK-NEXT: bb.nph:
				; CHECK-NEXT: [[SCEVGEP:%.]] = getelementptr i8, i8 [[SRC:%.*]], i64 42
				; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[DEST:%.]], i8 align 1 [[SCEVGEP]], i64 [[SIZE:%.*]], i1 false)
				; CHECK-NEXT: br label [[FOR_BODY:%.*]]
				; CHECK: for.body:
				; CHECK-NEXT: [[INDVAR:%.]] = phi i64 [ 0, [[BB_NPH:%.]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ]
				; CHECK-NEXT: [[STEP:%.*]] = add nuw nsw i64 [[INDVAR]], 42
				; CHECK-NEXT: [[SRCI:%.]] = getelementptr i8, i8 [[SRC]], i64 [[STEP]]
				; CHECK-NEXT: [[DESTI:%.]] = getelementptr i8, i8 [[DEST]], i64 [[INDVAR]]
				; CHECK-NEXT: [[V:%.]] = load i8, i8 [[SRCI]], align 1
				; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1
				; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], [[SIZE]]
				; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
				; CHECK: for.end:
				; CHECK-NEXT: ret void
				;
				bb.nph:
				br label %for.body

				for.body: ; preds = %bb.nph, %for.body
				%indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ]
				%Step = add nuw nsw i64 %indvar, 42
				%SrcI = getelementptr i8, i8* %Src, i64 %Step
				%DestI = getelementptr i8, i8* %Dest, i64 %indvar
				%V = load i8, i8* %SrcI, align 1
				store i8 %V, i8* %DestI, align 1
				%indvar.next = add i64 %indvar, 1
				%exitcond = icmp eq i64 %indvar.next, %Size
				br i1 %exitcond, label %for.end, label %for.body

				for.end: ; preds = %for.body, %entry
				ret void
				}

	; Validate that "memset_pattern" has the proper attributes.			; Validate that "memset_pattern" has the proper attributes.
	; CHECK: declare void @memset_pattern16(i8* nocapture writeonly, i8* nocapture readonly, i64) [[ATTRS:#[0-9]+]]			; CHECK: declare void @memset_pattern16(i8* nocapture writeonly, i8* nocapture readonly, i64) [[ATTRS:#[0-9]+]]
	; CHECK: [[ATTRS]] = { argmemonly nofree }			; CHECK: [[ATTRS]] = { argmemonly nofree }

This is an archive of the discontinued LLVM Phabricator instance.

[LoopIdiom] Transform memmove-like loop into memmove (PR46179)
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 355223

llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp

llvm/test/Transforms/LoopIdiom/X86/unordered-atomic-memcpy.ll

llvm/test/Transforms/LoopIdiom/basic.ll

This is an archive of the discontinued LLVM Phabricator instance.

[LoopIdiom] Transform memmove-like loop into memmove (PR46179)ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 355223

llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp

llvm/test/Transforms/LoopIdiom/X86/unordered-atomic-memcpy.ll

llvm/test/Transforms/LoopIdiom/basic.ll

[LoopIdiom] Transform memmove-like loop into memmove (PR46179)
ClosedPublic