This is an archive of the discontinued LLVM Phabricator instance.

[InstCombine] Unify handling of atomic memtransfer with non-atomic memtransfer
ClosedPublic

Authored by dneilson on May 9 2018, 12:13 PM.

Download Raw Diff

Details

Reviewers

apilipenko
skatkov
mkazantsev
anna
reames

Commits

rG8f30ec65b0a6: [InstCombine] Unify handling of atomic memtransfer with non-atomic memtransfer
rL332093: [InstCombine] Unify handling of atomic memtransfer with non-atomic memtransfer

Summary

This change reworks the handling of atomic memcpy within the instcombine pass.
Previously, a constant length atomic memcpy would be lowered into loads & stores
as long as no more than 16 load/store pairs are created. This is quite different
from the lowering done for a non-atomic memcpy; which only ever lowers into a single
load/store pair of no more than 8 bytes. Larger constant-sized memcpy calls are
expanded to load/stores in later passes, such as SelectionDAG lowering.

In this change the behaviour for atomic memcpy is unified with non-atomic memcpy;
atomic memcpy is now treated in the same was as non-atomic memcpy has always been.
We leave it to later passes to lower longer-length atomic memcpy calls.

Due to the structure of the pass's handling of memtransfer intrinsics, this change
also gives us handling of atomic memmove that we did not previously have.

Diff Detail

Repository: rL LLVM

Event Timeline

dneilson created this revision.May 9 2018, 12:13 PM

Harbormaster completed remote builds in B17895: Diff 145982.May 9 2018, 12:14 PM

dneilson planned changes to this revision.May 9 2018, 12:16 PM

dneilson added inline comments.

lib/Transforms/InstCombine/InstCombineCalls.cpp
130 ↗	(On Diff #145982)	This is wrong. The length of an atomic memtransfer is in bytes, so there is no need to multiply by the element size.

Don't multiply length by element size for atomic memtransfer intrinsics.
Add tests to show proper behaviour in this case.

dneilson added a child revision: D46660: [InstCombine] Handle atomic memset in the same way as regular memset.May 9 2018, 1:06 PM

LGTM

This revision is now accepted and ready to land.May 10 2018, 3:00 PM

Closed by commit rL332093: [InstCombine] Unify handling of atomic memtransfer with non-atomic memtransfer (authored by dneilson). · Explain WhyMay 11 2018, 7:33 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

Transforms/

InstCombine/

InstCombineCalls.cpp

141 lines

InstCombineInternal.h

3 lines

test/

Transforms/

InstCombine/

element-atomic-memcpy-to-loads.ll

94 lines

element-atomic-memintrins.ll

282 lines

Diff 146324

llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp

Show First 20 Lines • Show All 67 Lines • ▼ Show 20 Lines

using namespace llvm;		using namespace llvm;
using namespace PatternMatch;		using namespace PatternMatch;

#define DEBUG_TYPE "instcombine"		#define DEBUG_TYPE "instcombine"

STATISTIC(NumSimplified, "Number of library calls simplified");		STATISTIC(NumSimplified, "Number of library calls simplified");

static cl::opt<unsigned> UnfoldElementAtomicMemcpyMaxElements(
"unfold-element-atomic-memcpy-max-elements",
cl::init(16),
cl::desc("Maximum number of elements in atomic memcpy the optimizer is "
"allowed to unfold"));

static cl::opt<unsigned> GuardWideningWindow(		static cl::opt<unsigned> GuardWideningWindow(
"instcombine-guard-widening-window",		"instcombine-guard-widening-window",
cl::init(3),		cl::init(3),
cl::desc("How wide an instruction window to bypass looking for "		cl::desc("How wide an instruction window to bypass looking for "
"another guard"));		"another guard"));


/// Return the specified type promoted as it would be to pass though a va_arg		/// Return the specified type promoted as it would be to pass though a va_arg
/// area.		/// area.
static Type getPromotedType(Type Ty) {		static Type getPromotedType(Type Ty) {
if (IntegerType* ITy = dyn_cast<IntegerType>(Ty)) {		if (IntegerType* ITy = dyn_cast<IntegerType>(Ty)) {
if (ITy->getBitWidth() < 32)		if (ITy->getBitWidth() < 32)
return Type::getInt32Ty(Ty->getContext());		return Type::getInt32Ty(Ty->getContext());
}		}
return Ty;		return Ty;
Show All 11 Lines	for (unsigned I = 0, E = V->getNumElements(); I != E; ++I) {
bool Sign = V->getElementType()->isIntegerTy()		bool Sign = V->getElementType()->isIntegerTy()
? cast<ConstantInt>(Elt)->isNegative()		? cast<ConstantInt>(Elt)->isNegative()
: cast<ConstantFP>(Elt)->isNegative();		: cast<ConstantFP>(Elt)->isNegative();
BoolVec.push_back(ConstantInt::get(BoolTy, Sign));		BoolVec.push_back(ConstantInt::get(BoolTy, Sign));
}		}
return ConstantVector::get(BoolVec);		return ConstantVector::get(BoolVec);
}		}

Instruction *		Instruction InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst MI) {
InstCombiner::SimplifyElementUnorderedAtomicMemCpy(AtomicMemCpyInst *AMI) {
// Try to unfold this intrinsic into sequence of explicit atomic loads and
// stores.
// First check that number of elements is compile time constant.
auto *LengthCI = dyn_cast<ConstantInt>(AMI->getLength());
if (!LengthCI)
return nullptr;

// Check that there are not too many elements.
uint64_t LengthInBytes = LengthCI->getZExtValue();
uint32_t ElementSizeInBytes = AMI->getElementSizeInBytes();
uint64_t NumElements = LengthInBytes / ElementSizeInBytes;
if (NumElements >= UnfoldElementAtomicMemcpyMaxElements)
return nullptr;

// Only expand if there are elements to copy.
if (NumElements > 0) {
// Don't unfold into illegal integers
uint64_t ElementSizeInBits = ElementSizeInBytes * 8;
if (!getDataLayout().isLegalInteger(ElementSizeInBits))
return nullptr;

// Cast source and destination to the correct type. Intrinsic input
// arguments are usually represented as i8*. Often operands will be
// explicitly casted to i8* and we can just strip those casts instead of
// inserting new ones. However it's easier to rely on other InstCombine
// rules which will cover trivial cases anyway.
Value *Src = AMI->getRawSource();
Value *Dst = AMI->getRawDest();
Type *ElementPointerType =
Type::getIntNPtrTy(AMI->getContext(), ElementSizeInBits,
Src->getType()->getPointerAddressSpace());

Value *SrcCasted = Builder.CreatePointerCast(Src, ElementPointerType,
"memcpy_unfold.src_casted");
Value *DstCasted = Builder.CreatePointerCast(Dst, ElementPointerType,
"memcpy_unfold.dst_casted");

for (uint64_t i = 0; i < NumElements; ++i) {
// Get current element addresses
ConstantInt *ElementIdxCI =
ConstantInt::get(AMI->getContext(), APInt(64, i));
Value *SrcElementAddr =
Builder.CreateGEP(SrcCasted, ElementIdxCI, "memcpy_unfold.src_addr");
Value *DstElementAddr =
Builder.CreateGEP(DstCasted, ElementIdxCI, "memcpy_unfold.dst_addr");

// Load from the source. Transfer alignment information and mark load as
// unordered atomic.
LoadInst *Load = Builder.CreateLoad(SrcElementAddr, "memcpy_unfold.val");
Load->setOrdering(AtomicOrdering::Unordered);
// We know alignment of the first element. It is also guaranteed by the
// verifier that element size is less or equal than first element
// alignment and both of this values are powers of two. This means that
// all subsequent accesses are at least element size aligned.
// TODO: We can infer better alignment but there is no evidence that this
// will matter.
Load->setAlignment(i == 0 ? AMI->getParamAlignment(1)
: ElementSizeInBytes);
Load->setDebugLoc(AMI->getDebugLoc());

// Store loaded value via unordered atomic store.
StoreInst *Store = Builder.CreateStore(Load, DstElementAddr);
Store->setOrdering(AtomicOrdering::Unordered);
Store->setAlignment(i == 0 ? AMI->getParamAlignment(0)
: ElementSizeInBytes);
Store->setDebugLoc(AMI->getDebugLoc());
}
}

// Set the number of elements of the copy to 0, it will be deleted on the
// next iteration.
AMI->setLength(Constant::getNullValue(LengthCI->getType()));
return AMI;
}

Instruction InstCombiner::SimplifyMemTransfer(MemIntrinsic MI) {
unsigned DstAlign = getKnownAlignment(MI->getRawDest(), DL, MI, &AC, &DT);		unsigned DstAlign = getKnownAlignment(MI->getRawDest(), DL, MI, &AC, &DT);
unsigned CopyDstAlign = MI->getDestAlignment();		unsigned CopyDstAlign = MI->getDestAlignment();
if (CopyDstAlign < DstAlign){		if (CopyDstAlign < DstAlign){
MI->setDestAlignment(DstAlign);		MI->setDestAlignment(DstAlign);
return MI;		return MI;
}		}

auto* MTI = cast<MemTransferInst>(MI);		unsigned SrcAlign = getKnownAlignment(MI->getRawSource(), DL, MI, &AC, &DT);
unsigned SrcAlign = getKnownAlignment(MTI->getRawSource(), DL, MI, &AC, &DT);		unsigned CopySrcAlign = MI->getSourceAlignment();
unsigned CopySrcAlign = MTI->getSourceAlignment();
if (CopySrcAlign < SrcAlign) {		if (CopySrcAlign < SrcAlign) {
MTI->setSourceAlignment(SrcAlign);		MI->setSourceAlignment(SrcAlign);
return MI;		return MI;
}		}

// If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with		// If MemCpyInst length is 1/2/4/8 bytes then replace memcpy with
// load/store.		// load/store.
ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getArgOperand(2));		ConstantInt *MemOpLength = dyn_cast<ConstantInt>(MI->getLength());
if (!MemOpLength) return nullptr;		if (!MemOpLength) return nullptr;

// Source and destination pointer types are always "i8*" for intrinsic. See		// Source and destination pointer types are always "i8*" for intrinsic. See
// if the size is something we can handle with a single primitive load/store.		// if the size is something we can handle with a single primitive load/store.
// A single load+store correctly handles overlapping memory in the memmove		// A single load+store correctly handles overlapping memory in the memmove
// case.		// case.
uint64_t Size = MemOpLength->getLimitedValue();		uint64_t Size = MemOpLength->getLimitedValue();
assert(Size && "0-sized memory transferring should be removed already.");		assert(Size && "0-sized memory transferring should be removed already.");
Show All 25 Lines	if (M->getNumOperands() == 3 && M->getOperand(0) &&
mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() ==		mdconst::extract<ConstantInt>(M->getOperand(1))->getValue() ==
Size &&		Size &&
M->getOperand(2) && isa<MDNode>(M->getOperand(2)))		M->getOperand(2) && isa<MDNode>(M->getOperand(2)))
CopyMD = cast<MDNode>(M->getOperand(2));		CopyMD = cast<MDNode>(M->getOperand(2));
}		}

Value *Src = Builder.CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy);		Value *Src = Builder.CreateBitCast(MI->getArgOperand(1), NewSrcPtrTy);
Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);		Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);
LoadInst *L = Builder.CreateLoad(Src, MI->isVolatile());		LoadInst *L = Builder.CreateLoad(Src);
// Alignment from the mem intrinsic will be better, so use it.		// Alignment from the mem intrinsic will be better, so use it.
L->setAlignment(CopySrcAlign);		L->setAlignment(CopySrcAlign);
if (CopyMD)		if (CopyMD)
L->setMetadata(LLVMContext::MD_tbaa, CopyMD);		L->setMetadata(LLVMContext::MD_tbaa, CopyMD);
MDNode *LoopMemParallelMD =		MDNode *LoopMemParallelMD =
MI->getMetadata(LLVMContext::MD_mem_parallel_loop_access);		MI->getMetadata(LLVMContext::MD_mem_parallel_loop_access);
if (LoopMemParallelMD)		if (LoopMemParallelMD)
L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);		L->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);

StoreInst *S = Builder.CreateStore(L, Dest, MI->isVolatile());		StoreInst *S = Builder.CreateStore(L, Dest);
// Alignment from the mem intrinsic will be better, so use it.		// Alignment from the mem intrinsic will be better, so use it.
S->setAlignment(CopyDstAlign);		S->setAlignment(CopyDstAlign);
if (CopyMD)		if (CopyMD)
S->setMetadata(LLVMContext::MD_tbaa, CopyMD);		S->setMetadata(LLVMContext::MD_tbaa, CopyMD);
if (LoopMemParallelMD)		if (LoopMemParallelMD)
S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);		S->setMetadata(LLVMContext::MD_mem_parallel_loop_access, LoopMemParallelMD);

		if (auto *MT = dyn_cast<MemTransferInst>(MI)) {
		// non-atomics can be volatile
		L->setVolatile(MT->isVolatile());
		S->setVolatile(MT->isVolatile());
		}
		if (isa<AtomicMemTransferInst>(MI)) {
		// atomics have to be unordered
		L->setOrdering(AtomicOrdering::Unordered);
		S->setOrdering(AtomicOrdering::Unordered);
		}

// Set the size of the copy to 0, it will be deleted on the next iteration.		// Set the size of the copy to 0, it will be deleted on the next iteration.
MI->setArgOperand(2, Constant::getNullValue(MemOpLength->getType()));		MI->setLength(Constant::getNullValue(MemOpLength->getType()));
return MI;		return MI;
}		}

Instruction InstCombiner::SimplifyMemSet(MemSetInst MI) {		Instruction InstCombiner::SimplifyMemSet(MemSetInst MI) {
unsigned Alignment = getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT);		unsigned Alignment = getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT);
if (MI->getDestAlignment() < Alignment) {		if (MI->getDestAlignment() < Alignment) {
MI->setDestAlignment(Alignment);		MI->setDestAlignment(Alignment);
return MI;		return MI;
▲ Show 20 Lines • Show All 1,495 Lines • ▼ Show 20 Lines	if (CI.getFunction()->doesNotThrow() && !CI.doesNotThrow()) {
return &CI;		return &CI;
}		}

IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI);		IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI);
if (!II) return visitCallSite(&CI);		if (!II) return visitCallSite(&CI);

// Intrinsics cannot occur in an invoke, so handle them here instead of in		// Intrinsics cannot occur in an invoke, so handle them here instead of in
// visitCallSite.		// visitCallSite.
if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(II)) {		if (auto *MI = dyn_cast<AnyMemIntrinsic>(II)) {
bool Changed = false;		bool Changed = false;

// memmove/cpy/set of zero bytes is a noop.		// memmove/cpy/set of zero bytes is a noop.
if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) {		if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) {
if (NumBytes->isNullValue())		if (NumBytes->isNullValue())
return eraseInstFromFunction(CI);		return eraseInstFromFunction(CI);

if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes))		if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes))
if (CI->getZExtValue() == 1) {		if (CI->getZExtValue() == 1) {
// Replace the instruction with just byte operations. We would		// Replace the instruction with just byte operations. We would
// transform other cases to loads/stores, but we don't know if		// transform other cases to loads/stores, but we don't know if
// alignment is sufficient.		// alignment is sufficient.
}		}
}		}

// No other transformations apply to volatile transfers.		// No other transformations apply to volatile transfers.
if (MI->isVolatile())		if (auto *M = dyn_cast<MemIntrinsic>(MI))
		if (M->isVolatile())
return nullptr;		return nullptr;

// If we have a memmove and the source operation is a constant global,		// If we have a memmove and the source operation is a constant global,
// then the source and dest pointers can't alias, so we can change this		// then the source and dest pointers can't alias, so we can change this
// into a call to memcpy.		// into a call to memcpy.
if (MemMoveInst *MMI = dyn_cast<MemMoveInst>(MI)) {		if (auto *MMI = dyn_cast<AnyMemMoveInst>(MI)) {
if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource()))		if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource()))
if (GVSrc->isConstant()) {		if (GVSrc->isConstant()) {
Module *M = CI.getModule();		Module *M = CI.getModule();
Intrinsic::ID MemCpyID = Intrinsic::memcpy;		Intrinsic::ID MemCpyID =
		isa<AtomicMemMoveInst>(MMI)
		? Intrinsic::memcpy_element_unordered_atomic
		: Intrinsic::memcpy;
Type *Tys[3] = { CI.getArgOperand(0)->getType(),		Type *Tys[3] = { CI.getArgOperand(0)->getType(),
CI.getArgOperand(1)->getType(),		CI.getArgOperand(1)->getType(),
CI.getArgOperand(2)->getType() };		CI.getArgOperand(2)->getType() };
CI.setCalledFunction(Intrinsic::getDeclaration(M, MemCpyID, Tys));		CI.setCalledFunction(Intrinsic::getDeclaration(M, MemCpyID, Tys));
Changed = true;		Changed = true;
}		}
}		}

if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {		if (AnyMemTransferInst *MTI = dyn_cast<AnyMemTransferInst>(MI)) {
// memmove(x,x,size) -> noop.		// memmove(x,x,size) -> noop.
if (MTI->getSource() == MTI->getDest())		if (MTI->getSource() == MTI->getDest())
return eraseInstFromFunction(CI);		return eraseInstFromFunction(CI);
}		}

// If we can determine a pointer alignment that is bigger than currently		// If we can determine a pointer alignment that is bigger than currently
// set, update the alignment.		// set, update the alignment.
if (isa<MemTransferInst>(MI)) {		if (auto *MTI = dyn_cast<AnyMemTransferInst>(MI)) {
if (Instruction *I = SimplifyMemTransfer(MI))		if (Instruction *I = SimplifyAnyMemTransfer(MTI))
return I;		return I;
} else if (MemSetInst *MSI = dyn_cast<MemSetInst>(MI)) {		} else if (MemSetInst *MSI = dyn_cast<MemSetInst>(MI)) {
if (Instruction *I = SimplifyMemSet(MSI))		if (Instruction *I = SimplifyMemSet(MSI))
return I;		return I;
}		}

if (Changed) return II;		if (Changed) return II;
}		}

if (auto *AMI = dyn_cast<AtomicMemCpyInst>(II)) {
if (Constant *C = dyn_cast<Constant>(AMI->getLength()))
if (C->isNullValue())
return eraseInstFromFunction(*AMI);

if (Instruction *I = SimplifyElementUnorderedAtomicMemCpy(AMI))
return I;
}

if (Instruction I = SimplifyNVVMIntrinsic(II, this))		if (Instruction I = SimplifyNVVMIntrinsic(II, this))
return I;		return I;

auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width,		auto SimplifyDemandedVectorEltsLow = [this](Value *Op, unsigned Width,
unsigned DemandedWidth) {		unsigned DemandedWidth) {
APInt UndefElts(Width, 0);		APInt UndefElts(Width, 0);
APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);		APInt DemandedElts = APInt::getLowBitsSet(Width, DemandedWidth);
return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);		return SimplifyDemandedVectorElts(Op, DemandedElts, UndefElts);
▲ Show 20 Lines • Show All 2,520 Lines • Show Last 20 Lines

llvm/trunk/lib/Transforms/InstCombine/InstCombineInternal.h

Show First 20 Lines • Show All 818 Lines • ▼ Show 20 Lines	Instruction OptAndOp(BinaryOperator Op, ConstantInt *OpRHS,
ConstantInt *AndRHS, BinaryOperator &TheAnd);		ConstantInt *AndRHS, BinaryOperator &TheAnd);

Value insertRangeTest(Value V, const APInt &Lo, const APInt &Hi,		Value insertRangeTest(Value V, const APInt &Lo, const APInt &Hi,
bool isSigned, bool Inside);		bool isSigned, bool Inside);
Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI);		Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocaInst &AI);
Instruction *MatchBSwap(BinaryOperator &I);		Instruction *MatchBSwap(BinaryOperator &I);
bool SimplifyStoreAtEndOfBlock(StoreInst &SI);		bool SimplifyStoreAtEndOfBlock(StoreInst &SI);

Instruction SimplifyElementUnorderedAtomicMemCpy(AtomicMemCpyInst AMI);		Instruction SimplifyAnyMemTransfer(AnyMemTransferInst MI);
Instruction SimplifyMemTransfer(MemIntrinsic MI);
Instruction SimplifyMemSet(MemSetInst MI);		Instruction SimplifyMemSet(MemSetInst MI);

Value EvaluateInDifferentType(Value V, Type *Ty, bool isSigned);		Value EvaluateInDifferentType(Value V, Type *Ty, bool isSigned);

/// Returns a value X such that Val = X * Scale, or null if none.		/// Returns a value X such that Val = X * Scale, or null if none.
///		///
/// If the multiplication is known not to overflow then NoSignedWrap is set.		/// If the multiplication is known not to overflow then NoSignedWrap is set.
Value Descale(Value Val, APInt Scale, bool &NoSignedWrap);		Value Descale(Value Val, APInt Scale, bool &NoSignedWrap);
};		};

} // end namespace llvm		} // end namespace llvm

#undef DEBUG_TYPE		#undef DEBUG_TYPE

#endif // LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H		#endif // LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H

llvm/trunk/test/Transforms/InstCombine/element-atomic-memcpy-to-loads.ll

	; RUN: opt -instcombine -unfold-element-atomic-memcpy-max-elements=8 -S < %s \| FileCheck %s
	; Temporarily an expected failure until inst combine is updated in the next patch
	target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"

	; Test basic unfolding -- unordered load & store
	define void @test1a(i8* %Src, i8* %Dst) {
	; CHECK-LABEL: test1a
	; CHECK-NOT: llvm.memcpy.element.unordered.atomic

	; CHECK-DAG: %memcpy_unfold.src_casted = bitcast i8* %Src to i32*
	; CHECK-DAG: %memcpy_unfold.dst_casted = bitcast i8* %Dst to i32*

	; CHECK-DAG: [[VAL1:%[^\s]+]] = load atomic i32, i32* %memcpy_unfold.src_casted unordered, align 4
	; CHECK-DAG: store atomic i32 [[VAL1]], i32* %memcpy_unfold.dst_casted unordered, align 8

	; CHECK-DAG: [[VAL2:%[^\s]+]] = load atomic i32, i32* %{{[^\s]+}} unordered, align 4
	; CHECK-DAG: store atomic i32 [[VAL2]], i32* %{{[^\s]+}} unordered, align 4

	; CHECK-DAG: [[VAL3:%[^\s]+]] = load atomic i32, i32* %{{[^\s]+}} unordered, align 4
	; CHECK-DAG: store atomic i32 [[VAL3]], i32* %{{[^\s]+}} unordered, align 4

	; CHECK-DAG: [[VAL4:%[^\s]+]] = load atomic i32, i32* %{{[^\s]+}} unordered, align 4
	; CHECK-DAG: store atomic i32 [[VAL4]], i32* %{{[^\s]+}} unordered, align 4
	entry:
	call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %Dst, i8* align 4 %Src, i32 16, i32 4)
	ret void
	}

	; Test that we don't unfold too much
	define void @test2(i8* %Src, i8* %Dst) {
	; CHECK-LABEL: test2

	; CHECK-NOT: load
	; CHECK-NOT: store
	; CHECK: llvm.memcpy.element.unordered.atomic
	entry:
	call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %Dst, i8* align 4 %Src, i32 256, i32 4)
	ret void
	}

	; Test that we will not unfold into non native integers
	define void @test3(i8* %Src, i8* %Dst) {
	; CHECK-LABEL: test3

	; CHECK-NOT: load
	; CHECK-NOT: store
	; CHECK: llvm.memcpy.element.unordered.atomic
	entry:
	call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 64 %Dst, i8* align 64 %Src, i32 64, i32 64)
	ret void
	}

	; Test that we will eliminate redundant bitcasts
	define void @test4(i64* %Src, i64* %Dst) {
	; CHECK-LABEL: test4
	; CHECK-NOT: llvm.memcpy.element.unordered.atomic

	; CHECK-NOT: bitcast

	; CHECK-DAG: [[VAL1:%[^\s]+]] = load atomic i64, i64* %Src unordered, align 16
	; CHECK-DAG: store atomic i64 [[VAL1]], i64* %Dst unordered, align 16

	; CHECK-DAG: [[SRC_ADDR2:%[^ ]+]] = getelementptr i64, i64* %Src, i64 1
	; CHECK-DAG: [[DST_ADDR2:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 1
	; CHECK-DAG: [[VAL2:%[^\s]+]] = load atomic i64, i64* [[SRC_ADDR2]] unordered, align 8
	; CHECK-DAG: store atomic i64 [[VAL2]], i64* [[DST_ADDR2]] unordered, align 8

	; CHECK-DAG: [[SRC_ADDR3:%[^ ]+]] = getelementptr i64, i64* %Src, i64 2
	; CHECK-DAG: [[DST_ADDR3:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 2
	; CHECK-DAG: [[VAL3:%[^ ]+]] = load atomic i64, i64* [[SRC_ADDR3]] unordered, align 8
	; CHECK-DAG: store atomic i64 [[VAL3]], i64* [[DST_ADDR3]] unordered, align 8

	; CHECK-DAG: [[SRC_ADDR4:%[^ ]+]] = getelementptr i64, i64* %Src, i64 3
	; CHECK-DAG: [[DST_ADDR4:%[^ ]+]] = getelementptr i64, i64* %Dst, i64 3
	; CHECK-DAG: [[VAL4:%[^ ]+]] = load atomic i64, i64* [[SRC_ADDR4]] unordered, align 8
	; CHECK-DAG: store atomic i64 [[VAL4]], i64* [[DST_ADDR4]] unordered, align 8
	entry:
	%Src.casted = bitcast i64* %Src to i8*
	%Dst.casted = bitcast i64* %Dst to i8*
	call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %Dst.casted, i8* align 16 %Src.casted, i32 32, i32 8)
	ret void
	}

	; Test that 0-length unordered atomic memcpy gets removed.
	define void @test5(i8* %Src, i8* %Dst) {
	; CHECK-LABEL: test5

	; CHECK-NOT: llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 64 %Dst, i8* align 64 %Src, i32 0, i32 8)
	entry:
	call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 64 %Dst, i8* align 64 %Src, i32 0, i32 8)
	ret void
	}

	declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32) nounwind

llvm/trunk/test/Transforms/InstCombine/element-atomic-memintrins.ll

	;; Placeholder tests that will fail once element atomic @llvm.mem[move\|set] instrinsics have			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	;; been added to the MemIntrinsic class hierarchy. These will act as a reminder to
	;; verify that inst combine handles these intrinsics properly once they have been
	;; added to that class hierarchy.

	; RUN: opt -instcombine -S < %s \| FileCheck %s			; RUN: opt -instcombine -S < %s \| FileCheck %s

	;; ---- memset -----			;; ---- memset -----

	; Ensure 0-length memset isn't removed			; Ensure 0-length memset is removed
	define void @test_memset_zero_length(i8* %dest) {			define void @test_memset_zero_length(i8* %dest) {
	; CHECK-LABEL: test_memset_zero_length			; CHECK-LABEL: @test_memset_zero_length(
	; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 0, i32 1)
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
				;
	call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 0, i32 1)			call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 0, i32 1)
	ret void			ret void
	}			}

	; Ensure that small-sized memsets don't convert to stores			; Placeholder test. This will chance once support for lowering atomic memsets is added to instcombine.
	define void @test_memset_to_store(i8* %dest) {			define void @test_memset_to_store(i8* %dest) {
	; CHECK-LABEL: test_memset_to_store			; CHECK-LABEL: @test_memset_to_store(
	; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 1, i32 1)			; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 [[DEST:%.*]], i8 1, i32 1, i32 1)
	; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 2, i32 1)			; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 [[DEST]], i8 1, i32 2, i32 1)
	; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 4, i32 1)			; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 [[DEST]], i8 1, i32 4, i32 1)
	; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 8, i32 1)			; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 [[DEST]], i8 1, i32 8, i32 1)
				; CHECK-NEXT: call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 [[DEST]], i8 1, i32 16, i32 1)
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
				;
	call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 1, i32 1)			call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 1, i32 1)
	call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 2, i32 1)			call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 2, i32 1)
	call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 4, i32 1)			call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 4, i32 1)
	call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 8, i32 1)			call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 8, i32 1)
				call void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* align 1 %dest, i8 1, i32 16, i32 1)
	ret void			ret void
	}			}

	declare void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* nocapture writeonly, i8, i32, i32) nounwind argmemonly			declare void @llvm.memset.element.unordered.atomic.p0i8.i32(i8* nocapture writeonly, i8, i32, i32) nounwind argmemonly


	;; =========================================			;; =========================================
	;; ----- memmove ------			;; ----- memmove ------

	; memmove from a global constant source does not become memcpy
	@gconst = constant [8 x i8] c"0123456\00"			@gconst = constant [32 x i8] c"0123456789012345678901234567890\00"
				; Check that a memmove from a global constant is converted into a memcpy
	define void @test_memmove_to_memcpy(i8* %dest) {			define void @test_memmove_to_memcpy(i8* %dest) {
	; CHECK-LABEL: test_memmove_to_memcpy			; CHECK-LABEL: @test_memmove_to_memcpy(
	; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 getelementptr inbounds ([8 x i8], [8 x i8]* @gconst, i64 0, i64 0), i32 8, i32 1)			; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 [[DEST:%.]], i8 align 16 getelementptr inbounds ([32 x i8], [32 x i8]* @gconst, i64 0, i64 0), i32 32, i32 1)
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
	call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 getelementptr inbounds ([8 x i8], [8 x i8]* @gconst, i64 0, i64 0), i32 8, i32 1)			;
				call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 getelementptr inbounds ([32 x i8], [32 x i8]* @gconst, i64 0, i64 0), i32 32, i32 1)
	ret void			ret void
	}			}

	define void @test_memmove_zero_length(i8* %dest, i8* %src) {			define void @test_memmove_zero_length(i8* %dest, i8* %src) {
	; CHECK-LABEL: test_memmove_zero_length			; CHECK-LABEL: @test_memmove_zero_length(
	; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 0, i32 1)
	; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 0, i32 2)
	; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 0, i32 4)
	; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 0, i32 8)
	; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 0, i32 16)
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
				;
	call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 0, i32 1)			call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 0, i32 1)
	call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 0, i32 2)			call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 0, i32 2)
	call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 0, i32 4)			call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 0, i32 4)
	call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 0, i32 8)			call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 0, i32 8)
	call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 0, i32 16)			call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 0, i32 16)
	ret void			ret void
	}			}

	; memmove with src==dest is removed			; memmove with src==dest is removed
	define void @test_memmove_removed(i8* %srcdest, i32 %sz) {			define void @test_memmove_removed(i8* %srcdest, i32 %sz) {
	; CHECK-LABEL: test_memmove_removed			; CHECK-LABEL: @test_memmove_removed(
	; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %srcdest, i8* align 1 %srcdest, i32 %sz, i32 1)
	; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %srcdest, i8* align 2 %srcdest, i32 %sz, i32 2)
	; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %srcdest, i8* align 4 %srcdest, i32 %sz, i32 4)
	; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %srcdest, i8* align 8 %srcdest, i32 %sz, i32 8)
	; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %srcdest, i8* align 16 %srcdest, i32 %sz, i32 16)
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
				;
	call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %srcdest, i8* align 1 %srcdest, i32 %sz, i32 1)			call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %srcdest, i8* align 1 %srcdest, i32 %sz, i32 1)
	call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %srcdest, i8* align 2 %srcdest, i32 %sz, i32 2)			call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %srcdest, i8* align 2 %srcdest, i32 %sz, i32 2)
	call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %srcdest, i8* align 4 %srcdest, i32 %sz, i32 4)			call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %srcdest, i8* align 4 %srcdest, i32 %sz, i32 4)
	call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %srcdest, i8* align 8 %srcdest, i32 %sz, i32 8)			call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %srcdest, i8* align 8 %srcdest, i32 %sz, i32 8)
	call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %srcdest, i8* align 16 %srcdest, i32 %sz, i32 16)			call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %srcdest, i8* align 16 %srcdest, i32 %sz, i32 16)
	ret void			ret void
	}			}

	; memmove with a small constant length is converted to a load/store pair			; memmove with a small constant length is converted to a load/store pair
	define void @test_memmove_loadstore(i8* %dest, i8* %src) {			define void @test_memmove_loadstore(i8* %dest, i8* %src) {
	; CHECK-LABEL: test_memmove_loadstore			; CHECK-LABEL: @test_memmove_loadstore(
	; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 1, i32 1)			; CHECK-NEXT: [[TMP1:%.]] = load atomic i8, i8 [[SRC:%.*]] unordered, align 1
	; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 2, i32 1)			; CHECK-NEXT: store atomic i8 [[TMP1]], i8* [[DEST:%.*]] unordered, align 1
	; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 4, i32 1)			; CHECK-NEXT: [[TMP2:%.]] = bitcast i8 [[SRC]] to i16*
	; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 8, i32 1)			; CHECK-NEXT: [[TMP3:%.]] = bitcast i8 [[DEST]] to i16*
				; CHECK-NEXT: [[TMP4:%.]] = load atomic i16, i16 [[TMP2]] unordered, align 1
				; CHECK-NEXT: store atomic i16 [[TMP4]], i16* [[TMP3]] unordered, align 1
				; CHECK-NEXT: [[TMP5:%.]] = bitcast i8 [[SRC]] to i32*
				; CHECK-NEXT: [[TMP6:%.]] = bitcast i8 [[DEST]] to i32*
				; CHECK-NEXT: [[TMP7:%.]] = load atomic i32, i32 [[TMP5]] unordered, align 1
				; CHECK-NEXT: store atomic i32 [[TMP7]], i32* [[TMP6]] unordered, align 1
				; CHECK-NEXT: [[TMP8:%.]] = bitcast i8 [[SRC]] to i64*
				; CHECK-NEXT: [[TMP9:%.]] = bitcast i8 [[DEST]] to i64*
				; CHECK-NEXT: [[TMP10:%.]] = load atomic i64, i64 [[TMP8]] unordered, align 1
				; CHECK-NEXT: store atomic i64 [[TMP10]], i64* [[TMP9]] unordered, align 1
				; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 [[DEST]], i8* align 1 [[SRC]], i32 16, i32 1)
	; CHECK-NEXT: ret void			; CHECK-NEXT: ret void
				;
	call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 1, i32 1)			call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 1, i32 1)
	call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 2, i32 1)			call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 2, i32 1)
	call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 4, i32 1)			call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 4, i32 1)
	call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 8, i32 1)			call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 8, i32 1)
				call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 16, i32 1)
				ret void
				}

				define void @test_memmove_loadstore_2(i8* %dest, i8* %src) {
				; CHECK-LABEL: @test_memmove_loadstore_2(
				; CHECK-NEXT: [[TMP1:%.]] = bitcast i8 [[SRC:%.]] to i16
				; CHECK-NEXT: [[TMP2:%.]] = bitcast i8 [[DEST:%.]] to i16
				; CHECK-NEXT: [[TMP3:%.]] = load atomic i16, i16 [[TMP1]] unordered, align 2
				; CHECK-NEXT: store atomic i16 [[TMP3]], i16* [[TMP2]] unordered, align 2
				; CHECK-NEXT: [[TMP4:%.]] = bitcast i8 [[SRC]] to i32*
				; CHECK-NEXT: [[TMP5:%.]] = bitcast i8 [[DEST]] to i32*
				; CHECK-NEXT: [[TMP6:%.]] = load atomic i32, i32 [[TMP4]] unordered, align 2
				; CHECK-NEXT: store atomic i32 [[TMP6]], i32* [[TMP5]] unordered, align 2
				; CHECK-NEXT: [[TMP7:%.]] = bitcast i8 [[SRC]] to i64*
				; CHECK-NEXT: [[TMP8:%.]] = bitcast i8 [[DEST]] to i64*
				; CHECK-NEXT: [[TMP9:%.]] = load atomic i64, i64 [[TMP7]] unordered, align 2
				; CHECK-NEXT: store atomic i64 [[TMP9]], i64* [[TMP8]] unordered, align 2
				; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 [[DEST]], i8* align 2 [[SRC]], i32 16, i32 2)
				; CHECK-NEXT: ret void
				;
				call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 2, i32 2)
				call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 4, i32 2)
				call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 8, i32 2)
				call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 16, i32 2)
				ret void
				}

				define void @test_memmove_loadstore_4(i8* %dest, i8* %src) {
				; CHECK-LABEL: @test_memmove_loadstore_4(
				; CHECK-NEXT: [[TMP1:%.]] = bitcast i8 [[SRC:%.]] to i32
				; CHECK-NEXT: [[TMP2:%.]] = bitcast i8 [[DEST:%.]] to i32
				; CHECK-NEXT: [[TMP3:%.]] = load atomic i32, i32 [[TMP1]] unordered, align 4
				; CHECK-NEXT: store atomic i32 [[TMP3]], i32* [[TMP2]] unordered, align 4
				; CHECK-NEXT: [[TMP4:%.]] = bitcast i8 [[SRC]] to i64*
				; CHECK-NEXT: [[TMP5:%.]] = bitcast i8 [[DEST]] to i64*
				; CHECK-NEXT: [[TMP6:%.]] = load atomic i64, i64 [[TMP4]] unordered, align 4
				; CHECK-NEXT: store atomic i64 [[TMP6]], i64* [[TMP5]] unordered, align 4
				; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 [[DEST]], i8* align 4 [[SRC]], i32 16, i32 4)
				; CHECK-NEXT: ret void
				;
				call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 4, i32 4)
				call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 8, i32 4)
				call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 16, i32 4)
				ret void
				}

				define void @test_memmove_loadstore_8(i8* %dest, i8* %src) {
				; CHECK-LABEL: @test_memmove_loadstore_8(
				; CHECK-NEXT: [[TMP1:%.]] = bitcast i8 [[SRC:%.]] to i64
				; CHECK-NEXT: [[TMP2:%.]] = bitcast i8 [[DEST:%.]] to i64
				; CHECK-NEXT: [[TMP3:%.]] = load atomic i64, i64 [[TMP1]] unordered, align 8
				; CHECK-NEXT: store atomic i64 [[TMP3]], i64* [[TMP2]] unordered, align 8
				; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 [[DEST]], i8* align 8 [[SRC]], i32 16, i32 8)
				; CHECK-NEXT: ret void
				;
				call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 8, i32 8)
				call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 16, i32 8)
				ret void
				}

				define void @test_memmove_loadstore_16(i8* %dest, i8* %src) {
				; CHECK-LABEL: @test_memmove_loadstore_16(
				; CHECK-NEXT: call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 [[DEST:%.]], i8 align 16 [[SRC:%.*]], i32 16, i32 16)
				; CHECK-NEXT: ret void
				;
				call void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 16, i32 16)
	ret void			ret void
	}			}

	declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i32) nounwind argmemonly			declare void @llvm.memmove.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i32) nounwind argmemonly

				;; =========================================
				;; ----- memcpy ------

				define void @test_memcpy_zero_length(i8* %dest, i8* %src) {
				; CHECK-LABEL: @test_memcpy_zero_length(
				; CHECK-NEXT: ret void
				;
				call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 0, i32 1)
				call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 0, i32 2)
				call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 0, i32 4)
				call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 0, i32 8)
				call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 0, i32 16)
				ret void
				}

				; memcpy with src==dest is removed
				define void @test_memcpy_removed(i8* %srcdest, i32 %sz) {
				; CHECK-LABEL: @test_memcpy_removed(
				; CHECK-NEXT: ret void
				;
				call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %srcdest, i8* align 1 %srcdest, i32 %sz, i32 1)
				call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %srcdest, i8* align 2 %srcdest, i32 %sz, i32 2)
				call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %srcdest, i8* align 4 %srcdest, i32 %sz, i32 4)
				call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %srcdest, i8* align 8 %srcdest, i32 %sz, i32 8)
				call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %srcdest, i8* align 16 %srcdest, i32 %sz, i32 16)
				ret void
				}

				; memcpy with a small constant length is converted to a load/store pair
				define void @test_memcpy_loadstore(i8* %dest, i8* %src) {
				; CHECK-LABEL: @test_memcpy_loadstore(
				; CHECK-NEXT: [[TMP1:%.]] = load atomic i8, i8 [[SRC:%.*]] unordered, align 1
				; CHECK-NEXT: store atomic i8 [[TMP1]], i8* [[DEST:%.*]] unordered, align 1
				; CHECK-NEXT: [[TMP2:%.]] = bitcast i8 [[SRC]] to i16*
				; CHECK-NEXT: [[TMP3:%.]] = bitcast i8 [[DEST]] to i16*
				; CHECK-NEXT: [[TMP4:%.]] = load atomic i16, i16 [[TMP2]] unordered, align 1
				; CHECK-NEXT: store atomic i16 [[TMP4]], i16* [[TMP3]] unordered, align 1
				; CHECK-NEXT: [[TMP5:%.]] = bitcast i8 [[SRC]] to i32*
				; CHECK-NEXT: [[TMP6:%.]] = bitcast i8 [[DEST]] to i32*
				; CHECK-NEXT: [[TMP7:%.]] = load atomic i32, i32 [[TMP5]] unordered, align 1
				; CHECK-NEXT: store atomic i32 [[TMP7]], i32* [[TMP6]] unordered, align 1
				; CHECK-NEXT: [[TMP8:%.]] = bitcast i8 [[SRC]] to i64*
				; CHECK-NEXT: [[TMP9:%.]] = bitcast i8 [[DEST]] to i64*
				; CHECK-NEXT: [[TMP10:%.]] = load atomic i64, i64 [[TMP8]] unordered, align 1
				; CHECK-NEXT: store atomic i64 [[TMP10]], i64* [[TMP9]] unordered, align 1
				; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 [[DEST]], i8* align 1 [[SRC]], i32 16, i32 1)
				; CHECK-NEXT: ret void
				;
				call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 1, i32 1)
				call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 2, i32 1)
				call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 4, i32 1)
				call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 8, i32 1)
				call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 1 %dest, i8* align 1 %src, i32 16, i32 1)
				ret void
				}

				define void @test_memcpy_loadstore_2(i8* %dest, i8* %src) {
				; CHECK-LABEL: @test_memcpy_loadstore_2(
				; CHECK-NEXT: [[TMP1:%.]] = bitcast i8 [[SRC:%.]] to i16
				; CHECK-NEXT: [[TMP2:%.]] = bitcast i8 [[DEST:%.]] to i16
				; CHECK-NEXT: [[TMP3:%.]] = load atomic i16, i16 [[TMP1]] unordered, align 2
				; CHECK-NEXT: store atomic i16 [[TMP3]], i16* [[TMP2]] unordered, align 2
				; CHECK-NEXT: [[TMP4:%.]] = bitcast i8 [[SRC]] to i32*
				; CHECK-NEXT: [[TMP5:%.]] = bitcast i8 [[DEST]] to i32*
				; CHECK-NEXT: [[TMP6:%.]] = load atomic i32, i32 [[TMP4]] unordered, align 2
				; CHECK-NEXT: store atomic i32 [[TMP6]], i32* [[TMP5]] unordered, align 2
				; CHECK-NEXT: [[TMP7:%.]] = bitcast i8 [[SRC]] to i64*
				; CHECK-NEXT: [[TMP8:%.]] = bitcast i8 [[DEST]] to i64*
				; CHECK-NEXT: [[TMP9:%.]] = load atomic i64, i64 [[TMP7]] unordered, align 2
				; CHECK-NEXT: store atomic i64 [[TMP9]], i64* [[TMP8]] unordered, align 2
				; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 [[DEST]], i8* align 2 [[SRC]], i32 16, i32 2)
				; CHECK-NEXT: ret void
				;
				call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 2, i32 2)
				call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 4, i32 2)
				call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 8, i32 2)
				call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 2 %dest, i8* align 2 %src, i32 16, i32 2)
				ret void
				}

				define void @test_memcpy_loadstore_4(i8* %dest, i8* %src) {
				; CHECK-LABEL: @test_memcpy_loadstore_4(
				; CHECK-NEXT: [[TMP1:%.]] = bitcast i8 [[SRC:%.]] to i32
				; CHECK-NEXT: [[TMP2:%.]] = bitcast i8 [[DEST:%.]] to i32
				; CHECK-NEXT: [[TMP3:%.]] = load atomic i32, i32 [[TMP1]] unordered, align 4
				; CHECK-NEXT: store atomic i32 [[TMP3]], i32* [[TMP2]] unordered, align 4
				; CHECK-NEXT: [[TMP4:%.]] = bitcast i8 [[SRC]] to i64*
				; CHECK-NEXT: [[TMP5:%.]] = bitcast i8 [[DEST]] to i64*
				; CHECK-NEXT: [[TMP6:%.]] = load atomic i64, i64 [[TMP4]] unordered, align 4
				; CHECK-NEXT: store atomic i64 [[TMP6]], i64* [[TMP5]] unordered, align 4
				; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 [[DEST]], i8* align 4 [[SRC]], i32 16, i32 4)
				; CHECK-NEXT: ret void
				;
				call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 4, i32 4)
				call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 8, i32 4)
				call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 4 %dest, i8* align 4 %src, i32 16, i32 4)
				ret void
				}

				define void @test_memcpy_loadstore_8(i8* %dest, i8* %src) {
				; CHECK-LABEL: @test_memcpy_loadstore_8(
				; CHECK-NEXT: [[TMP1:%.]] = bitcast i8 [[SRC:%.]] to i64
				; CHECK-NEXT: [[TMP2:%.]] = bitcast i8 [[DEST:%.]] to i64
				; CHECK-NEXT: [[TMP3:%.]] = load atomic i64, i64 [[TMP1]] unordered, align 8
				; CHECK-NEXT: store atomic i64 [[TMP3]], i64* [[TMP2]] unordered, align 8
				; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 [[DEST]], i8* align 8 [[SRC]], i32 16, i32 8)
				; CHECK-NEXT: ret void
				;
				call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 8, i32 8)
				call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 8 %dest, i8* align 8 %src, i32 16, i32 8)
				ret void
				}

				define void @test_memcpy_loadstore_16(i8* %dest, i8* %src) {
				; CHECK-LABEL: @test_memcpy_loadstore_16(
				; CHECK-NEXT: call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 [[DEST:%.]], i8 align 16 [[SRC:%.*]], i32 16, i32 16)
				; CHECK-NEXT: ret void
				;
				call void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* align 16 %dest, i8* align 16 %src, i32 16, i32 16)
				ret void
				}

				declare void @llvm.memcpy.element.unordered.atomic.p0i8.p0i8.i32(i8* nocapture writeonly, i8* nocapture readonly, i32, i32) nounwind argmemonly