This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
include/llvm/IR/
-
llvm/
-
IR/
-
IRBuilder.h
-
IntrinsicInst.h
-
lib/
-
Analysis/
1
Lint.cpp
-
CodeGen/SelectionDAG/
-
SelectionDAG/
-
SelectionDAGBuilder.cpp
-
IR/
-
IRBuilder.cpp
-
Transforms/Scalar/
-
Scalar/
-
InferAddressSpaces.cpp
-
test/Transforms/InferAddressSpaces/AMDGPU/
-
Transforms/
-
InferAddressSpaces/
-
AMDGPU/
-
mem-intrinsics.ll

Differential D86066

IR: Merge MemCpyInlineInst and MemCpyInst
AbandonedPublic

Authored by arsenm on Aug 17 2020, 5:42 AM.

Download Raw Diff

Details

Reviewers

gchatelet
jdoerfert
efriedma

Summary

llvm.memcpy.inline does not deviate from llvm.memcpy in any way that's
meaningful to the IR. This avoids writing more code in a future
commit. Not sure why lint handled these slightly differently.

Diff Detail

Event Timeline

arsenm created this revision.Aug 17 2020, 5:42 AM

Herald added a project: Restricted Project. · View Herald TranscriptAug 17 2020, 5:42 AM

Herald added a subscriber: hiraditya. · View Herald Transcript

arsenm requested review of this revision.Aug 17 2020, 5:42 AM

Herald added a subscriber: wdng. · View Herald TranscriptAug 17 2020, 5:42 AM

arsenm added a child revision: D86067: GlobalISel: Add opcode for llvm.memcpy.inline.Aug 17 2020, 5:47 AM

arsenm updated this revision to Diff 285998.Aug 17 2020, 6:15 AM

Herald added subscribers: kerbowa, nhaehnle, jvesely. · View Herald TranscriptAug 17 2020, 6:15 AM

Still requires updating more places that create new memcpys from existing ones to clean up this mess

In D86066#2221190, @arsenm wrote:

Still requires updating more places that create new memcpys from existing ones to clean up this mess

Originally llvm.memcpy.inline has been created as a separate IR instruction to allow for the semantic to diverge if needed.
Now in the light of D79279 it seems that we're heading towards many subtlety different IR memory functions, this comes with maintenance cost and complexity.

Maybe we should keep only one that has all the expressive power and lower to it.

I'm summing up the envisioned properties for memcpy here (mostly the ones from D79279)

Source and destination pointers:
- must be of any trivially copyable types
- may come with alignment guarantees
- may be tagged as volatile or provide atomic access semantic
- may be from different pointer spaces? <- not sure about this one (edit: @arsenm I'm watching your talk about address spaces, it seems some of the requirements can be encoded with this notion)
A constant force_inline argument can instruct the compiler to generate the whole content inline, in that case size must be a ConstantInt, if this argument is false, the compiler may choose to delegate implementation to an external function that has the same semantic (libc or user provided).

The same exercise would be needed for memset and memcmp.

@jfb what do you think? I know your patch is almost ready so I'd rather have this discussion before it's submitted.
Maybe this conversation should take place on the dev list even?

Having a bunch of boolean modifiers floating around on memcpy sounds miserable for code to deal with; it makes it hard to code to just ignore the "complicated" memcpy operations if it doesn't want to reason about them. (It's already messy just dealing with volatile operations.)

llvm/lib/Analysis/Lint.cpp
343	IIRC the reason it was implemented this way is that the length passed to memcpy_inline is marked immarg, so we can be more aggressive. I guess the paths can be merged, though.

MemCpyInlineInst is now a subclass of MemCpyInst so somebody else got to this later

Herald added a project: Restricted Project. · View Herald TranscriptSep 1 2023, 5:48 AM

Revision Contents

Path

Size

llvm/

include/

llvm/

IR/

IRBuilder.h

8 lines

IntrinsicInst.h

18 lines

lib/

Analysis/

Lint.cpp

20 lines

CodeGen/

SelectionDAG/

SelectionDAGBuilder.cpp

2 lines

IR/

IRBuilder.cpp

31 lines

Transforms/

Scalar/

InferAddressSpaces.cpp

3 lines

test/

Transforms/

InferAddressSpaces/

AMDGPU/

mem-intrinsics.ll

17 lines

Diff 285998

llvm/include/llvm/IR/IRBuilder.h

Show First 20 Lines • Show All 572 Lines • ▼ Show 20 Lines	return CreateMemCpy(Dst, DstAlign, Src, SrcAlign, getInt64(Size),
NoAliasTag);		NoAliasTag);
}		}

CallInst CreateMemCpy(Value Dst, MaybeAlign DstAlign, Value *Src,		CallInst CreateMemCpy(Value Dst, MaybeAlign DstAlign, Value *Src,
MaybeAlign SrcAlign, Value *Size,		MaybeAlign SrcAlign, Value *Size,
bool isVolatile = false, MDNode *TBAATag = nullptr,		bool isVolatile = false, MDNode *TBAATag = nullptr,
MDNode *TBAAStructTag = nullptr,		MDNode *TBAAStructTag = nullptr,
MDNode *ScopeTag = nullptr,		MDNode *ScopeTag = nullptr,
MDNode *NoAliasTag = nullptr);		MDNode *NoAliasTag = nullptr,
		bool Inline = false);

CallInst CreateMemCpyInline(Value Dst, MaybeAlign DstAlign, Value *Src,		CallInst CreateMemCpyInline(Value Dst, MaybeAlign DstAlign, Value *Src,
MaybeAlign SrcAlign, Value *Size);		MaybeAlign SrcAlign, Value *Size) {
		return CreateMemCpy(Dst, DstAlign, Src, SrcAlign, Size, false,
		nullptr, nullptr, nullptr, nullptr, true);
		}

/// Create and insert an element unordered-atomic memcpy between the		/// Create and insert an element unordered-atomic memcpy between the
/// specified pointers.		/// specified pointers.
///		///
/// DstAlign/SrcAlign are the alignments of the Dst/Src pointers, respectively.		/// DstAlign/SrcAlign are the alignments of the Dst/Src pointers, respectively.
///		///
/// If the pointers aren't i8*, they will be converted. If a TBAA tag is		/// If the pointers aren't i8*, they will be converted. If a TBAA tag is
/// specified, it will be added to the instruction. Likewise with alias.scope		/// specified, it will be added to the instruction. Likewise with alias.scope
▲ Show 20 Lines • Show All 2,047 Lines • Show Last 20 Lines

llvm/include/llvm/IR/IntrinsicInst.h

Show First 20 Lines • Show All 672 Lines • ▼ Show 20 Lines	public:
}		}
};		};

/// This class wraps the llvm.memcpy intrinsic.		/// This class wraps the llvm.memcpy intrinsic.
class MemCpyInst : public MemTransferInst {		class MemCpyInst : public MemTransferInst {
public:		public:
// Methods for support type inquiry through isa, cast, and dyn_cast:		// Methods for support type inquiry through isa, cast, and dyn_cast:
static bool classof(const IntrinsicInst *I) {		static bool classof(const IntrinsicInst *I) {
return I->getIntrinsicID() == Intrinsic::memcpy;		auto IID = I->getIntrinsicID();
		return IID == Intrinsic::memcpy \|\| IID == Intrinsic::memcpy_inline;
}		}
static bool classof(const Value *V) {		static bool classof(const Value *V) {
return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));		return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
}		}
};		};

/// This class wraps the llvm.memmove intrinsic.		/// This class wraps the llvm.memmove intrinsic.
class MemMoveInst : public MemTransferInst {		class MemMoveInst : public MemTransferInst {
public:		public:
// Methods for support type inquiry through isa, cast, and dyn_cast:		// Methods for support type inquiry through isa, cast, and dyn_cast:
static bool classof(const IntrinsicInst *I) {		static bool classof(const IntrinsicInst *I) {
return I->getIntrinsicID() == Intrinsic::memmove;		return I->getIntrinsicID() == Intrinsic::memmove;
}		}
static bool classof(const Value *V) {		static bool classof(const Value *V) {
return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));		return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
}		}
};		};

/// This class wraps the llvm.memcpy.inline intrinsic.
class MemCpyInlineInst : public MemTransferInst {
public:
ConstantInt *getLength() const {
return cast<ConstantInt>(MemTransferInst::getLength());
}
// Methods for support type inquiry through isa, cast, and dyn_cast:
static bool classof(const IntrinsicInst *I) {
return I->getIntrinsicID() == Intrinsic::memcpy_inline;
}
static bool classof(const Value *V) {
return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
}
};

// The common base class for any memset/memmove/memcpy intrinsics;		// The common base class for any memset/memmove/memcpy intrinsics;
// whether they be atomic or non-atomic.		// whether they be atomic or non-atomic.
// i.e. llvm.element.unordered.atomic.memset/memcpy/memmove		// i.e. llvm.element.unordered.atomic.memset/memcpy/memmove
// and llvm.memset/memcpy/memmove		// and llvm.memset/memcpy/memmove
class AnyMemIntrinsic : public MemIntrinsicBase<AnyMemIntrinsic> {		class AnyMemIntrinsic : public MemIntrinsicBase<AnyMemIntrinsic> {
public:		public:
bool isVolatile() const {		bool isVolatile() const {
// Only the non-atomic intrinsics can be volatile		// Only the non-atomic intrinsics can be volatile
▲ Show 20 Lines • Show All 220 Lines • Show Last 20 Lines

llvm/lib/Analysis/Lint.cpp

Show First 20 Lines • Show All 310 Lines • ▼ Show 20 Lines	void Lint::visitCallBase(CallBase &I) {


if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I))		if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I))
switch (II->getIntrinsicID()) {		switch (II->getIntrinsicID()) {
default: break;		default: break;

// TODO: Check more intrinsics		// TODO: Check more intrinsics

case Intrinsic::memcpy: {		case Intrinsic::memcpy:
		case Intrinsic::memcpy_inline: {
MemCpyInst *MCI = cast<MemCpyInst>(&I);		MemCpyInst *MCI = cast<MemCpyInst>(&I);
// TODO: If the size is known, use it.
visitMemoryReference(I, MCI->getDest(), MemoryLocation::UnknownSize,		visitMemoryReference(I, MCI->getDest(), MemoryLocation::UnknownSize,
MCI->getDestAlign(), nullptr, MemRef::Write);		MCI->getDestAlign(), nullptr, MemRef::Write);
visitMemoryReference(I, MCI->getSource(), MemoryLocation::UnknownSize,		visitMemoryReference(I, MCI->getSource(), MemoryLocation::UnknownSize,
MCI->getSourceAlign(), nullptr, MemRef::Read);		MCI->getSourceAlign(), nullptr, MemRef::Read);

// Check that the memcpy arguments don't overlap. The AliasAnalysis API		// Check that the memcpy arguments don't overlap. The AliasAnalysis API
// isn't expressive enough for what we really want to do. Known partial		// isn't expressive enough for what we really want to do. Known partial
// overlap is not distinguished from the case where nothing is known.		// overlap is not distinguished from the case where nothing is known.
auto Size = LocationSize::unknown();		auto Size = LocationSize::unknown();
if (const ConstantInt *Len =		if (const ConstantInt *Len =
dyn_cast<ConstantInt>(findValue(MCI->getLength(),		dyn_cast<ConstantInt>(findValue(MCI->getLength(),
/OffsetOk=/false)))		/OffsetOk=/false)))
if (Len->getValue().isIntN(32))		if (Len->getValue().isIntN(32))
Size = LocationSize::precise(Len->getValue().getZExtValue());		Size = LocationSize::precise(Len->getValue().getZExtValue());
Assert(AA->alias(MCI->getSource(), Size, MCI->getDest(), Size) !=		Assert(AA->alias(MCI->getSource(), Size, MCI->getDest(), Size) !=
MustAlias,		MustAlias,
"Undefined behavior: memcpy source and destination overlap", &I);		"Undefined behavior: memcpy source and destination overlap", &I);
break;		break;
}		}
case Intrinsic::memcpy_inline: {
MemCpyInlineInst *MCII = cast<MemCpyInlineInst>(&I);
const uint64_t Size = MCII->getLength()->getValue().getLimitedValue();
efriedmaUnsubmitted Not Done Reply Inline Actions IIRC the reason it was implemented this way is that the length passed to memcpy_inline is marked immarg, so we can be more aggressive. I guess the paths can be merged, though. efriedma: IIRC the reason it was implemented this way is that the length passed to memcpy_inline is…
visitMemoryReference(I, MCII->getDest(), Size, MCII->getDestAlign(),
nullptr, MemRef::Write);
visitMemoryReference(I, MCII->getSource(), Size, MCII->getSourceAlign(),
nullptr, MemRef::Read);

// Check that the memcpy arguments don't overlap. The AliasAnalysis API
// isn't expressive enough for what we really want to do. Known partial
// overlap is not distinguished from the case where nothing is known.
const LocationSize LS = LocationSize::precise(Size);
Assert(AA->alias(MCII->getSource(), LS, MCII->getDest(), LS) != MustAlias,
"Undefined behavior: memcpy source and destination overlap", &I);
break;
}
case Intrinsic::memmove: {		case Intrinsic::memmove: {
MemMoveInst *MMI = cast<MemMoveInst>(&I);		MemMoveInst *MMI = cast<MemMoveInst>(&I);
// TODO: If the size is known, use it.		// TODO: If the size is known, use it.
visitMemoryReference(I, MMI->getDest(), MemoryLocation::UnknownSize,		visitMemoryReference(I, MMI->getDest(), MemoryLocation::UnknownSize,
MMI->getDestAlign(), nullptr, MemRef::Write);		MMI->getDestAlign(), nullptr, MemRef::Write);
visitMemoryReference(I, MMI->getSource(), MemoryLocation::UnknownSize,		visitMemoryReference(I, MMI->getSource(), MemoryLocation::UnknownSize,
MMI->getSourceAlign(), nullptr, MemRef::Read);		MMI->getSourceAlign(), nullptr, MemRef::Read);
break;		break;
▲ Show 20 Lines • Show All 394 Lines • Show Last 20 Lines

llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 5,710 Lines • ▼ Show 20 Lines	case Intrinsic::memcpy: {
SDValue MC = DAG.getMemcpy(Root, sdl, Op1, Op2, Op3, Alignment, isVol,		SDValue MC = DAG.getMemcpy(Root, sdl, Op1, Op2, Op3, Alignment, isVol,
/* AlwaysInline */ false, isTC,		/* AlwaysInline */ false, isTC,
MachinePointerInfo(I.getArgOperand(0)),		MachinePointerInfo(I.getArgOperand(0)),
MachinePointerInfo(I.getArgOperand(1)));		MachinePointerInfo(I.getArgOperand(1)));
updateDAGForMaybeTailCall(MC);		updateDAGForMaybeTailCall(MC);
return;		return;
}		}
case Intrinsic::memcpy_inline: {		case Intrinsic::memcpy_inline: {
const auto &MCI = cast<MemCpyInlineInst>(I);		const auto &MCI = cast<MemCpyInst>(I);
SDValue Dst = getValue(I.getArgOperand(0));		SDValue Dst = getValue(I.getArgOperand(0));
SDValue Src = getValue(I.getArgOperand(1));		SDValue Src = getValue(I.getArgOperand(1));
SDValue Size = getValue(I.getArgOperand(2));		SDValue Size = getValue(I.getArgOperand(2));
assert(isa<ConstantSDNode>(Size) && "memcpy_inline needs constant size");		assert(isa<ConstantSDNode>(Size) && "memcpy_inline needs constant size");
// @llvm.memcpy.inline defines 0 and 1 to both mean no alignment.		// @llvm.memcpy.inline defines 0 and 1 to both mean no alignment.
Align DstAlign = MCI.getDestAlign().valueOrOne();		Align DstAlign = MCI.getDestAlign().valueOrOne();
Align SrcAlign = MCI.getSourceAlign().valueOrOne();		Align SrcAlign = MCI.getSourceAlign().valueOrOne();
Align Alignment = commonAlignment(DstAlign, SrcAlign);		Align Alignment = commonAlignment(DstAlign, SrcAlign);
▲ Show 20 Lines • Show All 5,006 Lines • Show Last 20 Lines

llvm/lib/IR/IRBuilder.cpp

Show First 20 Lines • Show All 133 Lines • ▼ Show 20 Lines	CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemSet(

return CI;		return CI;
}		}

CallInst IRBuilderBase::CreateMemCpy(Value Dst, MaybeAlign DstAlign,		CallInst IRBuilderBase::CreateMemCpy(Value Dst, MaybeAlign DstAlign,
Value *Src, MaybeAlign SrcAlign,		Value *Src, MaybeAlign SrcAlign,
Value *Size, bool isVolatile,		Value *Size, bool isVolatile,
MDNode TBAATag, MDNode TBAAStructTag,		MDNode TBAATag, MDNode TBAAStructTag,
MDNode ScopeTag, MDNode NoAliasTag) {		MDNode ScopeTag, MDNode NoAliasTag,
		bool Inline) {
Dst = getCastedInt8PtrValue(Dst);		Dst = getCastedInt8PtrValue(Dst);
Src = getCastedInt8PtrValue(Src);		Src = getCastedInt8PtrValue(Src);

Value *Ops[] = {Dst, Src, Size, getInt1(isVolatile)};		Value *Ops[] = {Dst, Src, Size, getInt1(isVolatile)};
Type *Tys[] = { Dst->getType(), Src->getType(), Size->getType() };		Type *Tys[] = { Dst->getType(), Src->getType(), Size->getType() };
Module *M = BB->getParent()->getParent();		Module *M = BB->getParent()->getParent();
Function *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memcpy, Tys);
		Intrinsic::ID MemCpy = Inline ? Intrinsic::memcpy_inline : Intrinsic::memcpy;
		Function *TheFn = Intrinsic::getDeclaration(M, MemCpy, Tys);

CallInst *CI = createCallHelper(TheFn, Ops, this);		CallInst *CI = createCallHelper(TheFn, Ops, this);

auto* MCI = cast<MemCpyInst>(CI);		auto* MCI = cast<MemCpyInst>(CI);
if (DstAlign)		if (DstAlign)
MCI->setDestAlignment(*DstAlign);		MCI->setDestAlignment(*DstAlign);
if (SrcAlign)		if (SrcAlign)
MCI->setSourceAlignment(*SrcAlign);		MCI->setSourceAlignment(*SrcAlign);
Show All 10 Lines	if (ScopeTag)
CI->setMetadata(LLVMContext::MD_alias_scope, ScopeTag);		CI->setMetadata(LLVMContext::MD_alias_scope, ScopeTag);

if (NoAliasTag)		if (NoAliasTag)
CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag);		CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag);

return CI;		return CI;
}		}

CallInst IRBuilderBase::CreateMemCpyInline(Value Dst, MaybeAlign DstAlign,
Value *Src, MaybeAlign SrcAlign,
Value *Size) {
Dst = getCastedInt8PtrValue(Dst);
Src = getCastedInt8PtrValue(Src);
Value *IsVolatile = getInt1(false);

Value *Ops[] = {Dst, Src, Size, IsVolatile};
Type *Tys[] = {Dst->getType(), Src->getType(), Size->getType()};
Function *F = BB->getParent();
Module *M = F->getParent();
Function *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memcpy_inline, Tys);

CallInst *CI = createCallHelper(TheFn, Ops, this);

auto *MCI = cast<MemCpyInlineInst>(CI);
if (DstAlign)
MCI->setDestAlignment(*DstAlign);
if (SrcAlign)
MCI->setSourceAlignment(*SrcAlign);

return CI;
}

CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemCpy(		CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemCpy(
Value Dst, Align DstAlign, Value Src, Align SrcAlign, Value *Size,		Value Dst, Align DstAlign, Value Src, Align SrcAlign, Value *Size,
uint32_t ElementSize, MDNode TBAATag, MDNode TBAAStructTag,		uint32_t ElementSize, MDNode TBAATag, MDNode TBAAStructTag,
MDNode ScopeTag, MDNode NoAliasTag) {		MDNode ScopeTag, MDNode NoAliasTag) {
assert(DstAlign >= ElementSize &&		assert(DstAlign >= ElementSize &&
"Pointer alignment must be at least element size");		"Pointer alignment must be at least element size");
assert(SrcAlign >= ElementSize &&		assert(SrcAlign >= ElementSize &&
"Pointer alignment must be at least element size");		"Pointer alignment must be at least element size");
▲ Show 20 Lines • Show All 972 Lines • Show Last 20 Lines

llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp

Show First 20 Lines • Show All 924 Lines • ▼ Show 20 Lines	if (auto *MSI = dyn_cast<MemSetInst>(MI)) {
if (Dest == OldV)		if (Dest == OldV)
Dest = NewV;		Dest = NewV;

if (isa<MemCpyInst>(MTI)) {		if (isa<MemCpyInst>(MTI)) {
MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct);		MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct);
B.CreateMemCpy(Dest, MTI->getDestAlign(), Src, MTI->getSourceAlign(),		B.CreateMemCpy(Dest, MTI->getDestAlign(), Src, MTI->getSourceAlign(),
MTI->getLength(),		MTI->getLength(),
false, // isVolatile		false, // isVolatile
TBAA, TBAAStruct, ScopeMD, NoAliasMD);		TBAA, TBAAStruct, ScopeMD, NoAliasMD,
		MTI->getIntrinsicID() == Intrinsic::memcpy_inline);
} else {		} else {
assert(isa<MemMoveInst>(MTI));		assert(isa<MemMoveInst>(MTI));
B.CreateMemMove(Dest, MTI->getDestAlign(), Src, MTI->getSourceAlign(),		B.CreateMemMove(Dest, MTI->getDestAlign(), Src, MTI->getSourceAlign(),
MTI->getLength(),		MTI->getLength(),
false, // isVolatile		false, // isVolatile
TBAA, ScopeMD, NoAliasMD);		TBAA, ScopeMD, NoAliasMD);
}		}
} else		} else
▲ Show 20 Lines • Show All 212 Lines • Show Last 20 Lines

llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll

	Show First 20 Lines • Show All 110 Lines • ▼ Show 20 Lines
	; CHECK-LABEL: @memmove_flat_to_flat_replace_src_with_group(			; CHECK-LABEL: @memmove_flat_to_flat_replace_src_with_group(
	; CHECK: call void @llvm.memmove.p0i8.p3i8.i64(i8* align 4 %dest, i8 addrspace(3)* align 4 %src.group.ptr, i64 %size, i1 false), !tbaa !0, !alias.scope !3, !noalias !4			; CHECK: call void @llvm.memmove.p0i8.p3i8.i64(i8* align 4 %dest, i8 addrspace(3)* align 4 %src.group.ptr, i64 %size, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
	define amdgpu_kernel void @memmove_flat_to_flat_replace_src_with_group(i8* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {			define amdgpu_kernel void @memmove_flat_to_flat_replace_src_with_group(i8* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 {
	%cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8*			%cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8*
	call void @llvm.memmove.p0i8.p0i8.i64(i8* align 4 %dest, i8* align 4 %cast.src, i64 %size, i1 false), !tbaa !0, !alias.scope !3, !noalias !4			call void @llvm.memmove.p0i8.p0i8.i64(i8* align 4 %dest, i8* align 4 %cast.src, i64 %size, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
	ret void			ret void
	}			}

				; CHECK-LABEL: @memcpy_inline_flat_to_flat_replace_src_with_group(
				; CHECK: call void @llvm.memcpy.inline.p0i8.p3i8.i64(i8* align 4 %dest, i8 addrspace(3)* align 4 %src.group.ptr, i64 24, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
				define amdgpu_kernel void @memcpy_inline_flat_to_flat_replace_src_with_group(i8* %dest, i8 addrspace(3)* %src.group.ptr) #0 {
				%cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8*
				call void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* align 4 %dest, i8* align 4 %cast.src, i64 24, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
				ret void
				}

				; CHECK-LABEL: @memcpy_inline_flat_to_flat_replace_dest_with_group(
				; CHECK: call void @llvm.memcpy.inline.p3i8.p0i8.i64(i8 addrspace(3)* align 4 %dest.group.ptr, i8* align 4 %src.ptr, i64 24, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
				define amdgpu_kernel void @memcpy_inline_flat_to_flat_replace_dest_with_group(i8 addrspace(3)* %dest.group.ptr, i8* %src.ptr, i64 %size) #0 {
				%cast.dest = addrspacecast i8 addrspace(3)* %dest.group.ptr to i8*
				call void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* align 4 %cast.dest, i8* align 4 %src.ptr, i64 24, i1 false), !tbaa !0, !alias.scope !3, !noalias !4
				ret void
				}

	declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) #1			declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) #1
	declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #1			declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #1
	declare void @llvm.memcpy.p0i8.p3i8.i32(i8* nocapture writeonly, i8 addrspace(3)* nocapture readonly, i32, i1) #1			declare void @llvm.memcpy.p0i8.p3i8.i32(i8* nocapture writeonly, i8 addrspace(3)* nocapture readonly, i32, i1) #1
	declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #1			declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #1
				declare void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #1

	attributes #0 = { nounwind }			attributes #0 = { nounwind }
	attributes #1 = { argmemonly nounwind }			attributes #1 = { argmemonly nounwind }

	!0 = !{!1, !1, i64 0}			!0 = !{!1, !1, i64 0}
	!1 = !{!"A", !2}			!1 = !{!"A", !2}
	!2 = !{!"tbaa root"}			!2 = !{!"tbaa root"}
	!3 = !{!"B", !2}			!3 = !{!"B", !2}
	!4 = !{!5}			!4 = !{!5}
	!5 = distinct !{!5, !6, !"some scope"}			!5 = distinct !{!5, !6, !"some scope"}
	!6 = distinct !{!6, !"some domain"}			!6 = distinct !{!6, !"some domain"}
	!7 = !{i64 0, i64 8, null}			!7 = !{i64 0, i64 8, null}