This is an archive of the discontinued LLVM Phabricator instance.

In D11220#205430, @jholewinski wrote:

The algorithm looks good. Thanks for working on this!

Though I'm wondering if this shouldn't be moved to the CodeGen library. Other targets may be able to benefit from this, like AMDGPU.

Thanks for the quick review, Justin.

Re AMDGPU, I prefer not to generalize prematurely, because maybe YAGNI :-) AMDGPU folks may or may not need this... They are free to adopt and generalize this if they do need it and I'll be happy to help, of course.

Fair enough.

This revision is now accepted and ready to land.Jul 15 2015, 8:31 AM

You may need this for non-clang frontends, but for clang, you can mark memmove as unsupported in TargetLibraryInfo. This is what we do on AMDGPU for memcpy and memset.

Hi,

My name is Okwan Kwon, and I have two comments.

Use getRawDest() and getRawSource() instead. getDest() and getSource() will strip off any casts to give the original pointer. When the original pointer type is not i8 *, then it will get an assertion.

Can you find a way to make use of the alignment information from the memmove intrinsic? It might be possible to generate more efficient load/store than byte copying.

Okwan

jingyue added inline comments.Jul 15 2015, 10:22 AM

lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
125	Argument names start with upper case.
168	CreateInBoundsGEP(srcAddr, IndexPtr) should work now.
lib/Target/NVPTX/NVPTXTargetMachine.cpp
69	Do you intend to use `PR` instead?
test/CodeGen/NVPTX/lower-aggr-copies.ll
21	How come we don't see inbounds here? You created them using `CreateInBoundsGEP`.

In D11220#205531, @okwank wrote:

Hi,

My name is Okwan Kwon, and I have two comments.

Use getRawDest() and getRawSource() instead. getDest() and getSource() will strip off any casts to give the original pointer. When the original pointer type is not i8 *, then it will get an assertion.

Thanks, done. Also added test with casts.

Can you find a way to make use of the alignment information from the memmove intrinsic? It might be possible to generate more efficient load/store than byte copying.

There's a TODO in the code now about this. I'll keep it as a TODO for now

lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
125	True. For now I'm keeping consistent style with other functions in this file - which isn't great in terms of LLVM style. I will do a later refactoring to bring everything closer to the LLVM style
168	Nice!
lib/Target/NVPTX/NVPTXTargetMachine.cpp
69	Good catch, thanks.
test/CodeGen/NVPTX/lower-aggr-copies.ll
21	So no, I didn't. This is still memcpy, not memmove. It didn't use inbounds. I will refactor it to use it in the future, because it makes sense here I think

Thanks for the comments

Updated with comments

LGTM

Closed by commit rL242413: Correct lowering of memmove in NVPTX (authored by eliben). · Explain WhyJul 16 2015, 9:27 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

lib/

Target/

NVPTX/

NVPTXLowerAggrCopies.cpp

209 lines

NVPTXTargetMachine.cpp

3 lines

test/

CodeGen/

NVPTX/

lower-aggr-copies.ll

103 lines

Diff 29782

lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp

Context not available.
	// License. See LICENSE.TXT for details.	// License. See LICENSE.TXT for details.
	//	//
	//===----------------------------------------------------------------------===//	//===----------------------------------------------------------------------===//
		//
	// Lower aggregate copies, memset, memcpy, memmov intrinsics into loops when	// Lower aggregate copies, memset, memcpy, memmov intrinsics into loops when
	// the size is large or is not a compile-time constant.	// the size is large or is not a compile-time constant.
	//	//
Context not available.
	#include "llvm/IR/LLVMContext.h"	#include "llvm/IR/LLVMContext.h"
	#include "llvm/IR/Module.h"	#include "llvm/IR/Module.h"
	#include "llvm/Support/Debug.h"	#include "llvm/Support/Debug.h"
		#include "llvm/Transforms/Utils/BasicBlockUtils.h"

	#define DEBUG_TYPE "nvptx"	#define DEBUG_TYPE "nvptx"

Context not available.
	using namespace llvm;	using namespace llvm;

	namespace {	namespace {

	// actual analysis class, which is a functionpass	// actual analysis class, which is a functionpass
	struct NVPTXLowerAggrCopies : public FunctionPass {	struct NVPTXLowerAggrCopies : public FunctionPass {
	static char ID;	static char ID;
Context not available.
	return "Lower aggregate copies/intrinsics into loops";	return "Lower aggregate copies/intrinsics into loops";
	}	}
	};	};
	} // namespace

	char NVPTXLowerAggrCopies::ID = 0;	char NVPTXLowerAggrCopies::ID = 0;

	// Lower MemTransferInst or load-store pair to loop	// Lower memcpy to loop.
	static void convertTransferToLoop(	void convertMemCpyToLoop(Instruction splitAt, Value srcAddr, Value *dstAddr,
	Instruction splitAt, Value srcAddr, Value dstAddr, Value len,	Value *len, bool srcVolatile, bool dstVolatile,
	bool srcVolatile, bool dstVolatile, LLVMContext &Context, Function &F) {	LLVMContext &Context, Function &F) {
	Type *indType = len->getType();	Type *indType = len->getType();

	BasicBlock *origBB = splitAt->getParent();	BasicBlock *origBB = splitAt->getParent();
Context not available.
	loop.CreateCondBr(loop.CreateICmpULT(newind, len), loopBB, newBB);	loop.CreateCondBr(loop.CreateICmpULT(newind, len), loopBB, newBB);
	}	}

	// Lower MemSetInst to loop	// Lower memmove to IR. memmove is required to correctly copy overlapping memory
	static void convertMemSetToLoop(Instruction splitAt, Value dstAddr,	// regions; therefore, it has to check the relative positions of the source and
	Value len, Value val, LLVMContext &Context,	// destination pointers and choose the copy direction accordingly.
	Function &F) {	//
		// The code below is an IR rendition of this C function:
		//
		// void* memmove(void* dst, const void* src, size_t n) {
		// unsigned char* d = dst;
		// const unsigned char* s = src;
		// if (s < d) {
		// // copy backwards
		// while (n--) {
		// d[n] = s[n];
		// }
		// } else {
		// // copy forward
		// for (size_t i = 0; i < n; ++i) {
		// d[i] = s[i];
		// }
		// }
		// return dst;
		// }
		void convertMemMoveToLoop(Instruction splitAt, Value srcAddr, Value *dstAddr,
		jingyueUnsubmitted Done Reply Inline Actions Argument names start with upper case. jingyue: Argument names start with upper case.
		elibenAuthorUnsubmitted Not Done Reply Inline Actions True. For now I'm keeping consistent style with other functions in this file - which isn't great in terms of LLVM style. I will do a later refactoring to bring everything closer to the LLVM style eliben: True. For now I'm keeping consistent style with other functions in this file - which isn't…
		Value *len, bool srcVolatile, bool dstVolatile,
		LLVMContext &Context, Function &F) {
		Type *TypeOfLen = len->getType();
		BasicBlock *OrigBB = splitAt->getParent();

		// Create the a comparison of src and dst, based on which we jump to either
		// the forward-copy part of the function (if src >= dst) or the backwards-copy
		// part (if src < dst).
		// SplitBlockAndInsertIfThenElse conveniently creates the basic if-then-else
		// structure. Its block terminators (unconditional branches) are replaced by
		// the appropriate conditional branches when the loop is built.
		ICmpInst *PtrCompare = new ICmpInst(splitAt, ICmpInst::ICMP_ULT, srcAddr,
		dstAddr, "compare_src_dst");
		TerminatorInst ThenTerm, ElseTerm;
		SplitBlockAndInsertIfThenElse(PtrCompare, splitAt, &ThenTerm, &ElseTerm);

		// Each part of the function consists of two blocks:
		// copy_backwards: used to skip the loop when n == 0
		// copy_backwards_loop: the actual backwards loop BB
		// copy_forward: used to skip the loop when n == 0
		// copy_forward_loop: the actual forward loop BB
		BasicBlock *CopyBackwardsBB = ThenTerm->getParent();
		CopyBackwardsBB->setName("copy_backwards");
		BasicBlock *CopyForwardBB = ElseTerm->getParent();
		CopyForwardBB->setName("copy_forward");
		BasicBlock *ExitBB = splitAt->getParent();
		ExitBB->setName("memmove_done");

		// Initial comparison of n == 0 that lets us skip the loops altogether. Shared
		// between both backwards and forward copy clauses.
		ICmpInst *CompareN =
		new ICmpInst(OrigBB->getTerminator(), ICmpInst::ICMP_EQ, len,
		ConstantInt::get(TypeOfLen, 0), "compare_n_to_0");

		// Copying backwards.
		BasicBlock *LoopBB =
		BasicBlock::Create(Context, "copy_backwards_loop", &F, CopyForwardBB);
		IRBuilder<> LoopBuilder(LoopBB);
		PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfLen, 0);
		Value *IndexPtr = LoopBuilder.CreateSub(
		LoopPhi, ConstantInt::get(TypeOfLen, 1), "index_ptr");
		Value *Element = LoopBuilder.CreateLoad(
		LoopBuilder.CreateInBoundsGEP(srcAddr, {IndexPtr}), "element");
		jingyueUnsubmitted Done Reply Inline Actions CreateInBoundsGEP(srcAddr, IndexPtr) should work now. jingyue: CreateInBoundsGEP(srcAddr, IndexPtr) should work now.
		elibenAuthorUnsubmitted Not Done Reply Inline Actions Nice! eliben: Nice!
		LoopBuilder.CreateStore(Element,
		LoopBuilder.CreateInBoundsGEP(dstAddr, {IndexPtr}));
		LoopBuilder.CreateCondBr(
		LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfLen, 0)),
		ExitBB, LoopBB);
		LoopPhi->addIncoming(IndexPtr, LoopBB);
		LoopPhi->addIncoming(len, CopyBackwardsBB);
		BranchInst::Create(ExitBB, LoopBB, CompareN, ThenTerm);
		ThenTerm->removeFromParent();

		// Copying forward.
		BasicBlock *FwdLoopBB =
		BasicBlock::Create(Context, "copy_forward_loop", &F, ExitBB);
		IRBuilder<> FwdLoopBuilder(FwdLoopBB);
		PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfLen, 0, "index_ptr");
		Value *FwdElement = FwdLoopBuilder.CreateLoad(
		FwdLoopBuilder.CreateInBoundsGEP(srcAddr, {FwdCopyPhi}), "element");
		FwdLoopBuilder.CreateStore(
		FwdElement, FwdLoopBuilder.CreateInBoundsGEP(dstAddr, {FwdCopyPhi}));
		Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd(
		FwdCopyPhi, ConstantInt::get(TypeOfLen, 1), "index_increment");
		FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, len),
		ExitBB, FwdLoopBB);
		FwdCopyPhi->addIncoming(FwdIndexPtr, FwdLoopBB);
		FwdCopyPhi->addIncoming(ConstantInt::get(TypeOfLen, 0), CopyForwardBB);

		BranchInst::Create(ExitBB, FwdLoopBB, CompareN, ElseTerm);
		ElseTerm->removeFromParent();
		}

		// Lower memset to loop.
		void convertMemSetToLoop(Instruction splitAt, Value dstAddr, Value *len,
		Value *val, LLVMContext &Context, Function &F) {
	BasicBlock *origBB = splitAt->getParent();	BasicBlock *origBB = splitAt->getParent();
	BasicBlock *newBB = splitAt->getParent()->splitBasicBlock(splitAt, "split");	BasicBlock *newBB = splitAt->getParent()->splitBasicBlock(splitAt, "split");
	BasicBlock *loopBB = BasicBlock::Create(Context, "loadstoreloop", &F, newBB);	BasicBlock *loopBB = BasicBlock::Create(Context, "loadstoreloop", &F, newBB);
Context not available.

	bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {	bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
	SmallVector<LoadInst *, 4> aggrLoads;	SmallVector<LoadInst *, 4> aggrLoads;
	SmallVector<MemTransferInst *, 4> aggrMemcpys;	SmallVector<MemIntrinsic *, 4> MemCalls;
	SmallVector<MemSetInst *, 4> aggrMemsets;

	const DataLayout &DL = F.getParent()->getDataLayout();	const DataLayout &DL = F.getParent()->getDataLayout();
	LLVMContext &Context = F.getParent()->getContext();	LLVMContext &Context = F.getParent()->getContext();

	//	// Collect all aggregate loads and mem* calls.
	// Collect all the aggrLoads, aggrMemcpys and addrMemsets.
	//
	for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {	for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
	for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;	for (BasicBlock::iterator II = BI->begin(), IE = BI->end(); II != IE;
	++II) {	++II) {
Context not available.
	continue;	continue;
	aggrLoads.push_back(load);	aggrLoads.push_back(load);
	}	}
	} else if (MemTransferInst *intr = dyn_cast<MemTransferInst>(II)) {	} else if (MemIntrinsic *IntrCall = dyn_cast<MemIntrinsic>(II)) {
	Value *len = intr->getLength();	// Convert intrinsic calls with variable size or with constant size
	// If the number of elements being copied is greater	// larger than the MaxAggrCopySize threshold.
	// than MaxAggrCopySize, lower it to a loop	if (ConstantInt *LenCI = dyn_cast<ConstantInt>(IntrCall->getLength())) {
	if (ConstantInt *len_int = dyn_cast<ConstantInt>(len)) {	if (LenCI->getZExtValue() >= MaxAggrCopySize) {
	if (len_int->getZExtValue() >= MaxAggrCopySize) {	MemCalls.push_back(IntrCall);
	aggrMemcpys.push_back(intr);
	}	}
	} else {	} else {
	// turn variable length memcpy/memmov into loop	MemCalls.push_back(IntrCall);
	aggrMemcpys.push_back(intr);
	}	}
	} else if (MemSetInst *memsetintr = dyn_cast<MemSetInst>(II)) {
	Value *len = memsetintr->getLength();
	if (ConstantInt *len_int = dyn_cast<ConstantInt>(len)) {
	if (len_int->getZExtValue() >= MaxAggrCopySize) {
	aggrMemsets.push_back(memsetintr);
	}
	} else {
	// turn variable length memset into loop
	aggrMemsets.push_back(memsetintr);
	}
	}	}
	}	}
	}	}
	if ((aggrLoads.size() == 0) && (aggrMemcpys.size() == 0) &&
	(aggrMemsets.size() == 0))	if (aggrLoads.size() == 0 && MemCalls.size() == 0) {
	return false;	return false;
		}

	//	//
	// Do the transformation of an aggr load/copy/set to a loop	// Do the transformation of an aggr load/copy/set to a loop
Context not available.
	unsigned numLoads = DL.getTypeStoreSize(load->getType());	unsigned numLoads = DL.getTypeStoreSize(load->getType());
	Value *len = ConstantInt::get(Type::getInt32Ty(Context), numLoads);	Value *len = ConstantInt::get(Type::getInt32Ty(Context), numLoads);

	convertTransferToLoop(store, srcAddr, dstAddr, len, load->isVolatile(),	convertMemCpyToLoop(store, srcAddr, dstAddr, len, load->isVolatile(),
	store->isVolatile(), Context, F);	store->isVolatile(), Context, F);

	store->eraseFromParent();	store->eraseFromParent();
	load->eraseFromParent();	load->eraseFromParent();
	}	}

	for (MemTransferInst *cpy : aggrMemcpys) {	// Transform mem* intrinsic calls.
	convertTransferToLoop(/* splitAt */ cpy,	for (MemIntrinsic *MemCall : MemCalls) {
	/* srcAddr */ cpy->getSource(),	if (MemCpyInst *Memcpy = dyn_cast<MemCpyInst>(MemCall)) {
	/* dstAddr */ cpy->getDest(),	convertMemCpyToLoop(/* splitAt */ Memcpy,
	/* len */ cpy->getLength(),	/* srcAddr */ Memcpy->getSource(),
	/* srcVolatile */ cpy->isVolatile(),	/* dstAddr */ Memcpy->getDest(),
	/* dstVolatile */ cpy->isVolatile(),	/* len */ Memcpy->getLength(),
		/* srcVolatile */ Memcpy->isVolatile(),
		/* dstVolatile */ Memcpy->isVolatile(),
	/* Context */ Context,	/* Context */ Context,
	/* Function F */ F);	/* Function F */ F);
	cpy->eraseFromParent();	} else if (MemMoveInst *Memmove = dyn_cast<MemMoveInst>(MemCall)) {
	}	convertMemMoveToLoop(/* splitAt */ Memmove,
		/* srcAddr */ Memmove->getSource(),
		/* dstAddr */ Memmove->getDest(),
		/* len */ Memmove->getLength(),
		/* srcVolatile */ Memmove->isVolatile(),
		/* dstVolatile */ Memmove->isVolatile(),
		/* Context */ Context,
		/* Function F */ F);

	for (MemSetInst *memsetinst : aggrMemsets) {	} else if (MemSetInst *Memset = dyn_cast<MemSetInst>(MemCall)) {
	Value *len = memsetinst->getLength();	convertMemSetToLoop(/* splitAt */ Memset,
	Value *val = memsetinst->getValue();	/* dstAddr */ Memset->getDest(),
	convertMemSetToLoop(memsetinst, memsetinst->getDest(), len, val, Context,	/* len */ Memset->getLength(),
	F);	/* val */ Memset->getValue(),
	memsetinst->eraseFromParent();	/* Context */ Context,
		/* F */ F);
		}
		MemCall->eraseFromParent();
	}	}

	return true;	return true;
	}	}

		} // namespace

		namespace llvm {
		void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
		}

		INITIALIZE_PASS(NVPTXLowerAggrCopies, "nvptx-lower-aggr-copies",
		"Lower aggregate copies, and llvm.mem* intrinsics into loops",
		false, false)

	FunctionPass *llvm::createLowerAggrCopies() {	FunctionPass *llvm::createLowerAggrCopies() {
	return new NVPTXLowerAggrCopies();	return new NVPTXLowerAggrCopies();
	}	}
Context not available.

lib/Target/NVPTX/NVPTXTargetMachine.cpp

Context not available.
	void initializeNVPTXAllocaHoistingPass(PassRegistry &);	void initializeNVPTXAllocaHoistingPass(PassRegistry &);
	void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);	void initializeNVPTXAssignValidGlobalNamesPass(PassRegistry&);
	void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &);	void initializeNVPTXFavorNonGenericAddrSpacesPass(PassRegistry &);
		void initializeNVPTXLowerAggrCopiesPass(PassRegistry &);
	void initializeNVPTXLowerKernelArgsPass(PassRegistry &);	void initializeNVPTXLowerKernelArgsPass(PassRegistry &);
	void initializeNVPTXLowerAllocaPass(PassRegistry &);	void initializeNVPTXLowerAllocaPass(PassRegistry &);
	}	}
Context not available.

	// FIXME: This pass is really intended to be invoked during IR optimization,	// FIXME: This pass is really intended to be invoked during IR optimization,
	// but it's very NVPTX-specific.	// but it's very NVPTX-specific.
		PassRegistry &PR = *PassRegistry::getPassRegistry();
	initializeNVVMReflectPass(*PassRegistry::getPassRegistry());	initializeNVVMReflectPass(*PassRegistry::getPassRegistry());
		jingyueUnsubmitted Done Reply Inline Actions Do you intend to use `PR` instead? jingyue: Do you intend to use `PR` instead?
		elibenAuthorUnsubmitted Not Done Reply Inline Actions Good catch, thanks. eliben: Good catch, thanks.
	initializeGenericToNVVMPass(*PassRegistry::getPassRegistry());	initializeGenericToNVVMPass(*PassRegistry::getPassRegistry());
	initializeNVPTXAllocaHoistingPass(*PassRegistry::getPassRegistry());	initializeNVPTXAllocaHoistingPass(*PassRegistry::getPassRegistry());
Context not available.
	*PassRegistry::getPassRegistry());	*PassRegistry::getPassRegistry());
	initializeNVPTXLowerKernelArgsPass(*PassRegistry::getPassRegistry());	initializeNVPTXLowerKernelArgsPass(*PassRegistry::getPassRegistry());
	initializeNVPTXLowerAllocaPass(*PassRegistry::getPassRegistry());	initializeNVPTXLowerAllocaPass(*PassRegistry::getPassRegistry());
		initializeNVPTXLowerAggrCopiesPass(PR);
	}	}

	static std::string computeDataLayout(bool is64Bit) {	static std::string computeDataLayout(bool is64Bit) {
Context not available.

test/CodeGen/NVPTX/lower-aggr-copies.ll

	; RUN: llc < %s -march=nvptx -mcpu=sm_35 \| FileCheck %s	; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 \| FileCheck %s --check-prefix PTX
		; RUN: opt < %s -S -nvptx-lower-aggr-copies \| FileCheck %s --check-prefix IR

	; Verify that the NVPTXLowerAggrCopies pass works as expected - calls to	; Verify that the NVPTXLowerAggrCopies pass works as expected - calls to
	; llvm.mem* intrinsics get lowered to loops.	; llvm.mem* intrinsics get lowered to loops.

		target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
		target triple = "nvptx64-unknown-unknown"

	declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1	declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
		declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1) #1
	declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1	declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) #1

	define i8* @memcpy_caller(i8* %dst, i8* %src, i64 %n) #0 {	define i8* @memcpy_caller(i8* %dst, i8* %src, i64 %n) #0 {
Context not available.
	entry:	entry:
	tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %n, i32 1, i1 false)	tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %n, i32 1, i1 false)
	ret i8* %dst	ret i8* %dst
	; CHECK-LABEL: .visible .func (.param .b32 func_retval0) memcpy_caller
	; CHECK: LBB[[LABEL:[_0-9]+]]:	; IR-LABEL: @memcpy_caller
	; CHECK: ld.u8 %rs[[REG:[0-9]+]]	; IR: loadstoreloop:
	; CHECK: st.u8 [%r{{[0-9]+}}], %rs[[REG]]	; IR: [[LOADPTR:%[0-9]+]] = getelementptr i8, i8* %src, i64
		jingyueUnsubmitted Done Reply Inline Actions How come we don't see inbounds here? You created them using `CreateInBoundsGEP`. jingyue: How come we don't see inbounds here? You created them using `CreateInBoundsGEP`.
		elibenAuthorUnsubmitted Not Done Reply Inline Actions So no, I didn't. This is still memcpy, not memmove. It didn't use inbounds. I will refactor it to use it in the future, because it makes sense here I think eliben: So no, I didn't. This is still memcpy, not memmove. It didn't use inbounds. I will refactor it…
	; CHECK: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1	; IR-NEXT: [[VAL:%[0-9]+]] = load i8, i8* [[LOADPTR]]
	; CHECK-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd	; IR-NEXT: [[STOREPTR:%[0-9]+]] = getelementptr i8, i8* %dst, i64
	; CHECK-NEXT: @%p[[PRED]] bra LBB[[LABEL]]	; IR-NEXT: store i8 [[VAL]], i8* [[STOREPTR]]

		; PTX-LABEL: .visible .func (.param .b64 func_retval0) memcpy_caller
		; PTX: LBB[[LABEL:[_0-9]+]]:
		; PTX: ld.u8 %rs[[REG:[0-9]+]]
		; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[REG]]
		; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
		; PTX-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
		; PTX-NEXT: @%p[[PRED]] bra LBB[[LABEL]]
	}	}

	define i8* @memcpy_volatile_caller(i8* %dst, i8* %src, i64 %n) #0 {	define i8* @memcpy_volatile_caller(i8* %dst, i8* %src, i64 %n) #0 {
Context not available.
	entry:	entry:
	tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %n, i32 1, i1 true)	tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %n, i32 1, i1 true)
	ret i8* %dst	ret i8* %dst
	; CHECK-LABEL: .visible .func (.param .b32 func_retval0) memcpy_volatile_caller
	; CHECK: LBB[[LABEL:[_0-9]+]]:	; IR-LABEL: @memcpy_volatile_caller
	; CHECK: ld.volatile.u8 %rs[[REG:[0-9]+]]	; IR: load volatile
	; CHECK: st.volatile.u8 [%r{{[0-9]+}}], %rs[[REG]]	; IR: store volatile
	; CHECK: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
	; CHECK-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd	; PTX-LABEL: .visible .func (.param .b64 func_retval0) memcpy_volatile_caller
	; CHECK-NEXT: @%p[[PRED]] bra LBB[[LABEL]]	; PTX: LBB[[LABEL:[_0-9]+]]:
		; PTX: ld.volatile.u8 %rs[[REG:[0-9]+]]
		; PTX: st.volatile.u8 [%rd{{[0-9]+}}], %rs[[REG]]
		; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
		; PTX-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
		; PTX-NEXT: @%p[[PRED]] bra LBB[[LABEL]]
	}	}

	define i8* @memset_caller(i8* %dst, i32 %c, i64 %n) #0 {	define i8* @memset_caller(i8* %dst, i32 %c, i64 %n) #0 {
Context not available.
	%0 = trunc i32 %c to i8	%0 = trunc i32 %c to i8
	tail call void @llvm.memset.p0i8.i64(i8* %dst, i8 %0, i64 %n, i32 1, i1 false)	tail call void @llvm.memset.p0i8.i64(i8* %dst, i8 %0, i64 %n, i32 1, i1 false)
	ret i8* %dst	ret i8* %dst
	; CHECK-LABEL: .visible .func (.param .b32 func_retval0) memset_caller(
	; CHECK: ld.param.u8 %rs[[REG:[0-9]+]]	; IR-LABEL: @memset_caller
	; CHECK: LBB[[LABEL:[_0-9]+]]:	; IR: [[VAL:%[0-9]+]] = trunc i32 %c to i8
	; CHECK: st.u8 [%r{{[0-9]+}}], %rs[[REG]]	; IR: loadstoreloop:
	; CHECK: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1	; IR: [[STOREPTR:%[0-9]+]] = getelementptr i8, i8* %dst, i64
	; CHECK-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd	; IR-NEXT: store i8 [[VAL]], i8* [[STOREPTR]]
	; CHECK-NEXT: @%p[[PRED]] bra LBB[[LABEL]]
		; PTX-LABEL: .visible .func (.param .b64 func_retval0) memset_caller(
		; PTX: ld.param.u8 %rs[[REG:[0-9]+]]
		; PTX: LBB[[LABEL:[_0-9]+]]:
		; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[REG]]
		; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1
		; PTX-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd
		; PTX-NEXT: @%p[[PRED]] bra LBB[[LABEL]]
	}	}

		define i8* @memmove_caller(i8* %dst, i8* %src, i64 %n) #0 {
		entry:
		tail call void @llvm.memmove.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %n, i32 1, i1 false)
		ret i8* %dst

		; IR-LABEL: @memmove_caller
		; IR: icmp ult i8* %src, %dst
		; IR: [[PHIVAL:%[0-9a-zA-Z_]+]] = phi i64
		; IR-NEXT: %index_ptr = sub i64 [[PHIVAL]], 1
		; IR: [[FWDPHIVAL:%[0-9a-zA-Z_]+]] = phi i64
		; IR: {{%[0-9a-zA-Z_]+}} = add i64 [[FWDPHIVAL]], 1

		; PTX-LABEL: .visible .func (.param .b64 func_retval0) memmove_caller(
		; PTX: ld.param.u64 %rd[[N:[0-9]+]]
		; PTX: setp.eq.s64 %p[[NEQ0:[0-9]+]], %rd[[N]], 0
		; PTX: setp.ge.u64 %p[[SRC_GT_THAN_DST:[0-9]+]], %rd{{[0-9]+}}, %rd{{[0-9]+}}
		; PTX-NEXT: @%p[[SRC_GT_THAN_DST]] bra LBB[[FORWARD_BB:[0-9_]+]]
		; -- this is the backwards copying BB
		; PTX: @%p[[NEQ0]] bra LBB[[EXIT:[0-9_]+]]
		; PTX: add.s64 %rd[[N]], %rd[[N]], -1
		; PTX: ld.u8 %rs[[ELEMENT:[0-9]+]]
		; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[ELEMENT]]
		; -- this is the forwards copying BB
		; PTX: LBB[[FORWARD_BB]]:
		; PTX: @%p[[NEQ0]] bra LBB[[EXIT]]
		; PTX: ld.u8 %rs[[ELEMENT2:[0-9]+]]
		; PTX: st.u8 [%rd{{[0-9]+}}], %rs[[ELEMENT2]]
		; PTX: add.s64 %rd[[INDEX:[0-9]+]], %rd[[INDEX]], 1
		; -- exit block
		; PTX: LBB[[EXIT]]:
		; PTX-NEXT: st.param.b64 [func_retval0
		; PTX-NEXT: ret
		}
Context not available.

This is an archive of the discontinued LLVM Phabricator instance.

Correct lowering of memmove in NVPTXClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 29782

lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp

lib/Target/NVPTX/NVPTXTargetMachine.cpp

test/CodeGen/NVPTX/lower-aggr-copies.ll

Correct lowering of memmove in NVPTX
ClosedPublic