Index: lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp =================================================================== --- lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp +++ lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp @@ -57,7 +57,6 @@ // Lower MemTransferInst or load-store pair to loop static void convertTransferToLoop( Instruction *splitAt, Value *srcAddr, Value *dstAddr, Value *len, - //unsigned numLoads, bool srcVolatile, bool dstVolatile, LLVMContext &Context, Function &F) { Type *indType = len->getType(); @@ -200,13 +199,14 @@ } for (MemTransferInst *cpy : aggrMemcpys) { - Value *len = cpy->getLength(); - // llvm 2.7 version of memcpy does not have volatile - // operand yet. So always making it non-volatile - // optimistically, so that we don't see unnecessary - // st.volatile in ptx - convertTransferToLoop(cpy, cpy->getSource(), cpy->getDest(), len, false, - false, Context, F); + convertTransferToLoop(/* splitAt */ cpy, + /* srcAddr */ cpy->getSource(), + /* dstAddr */ cpy->getDest(), + /* len */ cpy->getLength(), + /* srcVolatile */ cpy->isVolatile(), + /* dstVolatile */ cpy->isVolatile(), + /* Context */ Context, + /* Function F */ F); cpy->eraseFromParent(); } Index: test/CodeGen/NVPTX/lower-aggr-copies.ll =================================================================== --- test/CodeGen/NVPTX/lower-aggr-copies.ll +++ test/CodeGen/NVPTX/lower-aggr-copies.ll @@ -19,6 +19,19 @@ ; CHECK-NEXT: @%p[[PRED]] bra LBB[[LABEL]] } +define i8* @memcpy_volatile_caller(i8* %dst, i8* %src, i64 %n) #0 { +entry: + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %n, i32 1, i1 true) + ret i8* %dst +; CHECK-LABEL: .visible .func (.param .b32 func_retval0) memcpy_volatile_caller +; CHECK: LBB[[LABEL:[_0-9]+]]: +; CHECK: ld.volatile.u8 %rs[[REG:[0-9]+]] +; CHECK: st.volatile.u8 [%r{{[0-9]+}}], %rs[[REG]] +; CHECK: add.s64 %rd[[COUNTER:[0-9]+]], %rd[[COUNTER]], 1 +; CHECK-NEXT: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd +; CHECK-NEXT: @%p[[PRED]] bra LBB[[LABEL]] +} + define i8* @memset_caller(i8* %dst, i32 %c, i64 %n) #0 { entry: %0 = trunc i32 %c to i8