Index: llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/trunk/include/llvm/Analysis/TargetTransformInfo.h @@ -862,12 +862,6 @@ unsigned SrcAlign, unsigned DestAlign) const; - /// \returns True if we want to test the new memcpy lowering functionality in - /// Transform/Utils. - /// Temporary. Will be removed once we move to the new functionality and - /// remove the old. - bool useWideIRMemcpyLoopLowering() const; - /// \returns True if the two functions have compatible attributes for inlining /// purposes. bool areInlineCompatible(const Function *Caller, Index: llvm/trunk/include/llvm/Transforms/Utils/LowerMemIntrinsics.h =================================================================== --- llvm/trunk/include/llvm/Transforms/Utils/LowerMemIntrinsics.h +++ llvm/trunk/include/llvm/Transforms/Utils/LowerMemIntrinsics.h @@ -25,12 +25,6 @@ class TargetTransformInfo; class Value; -/// Emit a loop implementing the semantics of llvm.memcpy with the equivalent -/// arguments at \p InsertBefore. -void createMemCpyLoop(Instruction *InsertBefore, Value *SrcAddr, Value *DstAddr, - Value *CopyLen, unsigned SrcAlign, unsigned DestAlign, - bool SrcIsVolatile, bool DstIsVolatile); - /// Emit a loop implementing the semantics of llvm.memcpy where the size is not /// a compile-time constant. Loop will be insterted at \p InsertBefore. void createMemCpyLoopUnknownSize(Instruction *InsertBefore, Value *SrcAddr, Index: llvm/trunk/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/trunk/lib/Analysis/TargetTransformInfo.cpp +++ llvm/trunk/lib/Analysis/TargetTransformInfo.cpp @@ -26,11 +26,6 @@ #define DEBUG_TYPE "tti" -static cl::opt UseWideMemcpyLoopLowering( - "use-wide-memcpy-loop-lowering", cl::init(false), - cl::desc("Enables the new wide memcpy loop lowering in Transforms/Utils."), - cl::Hidden); - static cl::opt EnableReduxCost("costmodel-reduxcost", cl::init(false), cl::Hidden, cl::desc("Recognize reduction patterns.")); @@ -547,10 +542,6 @@ SrcAlign, DestAlign); } -bool TargetTransformInfo::useWideIRMemcpyLoopLowering() const { - return UseWideMemcpyLoopLowering; -} - bool TargetTransformInfo::areInlineCompatible(const Function *Caller, const Function *Callee) const { return TTIImpl->areInlineCompatible(Caller, Callee); Index: llvm/trunk/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp =================================================================== --- llvm/trunk/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp +++ llvm/trunk/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp @@ -111,23 +111,13 @@ ConstantInt *CopyLen = ConstantInt::get(Type::getInt32Ty(Context), NumLoads); - if (!TTI.useWideIRMemcpyLoopLowering()) { - createMemCpyLoop(/* ConvertedInst */ SI, - /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr, - /* CopyLen */ CopyLen, - /* SrcAlign */ LI->getAlignment(), - /* DestAlign */ SI->getAlignment(), - /* SrcIsVolatile */ LI->isVolatile(), - /* DstIsVolatile */ SI->isVolatile()); - } else { - createMemCpyLoopKnownSize(/* ConvertedInst */ SI, - /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr, - /* CopyLen */ CopyLen, - /* SrcAlign */ LI->getAlignment(), - /* DestAlign */ SI->getAlignment(), - /* SrcIsVolatile */ LI->isVolatile(), - /* DstIsVolatile */ SI->isVolatile(), TTI); - } + createMemCpyLoopKnownSize(/* ConvertedInst */ SI, + /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr, + /* CopyLen */ CopyLen, + /* SrcAlign */ LI->getAlignment(), + /* DestAlign */ SI->getAlignment(), + /* SrcIsVolatile */ LI->isVolatile(), + /* DstIsVolatile */ SI->isVolatile(), TTI); SI->eraseFromParent(); LI->eraseFromParent(); Index: llvm/trunk/lib/Transforms/Utils/LowerMemIntrinsics.cpp =================================================================== --- llvm/trunk/lib/Transforms/Utils/LowerMemIntrinsics.cpp +++ llvm/trunk/lib/Transforms/Utils/LowerMemIntrinsics.cpp @@ -263,61 +263,6 @@ } } -void llvm::createMemCpyLoop(Instruction *InsertBefore, - Value *SrcAddr, Value *DstAddr, Value *CopyLen, - unsigned SrcAlign, unsigned DestAlign, - bool SrcIsVolatile, bool DstIsVolatile) { - Type *TypeOfCopyLen = CopyLen->getType(); - - BasicBlock *OrigBB = InsertBefore->getParent(); - Function *F = OrigBB->getParent(); - BasicBlock *NewBB = - InsertBefore->getParent()->splitBasicBlock(InsertBefore, "split"); - BasicBlock *LoopBB = BasicBlock::Create(F->getContext(), "loadstoreloop", - F, NewBB); - - IRBuilder<> Builder(OrigBB->getTerminator()); - - // SrcAddr and DstAddr are expected to be pointer types, - // so no check is made here. - unsigned SrcAS = cast(SrcAddr->getType())->getAddressSpace(); - unsigned DstAS = cast(DstAddr->getType())->getAddressSpace(); - - // Cast pointers to (char *) - SrcAddr = Builder.CreateBitCast(SrcAddr, Builder.getInt8PtrTy(SrcAS)); - DstAddr = Builder.CreateBitCast(DstAddr, Builder.getInt8PtrTy(DstAS)); - - Builder.CreateCondBr( - Builder.CreateICmpEQ(ConstantInt::get(TypeOfCopyLen, 0), CopyLen), NewBB, - LoopBB); - OrigBB->getTerminator()->eraseFromParent(); - - IRBuilder<> LoopBuilder(LoopBB); - PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0); - LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB); - - // load from SrcAddr+LoopIndex - // TODO: we can leverage the align parameter of llvm.memcpy for more efficient - // word-sized loads and stores. - Value *Element = - LoopBuilder.CreateLoad(LoopBuilder.CreateInBoundsGEP( - LoopBuilder.getInt8Ty(), SrcAddr, LoopIndex), - SrcIsVolatile); - // store at DstAddr+LoopIndex - LoopBuilder.CreateStore(Element, - LoopBuilder.CreateInBoundsGEP(LoopBuilder.getInt8Ty(), - DstAddr, LoopIndex), - DstIsVolatile); - - // The value for LoopIndex coming from backedge is (LoopIndex + 1) - Value *NewIndex = - LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1)); - LoopIndex->addIncoming(NewIndex, LoopBB); - - LoopBuilder.CreateCondBr(LoopBuilder.CreateICmpULT(NewIndex, CopyLen), LoopBB, - NewBB); -} - // Lower memmove to IR. memmove is required to correctly copy overlapping memory // regions; therefore, it has to check the relative positions of the source and // destination pointers and choose the copy direction accordingly. @@ -459,38 +404,26 @@ void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy, const TargetTransformInfo &TTI) { - // Original implementation - if (!TTI.useWideIRMemcpyLoopLowering()) { - createMemCpyLoop(/* InsertBefore */ Memcpy, - /* SrcAddr */ Memcpy->getRawSource(), - /* DstAddr */ Memcpy->getRawDest(), - /* CopyLen */ Memcpy->getLength(), - /* SrcAlign */ Memcpy->getAlignment(), - /* DestAlign */ Memcpy->getAlignment(), - /* SrcIsVolatile */ Memcpy->isVolatile(), - /* DstIsVolatile */ Memcpy->isVolatile()); + if (ConstantInt *CI = dyn_cast(Memcpy->getLength())) { + createMemCpyLoopKnownSize(/* InsertBefore */ Memcpy, + /* SrcAddr */ Memcpy->getRawSource(), + /* DstAddr */ Memcpy->getRawDest(), + /* CopyLen */ CI, + /* SrcAlign */ Memcpy->getAlignment(), + /* DestAlign */ Memcpy->getAlignment(), + /* SrcIsVolatile */ Memcpy->isVolatile(), + /* DstIsVolatile */ Memcpy->isVolatile(), + /* TargetTransformInfo */ TTI); } else { - if (ConstantInt *CI = dyn_cast(Memcpy->getLength())) { - createMemCpyLoopKnownSize(/* InsertBefore */ Memcpy, + createMemCpyLoopUnknownSize(/* InsertBefore */ Memcpy, /* SrcAddr */ Memcpy->getRawSource(), /* DstAddr */ Memcpy->getRawDest(), - /* CopyLen */ CI, + /* CopyLen */ Memcpy->getLength(), /* SrcAlign */ Memcpy->getAlignment(), /* DestAlign */ Memcpy->getAlignment(), /* SrcIsVolatile */ Memcpy->isVolatile(), /* DstIsVolatile */ Memcpy->isVolatile(), - /* TargetTransformInfo */ TTI); - } else { - createMemCpyLoopUnknownSize(/* InsertBefore */ Memcpy, - /* SrcAddr */ Memcpy->getRawSource(), - /* DstAddr */ Memcpy->getRawDest(), - /* CopyLen */ Memcpy->getLength(), - /* SrcAlign */ Memcpy->getAlignment(), - /* DestAlign */ Memcpy->getAlignment(), - /* SrcIsVolatile */ Memcpy->isVolatile(), - /* DstIsVolatile */ Memcpy->isVolatile(), - /* TargetTransfomrInfo */ TTI); - } + /* TargetTransfomrInfo */ TTI); } } Index: llvm/trunk/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll +++ llvm/trunk/test/CodeGen/AMDGPU/lower-mem-intrinsics.ll @@ -1,5 +1,4 @@ ; RUN: opt -S -amdgpu-lower-intrinsics %s | FileCheck -check-prefix=OPT %s -; RUN: opt -S -amdgpu-lower-intrinsics -use-wide-memcpy-loop-lowering=true %s | FileCheck -check-prefix=WOPT %s declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture readonly, i64, i32, i1) #1 declare void @llvm.memcpy.p1i8.p3i8.i32(i8 addrspace(1)* nocapture, i8 addrspace(3)* nocapture readonly, i32, i32, i1) #1 @@ -18,21 +17,14 @@ ; Smallest static size which will be expanded ; OPT-LABEL: @min_size_large_static_memcpy_caller0( ; OPT-NOT: call -; OPT: getelementptr -; OPT-NEXT: load i8 -; OPT: getelementptr -; OPT-NEXT: store i8 - -; WOPT-LABEL: @min_size_large_static_memcpy_caller0( -; WOPT-NOT: call -; WOPT: br label %load-store-loop -; WOPT: [[T1:%[0-9]+]] = getelementptr inbounds i8, i8 addrspace(1)* %src, i64 %loop-index -; WOPT-NEXT: [[T2:%[0-9]+]] = load i8, i8 addrspace(1)* [[T1]] -; WOPT-NEXT: [[T3:%[0-9]+]] = getelementptr inbounds i8, i8 addrspace(1)* %dst, i64 %loop-index -; WOPT-NEXT: store i8 [[T2]], i8 addrspace(1)* [[T3]] -; WOPT-NEXT: [[T4:%[0-9]+]] = add i64 %loop-index, 1 -; WOPT-NEXT: [[T5:%[0-9]+]] = icmp ult i64 [[T4]], 1025 -; WOPT-NEXT: br i1 [[T5]], label %load-store-loop, label %memcpy-split +; OPT: br label %load-store-loop +; OPT: [[T1:%[0-9]+]] = getelementptr inbounds i8, i8 addrspace(1)* %src, i64 %loop-index +; OPT-NEXT: [[T2:%[0-9]+]] = load i8, i8 addrspace(1)* [[T1]] +; OPT-NEXT: [[T3:%[0-9]+]] = getelementptr inbounds i8, i8 addrspace(1)* %dst, i64 %loop-index +; OPT-NEXT: store i8 [[T2]], i8 addrspace(1)* [[T3]] +; OPT-NEXT: [[T4:%[0-9]+]] = add i64 %loop-index, 1 +; OPT-NEXT: [[T5:%[0-9]+]] = icmp ult i64 [[T4]], 1025 +; OPT-NEXT: br i1 [[T5]], label %load-store-loop, label %memcpy-split define amdgpu_kernel void @min_size_large_static_memcpy_caller0(i8 addrspace(1)* %dst, i8 addrspace(1)* %src) #0 { call void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* %dst, i8 addrspace(1)* %src, i64 1025, i32 1, i1 false) ret void Index: llvm/trunk/test/CodeGen/NVPTX/lower-aggr-copies.ll =================================================================== --- llvm/trunk/test/CodeGen/NVPTX/lower-aggr-copies.ll +++ llvm/trunk/test/CodeGen/NVPTX/lower-aggr-copies.ll @@ -1,6 +1,5 @@ ; RUN: llc < %s -march=nvptx64 -mcpu=sm_35 -O0 | FileCheck %s --check-prefix PTX ; RUN: opt < %s -S -nvptx-lower-aggr-copies | FileCheck %s --check-prefix IR -; RUN: opt < %s -S -nvptx-lower-aggr-copies -use-wide-memcpy-loop-lowering=true | FileCheck %s --check-prefix WIR ; Verify that the NVPTXLowerAggrCopies pass works as expected - calls to ; llvm.mem* intrinsics get lowered to loops. @@ -18,13 +17,22 @@ ret i8* %dst ; IR-LABEL: @memcpy_caller -; IR: [[CMPREG:%[0-9]+]] = icmp eq i64 0, %n -; IR: br i1 [[CMPREG]], label %split, label %loadstoreloop -; IR: loadstoreloop: -; IR: [[LOADPTR:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 -; IR-NEXT: [[VAL:%[0-9]+]] = load i8, i8* [[LOADPTR]] -; IR-NEXT: [[STOREPTR:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 -; IR-NEXT: store i8 [[VAL]], i8* [[STOREPTR]] +; IR: entry: +; IR: [[Cond:%[0-9]+]] = icmp ne i64 %n, 0 +; IR: br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion + +; IR: loop-memcpy-expansion: +; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ] +; IR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index +; IR: [[Load:%[0-9]+]] = load i8, i8* [[SrcGep]] +; IR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index +; IR: store i8 [[Load]], i8* [[DstGep]] +; IR: [[IndexInc]] = add i64 %loop-index, 1 +; IR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n +; IR: br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion + +; IR-LABEL: post-loop-memcpy-expansion: +; IR: ret i8* %dst ; PTX-LABEL: .visible .func (.param .b64 func_retval0) memcpy_caller ; PTX: LBB[[LABEL:[_0-9]+]]: @@ -34,23 +42,6 @@ ; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd ; PTX: @%p[[PRED]] bra LBB[[LABEL]] -; WIR-LABEL: @memcpy_caller -; WIR: entry: -; WIR: [[Cond:%[0-9]+]] = icmp ne i64 %n, 0 -; WIR: br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion - -; WIR: loop-memcpy-expansion: -; WIR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ] -; WIR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index -; WIR: [[Load:%[0-9]+]] = load i8, i8* [[SrcGep]] -; WIR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index -; WIR: store i8 [[Load]], i8* [[DstGep]] -; WIR: [[IndexInc]] = add i64 %loop-index, 1 -; WIR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n -; WIR: br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion - -; WIR-LABEL: post-loop-memcpy-expansion: -; WIR: ret i8* %dst } define i8* @memcpy_volatile_caller(i8* %dst, i8* %src, i64 %n) #0 { @@ -59,8 +50,23 @@ ret i8* %dst ; IR-LABEL: @memcpy_volatile_caller -; IR: load volatile -; IR: store volatile +; IR: entry: +; IR: [[Cond:%[0-9]+]] = icmp ne i64 %n, 0 +; IR: br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion + +; IR: loop-memcpy-expansion: +; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ] +; IR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index +; IR: [[Load:%[0-9]+]] = load volatile i8, i8* [[SrcGep]] +; IR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index +; IR: store volatile i8 [[Load]], i8* [[DstGep]] +; IR: [[IndexInc]] = add i64 %loop-index, 1 +; IR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n +; IR: br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion + +; IR-LABEL: post-loop-memcpy-expansion: +; IR: ret i8* %dst + ; PTX-LABEL: .visible .func (.param .b64 func_retval0) memcpy_volatile_caller ; PTX: LBB[[LABEL:[_0-9]+]]: @@ -69,24 +75,6 @@ ; PTX: add.s64 %rd[[COUNTER:[0-9]+]], %rd{{[0-9]+}}, 1 ; PTX: setp.lt.u64 %p[[PRED:[0-9]+]], %rd[[COUNTER]], %rd ; PTX: @%p[[PRED]] bra LBB[[LABEL]] - -; WIR-LABEL: @memcpy_volatile_caller -; WIR: entry: -; WIR: [[Cond:%[0-9]+]] = icmp ne i64 %n, 0 -; WIR: br i1 [[Cond]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion - -; WIR: loop-memcpy-expansion: -; WIR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %loop-memcpy-expansion ] -; WIR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index -; WIR: [[Load:%[0-9]+]] = load volatile i8, i8* [[SrcGep]] -; WIR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index -; WIR: store volatile i8 [[Load]], i8* [[DstGep]] -; WIR: [[IndexInc]] = add i64 %loop-index, 1 -; WIR: [[Cond2:%[0-9]+]] = icmp ult i64 [[IndexInc]], %n -; WIR: br i1 [[Cond2]], label %loop-memcpy-expansion, label %post-loop-memcpy-expansion - -; WIR-LABEL: post-loop-memcpy-expansion: -; WIR: ret i8* %dst } define i8* @memcpy_casting_caller(i32* %dst, i32* %src, i64 %n) #0 { @@ -102,12 +90,6 @@ ; IR: [[SRCCAST:%[0-9]+]] = bitcast i32* %src to i8* ; IR: getelementptr inbounds i8, i8* [[SRCCAST]] ; IR: getelementptr inbounds i8, i8* [[DSTCAST]] - -; WIR-LABEL: @memcpy_casting_caller -; WIR: [[DSTCAST:%[0-9]+]] = bitcast i32* %dst to i8* -; WIR: [[SRCCAST:%[0-9]+]] = bitcast i32* %src to i8* -; WIR: getelementptr inbounds i8, i8* [[SRCCAST]] -; WIR: getelementptr inbounds i8, i8* [[DSTCAST]] } define i8* @memcpy_known_size(i8* %dst, i8* %src) { @@ -116,18 +98,18 @@ ret i8* %dst ; Check that calls with compile-time constant size are handled correctly -; WIR-LABEL: @memcpy_known_size -; WIR: entry: -; WIR: br label %load-store-loop -; WIR: load-store-loop: -; WIR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %load-store-loop ] -; WIR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index -; WIR: [[Load:%[0-9]+]] = load i8, i8* [[SrcGep]] -; WIR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index -; WIR: store i8 [[Load]], i8* [[DstGep]] -; WIR: [[IndexInc]] = add i64 %loop-index, 1 -; WIR: [[Cond:%[0-9]+]] = icmp ult i64 %3, 144 -; WIR: br i1 [[Cond]], label %load-store-loop, label %memcpy-split +; IR-LABEL: @memcpy_known_size +; IR: entry: +; IR: br label %load-store-loop +; IR: load-store-loop: +; IR: %loop-index = phi i64 [ 0, %entry ], [ [[IndexInc:%[0-9]+]], %load-store-loop ] +; IR: [[SrcGep:%[0-9]+]] = getelementptr inbounds i8, i8* %src, i64 %loop-index +; IR: [[Load:%[0-9]+]] = load i8, i8* [[SrcGep]] +; IR: [[DstGep:%[0-9]+]] = getelementptr inbounds i8, i8* %dst, i64 %loop-index +; IR: store i8 [[Load]], i8* [[DstGep]] +; IR: [[IndexInc]] = add i64 %loop-index, 1 +; IR: [[Cond:%[0-9]+]] = icmp ult i64 %3, 144 +; IR: br i1 [[Cond]], label %load-store-loop, label %memcpy-split } define i8* @memset_caller(i8* %dst, i32 %c, i64 %n) #0 {