diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h --- a/llvm/include/llvm/IR/IRBuilder.h +++ b/llvm/include/llvm/IR/IRBuilder.h @@ -647,8 +647,11 @@ TBAAStructTag, ScopeTag, NoAliasTag); } - CallInst *CreateMemCpyInline(Value *Dst, MaybeAlign DstAlign, Value *Src, - MaybeAlign SrcAlign, Value *Size); + CallInst * + CreateMemCpyInline(Value *Dst, MaybeAlign DstAlign, Value *Src, + MaybeAlign SrcAlign, Value *Size, bool isVolatile = false, + MDNode *TBAATag = nullptr, MDNode *TBAAStructTag = nullptr, + MDNode *ScopeTag = nullptr, MDNode *NoAliasTag = nullptr); /// Create and insert an element unordered-atomic memcpy between the /// specified pointers. diff --git a/llvm/include/llvm/IR/InstVisitor.h b/llvm/include/llvm/IR/InstVisitor.h --- a/llvm/include/llvm/IR/InstVisitor.h +++ b/llvm/include/llvm/IR/InstVisitor.h @@ -209,6 +209,9 @@ RetTy visitDbgInfoIntrinsic(DbgInfoIntrinsic &I){ DELEGATE(IntrinsicInst); } RetTy visitMemSetInst(MemSetInst &I) { DELEGATE(MemIntrinsic); } RetTy visitMemCpyInst(MemCpyInst &I) { DELEGATE(MemTransferInst); } + RetTy visitMemCpyInlineInst(MemCpyInlineInst &I) { + DELEGATE(MemTransferInst); + } RetTy visitMemMoveInst(MemMoveInst &I) { DELEGATE(MemTransferInst); } RetTy visitMemTransferInst(MemTransferInst &I) { DELEGATE(MemIntrinsic); } RetTy visitMemIntrinsic(MemIntrinsic &I) { DELEGATE(IntrinsicInst); } diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h --- a/llvm/include/llvm/IR/IntrinsicInst.h +++ b/llvm/include/llvm/IR/IntrinsicInst.h @@ -902,7 +902,8 @@ public: // Methods for support type inquiry through isa, cast, and dyn_cast: static bool classof(const IntrinsicInst *I) { - return I->getIntrinsicID() == Intrinsic::memcpy; + return I->getIntrinsicID() == Intrinsic::memcpy || + I->getIntrinsicID() == Intrinsic::memcpy_inline; } static bool classof(const Value *V) { return isa(V) && classof(cast(V)); @@ -922,10 +923,10 @@ }; /// This class wraps the llvm.memcpy.inline intrinsic. -class MemCpyInlineInst : public MemTransferInst { +class MemCpyInlineInst : public MemCpyInst { public: ConstantInt *getLength() const { - return cast(MemTransferInst::getLength()); + return cast(MemCpyInst::getLength()); } // Methods for support type inquiry through isa, cast, and dyn_cast: static bool classof(const IntrinsicInst *I) { diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp --- a/llvm/lib/IR/IRBuilder.cpp +++ b/llvm/lib/IR/IRBuilder.cpp @@ -203,14 +203,14 @@ return CI; } -CallInst *IRBuilderBase::CreateMemCpyInline(Value *Dst, MaybeAlign DstAlign, - Value *Src, MaybeAlign SrcAlign, - Value *Size) { +CallInst *IRBuilderBase::CreateMemCpyInline( + Value *Dst, MaybeAlign DstAlign, Value *Src, MaybeAlign SrcAlign, + Value *Size, bool isVolatile, MDNode *TBAATag, MDNode *TBAAStructTag, + MDNode *ScopeTag, MDNode *NoAliasTag) { Dst = getCastedInt8PtrValue(Dst); Src = getCastedInt8PtrValue(Src); - Value *IsVolatile = getInt1(false); - Value *Ops[] = {Dst, Src, Size, IsVolatile}; + Value *Ops[] = {Dst, Src, Size, getInt1(isVolatile)}; Type *Tys[] = {Dst->getType(), Src->getType(), Size->getType()}; Function *F = BB->getParent(); Module *M = F->getParent(); @@ -224,6 +224,20 @@ if (SrcAlign) MCI->setSourceAlignment(*SrcAlign); + // Set the TBAA info if present. + if (TBAATag) + MCI->setMetadata(LLVMContext::MD_tbaa, TBAATag); + + // Set the TBAA Struct info if present. + if (TBAAStructTag) + MCI->setMetadata(LLVMContext::MD_tbaa_struct, TBAAStructTag); + + if (ScopeTag) + MCI->setMetadata(LLVMContext::MD_alias_scope, ScopeTag); + + if (NoAliasTag) + MCI->setMetadata(LLVMContext::MD_noalias, NoAliasTag); + return CI; } diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp --- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp +++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp @@ -953,7 +953,13 @@ if (Dest == OldV) Dest = NewV; - if (isa(MTI)) { + if (isa(MTI)) { + MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct); + B.CreateMemCpyInline(Dest, MTI->getDestAlign(), Src, + MTI->getSourceAlign(), MTI->getLength(), + false, // isVolatile + TBAA, TBAAStruct, ScopeMD, NoAliasMD); + } else if (isa(MTI)) { MDNode *TBAAStruct = MTI->getMetadata(LLVMContext::MD_tbaa_struct); B.CreateMemCpy(Dest, MTI->getDestAlign(), Src, MTI->getSourceAlign(), MTI->getLength(), diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -831,7 +831,7 @@ return false; // If we're not allowed to hack on memcpy, we fail. - if (!HasMemcpy || DisableLIRP::Memcpy) + if ((!HasMemcpy && !isa(MCI)) || DisableLIRP::Memcpy) return false; Value *Dest = MCI->getDest(); @@ -1190,6 +1190,13 @@ MaybeAlign LoadAlign, Instruction *TheStore, Instruction *TheLoad, const SCEVAddRecExpr *StoreEv, const SCEVAddRecExpr *LoadEv, const SCEV *BECount) { + + // FIXME: until llvm.memcpy.inline supports dynamic sizes, we need to + // conservatively bail here, since otherwise we may have to transform + // llvm.memcpy.inline into llvm.memcpy which is illegal. + if (isa(TheStore)) + return false; + // The trip count of the loop and the base pointer of the addrec SCEV is // guaranteed to be loop invariant, which means that it should dominate the // header. This allows us to insert code for it in the preheader. diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp --- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp +++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp @@ -691,7 +691,7 @@ // We found an instruction that may write to the loaded memory. // We can try to promote at this position instead of the store - // position if nothing alias the store memory after this and the store + // position if nothing aliases the store memory after this and the store // destination is not in the range. if (P && P != SI) { if (!moveUp(SI, P, LI)) @@ -1109,7 +1109,14 @@ NewM = Builder.CreateMemMove(M->getRawDest(), M->getDestAlign(), MDep->getRawSource(), MDep->getSourceAlign(), M->getLength(), M->isVolatile()); - else + else if (isa(M)) { + // llvm.memcpy may be promoted to llvm.memcpy.inline, but the converse is + // never allowed since that would allow the latter to be lowered as a call + // to an external function. + NewM = Builder.CreateMemCpyInline( + M->getRawDest(), M->getDestAlign(), MDep->getRawSource(), + MDep->getSourceAlign(), M->getLength(), M->isVolatile()); + } else NewM = Builder.CreateMemCpy(M->getRawDest(), M->getDestAlign(), MDep->getRawSource(), MDep->getSourceAlign(), M->getLength(), M->isVolatile()); diff --git a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll --- a/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll +++ b/llvm/test/Transforms/InferAddressSpaces/AMDGPU/mem-intrinsics.ll @@ -8,6 +8,7 @@ ret void } + ; CHECK-LABEL: @memset_global_to_flat( ; CHECK: call void @llvm.memset.p1i8.i64(i8 addrspace(1)* align 4 %global.ptr, i8 4, i64 32, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 define amdgpu_kernel void @memset_global_to_flat(i8 addrspace(1)* %global.ptr, i32 %y) #0 { @@ -31,7 +32,6 @@ call void @llvm.memset.p0i8.i64(i8* align 4 %cast, i8 4, i64 %size, i1 false) ret void } - ; CHECK-LABEL: @memcpy_flat_to_flat_replace_src_with_group( ; CHECK: call void @llvm.memcpy.p0i8.p3i8.i64(i8* align 4 %dest, i8 addrspace(3)* align 4 %src.group.ptr, i64 %size, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 define amdgpu_kernel void @memcpy_flat_to_flat_replace_src_with_group(i8* %dest, i8 addrspace(3)* %src.group.ptr, i64 %size) #0 { @@ -40,6 +40,14 @@ ret void } +; CHECK-LABEL: @memcpy_inline_flat_to_flat_replace_src_with_group( +; CHECK: call void @llvm.memcpy.inline.p0i8.p3i8.i64(i8* align 4 %dest, i8 addrspace(3)* align 4 %src.group.ptr, i64 42, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 +define amdgpu_kernel void @memcpy_inline_flat_to_flat_replace_src_with_group(i8* %dest, i8 addrspace(3)* %src.group.ptr) #0 { + %cast.src = addrspacecast i8 addrspace(3)* %src.group.ptr to i8* + call void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* align 4 %dest, i8* align 4 %cast.src, i64 42, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 + ret void +} + ; CHECK-LABEL: @memcpy_flat_to_flat_replace_dest_with_group( ; CHECK: call void @llvm.memcpy.p3i8.p0i8.i64(i8 addrspace(3)* align 4 %dest.group.ptr, i8* align 4 %src.ptr, i64 %size, i1 false), !tbaa !0, !alias.scope !3, !noalias !4 define amdgpu_kernel void @memcpy_flat_to_flat_replace_dest_with_group(i8 addrspace(3)* %dest.group.ptr, i8* %src.ptr, i64 %size) #0 { @@ -118,6 +126,7 @@ declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1) #1 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #1 +declare void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #1 declare void @llvm.memcpy.p0i8.p3i8.i32(i8* nocapture writeonly, i8 addrspace(3)* nocapture readonly, i32, i1) #1 declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1) #1 diff --git a/llvm/test/Transforms/LoopIdiom/memcpy-inline-intrinsic.ll b/llvm/test/Transforms/LoopIdiom/memcpy-inline-intrinsic.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopIdiom/memcpy-inline-intrinsic.ll @@ -0,0 +1,64 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -loop-idiom < %s -S | FileCheck %s + +%struct.S = type { i32, i32, i8 } + +; unsigned copy_noalias(S* __restrict a, S *b, int n) { +; for (int i = 0; i < n; i++) { +; a[i] = b[i]; +; } +; return sizeof(a[0]); +; } + +; Function Attrs: nofree nounwind uwtable mustprogress +define dso_local i32 @copy_noalias(%struct.S* noalias nocapture %a, %struct.S* nocapture readonly %b, i32 %n) local_unnamed_addr #0 { +; CHECK-LABEL: @copy_noalias( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[CMP7:%.*]] = icmp sgt i32 [[N:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.cond.cleanup.loopexit: +; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK: for.cond.cleanup: +; CHECK-NEXT: ret i32 12 +; CHECK: for.body: +; CHECK-NEXT: [[I_08:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_08]] to i64 +; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_S:%.*]], %struct.S* [[B:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds [[STRUCT_S]], %struct.S* [[A:%.*]], i64 [[IDXPROM]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast %struct.S* [[ARRAYIDX2]] to i8* +; CHECK-NEXT: [[TMP1:%.*]] = bitcast %struct.S* [[ARRAYIDX]] to i8* +; CHECK-NEXT: call void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(12) [[TMP0]], i8* nonnull align 4 dereferenceable(12) [[TMP1]], i64 12, i1 false) +; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1 +; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]] +; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] +; +entry: + %cmp7 = icmp sgt i32 %n, 0 + br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret i32 12 + +for.body: ; preds = %for.body.preheader, %for.body + %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ] + %idxprom = zext i32 %i.08 to i64 + %arrayidx = getelementptr inbounds %struct.S, %struct.S* %b, i64 %idxprom + %arrayidx2 = getelementptr inbounds %struct.S, %struct.S* %a, i64 %idxprom + %0 = bitcast %struct.S* %arrayidx2 to i8* + %1 = bitcast %struct.S* %arrayidx to i8* + call void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %0, i8* nonnull align 4 dereferenceable(12) %1, i64 12, i1 false) + %inc = add nuw nsw i32 %i.08, 1 + %cmp = icmp slt i32 %inc, %n + br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit +} + +; Function Attrs: argmemonly nofree nosync nounwind willreturn +declare void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1 diff --git a/llvm/test/Transforms/MemCpyOpt/memcpy.ll b/llvm/test/Transforms/MemCpyOpt/memcpy.ll --- a/llvm/test/Transforms/MemCpyOpt/memcpy.ll +++ b/llvm/test/Transforms/MemCpyOpt/memcpy.ll @@ -8,6 +8,14 @@ %0 = type { x86_fp80, x86_fp80 } %1 = type { i32, i32 } +declare void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* nocapture, i8* nocapture, i64, i1) nounwind +declare void @llvm.memcpy.p0i8.p1i8.i64(i8* nocapture, i8 addrspace(1)* nocapture, i64, i1) nounwind +declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i1) nounwind +declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i1) nounwind +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind +declare void @llvm.memcpy.inline.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i1) nounwind +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind + define void @test1(%0* sret(%0) %agg.result, x86_fp80 %z.0, x86_fp80 %z.1) nounwind { ; CHECK-LABEL: @test1( ; CHECK-NEXT: entry: @@ -71,7 +79,49 @@ } +; Same as @test2_memcpy, but the remaining memcpy should remain non-inline even +; if the one eliminated was inline. +define void @test3_memcpy(i8* noalias %P, i8* noalias %Q) nounwind { +; CHECK-LABEL: @test3_memcpy( +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 [[Q:%.*]], i8* align 16 [[P:%.*]], i32 32, i1 false) +; CHECK-NEXT: ret void +; + %memtmp = alloca %0, align 16 + %R = bitcast %0* %memtmp to i8* + call void @llvm.memcpy.inline.p0i8.p0i8.i32(i8* align 16 %R, i8* align 16 %P, i32 32, i1 false) + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %Q, i8* align 16 %R, i32 32, i1 false) + ret void + +} + +; Same as @test2_memcpy, but the remaining memcpy should remain inline even +; if the one eliminated was not inline. +define void @test4_memcpy(i8* noalias %P, i8* noalias %Q) nounwind { +; CHECK-LABEL: @test4_memcpy( +; CHECK-NEXT: call void @llvm.memcpy.inline.p0i8.p0i8.i32(i8* align 16 [[Q:%.*]], i8* align 16 [[P:%.*]], i32 32, i1 false) +; CHECK-NEXT: ret void +; + %memtmp = alloca %0, align 16 + %R = bitcast %0* %memtmp to i8* + call void @llvm.memcpy.p0i8.p0i8.i32(i8* align 16 %R, i8* align 16 %P, i32 32, i1 false) + call void @llvm.memcpy.inline.p0i8.p0i8.i32(i8* align 16 %Q, i8* align 16 %R, i32 32, i1 false) + ret void + +} + +; Same as @test2_memcpy, and the inline-ness should be preserved. +define void @test5_memcpy(i8* noalias %P, i8* noalias %Q) nounwind { +; CHECK-LABEL: @test5_memcpy( +; CHECK-NEXT: call void @llvm.memcpy.inline.p0i8.p0i8.i32(i8* align 16 [[Q:%.*]], i8* align 16 [[P:%.*]], i32 32, i1 false) +; CHECK-NEXT: ret void +; + %memtmp = alloca %0, align 16 + %R = bitcast %0* %memtmp to i8* + call void @llvm.memcpy.inline.p0i8.p0i8.i32(i8* align 16 %R, i8* align 16 %P, i32 32, i1 false) + call void @llvm.memcpy.inline.p0i8.p0i8.i32(i8* align 16 %Q, i8* align 16 %R, i32 32, i1 false) + ret void +} @x = external global %0 @@ -202,9 +252,6 @@ } declare void @test4a(i8* align 1 byval(i8)) -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind -declare void @llvm.memcpy.p0i8.p1i8.i64(i8* nocapture, i8 addrspace(1)* nocapture, i64, i1) nounwind -declare void @llvm.memcpy.p1i8.p1i8.i64(i8 addrspace(1)* nocapture, i8 addrspace(1)* nocapture, i64, i1) nounwind %struct.S = type { i128, [4 x i8]} @@ -266,7 +313,6 @@ declare i32 @g(%struct.p* align 8 byval(%struct.p)) -declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i1) nounwind ; PR11142 - When looking for a memcpy-memcpy dependency, don't get stuck on ; instructions between the memcpy's that only affect the destination pointer. @@ -375,14 +421,5 @@ ret void } -declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind -declare void @llvm.memcpy.p1i8.p0i8.i64(i8 addrspace(1)* nocapture, i8* nocapture, i64, i1) nounwind - declare void @f1(%struct.big* nocapture sret(%struct.big)) declare void @f2(%struct.big*) - -; CHECK: attributes #1 = { argmemonly nofree nounwind willreturn } -; CHECK: attributes #2 = { nounwind ssp } -; CHECK: attributes #3 = { willreturn } -; CHECK: attributes #4 = { nounwind ssp uwtable } -; CHECK: attributes #5 = { argmemonly nofree nounwind willreturn writeonly }