diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp --- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp +++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp @@ -1186,6 +1186,51 @@ StoreEv, LoadEv, BECount); } +class MemmoveVerifier { +public: + explicit MemmoveVerifier(Value *LoadBasePtr, Value *StoreBasePtr, + const DataLayout *DL) + : DL(DL), LoadOff(0), StoreOff(0), + BP1(llvm::GetPointerBaseWithConstantOffset( + LoadBasePtr->stripPointerCasts(), LoadOff, *DL)), + BP2(llvm::GetPointerBaseWithConstantOffset( + StoreBasePtr->stripPointerCasts(), StoreOff, *DL)), + IsSameObject(BP1 == BP2) {} + + bool loadAndStoreMayFormMemmove(unsigned StoreSize, bool NegStride, + Instruction *TheLoad, bool IsMemCpy) const { + if (IsMemCpy) { + // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr + // for negative stride. FIXME: since llvm.memcpy dest and src may be equal + // extra check is needed here. + if ((!NegStride && LoadOff <= StoreOff) || + (NegStride && LoadOff >= StoreOff)) + return false; + } else { + // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr + // for negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr. + int64_t LoadSize = + DL->getTypeSizeInBits(TheLoad->getType()).getFixedSize() / 8; + if (BP1 != BP2 || LoadSize != int64_t(StoreSize)) + return false; + if ((!NegStride && LoadOff < StoreOff + int64_t(StoreSize)) || + (NegStride && LoadOff + LoadSize > StoreOff)) + return false; + } + return true; + } + +private: + const DataLayout *DL; + int64_t LoadOff; + int64_t StoreOff; + const Value *BP1; + const Value *BP2; + +public: + const bool IsSameObject; +}; + bool LoopIdiomRecognize::processLoopStoreOfLoopLoad( Value *DestPtr, Value *SourcePtr, unsigned StoreSize, MaybeAlign StoreAlign, MaybeAlign LoadAlign, Instruction *TheStore, Instruction *TheLoad, @@ -1243,10 +1288,10 @@ bool IsMemCpy = isa(TheStore); const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store"; - bool UseMemMove = + bool LoopAccessStore = mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount, StoreSize, *AA, Stores); - if (UseMemMove) { + if (LoopAccessStore) { Stores.insert(TheLoad); if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount, StoreSize, *AA, Stores)) { @@ -1279,34 +1324,33 @@ // the load memory locations. So remove it from the ignored stores. if (IsMemCpy) Stores.erase(TheStore); + MemmoveVerifier Verifier(LoadBasePtr, StoreBasePtr, DL); if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount, StoreSize, *AA, Stores)) { - ORE.emit([&]() { - return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad) - << ore::NV("Inst", InstRemark) << " in " - << ore::NV("Function", TheStore->getFunction()) - << " function will not be hoisted: " - << ore::NV("Reason", "The loop may access load location"); - }); - return Changed; - } - if (UseMemMove) { - // Ensure that LoadBasePtr is after StoreBasePtr or before StoreBasePtr for - // negative stride. LoadBasePtr shouldn't overlap with StoreBasePtr. - int64_t LoadOff = 0, StoreOff = 0; - const Value *BP1 = llvm::GetPointerBaseWithConstantOffset( - LoadBasePtr->stripPointerCasts(), LoadOff, *DL); - const Value *BP2 = llvm::GetPointerBaseWithConstantOffset( - StoreBasePtr->stripPointerCasts(), StoreOff, *DL); - int64_t LoadSize = - DL->getTypeSizeInBits(TheLoad->getType()).getFixedSize() / 8; - if (BP1 != BP2 || LoadSize != int64_t(StoreSize)) - return Changed; - if ((!NegStride && LoadOff < StoreOff + int64_t(StoreSize)) || - (NegStride && LoadOff + LoadSize > StoreOff)) + if (!IsMemCpy) { + ORE.emit([&]() { + return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", + TheLoad) + << ore::NV("Inst", InstRemark) << " in " + << ore::NV("Function", TheStore->getFunction()) + << " function will not be hoisted: " + << ore::NV("Reason", "The loop may access load location"); + }); return Changed; + } else { + // At this point loop may access load only for memcpy in same underlying + // object. If that's not the case bail out. + if (!Verifier.IsSameObject) + return Changed; + } } + bool UseMemMove = IsMemCpy ? Verifier.IsSameObject : LoopAccessStore; + if (UseMemMove) + if (!Verifier.loadAndStoreMayFormMemmove(StoreSize, NegStride, TheLoad, + IsMemCpy)) + return Changed; + if (avoidLIRForMultiBlockLoop()) return Changed; diff --git a/llvm/test/Transforms/LoopIdiom/basic.ll b/llvm/test/Transforms/LoopIdiom/basic.ll --- a/llvm/test/Transforms/LoopIdiom/basic.ll +++ b/llvm/test/Transforms/LoopIdiom/basic.ll @@ -1106,6 +1106,43 @@ ret void } +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) + +;; Memmove formation. We expect exactly same memmove result like in PR46179_positive_stride output. +define void @loop_with_memcpy_PR46179_positive_stride(i8* %Src, i64 %Size) { +; CHECK-LABEL: @loop_with_memcpy_PR46179_positive_stride( +; CHECK-NEXT: bb.nph: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, i8* [[SRC:%.*]], i64 1 +; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 [[SRC]], i8* align 1 [[SCEVGEP]], i64 [[SIZE:%.*]], i1 false) +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[BB_NPH:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[STEP:%.*]] = add nuw nsw i64 [[INDVAR]], 1 +; CHECK-NEXT: [[SRCI:%.*]] = getelementptr i8, i8* [[SRC]], i64 [[STEP]] +; CHECK-NEXT: [[DESTI:%.*]] = getelementptr i8, i8* [[SRC]], i64 [[INDVAR]] +; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], [[SIZE:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +bb.nph: + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 0, %bb.nph ], [ %indvar.next, %for.body ] + %Step = add nuw nsw i64 %indvar, 1 + %SrcI = getelementptr i8, i8* %Src, i64 %Step + %DestI = getelementptr i8, i8* %Src, i64 %indvar + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %DestI, i8* align 1 %SrcI, i64 1, i1 false) + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + ;; Memmove formation. define void @PR46179_negative_stride(i8* %Src, i64 %Size) { ; CHECK-LABEL: @PR46179_negative_stride( @@ -1145,7 +1182,82 @@ ret void } -;; Do not form memmove from previous store when stride is positive. +;; Memmove formation. We expect exactly same memmove result like in PR46179_negative_stride output. +define void @loop_with_memcpy_PR46179_negative_stride(i8* %Src, i64 %Size) { +; CHECK-LABEL: @loop_with_memcpy_PR46179_negative_stride( +; CHECK-NEXT: bb.nph: +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[SIZE:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, i8* [[SRC:%.*]], i64 1 +; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 [[SCEVGEP]], i8* align 1 [[SRC]], i64 [[SIZE]], i1 false) +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ [[STEP]], [[FOR_BODY]] ], [ [[SIZE]], [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[STEP:%.*]] = add nsw i64 [[INDVAR]], -1 +; CHECK-NEXT: [[SRCI:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[STEP]] +; CHECK-NEXT: [[DESTI:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[INDVAR]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp sgt i64 [[INDVAR]], 1 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +bb.nph: + %cmp1 = icmp sgt i64 %Size, 0 + br i1 %cmp1, label %for.body, label %for.end + +for.body: ; preds = %bb.nph, %.for.body + %indvar = phi i64 [ %Step, %for.body ], [ %Size, %bb.nph ] + %Step = add nsw i64 %indvar, -1 + %SrcI = getelementptr inbounds i8, i8* %Src, i64 %Step + %DestI = getelementptr inbounds i8, i8* %Src, i64 %indvar + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %DestI, i8* align 1 %SrcI, i64 1, i1 false) + %exitcond = icmp sgt i64 %indvar, 1 + br i1 %exitcond, label %for.body, label %for.end + +for.end: ; preds = %.for.body, %bb.nph + ret void +} + +;; Memmove formation. +define void @loop_with_memcpy_stride16(i8* %Src, i64 %Size) { +; CHECK-LABEL: @loop_with_memcpy_stride16( +; CHECK-NEXT: bb.nph: +; CHECK-NEXT: [[SCEVGEP:%.*]] = getelementptr i8, i8* [[SRC:%.*]], i64 16 +; CHECK-NEXT: [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[SIZE:%.*]], i64 16) +; CHECK-NEXT: [[TMP0:%.*]] = add nsw i64 [[SMAX]], -1 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[TMP0]], 4 +; CHECK-NEXT: [[TMP2:%.*]] = shl nuw nsw i64 [[TMP1]], 4 +; CHECK-NEXT: [[TMP3:%.*]] = add nuw i64 [[TMP2]], 16 +; CHECK-NEXT: call void @llvm.memmove.p0i8.p0i8.i64(i8* align 1 [[SRC]], i8* align 1 [[SCEVGEP]], i64 [[TMP3]], i1 false) +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ [[STEP]], [[FOR_BODY]] ], [ 0, [[BB_NPH:%.*]] ] +; CHECK-NEXT: [[STEP:%.*]] = add nuw nsw i64 [[INDVAR]], 16 +; CHECK-NEXT: [[SRCI:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[STEP]] +; CHECK-NEXT: [[DESTI:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[INDVAR]] +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp slt i64 [[STEP]], [[SIZE:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END:%.*]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +bb.nph: + br label %for.body + +for.body: ; preds = %for.body, %bb.nph + %indvar = phi i64 [ %Step, %for.body ], [ 0, %bb.nph ] + %Step = add nuw nsw i64 %indvar, 16 + %SrcI = getelementptr inbounds i8, i8* %Src, i64 %Step + %DestI = getelementptr inbounds i8, i8* %Src, i64 %indvar + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %DestI, i8* align 1 %SrcI, i64 16, i1 false) + %exitcond = icmp slt i64 %Step, %Size + br i1 %exitcond, label %for.body, label %for.end + +for.end: ; preds = %for.body + ret void +} + +;; Do not form memmove from previous load when stride is positive. define void @do_not_form_memmove1(i8* %Src, i64 %Size) { ; CHECK-LABEL: @do_not_form_memmove1( ; CHECK-NEXT: bb.nph: @@ -1181,10 +1293,44 @@ ret void } -;; Do not form memmove from next store when stride is negative. +;; Do not form memmove from previous load in memcpy when stride is positive. define void @do_not_form_memmove2(i8* %Src, i64 %Size) { ; CHECK-LABEL: @do_not_form_memmove2( ; CHECK-NEXT: bb.nph: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 1, [[BB_NPH:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ] +; CHECK-NEXT: [[STEP:%.*]] = add nuw nsw i64 [[INDVAR]], -1 +; CHECK-NEXT: [[SRCI:%.*]] = getelementptr i8, i8* [[SRC:%.*]], i64 [[STEP]] +; CHECK-NEXT: [[DESTI:%.*]] = getelementptr i8, i8* [[SRC]], i64 [[INDVAR]] +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[DESTI]], i8* align 1 [[SRCI]], i64 1, i1 false) +; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], [[SIZE:%.*]] +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +bb.nph: + br label %for.body + +for.body: ; preds = %bb.nph, %for.body + %indvar = phi i64 [ 1, %bb.nph ], [ %indvar.next, %for.body ] + %Step = add nuw nsw i64 %indvar, -1 + %SrcI = getelementptr i8, i8* %Src, i64 %Step + %DestI = getelementptr i8, i8* %Src, i64 %indvar + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %DestI, i8* align 1 %SrcI, i64 1, i1 false) + %indvar.next = add i64 %indvar, 1 + %exitcond = icmp eq i64 %indvar.next, %Size + br i1 %exitcond, label %for.end, label %for.body + +for.end: ; preds = %for.body, %entry + ret void +} + +;; Do not form memmove from next load when stride is negative. +define void @do_not_form_memmove3(i8* %Src, i64 %Size) { +; CHECK-LABEL: @do_not_form_memmove3( +; CHECK-NEXT: bb.nph: ; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[SIZE:%.*]], 0 ; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] ; CHECK: for.body.preheader: @@ -1221,9 +1367,47 @@ ret void } +;; Do not form memmove from next load in memcpy when stride is negative. +define void @do_not_form_memmove4(i8* %Src, i64 %Size) { +; CHECK-LABEL: @do_not_form_memmove4( +; CHECK-NEXT: bb.nph: +; CHECK-NEXT: [[CMP1:%.*]] = icmp sgt i64 [[SIZE:%.*]], 0 +; CHECK-NEXT: br i1 [[CMP1]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; CHECK: for.body.preheader: +; CHECK-NEXT: br label [[FOR_BODY:%.*]] +; CHECK: for.body: +; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ [[INDVAR_NEXT]], [[FOR_BODY]] ], [ [[SIZE]], [[FOR_BODY_PREHEADER]] ] +; CHECK-NEXT: [[STEP:%.*]] = add nuw nsw i64 [[INDVAR]], 1 +; CHECK-NEXT: [[SRCI:%.*]] = getelementptr inbounds i8, i8* [[SRC:%.*]], i64 [[STEP]] +; CHECK-NEXT: [[DESTI:%.*]] = getelementptr inbounds i8, i8* [[SRC]], i64 [[INDVAR]] +; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[DESTI]], i8* align 1 [[SRCI]], i64 1, i1 false) +; CHECK-NEXT: [[INDVAR_NEXT:%.*]] = add nsw i64 [[INDVAR]], -1 +; CHECK-NEXT: [[EXITCOND:%.*]] = icmp sgt i64 [[INDVAR]], 1 +; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_BODY]], label [[FOR_END]] +; CHECK: for.end: +; CHECK-NEXT: ret void +; +bb.nph: + %cmp1 = icmp sgt i64 %Size, 0 + br i1 %cmp1, label %for.body, label %for.end + +for.body: ; preds = %bb.nph, %.for.body + %indvar = phi i64 [ %indvar.next, %for.body ], [ %Size, %bb.nph ] + %Step = add nuw nsw i64 %indvar, 1 + %SrcI = getelementptr inbounds i8, i8* %Src, i64 %Step + %DestI = getelementptr inbounds i8, i8* %Src, i64 %indvar + call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %DestI, i8* align 1 %SrcI, i64 1, i1 false) + %indvar.next = add nsw i64 %indvar, -1 + %exitcond = icmp sgt i64 %indvar, 1 + br i1 %exitcond, label %for.body, label %for.end + +for.end: ; preds = %.for.body, %bb.nph + ret void +} + ;; Do not form memmove when underaligned load is overlapped with store. -define void @do_not_form_memmove3(i32* %s, i64 %size) { -; CHECK-LABEL: @do_not_form_memmove3( +define void @do_not_form_memmove5(i32* %s, i64 %size) { +; CHECK-LABEL: @do_not_form_memmove5( ; CHECK-NEXT: entry: ; CHECK-NEXT: [[END_IDX:%.*]] = add i64 [[SIZE:%.*]], -1 ; CHECK-NEXT: [[END_PTR:%.*]] = getelementptr inbounds i32, i32* [[S:%.*]], i64 [[END_IDX]] @@ -1264,12 +1448,11 @@ ret void } -declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) - -;; FIXME: Do not form memmove from loop body containing memcpy. -define void @do_not_form_memmove4(i8* %Src, i64 %Size) { -; CHECK-LABEL: @do_not_form_memmove4( +;; Do not form memmove for memcpy with aliasing store. +define void @do_not_form_memmove6(i8* %Src, i64 %Size) { +; CHECK-LABEL: @do_not_form_memmove6( ; CHECK-NEXT: bb.nph: +; CHECK-NEXT: [[BASEALIAS:%.*]] = call i8* @external(i8* [[SRC:%.*]]) ; CHECK-NEXT: br label [[FOR_BODY:%.*]] ; CHECK: for.body: ; CHECK-NEXT: [[INDVAR:%.*]] = phi i64 [ 0, [[BB_NPH:%.*]] ], [ [[INDVAR_NEXT:%.*]], [[FOR_BODY]] ] @@ -1277,6 +1460,7 @@ ; CHECK-NEXT: [[SRCI:%.*]] = getelementptr i8, i8* [[SRC]], i64 [[STEP]] ; CHECK-NEXT: [[DESTI:%.*]] = getelementptr i8, i8* [[SRC]], i64 [[INDVAR]] ; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[DESTI]], i8* align 1 [[SRCI]], i64 1, i1 false) +; CHECK-NEXT: store i8 4, i8* [[BASEALIAS]], align 1 ; CHECK-NEXT: [[INDVAR_NEXT]] = add i64 [[INDVAR]], 1 ; CHECK-NEXT: [[EXITCOND:%.*]] = icmp eq i64 [[INDVAR_NEXT]], [[SIZE:%.*]] ; CHECK-NEXT: br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]] @@ -1284,6 +1468,7 @@ ; CHECK-NEXT: ret void ; bb.nph: + %BaseAlias = call i8* @external(i8* %Src) br label %for.body for.body: ; preds = %bb.nph, %for.body @@ -1292,6 +1477,7 @@ %SrcI = getelementptr i8, i8* %Src, i64 %Step %DestI = getelementptr i8, i8* %Src, i64 %indvar call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 %DestI, i8* align 1 %SrcI, i64 1, i1 false) + store i8 4, i8* %BaseAlias %indvar.next = add i64 %indvar, 1 %exitcond = icmp eq i64 %indvar.next, %Size br i1 %exitcond, label %for.end, label %for.body