This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Transforms/Scalar/
-
Transforms/
-
Scalar/
11/13
LoopIdiomRecognize.cpp
-
test/Transforms/LoopIdiom/
-
Transforms/
-
LoopIdiom/
-
memcpy-debugify-remarks.ll
-
memcpy-intrinsic-different-types.ll
-
memcpy-intrinsic.ll
-
memset-debugify-remarks.ll

Differential D97667

[loop-idiom] Hoist loop memcpys to loop preheader
ClosedPublic

Authored by zhuhan0 on Mar 1 2021, 1:21 AM.

Download Raw Diff

Details

Reviewers

• zino
wenlei
reames
chandlerc
lattner
lebedev.ri
hoy

Commits

rGda1cdffbb1b7: [loop-idiom] Hoist loop memcpys to loop preheader
rG75d6b8bb4056: [loop-idiom] Hoist loop memcpys to loop preheader
rG92ddd3c1b6cd: [loop-idiom] Hoist loop memcpys to loop preheader

Summary

For a simple loop like:

struct S {
  int x;
  int y;
  char b;
};

unsigned foo(S* __restrict__ a, S* b, int n) {
  for (int i = 0; i < n; i++)
    a[i] = b[i];

  return sizeof(a[0]);
}

We could eliminate the loop and convert it to a large memcpy of 12*n bytes. Currently this is not handled. Output of opt -loop-idiom -S < memcpy_before.ll

%struct.S = type { i32, i32, i8 }

define dso_local i32 @_Z3fooP1SS0_i(%struct.S* noalias nocapture %a, %struct.S* nocapture readonly %b, i32 %n) local_unnamed_addr {
entry:
  %cmp7 = icmp sgt i32 %n, 0
  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup

for.body.preheader:                               ; preds = %entry
  br label %for.body

for.cond.cleanup.loopexit:                        ; preds = %for.body
  br label %for.cond.cleanup

for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
  ret i32 12

for.body:                                         ; preds = %for.body, %for.body.preheader
  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
  %idxprom = zext i32 %i.08 to i64
  %arrayidx = getelementptr inbounds %struct.S, %struct.S* %b, i64 %idxprom
  %arrayidx2 = getelementptr inbounds %struct.S, %struct.S* %a, i64 %idxprom
  %0 = bitcast %struct.S* %arrayidx2 to i8*
  %1 = bitcast %struct.S* %arrayidx to i8*
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %0, i8* nonnull align 4 dereferenceable(12) %1, i64 12, i1 false)
  %inc = add nuw nsw i32 %i.08, 1
  %cmp = icmp slt i32 %inc, %n
  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
}

; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #0

attributes #0 = { argmemonly nofree nosync nounwind willreturn }

The loop idiom pass currently only handles load and store instructions. Since struct S is too big to fit in a register, the loop body contains a memcpy intrinsic.

With this change, re-run opt -loop-idiom -S < memcpy_before.ll. The loop memcpy is promoted to loop preheader. For this trivial case, the loop is dead and will be removed by another pass.

%struct.S = type { i32, i32, i8 }

define dso_local i32 @_Z3fooP1SS0_i(%struct.S* noalias nocapture %a, %struct.S* nocapture readonly %b, i32 %n) local_unnamed_addr {
entry:
  %a1 = bitcast %struct.S* %a to i8*
  %b2 = bitcast %struct.S* %b to i8*
  %cmp7 = icmp sgt i32 %n, 0
  br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup

for.body.preheader:                               ; preds = %entry
  %0 = zext i32 %n to i64
  %1 = mul nuw nsw i64 %0, 12
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %a1, i8* align 4 %b2, i64 %1, i1 false)
  br label %for.body

for.cond.cleanup.loopexit:                        ; preds = %for.body
  br label %for.cond.cleanup

for.cond.cleanup:                                 ; preds = %for.cond.cleanup.loopexit, %entry
  ret i32 12

for.body:                                         ; preds = %for.body, %for.body.preheader
  %i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
  %idxprom = zext i32 %i.08 to i64
  %arrayidx = getelementptr inbounds %struct.S, %struct.S* %b, i64 %idxprom
  %arrayidx2 = getelementptr inbounds %struct.S, %struct.S* %a, i64 %idxprom
  %2 = bitcast %struct.S* %arrayidx2 to i8*
  %3 = bitcast %struct.S* %arrayidx to i8*
  %inc = add nuw nsw i32 %i.08, 1
  %cmp = icmp slt i32 %inc, %n
  br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
}

; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #0

attributes #0 = { argmemonly nofree nosync nounwind willreturn }

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

zhuhan0 created this revision.Mar 1 2021, 1:21 AM

Herald added subscribers: hoy, jfb, hiraditya. · View Herald TranscriptMar 1 2021, 1:21 AM

zhuhan0 requested review of this revision.Mar 1 2021, 1:21 AM

Herald added a project: Restricted Project. · View Herald TranscriptMar 1 2021, 1:21 AM

Harbormaster completed remote builds in B91294: Diff 327046.Mar 1 2021, 2:20 AM

Fix linter.

Harbormaster completed remote builds in B91383: Diff 327182.Mar 1 2021, 1:56 PM

Add function name to optimization remarks.

Harbormaster completed remote builds in B92607: Diff 328938.Mar 7 2021, 11:18 PM

Friendly ping. I could ask my colleagues to review this, but would appreciate some community feedback. I didn't find a clear code owner for this pass, so I simply put the top contributors to LoopIdiomRecognize.cpp as reviewers. Please let me know if I should put somebody else.

This looks like a great optimization to handle, but I'm not a competent reviewer for this area any longer.

LGTM!

This revision is now accepted and ready to land.Mar 22 2021, 10:09 AM

Typo in description: "perheader".

In D97667#2645105, @foad wrote:

Typo in description: "perheader".

Good catch. Corrected title.

This revision was landed with ongoing or failed builds.Mar 29 2021, 11:42 PM

Closed by commit rG92ddd3c1b6cd: [loop-idiom] Hoist loop memcpys to loop preheader (authored by zhuhan0). · Explain Why

This revision was automatically updated to reflect the committed changes.

zhuhan0 added a commit: rG92ddd3c1b6cd: [loop-idiom] Hoist loop memcpys to loop preheader.

Is @zino someone's replacement account?
I'm asking because the accept of this revision is the only contribution of that account.

krasimir added a reverting change: rG8e7df996e305: Revert "[loop-idiom] Hoist loop memcpys to loop preheader".Mar 30 2021, 2:47 AM

Was just reverted in rG8e7df996e3054cc174b91bc103057747c8349c06.

This revision is now accepted and ready to land.Mar 30 2021, 2:49 AM

lebedev.ri requested changes to this revision.Mar 30 2021, 2:49 AM

This revision now requires changes to proceed.Mar 30 2021, 2:49 AM

In D97667#2657994, @lebedev.ri wrote:

Is @zino someone's replacement account?
I'm asking because the accept of this revision is the only contribution of that account.

Zino is a real LLVM developer. @zino looks like your email is not verified.

llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
818	No need of this `continue`?
1233	Nit: Use 'StringRef` to avoid extra mem allocation?

hoy edited reviewers, added: hoy; removed: hoyFB.Mar 30 2021, 9:20 AM

In D97667#2657994, @lebedev.ri wrote:

Is @zino someone's replacement account?
I'm asking because the accept of this revision is the only contribution of that account.

Hi Roman, Yes, this is replacement account for Zino (on PTO for the week), and I believe the original account was https://reviews.llvm.org/p/zinob/

Zino, Hongtao and myself did internal code review for this patch before it's sent up here - we should have pointed that out.

Sorry for the breakage though. Instruction or a reproducer would be helpful.

-Wenlei

In D97667#2658951, @hoy wrote:

In D97667#2657994, @lebedev.ri wrote:

Is @zino someone's replacement account?
I'm asking because the accept of this revision is the only contribution of that account.

Zino is a real LLVM developer. @zino looks like your email is not verified.

Not as per https://reviews.llvm.org/people/commits/21223/ / https://reviews.llvm.org/people/revisions/21223/, which is why i asked.
Docs aren't quite clear on this, but the subtext is that reviews only count
if the reviewee is actually familiar with the code (and that can be seen from git log),
and is an [upstream!] llvm dev.

Can you please

precommit the tests
split this up into whatever nfc preparatory patch and the actual memcpy patch?

@lebedev.ri @zino is the replacement account for @zinob. He had some issue with the old account and couldn't retrieve it.

precommit the tests

Could you elaborate on this? What's the best way to run the failed buildbots build without committing this?

split this up into whatever nfc preparatory patch and the actual memcpy patch?

Will do. Thanks.

zhuhan0 mentioned this in D100979: [loop-idiom][NFC] Extract processLoopStoreOfLoopLoad into a helper function.Apr 21 2021, 10:43 AM

Fix build break. The breakage was a situation where the memcpy source and destination were of different types/sizes. Abort the transformation if that's the case. Also added a test case memcpy-intrinsic-different-types.ll.

Harbormaster completed remote builds in B100049: Diff 339321.Apr 21 2021, 11:02 AM

Also split the preparatory change away into an NFC patch https://reviews.llvm.org/D100979.

Address @hoy's comments.

Harbormaster completed remote builds in B100052: Diff 339324.Apr 21 2021, 11:09 AM

zhuhan0 marked 2 inline comments as done.Apr 21 2021, 11:09 AM

zhuhan0 added a parent revision: D100979: [loop-idiom][NFC] Extract processLoopStoreOfLoopLoad into a helper function.

Rebase.

This was already accepted and you fixed build break, I think you can try to reland it.

In D97667#2710349, @xbolva00 wrote:

This was already accepted and you fixed build break, I think you can try to reland it.

Ah I didn't know that. Thanks! Still not very familiar with upstream etiquette.

I'm actually going on a trip soon and will land this stack when I return next week.

Harbormaster completed remote builds in B100393: Diff 339798.Apr 22 2021, 5:19 PM

foad removed a subscriber: foad.Apr 23 2021, 12:21 AM

typo

This revision was not accepted when it landed; it landed in state Needs Review.Apr 27 2021, 5:40 PM

This revision was landed with ongoing or failed builds.

Closed by commit rG75d6b8bb4056: [loop-idiom] Hoist loop memcpys to loop preheader (authored by zhuhan0). · Explain Why

This revision was automatically updated to reflect the committed changes.

zhuhan0 added a commit: rG75d6b8bb4056: [loop-idiom] Hoist loop memcpys to loop preheader.

Harbormaster completed remote builds in B101301: Diff 341030.Apr 27 2021, 8:20 PM

To clarify, i was essentially asking to get an independent reviewer to re-review this,
looks like that didn't happen.

llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
856	This is the first time i'm seeing `Str` short for `Store`. Please use it direcly.
868–870	This doesn't make sense. Strides of load and store must match exactly. Doesn't this miscompile the case where load goes forward and store backward or vice verse?

lebedev.ri added inline comments.Apr 28 2021, 2:46 AM

llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
868–870	Also, i would like to see better test coverage: negative stride load/store stride sign mismatch

lebedev.ri's concerns seem to have been valid, so I'll be rolling this back. A test case in XLA that reverses data across certain dimensions in a multidimensional change fails with this patch. A sequence of loads and stores is converted into a single memcpy even though the ordering should be different across loads and stores.

Un-optimized input:

; ModuleID = '__compute_module'
source_filename = "__compute_module"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"

@0 = external dso_local unnamed_addr constant [96 x i8], align 16

; Function Attrs: uwtable
define void @Reverse4DFloatArrayOnDim01.3(i8* %retval, i8* noalias %run_options, i8** noalias %params, i8** noalias %buffer_table, i64* noalias %prof_counters) #0 {
entry:
  %reverse.2.invar_address.dim.3 = alloca i64, align 8
  %reverse.2.invar_address.dim.2 = alloca i64, align 8
  %reverse.2.invar_address.dim.1 = alloca i64, align 8
  %reverse.2.invar_address.dim.0 = alloca i64, align 8
  %0 = getelementptr inbounds i8*, i8** %buffer_table, i64 0
  %1 = load i8*, i8** %0, align 8, !invariant.load !0, !dereferenceable !1, !align !2
  %reverse.2 = bitcast i8* %1 to [4 x [3 x [2 x [1 x float]]]]*
  store i64 0, i64* %reverse.2.invar_address.dim.0, align 8
  br label %reverse.2.loop_header.dim.0

reverse.2.loop_header.dim.0:                      ; preds = %reverse.2.loop_exit.dim.1, %entry
  %reverse.2.indvar.dim.0 = load i64, i64* %reverse.2.invar_address.dim.0, align 8
  %2 = icmp uge i64 %reverse.2.indvar.dim.0, 4
  br i1 %2, label %reverse.2.loop_exit.dim.0, label %reverse.2.loop_body.dim.0

reverse.2.loop_body.dim.0:                        ; preds = %reverse.2.loop_header.dim.0
  store i64 0, i64* %reverse.2.invar_address.dim.1, align 8
  br label %reverse.2.loop_header.dim.1

reverse.2.loop_header.dim.1:                      ; preds = %reverse.2.loop_exit.dim.2, %reverse.2.loop_body.dim.0
  %reverse.2.indvar.dim.1 = load i64, i64* %reverse.2.invar_address.dim.1, align 8
  %3 = icmp uge i64 %reverse.2.indvar.dim.1, 3
  br i1 %3, label %reverse.2.loop_exit.dim.1, label %reverse.2.loop_body.dim.1

reverse.2.loop_body.dim.1:                        ; preds = %reverse.2.loop_header.dim.1
  store i64 0, i64* %reverse.2.invar_address.dim.2, align 8
  br label %reverse.2.loop_header.dim.2

reverse.2.loop_header.dim.2:                      ; preds = %reverse.2.loop_exit.dim.3, %reverse.2.loop_body.dim.1
  %reverse.2.indvar.dim.2 = load i64, i64* %reverse.2.invar_address.dim.2, align 8
  %4 = icmp uge i64 %reverse.2.indvar.dim.2, 2
  br i1 %4, label %reverse.2.loop_exit.dim.2, label %reverse.2.loop_body.dim.2

reverse.2.loop_body.dim.2:                        ; preds = %reverse.2.loop_header.dim.2
  store i64 0, i64* %reverse.2.invar_address.dim.3, align 8
  br label %reverse.2.loop_header.dim.3

reverse.2.loop_header.dim.3:                      ; preds = %reverse.2.loop_body.dim.3, %reverse.2.loop_body.dim.2
  %reverse.2.indvar.dim.3 = load i64, i64* %reverse.2.invar_address.dim.3, align 8
  %5 = icmp uge i64 %reverse.2.indvar.dim.3, 1
  br i1 %5, label %reverse.2.loop_exit.dim.3, label %reverse.2.loop_body.dim.3

reverse.2.loop_body.dim.3:                        ; preds = %reverse.2.loop_header.dim.3
  %6 = sub i64 3, %reverse.2.indvar.dim.0
  %7 = sub i64 2, %reverse.2.indvar.dim.1
  %8 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* bitcast ([96 x i8]* @0 to [4 x [3 x [2 x [1 x float]]]]*), i64 0, i64 %6, i64 %7, i64 %reverse.2.indvar.dim.2, i64 0
  %9 = load float, float* %8, align 4, !alias.scope !3, !noalias !6
  %10 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %reverse.2, i64 0, i64 %reverse.2.indvar.dim.0, i64 %reverse.2.indvar.dim.1, i64 %reverse.2.indvar.dim.2, i64 0
  store float %9, float* %10, align 4, !alias.scope !6, !noalias !3
  %invar.inc3 = add nuw nsw i64 %reverse.2.indvar.dim.3, 1
  store i64 %invar.inc3, i64* %reverse.2.invar_address.dim.3, align 8
  br label %reverse.2.loop_header.dim.3

reverse.2.loop_exit.dim.3:                        ; preds = %reverse.2.loop_header.dim.3
  %invar.inc2 = add nuw nsw i64 %reverse.2.indvar.dim.2, 1
  store i64 %invar.inc2, i64* %reverse.2.invar_address.dim.2, align 8
  br label %reverse.2.loop_header.dim.2

reverse.2.loop_exit.dim.2:                        ; preds = %reverse.2.loop_header.dim.2
  %invar.inc1 = add nuw nsw i64 %reverse.2.indvar.dim.1, 1
  store i64 %invar.inc1, i64* %reverse.2.invar_address.dim.1, align 8
  br label %reverse.2.loop_header.dim.1

reverse.2.loop_exit.dim.1:                        ; preds = %reverse.2.loop_header.dim.1
  %invar.inc = add nuw nsw i64 %reverse.2.indvar.dim.0, 1
  store i64 %invar.inc, i64* %reverse.2.invar_address.dim.0, align 8
  br label %reverse.2.loop_header.dim.0

reverse.2.loop_exit.dim.0:                        ; preds = %reverse.2.loop_header.dim.0
  ret void
}

attributes #0 = { uwtable "denormal-fp-math"="preserve-sign" "no-frame-pointer-elim"="false" }

!0 = !{}
!1 = !{i64 96}
!2 = !{i64 16}
!3 = !{!4}
!4 = !{!"buffer: {index:1, offset:0, size:96}", !5}
!5 = !{!"XLA global AA domain"}
!6 = !{!7}
!7 = !{!"buffer: {index:0, offset:0, size:96}", !5}

Before this patch:

; ModuleID = '__compute_module'
source_filename = "__compute_module"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"

; Function Attrs: nofree norecurse nosync nounwind uwtable
define void @Reverse4DFloatArrayOnDim01.3(i8* nocapture readnone %retval, i8* noalias nocapture readnone %run_options, i8** noalias nocapture readnone %params, i8** noalias nocapture readonly %buffer_table, i64* noalias nocapture readnone %prof_counters) local_unnamed_addr #0 {
entry:
  %0 = bitcast i8** %buffer_table to [4 x [3 x [2 x [1 x float]]]]**
  %1 = load [4 x [3 x [2 x [1 x float]]]]*, [4 x [3 x [2 x [1 x float]]]]** %0, align 8, !invariant.load !0, !dereferenceable !1, !align !2
  %2 = bitcast [4 x [3 x [2 x [1 x float]]]]* %1 to <4 x i64>*
  store <4 x i64> <i64 4737786809096339456, i64 4733283209467920384, i64 4728779609839501312, i64 4724276010211082240>, <4 x i64>* %2, align 16
  %scevgep.1.1 = getelementptr [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 1, i64 1, i64 0, i64 0
  %3 = bitcast float* %scevgep.1.1 to <4 x i64>*
  store <4 x i64> <i64 4719772410582138880, i64 4710765211325300736, i64 4701758012068462592, i64 4692750812811624448>, <4 x i64>* %3, align 16
  %scevgep.2.2 = getelementptr [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 2, i64 2, i64 0, i64 0
  %4 = bitcast float* %scevgep.2.2 to <4 x i64>*
  store <4 x i64> <i64 4683743613553737728, i64 4665729215040061440, i64 4647714816524288000, i64 4611686019492741120>, <4 x i64>* %4, align 16
  ret void
}

attributes #0 = { nofree norecurse nosync nounwind uwtable "denormal-fp-math"="preserve-sign" "no-frame-pointer-elim"="false" }

!0 = !{}
!1 = !{i64 96}
!2 = !{i64 16}

After this patch:

; ModuleID = '__compute_module'
source_filename = "__compute_module"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"

@0 = private unnamed_addr constant [96 x i8] c"\00\00\80?\00\00\00@\00\00@@\00\00\80@\00\00\A0@\00\00\C0@\00\00\E0@\00\00\00A\00\00\10A\00\00 A\00\000A\00\00@A\00\00PA\00\00`A\00\00pA\00\00\80A\00\00\88A\00\00\90A\00\00\98A\00\00\A0A\00\00\A8A\00\00\B0A\00\00\B8A\00\00\C0A", align 16

; Function Attrs: nofree norecurse nosync nounwind uwtable
define void @Reverse4DFloatArrayOnDim01.3(i8* nocapture readnone %retval, i8* noalias nocapture readnone %run_options, i8** noalias nocapture readnone %params, i8** noalias nocapture readonly %buffer_table, i64* noalias nocapture readnone %prof_counters) local_unnamed_addr #0 {
entry:
  %0 = load i8*, i8** %buffer_table, align 8, !invariant.load !0, !dereferenceable !1, !align !2
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 16 dereferenceable(96) %0, i8* noundef nonnull align 8 dereferenceable(96) getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 88), i64 96, i1 false)
  ret void
}

; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1

attributes #0 = { nofree norecurse nosync nounwind uwtable "denormal-fp-math"="preserve-sign" "no-frame-pointer-elim"="false" }
attributes #1 = { argmemonly nofree nosync nounwind willreturn }

!0 = !{}
!1 = !{i64 96}
!2 = !{i64 16}

Thank you.

@zhuhan0 please do get someone else familiar with the code to re-review this before relanding.
Thank you!

This revision now requires changes to proceed.Apr 28 2021, 4:15 AM

Tres Popp <tpopp@google.com> added a reverting change: rGefce19c3b092: Revert "[loop-idiom] Hoist loop memcpys to loop preheader".Apr 28 2021, 4:16 AM

@tpopp I cannot reproduce your test failure with opt -O2 and -O3. My patch only affects memcpy intrinsics in the loop body. Therefore running your test case shouldn't hit my code. Output of opt -O3:

; ModuleID = 'reverse_4d_float_array.ll'
source_filename = "__compute_module"
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
target triple = "x86_64-grtev4-linux-gnu"

@0 = external dso_local unnamed_addr constant [96 x i8], align 16

; Function Attrs: nofree norecurse nosync nounwind uwtable
define void @Reverse4DFloatArrayOnDim01.3(i8* nocapture readnone %retval, i8* noalias nocapture readnone %run_options, i8** noalias nocapture readnone %params, i8** noalias nocapture readonly %buffer_table, i64* noalias nocapture readnone %prof_counters) local_unnamed_addr #0 {
entry:
  %0 = bitcast i8** %buffer_table to [4 x [3 x [2 x [1 x float]]]]**
  %1 = load [4 x [3 x [2 x [1 x float]]]]*, [4 x [3 x [2 x [1 x float]]]]** %0, align 8, !invariant.load !0, !dereferenceable !1, !align !2
  %2 = load float, float* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 88) to float*), align 8, !alias.scope !3, !noalias !6
  %3 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 0, i64 0, i64 0, i64 0
  store float %2, float* %3, align 16, !alias.scope !6, !noalias !3
  %4 = load float, float* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 92) to float*), align 4, !alias.scope !3, !noalias !6
  %5 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 0, i64 0, i64 1, i64 0
  store float %4, float* %5, align 4, !alias.scope !6, !noalias !3
  %6 = load float, float* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 80) to float*), align 16, !alias.scope !3, !noalias !6
  %7 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 0, i64 1, i64 0, i64 0
  store float %6, float* %7, align 8, !alias.scope !6, !noalias !3
  %8 = load float, float* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 84) to float*), align 4, !alias.scope !3, !noalias !6
  %9 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 0, i64 1, i64 1, i64 0
  store float %8, float* %9, align 4, !alias.scope !6, !noalias !3
  %10 = load float, float* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 72) to float*), align 8, !alias.scope !3, !noalias !6
  %11 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 0, i64 2, i64 0, i64 0
  store float %10, float* %11, align 16, !alias.scope !6, !noalias !3
  %12 = load float, float* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 76) to float*), align 4, !alias.scope !3, !noalias !6
  %13 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 0, i64 2, i64 1, i64 0
  store float %12, float* %13, align 4, !alias.scope !6, !noalias !3
  %14 = load float, float* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 64) to float*), align 16, !alias.scope !3, !noalias !6
  %15 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 1, i64 0, i64 0, i64 0
  store float %14, float* %15, align 8, !alias.scope !6, !noalias !3
  %16 = load float, float* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 68) to float*), align 4, !alias.scope !3, !noalias !6
  %17 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 1, i64 0, i64 1, i64 0
  store float %16, float* %17, align 4, !alias.scope !6, !noalias !3
  %18 = load float, float* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 56) to float*), align 8, !alias.scope !3, !noalias !6
  %19 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 1, i64 1, i64 0, i64 0
  store float %18, float* %19, align 16, !alias.scope !6, !noalias !3
  %20 = load float, float* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 60) to float*), align 4, !alias.scope !3, !noalias !6
  %21 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 1, i64 1, i64 1, i64 0
  store float %20, float* %21, align 4, !alias.scope !6, !noalias !3
  %22 = load float, float* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 48) to float*), align 16, !alias.scope !3, !noalias !6
  %23 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 1, i64 2, i64 0, i64 0
  store float %22, float* %23, align 8, !alias.scope !6, !noalias !3
  %24 = load float, float* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 52) to float*), align 4, !alias.scope !3, !noalias !6
  %25 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 1, i64 2, i64 1, i64 0
  store float %24, float* %25, align 4, !alias.scope !6, !noalias !3
  %26 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 2, i64 0, i64 0, i64 0
  %27 = load <4 x float>, <4 x float>* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 32) to <4 x float>*), align 16, !alias.scope !3, !noalias !6
  %shuffle = shufflevector <4 x float> %27, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
  %28 = bitcast float* %26 to <4 x float>*
  store <4 x float> %shuffle, <4 x float>* %28, align 16, !alias.scope !6, !noalias !3
  %29 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 2, i64 2, i64 0, i64 0
  %30 = load <4 x float>, <4 x float>* bitcast (i8* getelementptr inbounds ([96 x i8], [96 x i8]* @0, i64 0, i64 16) to <4 x float>*), align 16, !alias.scope !3, !noalias !6
  %shuffle7 = shufflevector <4 x float> %30, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
  %31 = bitcast float* %29 to <4 x float>*
  store <4 x float> %shuffle7, <4 x float>* %31, align 16, !alias.scope !6, !noalias !3
  %32 = getelementptr inbounds [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 3, i64 1, i64 0, i64 0
  %33 = load <4 x float>, <4 x float>* bitcast ([96 x i8]* @0 to <4 x float>*), align 16, !alias.scope !3, !noalias !6
  %shuffle8 = shufflevector <4 x float> %33, <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 0, i32 1>
  %34 = bitcast float* %32 to <4 x float>*
  store <4 x float> %shuffle8, <4 x float>* %34, align 16, !alias.scope !6, !noalias !3
  ret void
}

attributes #0 = { nofree norecurse nosync nounwind uwtable "denormal-fp-math"="preserve-sign" "no-frame-pointer-elim"="false" }

!0 = !{}
!1 = !{i64 96}
!2 = !{i64 16}
!3 = !{!4}
!4 = !{!"buffer: {index:1, offset:0, size:96}", !5}
!5 = !{!"XLA global AA domain"}
!6 = !{!7}
!7 = !{!"buffer: {index:0, offset:0, size:96}", !5}

Do you have different compiler args to hit this test failure? Or is this not even an llvm test case?

zhuhan0 added inline comments.Apr 28 2021, 12:16 PM

llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
868–870	This doesn't make sense. Strides of load and store must match exactly. Doesn't this miscompile the case where load goes forward and store backward or vice verse? The strides of load and store can be different if the memcpy source and destination are of different types. See my test `memcpy-intrinsic-different-types.ll`, which was derived from the build failure we saw earlier. I printed out `LoadIntStride` and `StoreIntStride`: LoadIntStride: 32 StoreIntStride: 12 Line 869-870 check for this case and fix the build failure.
868–870	Also, i would like to see better test coverage: negative stride load/store stride sign mismatch This is reasonable. I'll add those.

lebedev.ri added inline comments.Apr 28 2021, 12:46 PM

llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
868–870	I believe, you have answered a question different from the one i asked.

Rename variables, fix stride check, add two tests and one more remark.

zhuhan0 marked 4 inline comments as done.Apr 28 2021, 3:47 PM

zhuhan0 added inline comments.

llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
868–870	You're right. That was a silly error. Fixed.

Harbormaster completed remote builds in B101513: Diff 341336.Apr 28 2021, 5:17 PM

; ModuleID = '__compute_module' 
source_filename = "__compute_module"                                                                                               
target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"                                       
target triple = "x86_64-grtev4-linux-gnu"                                                                                          
                                                                                                                                   
@0 = private unnamed_addr constant [96 x i8] c"\00\00\80?\00\00\00@\00\00@@\00\00\80@\00\00\A0@\00\00\C0@\00\00\E0@\00\00\00A\00\00
\10A\00\00 A\00\000A\00\00@A\00\00PA\00\00`A\00\00pA\00\00\80A\00\00\88A\00\00\90A\00\00\98A\00\00\A0A\00\00\A8A\00\00\B0A\00\00\B8
A\00\00\C0A", align 16                                                                                                             
                                                                                                                                   
; Function Attrs: nofree norecurse nosync nounwind uwtable                                                                         
define void @Reverse4DFloatArrayOnDim01.3(i8* nocapture readnone %retval, i8* noalias nocapture readnone %run_options, i8** noalias
 nocapture readnone %params, i8** noalias nocapture readonly %buffer_table, i64* noalias nocapture readnone %prof_counters) local_u
nnamed_addr #0 {                                                                                                                   
entry:                                                                                                                             
  %0 = bitcast i8** %buffer_table to [4 x [3 x [2 x [1 x float]]]]**
  %1 = load [4 x [3 x [2 x [1 x float]]]]*, [4 x [3 x [2 x [1 x float]]]]** %0, align 8, !invariant.load !0, !dereferenceable !1, !
align !2
  br label %reverse.2.loop_header.dim.1.preheader

reverse.2.loop_header.dim.1.preheader:            ; preds = %entry, %reverse.2.loop_exit.dim.1
  %reverse.2.invar_address.dim.0.06 = phi i64 [ 0, %entry ], [ %invar.inc, %reverse.2.loop_exit.dim.1 ]
  %2 = mul nsw i64 %reverse.2.invar_address.dim.0.06, -24
  %3 = add i64 %2, 88
  br label %reverse.2.loop_header.dim.2.preheader

reverse.2.loop_header.dim.2.preheader:            ; preds = %reverse.2.loop_header.dim.1.preheader, %reverse.2.loop_exit.dim
  %reverse.2.invar_address.dim.1.05 = phi i64 [ 0, %reverse.2.loop_header.dim.1.preheader ], [ %invar.inc1, %reverse.2.loop_exit.di
m.2 ]
  %scevgep = getelementptr [4 x [3 x [2 x [1 x float]]]], [4 x [3 x [2 x [1 x float]]]]* %1, i64 0, i64 %reverse.2.invar_address.di
m.0.06, i64 %reverse.2.invar_address.dim.1.05, i64 0, i64 0
  %scevgep7 = bitcast float* %scevgep to i8*
  %4 = mul nsw i64 %reverse.2.invar_address.dim.1.05, -8
  %5 = add i64 %3, %4
  %scevgep8 = getelementptr [96 x i8], [96 x i8]* @0, i64 0, i64 %5
  call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %scevgep7, i8* align 4 %scevgep8, i64 8, i1 false)
  br label %reverse.2.loop_exit.dim.2

reverse.2.loop_exit.dim.2:                        ; preds = %reverse.2.loop_header.dim.2.preheader
  %invar.inc1 = add nuw nsw i64 %reverse.2.invar_address.dim.1.05, 1
  %6 = icmp ugt i64 %reverse.2.invar_address.dim.1.05, 1
  br i1 %6, label %reverse.2.loop_exit.dim.1, label %reverse.2.loop_header.dim.2.preheader

reverse.2.loop_exit.dim.1:                        ; preds = %reverse.2.loop_exit.dim.2
  %invar.inc = add nuw nsw i64 %reverse.2.invar_address.dim.0.06, 1
  %7 = icmp ugt i64 %reverse.2.invar_address.dim.0.06, 2
  br i1 %7, label %reverse.2.loop_exit.dim.0, label %reverse.2.loop_header.dim.1.preheader

reverse.2.loop_exit.dim.0:                        ; preds = %reverse.2.loop_exit.dim.1
  ret void
}

; Function Attrs: argmemonly nofree nosync nounwind willreturn
declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly %0, i8* noalias nocapture readonly %1, i64 %2, i1 immarg %3
) #1

attributes #0 = { nofree norecurse nosync nounwind uwtable "denormal-fp-math"="preserve-sign" "no-frame-pointer-elim"="false" }
attributes #1 = { argmemonly nofree nosync nounwind willreturn }

!0 = !{}
!1 = !{i64 96}
!2 = !{i64 16}

opt -loop-idiom <%s -S

This shows the first time that this code is run and different IR is generated before and after. It then diverges further on a subsequent execution (where before an after have different inputs now). I am trying to find how to share a full opt command rather than sharing different snippets for the before/after inputs. I hope this first IR helps though

Describing what the code is intended to do (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/tests/reverse_test.cc#L146).

A 4d array is taking in reversing elements across the 0th and 1st dimensions, so for every value previously indexed at [A,B,C,D] in an array of size [W,X,Y,Z], the new index of the value is [W-A-1, X-B-1, C, D].

The original code indexes into proper locations for the first 2 dimensions, and then copies the subdata, while this change results in a single copy after indexing only in dimension 0, which cannot be done as the data in dimension 1 cannot be copied due to the reversal.

With this patch we got the following assertion:

bool llvm::APInt::operator==(const llvm::APInt &) const: Assertion `BitWidth == RHS.BitWidth && "Comparison requires equal bit widths"' failed.

in LoopIdiomRecognize::processLoopMemCpy() at the following comparison:

// Check if the load stride matches the store stride.
if (StrIntStride != LoadIntStride && StrIntStride != -LoadIntStride)
  return false;

for a memcpy done between two address spaces with different pointer sizes.

I don't have a upstream reproducer ready for this, but I'll see if I can create one.

In D97667#2724992, @tpopp wrote:

Describing what the code is intended to do (https://github.com/tensorflow/tensorflow/blob/master/tensorflow/compiler/xla/tests/reverse_test.cc#L146).

A 4d array is taking in reversing elements across the 0th and 1st dimensions, so for every value previously indexed at [A,B,C,D] in an array of size [W,X,Y,Z], the new index of the value is [W-A-1, X-B-1, C, D].

The original code indexes into proper locations for the first 2 dimensions, and then copies the subdata, while this change results in a single copy after indexing only in dimension 0, which cannot be done as the data in dimension 1 cannot be copied due to the reversal.

Makes sense. Thanks for the IR and explanation.

@lebedev.ri pointed out the issue earlier and I fixed it in the newest version. I ran opt -S -loop-diom on the IR you provided and it does not hoist the memcpy anymore.

lebedev.ri added inline comments.Apr 30 2021, 3:32 PM

llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
865	I don't think this should handle the case of different directions.

Compare LoadStride and StoreStride after sign extension.

In D97667#2725626, @dstenb wrote:
With this patch we got the following assertion:
bool llvm::APInt::operator==(const llvm::APInt &) const: Assertion `BitWidth == RHS.BitWidth && "Comparison requires equal bit widths"' failed.
in LoopIdiomRecognize::processLoopMemCpy() at the following comparison:
// Check if the load stride matches the store stride.
if (StrIntStride != LoadIntStride && StrIntStride != -LoadIntStride)
  return false;
for a memcpy done between two address spaces with different pointer sizes.

I don't have a upstream reproducer ready for this, but I'll see if I can create one.

The new version should fix the assertion I think. Let me know if you still see it.

llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
865	The different directions case is handled below in // Check if the load stride matches the store stride. if (StoreStrideInt != LoadStrideInt) return false; Or do you refer to something else?

Harbormaster completed remote builds in B102052: Diff 342083.Apr 30 2021, 6:21 PM

In D97667#2730798, @zhuhan0 wrote:
In D97667#2725626, @dstenb wrote:
With this patch we got the following assertion:
bool llvm::APInt::operator==(const llvm::APInt &) const: Assertion `BitWidth == RHS.BitWidth && "Comparison requires equal bit widths"' failed.
in LoopIdiomRecognize::processLoopMemCpy() at the following comparison:
// Check if the load stride matches the store stride.
if (StrIntStride != LoadIntStride && StrIntStride != -LoadIntStride)
  return false;
for a memcpy done between two address spaces with different pointer sizes.

I don't have a upstream reproducer ready for this, but I'll see if I can create one.
The new version should fix the assertion I think. Let me know if you still see it.

Yes, thanks! That fixed it. I did not manage to create an upstream reproducer from the downstream one unfortunately.

lebedev.ri added inline comments.May 3 2021, 7:57 AM

llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
869	I suspect we will still see crashes here. Since you use `getSExtValue()` later, just move that to before this, and compare ints?

zhuhan0 added inline comments.May 3 2021, 11:16 PM

llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp

869

I tried that earlier and there was a warning when compiling this code:

[1284/2576] Building CXX object lib/Transforms/Scalar/CMakeFiles/LLVMScalarOpts.dir/LoopIdiomRecognize.cpp.o
/data/users/zhuhan/server-llvm/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp:871:19: warning: comparison of integers of different signs: 'uint64_t' (aka 'unsigned long') and 'int64_t' (aka 'long') [-Wsign-compare]
  if (SizeInBytes != StoreStrideInt && SizeInBytes != -StoreStrideInt) {
      ~~~~~~~~~~~ ^  ~~~~~~~~~~~~~~
/data/users/zhuhan/server-llvm/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp:871:52: warning: comparison of integers of different signs: 'uint64_t' (aka 'unsigned long') and 'int64_t' (aka 'long') [-Wsign-compare]
  if (SizeInBytes != StoreStrideInt && SizeInBytes != -StoreStrideInt) {
                                       ~~~~~~~~~~~ ^  ~~~~~~~~~~~~~~~
2 warnings generated.

And I think this code should be correct because the != operator of APInt is properly overloaded:

inline bool operator!=(uint64_t V1, const APInt &V2) { return V2 != V1; }

and

bool operator!=(uint64_t Val) const { return !((*this) == Val); }

and finally

bool operator==(uint64_t Val) const {
  return (isSingleWord() || getActiveBits() <= 64) && getZExtValue() == Val;
}

Alright.
Please ensure that you have added all the necessary tests.
It looks about reasonable to me now, i think you can try re-landing it.

This revision is now accepted and ready to land.May 4 2021, 3:17 AM

This revision was landed with ongoing or failed builds.May 4 2021, 5:07 PM

Closed by commit rGda1cdffbb1b7: [loop-idiom] Hoist loop memcpys to loop preheader (authored by zhuhan0). · Explain Why

This revision was automatically updated to reflect the committed changes.

zhuhan0 added a commit: rGda1cdffbb1b7: [loop-idiom] Hoist loop memcpys to loop preheader.

In D97667#2735870, @lebedev.ri wrote:

Alright.
Please ensure that you have added all the necessary tests.
It looks about reasonable to me now, i think you can try re-landing it.

Thanks! Let me know if there's any issue.

Revision Contents

Path

Size

llvm/

lib/

Transforms/

Scalar/

LoopIdiomRecognize.cpp

162 lines

test/

Transforms/

LoopIdiom/

memcpy-debugify-remarks.ll

2 lines

memcpy-intrinsic-different-types.ll

89 lines

memcpy-intrinsic.ll

434 lines

memset-debugify-remarks.ll

2 lines

Diff 342913

llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp

Show First 20 Lines • Show All 199 Lines • ▼ Show 20 Lines	private:
bool runOnLoopBlock(BasicBlock BB, const SCEV BECount,		bool runOnLoopBlock(BasicBlock BB, const SCEV BECount,
SmallVectorImpl<BasicBlock *> &ExitBlocks);		SmallVectorImpl<BasicBlock *> &ExitBlocks);

void collectStores(BasicBlock *BB);		void collectStores(BasicBlock *BB);
LegalStoreKind isLegalStore(StoreInst *SI);		LegalStoreKind isLegalStore(StoreInst *SI);
enum class ForMemset { No, Yes };		enum class ForMemset { No, Yes };
bool processLoopStores(SmallVectorImpl<StoreInst > &SL, const SCEV BECount,		bool processLoopStores(SmallVectorImpl<StoreInst > &SL, const SCEV BECount,
ForMemset For);		ForMemset For);

		template <typename MemInst>
		bool processLoopMemIntrinsic(
		BasicBlock *BB,
		bool (LoopIdiomRecognize::Processor)(MemInst , const SCEV *),
		const SCEV *BECount);
		bool processLoopMemCpy(MemCpyInst MCI, const SCEV BECount);
bool processLoopMemSet(MemSetInst MSI, const SCEV BECount);		bool processLoopMemSet(MemSetInst MSI, const SCEV BECount);

bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,		bool processLoopStridedStore(Value *DestPtr, unsigned StoreSize,
MaybeAlign StoreAlignment, Value *StoredVal,		MaybeAlign StoreAlignment, Value *StoredVal,
Instruction *TheStore,		Instruction *TheStore,
SmallPtrSetImpl<Instruction *> &Stores,		SmallPtrSetImpl<Instruction *> &Stores,
const SCEVAddRecExpr Ev, const SCEV BECount,		const SCEVAddRecExpr Ev, const SCEV BECount,
bool NegStride, bool IsLoopMemset = false);		bool NegStride, bool IsLoopMemset = false);
▲ Show 20 Lines • Show All 414 Lines • ▼ Show 20 Lines	bool LoopIdiomRecognize::runOnLoopBlock(

for (auto &SL : StoreRefsForMemsetPattern)		for (auto &SL : StoreRefsForMemsetPattern)
MadeChange \|= processLoopStores(SL.second, BECount, ForMemset::No);		MadeChange \|= processLoopStores(SL.second, BECount, ForMemset::No);

// Optimize the store into a memcpy, if it feeds an similarly strided load.		// Optimize the store into a memcpy, if it feeds an similarly strided load.
for (auto &SI : StoreRefsForMemcpy)		for (auto &SI : StoreRefsForMemcpy)
MadeChange \|= processLoopStoreOfLoopLoad(SI, BECount);		MadeChange \|= processLoopStoreOfLoopLoad(SI, BECount);

for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {		MadeChange \|= processLoopMemIntrinsic<MemCpyInst>(
Instruction Inst = &I++;		BB, &LoopIdiomRecognize::processLoopMemCpy, BECount);
// Look for memset instructions, which may be optimized to a larger memset.		MadeChange \|= processLoopMemIntrinsic<MemSetInst>(
if (MemSetInst *MSI = dyn_cast<MemSetInst>(Inst)) {		BB, &LoopIdiomRecognize::processLoopMemSet, BECount);
WeakTrackingVH InstPtr(&*I);
if (!processLoopMemSet(MSI, BECount))
continue;
MadeChange = true;

// If processing the memset invalidated our iterator, start over from the
// top of the block.
if (!InstPtr)
I = BB->begin();
continue;
}
}

return MadeChange;		return MadeChange;
}		}

/// See if this store(s) can be promoted to a memset.		/// See if this store(s) can be promoted to a memset.
bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,		bool LoopIdiomRecognize::processLoopStores(SmallVectorImpl<StoreInst *> &SL,
const SCEV *BECount, ForMemset For) {		const SCEV *BECount, ForMemset For) {
// Try to find consecutive stores that can be transformed into memsets.		// Try to find consecutive stores that can be transformed into memsets.
▲ Show 20 Lines • Show All 132 Lines • ▼ Show 20 Lines	if (processLoopStridedStore(StorePtr, StoreSize,
TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());		TransformedStores.insert(AdjacentStores.begin(), AdjacentStores.end());
Changed = true;		Changed = true;
}		}
}		}

return Changed;		return Changed;
}		}

		/// processLoopMemIntrinsic - Template function for calling different processor
		/// functions based on mem instrinsic type.
		template <typename MemInst>
		bool LoopIdiomRecognize::processLoopMemIntrinsic(
		BasicBlock *BB,
		bool (LoopIdiomRecognize::Processor)(MemInst , const SCEV *),
		const SCEV *BECount) {
		bool MadeChange = false;
		for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
		Instruction Inst = &I++;
		// Look for memory instructions, which may be optimized to a larger one.
		if (MemInst *MI = dyn_cast<MemInst>(Inst)) {
		WeakTrackingVH InstPtr(&*I);
		if (!(this->*Processor)(MI, BECount))
		continue;
		MadeChange = true;

		// If processing the instruction invalidated our iterator, start over from
		// the top of the block.
		if (!InstPtr)
		I = BB->begin();
		}
		hoyUnsubmitted Done Reply Inline Actions No need of this `continue`? hoy: No need of this `continue`?
		}
		return MadeChange;
		}

		/// processLoopMemCpy - See if this memcpy can be promoted to a large memcpy
		bool LoopIdiomRecognize::processLoopMemCpy(MemCpyInst *MCI,
		const SCEV *BECount) {
		// We can only handle non-volatile memcpys with a constant size.
		if (MCI->isVolatile() \|\| !isa<ConstantInt>(MCI->getLength()))
		return false;

		// If we're not allowed to hack on memcpy, we fail.
		if (!HasMemcpy \|\| DisableLIRP::Memcpy)
		return false;

		Value *Dest = MCI->getDest();
		Value *Source = MCI->getSource();
		if (!Dest \|\| !Source)
		return false;

		// See if the load and store pointer expressions are AddRec like {base,+,1} on
		// the current loop, which indicates a strided load and store. If we have
		// something else, it's a random load or store we can't handle.
		const SCEVAddRecExpr *StoreEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Dest));
		if (!StoreEv \|\| StoreEv->getLoop() != CurLoop \|\| !StoreEv->isAffine())
		return false;
		const SCEVAddRecExpr *LoadEv = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Source));
		if (!LoadEv \|\| LoadEv->getLoop() != CurLoop \|\| !LoadEv->isAffine())
		return false;

		// Reject memcpys that are so large that they overflow an unsigned.
		uint64_t SizeInBytes = cast<ConstantInt>(MCI->getLength())->getZExtValue();
		if ((SizeInBytes >> 32) != 0)
		return false;

		// Check if the stride matches the size of the memcpy. If so, then we know
		// that every byte is touched in the loop.
		const SCEVConstant *StoreStride =
		lebedev.riUnsubmitted Done Reply Inline Actions This is the first time i'm seeing `Str` short for `Store`. Please use it direcly. lebedev.ri: This is the first time i'm seeing `Str` short for `Store`. Please use it direcly.
		dyn_cast<SCEVConstant>(StoreEv->getOperand(1));
		const SCEVConstant *LoadStride =
		dyn_cast<SCEVConstant>(LoadEv->getOperand(1));
		if (!StoreStride \|\| !LoadStride)
		return false;

		APInt StoreStrideValue = StoreStride->getAPInt();
		APInt LoadStrideValue = LoadStride->getAPInt();
		// Huge stride value - give up
		lebedev.riUnsubmitted Not Done Reply Inline Actions I don't think this should handle the case of different directions. lebedev.ri: I don't think this should handle the case of different directions.
		zhuhan0AuthorUnsubmitted Done Reply Inline Actions The different directions case is handled below in // Check if the load stride matches the store stride. if (StoreStrideInt != LoadStrideInt) return false; Or do you refer to something else? zhuhan0: The different directions case is handled below in ``` // Check if the load stride matches…
		if (StoreStrideValue.getBitWidth() > 64 \|\| LoadStrideValue.getBitWidth() > 64)
		return false;

		if (SizeInBytes != StoreStrideValue && SizeInBytes != -StoreStrideValue) {
		lebedev.riUnsubmitted Not Done Reply Inline Actions I suspect we will still see crashes here. Since you use `getSExtValue()` later, just move that to before this, and compare ints? lebedev.ri: I suspect we will still see crashes here. Since you use `getSExtValue()` later, just move that…
		zhuhan0AuthorUnsubmitted Done Reply Inline Actions I tried that earlier and there was a warning when compiling this code: [1284/2576] Building CXX object lib/Transforms/Scalar/CMakeFiles/LLVMScalarOpts.dir/LoopIdiomRecognize.cpp.o /data/users/zhuhan/server-llvm/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp:871:19: warning: comparison of integers of different signs: 'uint64_t' (aka 'unsigned long') and 'int64_t' (aka 'long') [-Wsign-compare] if (SizeInBytes != StoreStrideInt && SizeInBytes != -StoreStrideInt) { ~~~~~~~~~~~ ^ ~~~~~~~~~~~~~~ /data/users/zhuhan/server-llvm/llvm-project/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp:871:52: warning: comparison of integers of different signs: 'uint64_t' (aka 'unsigned long') and 'int64_t' (aka 'long') [-Wsign-compare] if (SizeInBytes != StoreStrideInt && SizeInBytes != -StoreStrideInt) { ~~~~~~~~~~~ ^ ~~~~~~~~~~~~~~~ 2 warnings generated. And I think this code should be correct because the != operator of APInt is properly overloaded: inline bool operator!=(uint64_t V1, const APInt &V2) { return V2 != V1; } and bool operator!=(uint64_t Val) const { return !((this) == Val); } and finally bool operator==(uint64_t Val) const { return (isSingleWord() \|\| getActiveBits() <= 64) && getZExtValue() == Val; } zhuhan0:* I tried that earlier and there was a warning when compiling this code: ``` [1284/2576] Building…
		ORE.emit([&]() {
		lebedev.riUnsubmitted Done Reply Inline Actions This doesn't make sense. Strides of load and store must match exactly. Doesn't this miscompile the case where load goes forward and store backward or vice verse? lebedev.ri: This doesn't make sense. Strides of load and store must match exactly. Doesn't this miscompile…
		lebedev.riUnsubmitted Done Reply Inline Actions Also, i would like to see better test coverage: negative stride load/store stride sign mismatch lebedev.ri: Also, i would like to see better test coverage: 1. negative stride 2. load/store stride sign…
		zhuhan0AuthorUnsubmitted Done Reply Inline Actions Also, i would like to see better test coverage: negative stride load/store stride sign mismatch This is reasonable. I'll add those. zhuhan0: > Also, i would like to see better test coverage: > 1. negative stride > 2. load/store stride…
		zhuhan0AuthorUnsubmitted Done Reply Inline Actions This doesn't make sense. Strides of load and store must match exactly. Doesn't this miscompile the case where load goes forward and store backward or vice verse? The strides of load and store can be different if the memcpy source and destination are of different types. See my test `memcpy-intrinsic-different-types.ll`, which was derived from the build failure we saw earlier. I printed out `LoadIntStride` and `StoreIntStride`: LoadIntStride: 32 StoreIntStride: 12 Line 869-870 check for this case and fix the build failure. zhuhan0: > This doesn't make sense. Strides of load and store must match exactly. > Doesn't this…
		lebedev.riUnsubmitted Done Reply Inline Actions I believe, you have answered a question different from the one i asked. lebedev.ri: I believe, you have answered a question different from the one i asked.
		zhuhan0AuthorUnsubmitted Done Reply Inline Actions You're right. That was a silly error. Fixed. zhuhan0: You're right. That was a silly error. Fixed.
		return OptimizationRemarkMissed(DEBUG_TYPE, "SizeStrideUnequal", MCI)
		<< ore::NV("Inst", "memcpy") << " in "
		<< ore::NV("Function", MCI->getFunction())
		<< " function will not be hoised: "
		<< ore::NV("Reason", "memcpy size is not equal to stride");
		});
		return false;
		}

		int64_t StoreStrideInt = StoreStrideValue.getSExtValue();
		int64_t LoadStrideInt = LoadStrideValue.getSExtValue();
		// Check if the load stride matches the store stride.
		if (StoreStrideInt != LoadStrideInt)
		return false;

		return processLoopStoreOfLoopLoad(Dest, Source, (unsigned)SizeInBytes,
		MCI->getDestAlign(), MCI->getSourceAlign(),
		MCI, MCI, StoreEv, LoadEv, BECount);
		}

/// processLoopMemSet - See if this memset can be promoted to a large memset.		/// processLoopMemSet - See if this memset can be promoted to a large memset.
bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,		bool LoopIdiomRecognize::processLoopMemSet(MemSetInst *MSI,
const SCEV *BECount) {		const SCEV *BECount) {
// We can only handle non-volatile memsets with a constant size.		// We can only handle non-volatile memsets with a constant size.
if (MSI->isVolatile() \|\| !isa<ConstantInt>(MSI->getLength()))		if (MSI->isVolatile() \|\| !isa<ConstantInt>(MSI->getLength()))
return false;		return false;

// If we're not allowed to hack on memset, we fail.		// If we're not allowed to hack on memset, we fail.
if (!HasMemset)		if (!HasMemset \|\| DisableLIRP::Memset)
return false;		return false;

Value *Pointer = MSI->getDest();		Value *Pointer = MSI->getDest();

// See if the pointer expression is an AddRec like {base,+,1} on the current		// See if the pointer expression is an AddRec like {base,+,1} on the current
// loop, which indicates a strided store. If we have something else, it's a		// loop, which indicates a strided store. If we have something else, it's a
// random store we can't handle.		// random store we can't handle.
const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));		const SCEVAddRecExpr *Ev = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Pointer));
▲ Show 20 Lines • Show All 223 Lines • ▼ Show 20 Lines	bool LoopIdiomRecognize::processLoopStridedStore(

LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"		LLVM_DEBUG(dbgs() << " Formed memset: " << *NewCall << "\n"
<< " from store to: " << Ev << " at: " << TheStore		<< " from store to: " << Ev << " at: " << TheStore
<< "\n");		<< "\n");

ORE.emit([&]() {		ORE.emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStridedStore",		return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStridedStore",
NewCall->getDebugLoc(), Preheader)		NewCall->getDebugLoc(), Preheader)
<< "Transformed loop-strided store into a call to "		<< "Transformed loop-strided store in "
		<< ore::NV("Function", TheStore->getFunction())
		<< " function into a call to "
<< ore::NV("NewFunction", NewCall->getCalledFunction())		<< ore::NV("NewFunction", NewCall->getCalledFunction())
<< "() function";		<< "() intrinsic";
});		});

// Okay, the memset has been formed. Zap the original store and anything that		// Okay, the memset has been formed. Zap the original store and anything that
// feeds into it.		// feeds into it.
for (auto *I : Stores) {		for (auto *I : Stores) {
if (MSSAU)		if (MSSAU)
MSSAU->removeMemoryAccess(I, true);		MSSAU->removeMemoryAccess(I, true);
deleteDeadInstruction(I);		deleteDeadInstruction(I);
▲ Show 20 Lines • Show All 71 Lines • ▼ Show 20 Lines	bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(
// for in just a textual dump of the IR. This is written as a variable, even		// for in just a textual dump of the IR. This is written as a variable, even
// though statically all the places this dominates could be replaced with		// though statically all the places this dominates could be replaced with
// 'true', with the hope that anyone trying to be clever / "more precise" with		// 'true', with the hope that anyone trying to be clever / "more precise" with
// the return value will read this comment, and leave them alone.		// the return value will read this comment, and leave them alone.
Changed = true;		Changed = true;

SmallPtrSet<Instruction *, 1> Stores;		SmallPtrSet<Instruction *, 1> Stores;
Stores.insert(TheStore);		Stores.insert(TheStore);

		bool IsMemCpy = isa<MemCpyInst>(TheStore);
		const StringRef InstRemark = IsMemCpy ? "memcpy" : "load and store";
		hoyUnsubmitted Done Reply Inline Actions Nit: Use 'StringRef` to avoid extra mem allocation? hoy: Nit: Use 'StringRef` to avoid extra mem allocation?

if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,		if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
StoreSize, *AA, Stores))		StoreSize, *AA, Stores)) {
		ORE.emit([&]() {
		return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessStore",
		TheStore)
		<< ore::NV("Inst", InstRemark) << " in "
		<< ore::NV("Function", TheStore->getFunction())
		<< " function will not be hoisted: "
		<< ore::NV("Reason", "The loop may access store location");
		});
return Changed;		return Changed;
		}

const SCEV *LdStart = LoadEv->getStart();		const SCEV *LdStart = LoadEv->getStart();
unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();		unsigned LdAS = SourcePtr->getType()->getPointerAddressSpace();

// Handle negative strided loops.		// Handle negative strided loops.
if (NegStride)		if (NegStride)
LdStart = getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSize, SE);		LdStart = getStartForNegStride(LdStart, BECount, IntIdxTy, StoreSize, SE);

// For a memcpy, we have to make sure that the input array is not being		// For a memcpy, we have to make sure that the input array is not being
// mutated by the loop.		// mutated by the loop.
Value *LoadBasePtr = Expander.expandCodeFor(		Value *LoadBasePtr = Expander.expandCodeFor(
LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());		LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());

		// If the store is a memcpy instruction, we must check if it will write to
		// the load memory locations. So remove it from the ignored stores.
		if (IsMemCpy)
		Stores.erase(TheStore);
if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,		if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
StoreSize, *AA, Stores))		StoreSize, *AA, Stores)) {
		ORE.emit([&]() {
		return OptimizationRemarkMissed(DEBUG_TYPE, "LoopMayAccessLoad", TheLoad)
		<< ore::NV("Inst", InstRemark) << " in "
		<< ore::NV("Function", TheStore->getFunction())
		<< " function will not be hoisted: "
		<< ore::NV("Reason", "The loop may access load location");
		});
return Changed;		return Changed;
		}

if (avoidLIRForMultiBlockLoop())		if (avoidLIRForMultiBlockLoop())
return Changed;		return Changed;

// Okay, everything is safe, we can transform this!		// Okay, everything is safe, we can transform this!

const SCEV *NumBytesS =		const SCEV *NumBytesS =
getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);		getNumBytes(BECount, IntIdxTy, StoreSize, CurLoop, DL, SE);
▲ Show 20 Lines • Show All 44 Lines • ▼ Show 20 Lines	LLVM_DEBUG(dbgs() << " Formed memcpy: " << *NewCall << "\n"
<< " from store ptr=" << StoreEv << " at: " << TheStore		<< " from store ptr=" << StoreEv << " at: " << TheStore
<< "\n");		<< "\n");

ORE.emit([&]() {		ORE.emit([&]() {
return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",		return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStoreOfLoopLoad",
NewCall->getDebugLoc(), Preheader)		NewCall->getDebugLoc(), Preheader)
<< "Formed a call to "		<< "Formed a call to "
<< ore::NV("NewFunction", NewCall->getCalledFunction())		<< ore::NV("NewFunction", NewCall->getCalledFunction())
<< "() function";		<< "() intrinsic from " << ore::NV("Inst", InstRemark)
		<< " instruction in " << ore::NV("Function", TheStore->getFunction())
		<< " function";
});		});

// Okay, the memcpy has been formed. Zap the original store and anything that		// Okay, the memcpy has been formed. Zap the original store and anything that
// feeds into it.		// feeds into it.
if (MSSAU)		if (MSSAU)
MSSAU->removeMemoryAccess(TheStore, true);		MSSAU->removeMemoryAccess(TheStore, true);
deleteDeadInstruction(TheStore);		deleteDeadInstruction(TheStore);
if (MSSAU && VerifyMemorySSA)		if (MSSAU && VerifyMemorySSA)
▲ Show 20 Lines • Show All 1,052 Lines • Show Last 20 Lines

llvm/test/Transforms/LoopIdiom/memcpy-debugify-remarks.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -basic-aa -debugify -loop-idiom -pass-remarks=loop-idiom -pass-remarks-analysis=loop-idiom -verify -verify-each -verify-dom-info -verify-loop-info < %s -S 2>&1 \| FileCheck %s			; RUN: opt -basic-aa -debugify -loop-idiom -pass-remarks=loop-idiom -pass-remarks-analysis=loop-idiom -verify -verify-each -verify-dom-info -verify-loop-info < %s -S 2>&1 \| FileCheck %s

	target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"			target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
	target triple = "x86_64-unknown-linux-gnu"			target triple = "x86_64-unknown-linux-gnu"

	; Check that everything still works when debuginfo is present, and that it is reasonably propagated.			; Check that everything still works when debuginfo is present, and that it is reasonably propagated.

	; CHECK: remark: <stdin>:6:1: Formed a call to llvm.memcpy.p0i8.p0i8.i64() function			; CHECK: remark: <stdin>:6:1: Formed a call to llvm.memcpy.p0i8.p0i8.i64() intrinsic from load and store instruction in test6_dest_align function

	define void @test6_dest_align(i32* noalias align 1 %Base, i32* noalias align 4 %Dest, i64 %Size) nounwind ssp {			define void @test6_dest_align(i32* noalias align 1 %Base, i32* noalias align 4 %Dest, i64 %Size) nounwind ssp {
	; CHECK-LABEL: @test6_dest_align(			; CHECK-LABEL: @test6_dest_align(
	; CHECK-NEXT: bb.nph:			; CHECK-NEXT: bb.nph:
	; CHECK-NEXT: [[DEST1:%.]] = bitcast i32 [[DEST:%.]] to i8			; CHECK-NEXT: [[DEST1:%.]] = bitcast i32 [[DEST:%.]] to i8
	; CHECK-NEXT: [[BASE2:%.]] = bitcast i32 [[BASE:%.]] to i8			; CHECK-NEXT: [[BASE2:%.]] = bitcast i32 [[BASE:%.]] to i8
	; CHECK-NEXT: [[TMP0:%.]] = shl nuw i64 [[SIZE:%.]], 2, !dbg !18			; CHECK-NEXT: [[TMP0:%.]] = shl nuw i64 [[SIZE:%.]], 2, !dbg !18
	; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[DEST1]], i8* align 1 [[BASE2]], i64 [[TMP0]], i1 false), !dbg !19			; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[DEST1]], i8* align 1 [[BASE2]], i64 [[TMP0]], i1 false), !dbg !19
	Show All 34 Lines

llvm/test/Transforms/LoopIdiom/memcpy-intrinsic-different-types.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
				; RUN: opt -loop-idiom < %s -S \| FileCheck %s

				; #include <vector>
				;
				; class SDValue {
				; int A;
				; int B;
				; unsigned C;
				; };
				;
				; class SDUse {
				; SDValue Val;
				; SDUse **Prev = nullptr;
				; SDUse *Next = nullptr;
				;
				; public:
				; operator const SDValue&() const { return Val; }
				; };
				;
				; void foo(SDUse *S, int N) {
				; // Should not hoist memcpy because source and destination are of different types
				; std::vector<SDValue> Ops(S, S + N);
				; }

				; ModuleID = 'different_types.cpp'
				source_filename = "different_types.cpp"
				target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
				target triple = "x86_64-unknown-linux-gnu"

				%class.SDUse = type { %class.SDValue, %class.SDUse*, %class.SDUse }
				%class.SDValue = type { i32, i32, i32 }

				declare dso_local i32 @__gxx_personality_v0(...)

				; Function Attrs: uwtable mustprogress
				define linkonce_odr dso_local %class.SDValue* @_ZNSt20__uninitialized_copyILb0EE13__uninit_copyIP5SDUseP7SDValueEET0_T_S7_S6_(%class.SDUse* %__first, %class.SDUse* %__last, %class.SDValue* %__result) local_unnamed_addr #0 align 2 personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) {
				; CHECK-LABEL: @_ZNSt20__uninitialized_copyILb0EE13__uninit_copyIP5SDUseP7SDValueEET0_T_S7_S6_(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[CMP_NOT15:%.]] = icmp eq %class.SDUse [[__FIRST:%.]], [[__LAST:%.]]
				; CHECK-NEXT: br i1 [[CMP_NOT15]], label [[FOR_END:%.]], label [[FOR_INC_PREHEADER:%.]]
				; CHECK: for.inc.preheader:
				; CHECK-NEXT: br label [[FOR_INC:%.*]]
				; CHECK: for.inc:
				; CHECK-NEXT: [[__CUR_017:%.]] = phi %class.SDValue [ [[INCDEC_PTR1:%.]], [[FOR_INC]] ], [ [[__RESULT:%.]], [[FOR_INC_PREHEADER]] ]
				; CHECK-NEXT: [[__FIRST_ADDR_016:%.]] = phi %class.SDUse [ [[INCDEC_PTR:%.*]], [[FOR_INC]] ], [ [[__FIRST]], [[FOR_INC_PREHEADER]] ]
				; CHECK-NEXT: [[TMP0:%.]] = bitcast %class.SDValue [[__CUR_017]] to i8*
				; CHECK-NEXT: [[TMP1:%.]] = bitcast %class.SDUse [[__FIRST_ADDR_016]] to i8*
				; CHECK-NEXT: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 4 dereferenceable(12) [[TMP0]], i8* noundef nonnull align 8 dereferenceable(12) [[TMP1]], i64 12, i1 false)
				; CHECK-NEXT: [[INCDEC_PTR]] = getelementptr inbounds [[CLASS_SDUSE:%.]], %class.SDUse [[__FIRST_ADDR_016]], i64 1
				; CHECK-NEXT: [[INCDEC_PTR1]] = getelementptr inbounds [[CLASS_SDVALUE:%.]], %class.SDValue [[__CUR_017]], i64 1
				; CHECK-NEXT: [[CMP_NOT:%.]] = icmp eq %class.SDUse [[INCDEC_PTR]], [[__LAST]]
				; CHECK-NEXT: br i1 [[CMP_NOT]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_INC]]
				; CHECK: for.end.loopexit:
				; CHECK-NEXT: [[INCDEC_PTR1_LCSSA:%.]] = phi %class.SDValue [ [[INCDEC_PTR1]], [[FOR_INC]] ]
				; CHECK-NEXT: br label [[FOR_END]]
				; CHECK: for.end:
				; CHECK-NEXT: [[__CUR_0_LCSSA:%.]] = phi %class.SDValue [ [[__RESULT]], [[ENTRY:%.*]] ], [ [[INCDEC_PTR1_LCSSA]], [[FOR_END_LOOPEXIT]] ]
				; CHECK-NEXT: ret %class.SDValue* [[__CUR_0_LCSSA]]
				;
				entry:
				%cmp.not15 = icmp eq %class.SDUse* %__first, %__last
				br i1 %cmp.not15, label %for.end, label %for.inc.preheader

				for.inc.preheader: ; preds = %entry
				br label %for.inc

				for.inc: ; preds = %for.inc.preheader, %for.inc
				%__cur.017 = phi %class.SDValue* [ %incdec.ptr1, %for.inc ], [ %__result, %for.inc.preheader ]
				%__first.addr.016 = phi %class.SDUse* [ %incdec.ptr, %for.inc ], [ %__first, %for.inc.preheader ]
				%0 = bitcast %class.SDValue* %__cur.017 to i8*
				%1 = bitcast %class.SDUse* %__first.addr.016 to i8*
				tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 4 dereferenceable(12) %0, i8* noundef nonnull align 8 dereferenceable(12) %1, i64 12, i1 false)
				%incdec.ptr = getelementptr inbounds %class.SDUse, %class.SDUse* %__first.addr.016, i64 1
				%incdec.ptr1 = getelementptr inbounds %class.SDValue, %class.SDValue* %__cur.017, i64 1
				%cmp.not = icmp eq %class.SDUse* %incdec.ptr, %__last
				br i1 %cmp.not, label %for.end.loopexit, label %for.inc

				for.end.loopexit: ; preds = %for.inc
				%incdec.ptr1.lcssa = phi %class.SDValue* [ %incdec.ptr1, %for.inc ]
				br label %for.end

				for.end: ; preds = %for.end.loopexit, %entry
				%__cur.0.lcssa = phi %class.SDValue* [ %__result, %entry ], [ %incdec.ptr1.lcssa, %for.end.loopexit ]
				ret %class.SDValue* %__cur.0.lcssa
				}

				; Function Attrs: argmemonly nofree nosync nounwind willreturn
				declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1

llvm/test/Transforms/LoopIdiom/memcpy-intrinsic.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
				; RUN: opt -loop-idiom < %s -S \| FileCheck %s

				%struct.S = type { i32, i32, i8 }

				; unsigned copy_noalias(S* __restrict a, S *b, int n) {
				; for (int i = 0; i < n; i++) {
				; a[i] = b[i];
				; }
				; return sizeof(a[0]);
				; }

				; Function Attrs: nofree nounwind uwtable mustprogress
				define dso_local i32 @copy_noalias(%struct.S* noalias nocapture %a, %struct.S* nocapture readonly %b, i32 %n) local_unnamed_addr #0 {
				; CHECK-LABEL: @copy_noalias(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[A1:%.]] = bitcast %struct.S [[A:%.]] to i8
				; CHECK-NEXT: [[B2:%.]] = bitcast %struct.S [[B:%.]] to i8
				; CHECK-NEXT: [[CMP7:%.]] = icmp sgt i32 [[N:%.]], 0
				; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.]], label [[FOR_COND_CLEANUP:%.]]
				; CHECK: for.body.preheader:
				; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
				; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i64 [[TMP0]], 12
				; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[A1]], i8* align 4 [[B2]], i64 [[TMP1]], i1 false)
				; CHECK-NEXT: br label [[FOR_BODY:%.*]]
				; CHECK: for.cond.cleanup.loopexit:
				; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
				; CHECK: for.cond.cleanup:
				; CHECK-NEXT: ret i32 12
				; CHECK: for.body:
				; CHECK-NEXT: [[I_08:%.]] = phi i32 [ [[INC:%.]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
				; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_08]] to i64
				; CHECK-NEXT: [[ARRAYIDX:%.]] = getelementptr inbounds [[STRUCT_S:%.]], %struct.S* [[B]], i64 [[IDXPROM]]
				; CHECK-NEXT: [[ARRAYIDX2:%.]] = getelementptr inbounds [[STRUCT_S]], %struct.S [[A]], i64 [[IDXPROM]]
				; CHECK-NEXT: [[TMP2:%.]] = bitcast %struct.S [[ARRAYIDX2]] to i8*
				; CHECK-NEXT: [[TMP3:%.]] = bitcast %struct.S [[ARRAYIDX]] to i8*
				; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
				; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
				; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
				;
				entry:
				%cmp7 = icmp sgt i32 %n, 0
				br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup

				for.body.preheader: ; preds = %entry
				br label %for.body

				for.cond.cleanup.loopexit: ; preds = %for.body
				br label %for.cond.cleanup

				for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
				ret i32 12

				for.body: ; preds = %for.body.preheader, %for.body
				%i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
				%idxprom = zext i32 %i.08 to i64
				%arrayidx = getelementptr inbounds %struct.S, %struct.S* %b, i64 %idxprom
				%arrayidx2 = getelementptr inbounds %struct.S, %struct.S* %a, i64 %idxprom
				%0 = bitcast %struct.S* %arrayidx2 to i8*
				%1 = bitcast %struct.S* %arrayidx to i8*
				call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %0, i8* nonnull align 4 dereferenceable(12) %1, i64 12, i1 false)
				%inc = add nuw nsw i32 %i.08, 1
				%cmp = icmp slt i32 %inc, %n
				br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
				}

				; unsigned copy_may_alias(S a, S b, int n) {
				; for (int i = 0; i < n; i++) {
				; a[i] = b[i];
				; }
				; return sizeof(a[0]);
				; }

				; Function Attrs: nofree nounwind uwtable mustprogress
				define dso_local i32 @copy_may_alias(%struct.S* nocapture %a, %struct.S* nocapture readonly %b, i32 %n) local_unnamed_addr #0 {
				; CHECK-LABEL: @copy_may_alias(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[CMP7:%.]] = icmp sgt i32 [[N:%.]], 0
				; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.]], label [[FOR_COND_CLEANUP:%.]]
				; CHECK: for.body.preheader:
				; CHECK-NEXT: br label [[FOR_BODY:%.*]]
				; CHECK: for.cond.cleanup.loopexit:
				; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
				; CHECK: for.cond.cleanup:
				; CHECK-NEXT: ret i32 12
				; CHECK: for.body:
				; CHECK-NEXT: [[I_08:%.]] = phi i32 [ [[INC:%.]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
				; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_08]] to i64
				; CHECK-NEXT: [[ARRAYIDX:%.]] = getelementptr inbounds [[STRUCT_S:%.]], %struct.S* [[B:%.*]], i64 [[IDXPROM]]
				; CHECK-NEXT: [[ARRAYIDX2:%.]] = getelementptr inbounds [[STRUCT_S]], %struct.S [[A:%.*]], i64 [[IDXPROM]]
				; CHECK-NEXT: [[TMP0:%.]] = bitcast %struct.S [[ARRAYIDX2]] to i8*
				; CHECK-NEXT: [[TMP1:%.]] = bitcast %struct.S [[ARRAYIDX]] to i8*
				; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(12) [[TMP0]], i8* nonnull align 4 dereferenceable(12) [[TMP1]], i64 12, i1 false)
				; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
				; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
				; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
				;
				entry:
				%cmp7 = icmp sgt i32 %n, 0
				br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup

				for.body.preheader: ; preds = %entry
				br label %for.body

				for.cond.cleanup.loopexit: ; preds = %for.body
				br label %for.cond.cleanup

				for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
				ret i32 12

				for.body: ; preds = %for.body.preheader, %for.body
				%i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
				%idxprom = zext i32 %i.08 to i64
				%arrayidx = getelementptr inbounds %struct.S, %struct.S* %b, i64 %idxprom
				%arrayidx2 = getelementptr inbounds %struct.S, %struct.S* %a, i64 %idxprom
				%0 = bitcast %struct.S* %arrayidx2 to i8*
				%1 = bitcast %struct.S* %arrayidx to i8*
				call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 4 dereferenceable(12) %0, i8* nonnull align 4 dereferenceable(12) %1, i64 12, i1 false)
				%inc = add nuw nsw i32 %i.08, 1
				%cmp = icmp slt i32 %inc, %n
				br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
				}

				%struct.R = type <{ i8, i32, i8 }>

				; void copy_noalias_read(S* __restrict x, S* __restrict y, int n, int &s) {
				; for (int i = 0; i < n; i++) {
				; x[i] = y[i];
				; s += y[i].b;
				; }
				; }

				; Function Attrs: nofree nounwind uwtable mustprogress
				define dso_local void @copy_noalias_read(%struct.R* noalias nocapture %x, %struct.R* noalias nocapture readonly %y, i32 %n, i32* nocapture nonnull align 4 dereferenceable(4) %s) local_unnamed_addr #0 {
				; CHECK-LABEL: @copy_noalias_read(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[X1:%.]] = bitcast %struct.R [[X:%.]] to i8
				; CHECK-NEXT: [[Y2:%.]] = bitcast %struct.R [[Y:%.]] to i8
				; CHECK-NEXT: [[CMP11:%.]] = icmp sgt i32 [[N:%.]], 0
				; CHECK-NEXT: br i1 [[CMP11]], label [[FOR_BODY_LR_PH:%.]], label [[FOR_COND_CLEANUP:%.]]
				; CHECK: for.body.lr.ph:
				; CHECK-NEXT: [[S_PROMOTED:%.]] = load i32, i32 [[S:%.*]], align 4
				; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
				; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i64 [[TMP0]], 6
				; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[X1]], i8* align 1 [[Y2]], i64 [[TMP1]], i1 false)
				; CHECK-NEXT: br label [[FOR_BODY:%.*]]
				; CHECK: for.cond.for.cond.cleanup_crit_edge:
				; CHECK-NEXT: [[ADD_LCSSA:%.]] = phi i32 [ [[ADD:%.]], [[FOR_BODY]] ]
				; CHECK-NEXT: store i32 [[ADD_LCSSA]], i32* [[S]], align 4
				; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
				; CHECK: for.cond.cleanup:
				; CHECK-NEXT: ret void
				; CHECK: for.body:
				; CHECK-NEXT: [[ADD13:%.*]] = phi i32 [ [[S_PROMOTED]], [[FOR_BODY_LR_PH]] ], [ [[ADD]], [[FOR_BODY]] ]
				; CHECK-NEXT: [[I_012:%.]] = phi i32 [ 0, [[FOR_BODY_LR_PH]] ], [ [[INC:%.]], [[FOR_BODY]] ]
				; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_012]] to i64
				; CHECK-NEXT: [[TMP2:%.]] = getelementptr inbounds [[STRUCT_R:%.]], %struct.R* [[X]], i64 [[IDXPROM]], i32 0
				; CHECK-NEXT: [[TMP3:%.]] = getelementptr inbounds [[STRUCT_R]], %struct.R [[Y]], i64 [[IDXPROM]], i32 0
				; CHECK-NEXT: [[B:%.]] = getelementptr inbounds [[STRUCT_R]], %struct.R [[Y]], i64 [[IDXPROM]], i32 1
				; CHECK-NEXT: [[TMP4:%.]] = load i32, i32 [[B]], align 1
				; CHECK-NEXT: [[ADD]] = add nsw i32 [[ADD13]], [[TMP4]]
				; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_012]], 1
				; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
				; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_FOR_COND_CLEANUP_CRIT_EDGE:%.*]]
				;
				entry:
				%cmp11 = icmp sgt i32 %n, 0
				br i1 %cmp11, label %for.body.lr.ph, label %for.cond.cleanup

				for.body.lr.ph: ; preds = %entry
				%s.promoted = load i32, i32* %s, align 4
				br label %for.body

				for.cond.for.cond.cleanup_crit_edge: ; preds = %for.body
				%add.lcssa = phi i32 [ %add, %for.body ]
				store i32 %add.lcssa, i32* %s, align 4
				br label %for.cond.cleanup

				for.cond.cleanup: ; preds = %for.cond.for.cond.cleanup_crit_edge, %entry
				ret void

				for.body: ; preds = %for.body.lr.ph, %for.body
				%add13 = phi i32 [ %s.promoted, %for.body.lr.ph ], [ %add, %for.body ]
				%i.012 = phi i32 [ 0, %for.body.lr.ph ], [ %inc, %for.body ]
				%idxprom = zext i32 %i.012 to i64
				%0 = getelementptr inbounds %struct.R, %struct.R* %x, i64 %idxprom, i32 0
				%1 = getelementptr inbounds %struct.R, %struct.R* %y, i64 %idxprom, i32 0
				call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(6) %0, i8* nonnull align 1 dereferenceable(6) %1, i64 6, i1 false)
				%b = getelementptr inbounds %struct.R, %struct.R* %y, i64 %idxprom, i32 1
				%2 = load i32, i32* %b, align 1
				%add = add nsw i32 %add13, %2
				%inc = add nuw nsw i32 %i.012, 1
				%cmp = icmp slt i32 %inc, %n
				br i1 %cmp, label %for.body, label %for.cond.for.cond.cleanup_crit_edge
				}

				; unsigned copy_noalias_negative_stride(S* __restrict__ a, S* b, int n) {
				; for (int i = n; i >= 0; i--) {
				; a[i] = b[i];
				; }
				; return sizeof(a[0]);
				; }

				; Function Attrs: nofree nosync nounwind uwtable mustprogress
				define dso_local i32 @copy_noalias_negative_stride(%struct.S* noalias nocapture %0, %struct.S* nocapture readonly %1, i32 %2) local_unnamed_addr #0 {
				; CHECK-LABEL: @copy_noalias_negative_stride(
				; CHECK-NEXT: [[TMP4:%.]] = bitcast %struct.S [[TMP0:%.]] to i8
				; CHECK-NEXT: [[TMP5:%.]] = bitcast %struct.S [[TMP1:%.]] to i8
				; CHECK-NEXT: [[TMP6:%.]] = icmp sgt i32 [[TMP2:%.]], -1
				; CHECK-NEXT: br i1 [[TMP6]], label [[TMP7:%.]], label [[TMP12:%.]]
				; CHECK: 7:
				; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP2]] to i64
				; CHECK-NEXT: [[TMP9:%.*]] = mul nuw nsw i64 [[TMP8]], 12
				; CHECK-NEXT: [[TMP10:%.*]] = add nuw nsw i64 [[TMP9]], 12
				; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 [[TMP4]], i8* align 4 [[TMP5]], i64 [[TMP10]], i1 false)
				; CHECK-NEXT: br label [[TMP13:%.*]]
				; CHECK: 11:
				; CHECK-NEXT: br label [[TMP12]]
				; CHECK: 12:
				; CHECK-NEXT: ret i32 12
				; CHECK: 13:
				; CHECK-NEXT: [[TMP14:%.]] = phi i32 [ [[TMP20:%.]], [[TMP13]] ], [ [[TMP2]], [[TMP7]] ]
				; CHECK-NEXT: [[TMP15:%.*]] = zext i32 [[TMP14]] to i64
				; CHECK-NEXT: [[TMP16:%.]] = getelementptr inbounds [[STRUCT_S:%.]], %struct.S* [[TMP1]], i64 [[TMP15]]
				; CHECK-NEXT: [[TMP17:%.]] = getelementptr inbounds [[STRUCT_S]], %struct.S [[TMP0]], i64 [[TMP15]]
				; CHECK-NEXT: [[TMP18:%.]] = bitcast %struct.S [[TMP17]] to i8*
				; CHECK-NEXT: [[TMP19:%.]] = bitcast %struct.S [[TMP16]] to i8*
				; CHECK-NEXT: [[TMP20]] = add nsw i32 [[TMP14]], -1
				; CHECK-NEXT: [[TMP21:%.*]] = icmp sgt i32 [[TMP14]], 0
				; CHECK-NEXT: br i1 [[TMP21]], label [[TMP13]], label [[TMP11:%.*]]
				;
				%4 = icmp sgt i32 %2, -1
				br i1 %4, label %5, label %7

				5: ; preds = %3
				br label %8

				6: ; preds = %8
				br label %7

				7: ; preds = %6, %3
				ret i32 12

				8: ; preds = %5, %8
				%9 = phi i32 [ %15, %8 ], [ %2, %5 ]
				%10 = zext i32 %9 to i64
				%11 = getelementptr inbounds %struct.S, %struct.S* %1, i64 %10
				%12 = getelementptr inbounds %struct.S, %struct.S* %0, i64 %10
				%13 = bitcast %struct.S* %12 to i8*
				%14 = bitcast %struct.S* %11 to i8*
				tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 4 dereferenceable(12) %13, i8* noundef nonnull align 4 dereferenceable(12) %14, i64 12, i1 false)
				%15 = add nsw i32 %9, -1
				%16 = icmp sgt i32 %9, 0
				br i1 %16, label %8, label %6
				}

				; unsigned copy_noalias_opposite_stride(S* __restrict__ a, S* b, int n) {
				; for (int i = 0, j = n; i < n && j >= 0; i++, j--) {
				; a[i] = b[j];
				; }
				; return sizeof(a[0]);
				; }

				; Function Attrs: nofree nosync nounwind uwtable mustprogress
				define dso_local i32 @copy_noalias_opposite_stride(%struct.S* noalias nocapture %0, %struct.S* nocapture readonly %1, i32 %2) local_unnamed_addr #0 {
				; CHECK-LABEL: @copy_noalias_opposite_stride(
				; CHECK-NEXT: [[TMP4:%.]] = icmp sgt i32 [[TMP2:%.]], 0
				; CHECK-NEXT: br i1 [[TMP4]], label [[TMP5:%.]], label [[TMP7:%.]]
				; CHECK: 5:
				; CHECK-NEXT: br label [[TMP8:%.*]]
				; CHECK: 6:
				; CHECK-NEXT: br label [[TMP7]]
				; CHECK: 7:
				; CHECK-NEXT: ret i32 12
				; CHECK: 8:
				; CHECK-NEXT: [[TMP9:%.]] = phi i32 [ [[TMP18:%.]], [[TMP8]] ], [ [[TMP2]], [[TMP5]] ]
				; CHECK-NEXT: [[TMP10:%.]] = phi i32 [ [[TMP17:%.]], [[TMP8]] ], [ 0, [[TMP5]] ]
				; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP9]] to i64
				; CHECK-NEXT: [[TMP12:%.]] = getelementptr inbounds [[STRUCT_S:%.]], %struct.S* [[TMP1:%.*]], i64 [[TMP11]]
				; CHECK-NEXT: [[TMP13:%.*]] = zext i32 [[TMP10]] to i64
				; CHECK-NEXT: [[TMP14:%.]] = getelementptr inbounds [[STRUCT_S]], %struct.S [[TMP0:%.*]], i64 [[TMP13]]
				; CHECK-NEXT: [[TMP15:%.]] = bitcast %struct.S [[TMP14]] to i8*
				; CHECK-NEXT: [[TMP16:%.]] = bitcast %struct.S [[TMP12]] to i8*
				; CHECK-NEXT: tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 4 dereferenceable(12) [[TMP15]], i8* noundef nonnull align 4 dereferenceable(12) [[TMP16]], i64 12, i1 false)
				; CHECK-NEXT: [[TMP17]] = add nuw nsw i32 [[TMP10]], 1
				; CHECK-NEXT: [[TMP18]] = add nsw i32 [[TMP9]], -1
				; CHECK-NEXT: [[TMP19:%.*]] = icmp slt i32 [[TMP17]], [[TMP2]]
				; CHECK-NEXT: [[TMP20:%.*]] = icmp sgt i32 [[TMP9]], 0
				; CHECK-NEXT: [[TMP21:%.*]] = and i1 [[TMP19]], [[TMP20]]
				; CHECK-NEXT: br i1 [[TMP21]], label [[TMP8]], label [[TMP6:%.*]]
				;
				%4 = icmp sgt i32 %2, 0
				br i1 %4, label %5, label %7

				5: ; preds = %3
				br label %8

				6: ; preds = %8
				br label %7

				7: ; preds = %6, %3
				ret i32 12

				8: ; preds = %5, %8
				%9 = phi i32 [ %18, %8 ], [ %2, %5 ]
				%10 = phi i32 [ %17, %8 ], [ 0, %5 ]
				%11 = zext i32 %9 to i64
				%12 = getelementptr inbounds %struct.S, %struct.S* %1, i64 %11
				%13 = zext i32 %10 to i64
				%14 = getelementptr inbounds %struct.S, %struct.S* %0, i64 %13
				%15 = bitcast %struct.S* %14 to i8*
				%16 = bitcast %struct.S* %12 to i8*
				tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* noundef nonnull align 4 dereferenceable(12) %15, i8* noundef nonnull align 4 dereferenceable(12) %16, i64 12, i1 false)
				%17 = add nuw nsw i32 %10, 1
				%18 = add nsw i32 %9, -1
				%19 = icmp slt i32 %17, %2
				%20 = icmp sgt i32 %9, 0
				%21 = and i1 %19, %20
				br i1 %21, label %8, label %6
				}

				%struct.SPacked = type <{ i32, i32, i8 }>

				; Function Attrs: nofree nounwind uwtable mustprogress
				define dso_local i32 @copy_noalias_packed(%struct.SPacked* noalias nocapture %a, %struct.SPacked* nocapture readonly %b, i32 %n) local_unnamed_addr #0 {
				; CHECK-LABEL: @copy_noalias_packed(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[A1:%.]] = bitcast %struct.SPacked [[A:%.]] to i8
				; CHECK-NEXT: [[B2:%.]] = bitcast %struct.SPacked [[B:%.]] to i8
				; CHECK-NEXT: [[CMP7:%.]] = icmp sgt i32 [[N:%.]], 0
				; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.]], label [[FOR_COND_CLEANUP:%.]]
				; CHECK: for.body.preheader:
				; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
				; CHECK-NEXT: [[TMP1:%.*]] = mul nuw nsw i64 [[TMP0]], 9
				; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 1 [[A1]], i8* align 1 [[B2]], i64 [[TMP1]], i1 false)
				; CHECK-NEXT: br label [[FOR_BODY:%.*]]
				; CHECK: for.cond.cleanup.loopexit:
				; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
				; CHECK: for.cond.cleanup:
				; CHECK-NEXT: ret i32 9
				; CHECK: for.body:
				; CHECK-NEXT: [[I_08:%.]] = phi i32 [ [[INC:%.]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
				; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_08]] to i64
				; CHECK-NEXT: [[ARRAYIDX:%.]] = getelementptr inbounds [[STRUCT_SPACKED:%.]], %struct.SPacked* [[B]], i64 [[IDXPROM]]
				; CHECK-NEXT: [[ARRAYIDX2:%.]] = getelementptr inbounds [[STRUCT_SPACKED]], %struct.SPacked [[A]], i64 [[IDXPROM]]
				; CHECK-NEXT: [[TMP2:%.]] = bitcast %struct.SPacked [[ARRAYIDX2]] to i8*
				; CHECK-NEXT: [[TMP3:%.]] = bitcast %struct.SPacked [[ARRAYIDX]] to i8*
				; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
				; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
				; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
				;
				entry:
				%cmp7 = icmp sgt i32 %n, 0
				br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup

				for.body.preheader: ; preds = %entry
				br label %for.body

				for.cond.cleanup.loopexit: ; preds = %for.body
				br label %for.cond.cleanup

				for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
				ret i32 9

				for.body: ; preds = %for.body.preheader, %for.body
				%i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
				%idxprom = zext i32 %i.08 to i64
				%arrayidx = getelementptr inbounds %struct.SPacked, %struct.SPacked* %b, i64 %idxprom
				%arrayidx2 = getelementptr inbounds %struct.SPacked, %struct.SPacked* %a, i64 %idxprom
				%0 = bitcast %struct.SPacked* %arrayidx2 to i8*
				%1 = bitcast %struct.SPacked* %arrayidx to i8*
				call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 1 dereferenceable(9) %0, i8* nonnull align 1 dereferenceable(9) %1, i64 9, i1 false)
				%inc = add nuw nsw i32 %i.08, 1
				%cmp = icmp slt i32 %inc, %n
				br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
				}

				%struct.SAligned = type { i32, i32, i8, [7 x i8] }

				define dso_local i32 @copy_noalias_aligned(%struct.SAligned* noalias nocapture %a, %struct.SAligned* nocapture readonly %b, i32 %n) local_unnamed_addr #0 {
				; CHECK-LABEL: @copy_noalias_aligned(
				; CHECK-NEXT: entry:
				; CHECK-NEXT: [[A1:%.]] = bitcast %struct.SAligned [[A:%.]] to i8
				; CHECK-NEXT: [[B2:%.]] = bitcast %struct.SAligned [[B:%.]] to i8
				; CHECK-NEXT: [[CMP7:%.]] = icmp sgt i32 [[N:%.]], 0
				; CHECK-NEXT: br i1 [[CMP7]], label [[FOR_BODY_PREHEADER:%.]], label [[FOR_COND_CLEANUP:%.]]
				; CHECK: for.body.preheader:
				; CHECK-NEXT: [[TMP0:%.*]] = zext i32 [[N]] to i64
				; CHECK-NEXT: [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 4
				; CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 16 [[A1]], i8* align 16 [[B2]], i64 [[TMP1]], i1 false)
				; CHECK-NEXT: br label [[FOR_BODY:%.*]]
				; CHECK: for.cond.cleanup.loopexit:
				; CHECK-NEXT: br label [[FOR_COND_CLEANUP]]
				; CHECK: for.cond.cleanup:
				; CHECK-NEXT: ret i32 16
				; CHECK: for.body:
				; CHECK-NEXT: [[I_08:%.]] = phi i32 [ [[INC:%.]], [[FOR_BODY]] ], [ 0, [[FOR_BODY_PREHEADER]] ]
				; CHECK-NEXT: [[IDXPROM:%.*]] = zext i32 [[I_08]] to i64
				; CHECK-NEXT: [[ARRAYIDX:%.]] = getelementptr inbounds [[STRUCT_SALIGNED:%.]], %struct.SAligned* [[B]], i64 [[IDXPROM]]
				; CHECK-NEXT: [[ARRAYIDX2:%.]] = getelementptr inbounds [[STRUCT_SALIGNED]], %struct.SAligned [[A]], i64 [[IDXPROM]]
				; CHECK-NEXT: [[TMP2:%.]] = bitcast %struct.SAligned [[ARRAYIDX2]] to i8*
				; CHECK-NEXT: [[TMP3:%.]] = bitcast %struct.SAligned [[ARRAYIDX]] to i8*
				; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[I_08]], 1
				; CHECK-NEXT: [[CMP:%.*]] = icmp slt i32 [[INC]], [[N]]
				; CHECK-NEXT: br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]]
				;
				entry:
				%cmp7 = icmp sgt i32 %n, 0
				br i1 %cmp7, label %for.body.preheader, label %for.cond.cleanup

				for.body.preheader: ; preds = %entry
				br label %for.body

				for.cond.cleanup.loopexit: ; preds = %for.body
				br label %for.cond.cleanup

				for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry
				ret i32 16

				for.body: ; preds = %for.body.preheader, %for.body
				%i.08 = phi i32 [ %inc, %for.body ], [ 0, %for.body.preheader ]
				%idxprom = zext i32 %i.08 to i64
				%arrayidx = getelementptr inbounds %struct.SAligned, %struct.SAligned* %b, i64 %idxprom
				%arrayidx2 = getelementptr inbounds %struct.SAligned, %struct.SAligned* %a, i64 %idxprom
				%0 = bitcast %struct.SAligned* %arrayidx2 to i8*
				%1 = bitcast %struct.SAligned* %arrayidx to i8*
				call void @llvm.memcpy.p0i8.p0i8.i64(i8* nonnull align 16 dereferenceable(16) %0, i8* nonnull align 16 dereferenceable(16) %1, i64 16, i1 false)
				%inc = add nuw nsw i32 %i.08, 1
				%cmp = icmp slt i32 %inc, %n
				br i1 %cmp, label %for.body, label %for.cond.cleanup.loopexit
				}

				; Function Attrs: argmemonly nofree nosync nounwind willreturn
				declare void @llvm.memcpy.p0i8.p0i8.i64(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i64, i1 immarg) #1

llvm/test/Transforms/LoopIdiom/memset-debugify-remarks.ll

	; NOTE: Assertions have been autogenerated by utils/update_test_checks.py			; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
	; RUN: opt -basic-aa -debugify -loop-idiom -pass-remarks=loop-idiom -pass-remarks-analysis=loop-idiom -verify -verify-each -verify-dom-info -verify-loop-info < %s -S 2>&1 \| FileCheck %s			; RUN: opt -basic-aa -debugify -loop-idiom -pass-remarks=loop-idiom -pass-remarks-analysis=loop-idiom -verify -verify-each -verify-dom-info -verify-loop-info < %s -S 2>&1 \| FileCheck %s

	target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"			target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
	target triple = "x86_64-unknown-linux-gnu"			target triple = "x86_64-unknown-linux-gnu"

	; Check that everything still works when debuginfo is present, and that it is reasonably propagated.			; Check that everything still works when debuginfo is present, and that it is reasonably propagated.

	; void my_basic_memset(char* begin, char* end, char value) {			; void my_basic_memset(char* begin, char* end, char value) {
	; for( ; begin != end; ++begin)			; for( ; begin != end; ++begin)
	; *begin = value;			; *begin = value;
	; }			; }

	; CHECK: remark: <stdin>:4:1: Transformed loop-strided store into a call to llvm.memset.p0i8.i64() function			; CHECK: remark: <stdin>:4:1: Transformed loop-strided store in _Z15my_basic_memsetPcS_c function into a call to llvm.memset.p0i8.i64() intrinsic

	define void @_Z15my_basic_memsetPcS_c(i8* %ptr, i8* %end, i8 %value) {			define void @_Z15my_basic_memsetPcS_c(i8* %ptr, i8* %end, i8 %value) {
	; CHECK-LABEL: @_Z15my_basic_memsetPcS_c(			; CHECK-LABEL: @_Z15my_basic_memsetPcS_c(
	; CHECK-NEXT: entry:			; CHECK-NEXT: entry:
	; CHECK-NEXT: [[PTR1:%.]] = ptrtoint i8 [[PTR:%.*]] to i64			; CHECK-NEXT: [[PTR1:%.]] = ptrtoint i8 [[PTR:%.*]] to i64
	; CHECK-NEXT: [[CMP3:%.]] = icmp eq i8 [[PTR]], [[END:%.*]], !dbg !15			; CHECK-NEXT: [[CMP3:%.]] = icmp eq i8 [[PTR]], [[END:%.*]], !dbg !15
	; CHECK-NEXT: call void @llvm.dbg.value(metadata i1 [[CMP3]], metadata !9, metadata !DIExpression()), !dbg !15			; CHECK-NEXT: call void @llvm.dbg.value(metadata i1 [[CMP3]], metadata !9, metadata !DIExpression()), !dbg !15
	; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_END:%.]], label [[FOR_BODY_PREHEADER:%.]], !dbg !16			; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_END:%.]], label [[FOR_BODY_PREHEADER:%.]], !dbg !16
	Show All 33 Lines

This is an archive of the discontinued LLVM Phabricator instance.

[loop-idiom] Hoist loop memcpys to loop preheaderClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 342913

llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp

llvm/test/Transforms/LoopIdiom/memcpy-debugify-remarks.ll

llvm/test/Transforms/LoopIdiom/memcpy-intrinsic-different-types.ll

llvm/test/Transforms/LoopIdiom/memcpy-intrinsic.ll

llvm/test/Transforms/LoopIdiom/memset-debugify-remarks.ll

[loop-idiom] Hoist loop memcpys to loop preheader
ClosedPublic