Download Raw Diff

Details

Reviewers

majnemer
hfinkel
lattner
nikic
spatel
lebedev.ri

Commits

rG302b80abf036: [InstCombine] Avoid Bitcast-GEP fusion for pointers directly from allocation…

Summary

Elimination of bitcasts with void pointer arguments results in GEPs with pure byte indexes. These GEPs do not preserve struct/array information and interrupt phi address translation in later pipeline stages.

Here is the original motivation for this patch:

#include<stdio.h>
#include<malloc.h>

typedef struct __Node{

  double f;
  struct __Node *next;

} Node;


void foo () {
  Node *a = (Node*) malloc (sizeof(Node));
  a->next = NULL;
  a->f = 11.5f;

  Node *ptr = a;
  double sum = 0.0f;
  while (ptr) {
    sum += ptr->f;
    ptr = ptr->next;
  }
  printf("%f\n", sum);
}

By explicit assignment a->next = NULL, we can infer the length of the link list is 1. In this case we can eliminate while loop traversal entirely. This elimination is supposed to be performed by GVN/MemoryDependencyAnalysis/PhiTranslation .

The final IR before this patch:

define dso_local void @foo(i32* nocapture readnone %r) local_unnamed_addr #0 {
entry:
  %call = tail call noalias dereferenceable_or_null(16) i8* @malloc(i64 16) #2
  %next = getelementptr inbounds i8, i8* %call, i64 8
  %0 = bitcast i8* %next to %struct.__Node**
  store %struct.__Node* null, %struct.__Node** %0, align 8, !tbaa !2
  %f = bitcast i8* %call to double*
  store double 1.150000e+01, double* %f, align 8, !tbaa !8
  %tobool12 = icmp eq i8* %call, null
  br i1 %tobool12, label %while.end, label %while.body.lr.ph

while.body.lr.ph:                                 ; preds = %entry
  %1 = bitcast i8* %call to %struct.__Node*
  br label %while.body

while.body:                                       ; preds = %while.body.lr.ph, %while.body
  %sum.014 = phi double [ 0.000000e+00, %while.body.lr.ph ], [ %add, %while.body ]
  %ptr.013 = phi %struct.__Node* [ %1, %while.body.lr.ph ], [ %3, %while.body ]
  %f1 = getelementptr inbounds %struct.__Node, %struct.__Node* %ptr.013, i64 0, i32 0
  %2 = load double, double* %f1, align 8, !tbaa !8
  %add = fadd contract double %sum.014, %2
  %next2 = getelementptr inbounds %struct.__Node, %struct.__Node* %ptr.013, i64 0, i32 1
  %3 = load %struct.__Node*, %struct.__Node** %next2, align 8, !tbaa !2
  %tobool = icmp eq %struct.__Node* %3, null
  br i1 %tobool, label %while.end, label %while.body

while.end:                                        ; preds = %while.body, %entry
  %sum.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add, %while.body ]
  %call3 = tail call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), double %sum.0.lcssa)
  ret void
}

Final IR after this patch:

; Function Attrs: nofree nounwind
define dso_local void @foo(i32* nocapture readnone %r) local_unnamed_addr #0 {
while.end:
  %call3 = tail call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), double 1.150000e+01)
  ret void
}

IR before GVN before this patch:

define dso_local void @foo(i32* nocapture readnone %r) local_unnamed_addr #0 {
entry:
  %call = tail call noalias dereferenceable_or_null(16) i8* @malloc(i64 16) #2
  %next = getelementptr inbounds i8, i8* %call, i64 8
  %0 = bitcast i8* %next to %struct.__Node**
  store %struct.__Node* null, %struct.__Node** %0, align 8, !tbaa !2
  %f = bitcast i8* %call to double*
  store double 1.150000e+01, double* %f, align 8, !tbaa !8
  %tobool12 = icmp eq i8* %call, null
  br i1 %tobool12, label %while.end, label %while.body.lr.ph

while.body.lr.ph:                                 ; preds = %entry
  %1 = bitcast i8* %call to %struct.__Node*
  br label %while.body

while.body:                                       ; preds = %while.body.lr.ph, %while.body
  %sum.014 = phi double [ 0.000000e+00, %while.body.lr.ph ], [ %add, %while.body ]
  %ptr.013 = phi %struct.__Node* [ %1, %while.body.lr.ph ], [ %3, %while.body ]
  %f1 = getelementptr inbounds %struct.__Node, %struct.__Node* %ptr.013, i64 0, i32 0
  %2 = load double, double* %f1, align 8, !tbaa !8
  %add = fadd contract double %sum.014, %2
  %next2 = getelementptr inbounds %struct.__Node, %struct.__Node* %ptr.013, i64 0, i32 1
  %3 = load %struct.__Node*, %struct.__Node** %next2, align 8, !tbaa !2
  %tobool = icmp eq %struct.__Node* %3, null
  br i1 %tobool, label %while.end.loopexit, label %while.body

while.end.loopexit:                               ; preds = %while.body
  %add.lcssa = phi double [ %add, %while.body ]
  br label %while.end

while.end:                                        ; preds = %while.end.loopexit, %entry
  %sum.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add.lcssa, %while.end.loopexit ]
  %call3 = tail call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), double %sum.0.lcssa)
  ret void
}

IR before GVN after this patch:

define dso_local void @foo(i32* nocapture readnone %r) local_unnamed_addr #0 {
entry:
  %call = tail call noalias dereferenceable_or_null(16) i8* @malloc(i64 16) #2
  %0 = bitcast i8* %call to %struct.__Node*
  %next = getelementptr inbounds %struct.__Node, %struct.__Node* %0, i64 0, i32 1
  store %struct.__Node* null, %struct.__Node** %next, align 8, !tbaa !2
  %f = getelementptr inbounds %struct.__Node, %struct.__Node* %0, i64 0, i32 0
  store double 1.150000e+01, double* %f, align 8, !tbaa !8
  %tobool12 = icmp eq i8* %call, null
  br i1 %tobool12, label %while.end, label %while.body.preheader

while.body.preheader:                             ; preds = %entry
  br label %while.body

while.body:                                       ; preds = %while.body.preheader, %while.body
  %sum.014 = phi double [ %add, %while.body ], [ 0.000000e+00, %while.body.preheader ]
  %ptr.013 = phi %struct.__Node* [ %2, %while.body ], [ %0, %while.body.preheader ]
  %f1 = getelementptr inbounds %struct.__Node, %struct.__Node* %ptr.013, i64 0, i32 0
  %1 = load double, double* %f1, align 8, !tbaa !8
  %add = fadd contract double %sum.014, %1
  %next2 = getelementptr inbounds %struct.__Node, %struct.__Node* %ptr.013, i64 0, i32 1
  %2 = load %struct.__Node*, %struct.__Node** %next2, align 8, !tbaa !2
  %tobool = icmp eq %struct.__Node* %2, null
  br i1 %tobool, label %while.end.loopexit, label %while.body

while.end.loopexit:                               ; preds = %while.body
  %add.lcssa = phi double [ %add, %while.body ]
  br label %while.end

while.end:                                        ; preds = %while.end.loopexit, %entry
  %sum.0.lcssa = phi double [ 0.000000e+00, %entry ], [ %add.lcssa, %while.end.loopexit ]
  %call3 = tail call i32 (i8*, ...) @printf(i8* nonnull dereferenceable(1) getelementptr inbounds ([4 x i8], [4 x i8]* @.str, i64 0, i64 0), double %sum.0.lcssa)
  ret void
}

The phi translation fails before this patch and it prevents GVN to remove the loop. The reason for this failure is in InstCombine. When the Instruction combining pass decides to convert:

%call = tail call noalias dereferenceable_or_null(16) i8* @malloc(i64 16)
 %0 = bitcast i8* %call to %struct.__Node*
 %next = getelementptr inbounds %struct.__Node, %struct.__Node* %0, i64 0, i32 1
 store %struct.__Node* null, %struct.__Node** %next

%call = tail call noalias dereferenceable_or_null(16) i8* @malloc(i64 16) 
  %next = getelementptr inbounds i8, i8* %call, i64 8
  %0 = bitcast i8* %next to %struct.__Node**
  store %struct.__Node* null, %struct.__Node** %0

GEP instructions with pure byte indexes (e.g. getelementptr inbounds i8, i8* %call, i64 8) are obstacles for address translation. address translation is looking for structural similarity between GEPs and these GEPs usually do not match since they have different structure.

This change will cause couple of failures in LLVM-tests. However, in all cases we need to change expected result by the test. I will update those tests as soon as I get green light on this patch.

Diff Detail

Unit TestsFailed

	Time	Test
	60 ms	x64 debian > Clang.CodeGen::arm64_32-vaarg.c
	200 ms	x64 debian > Clang.CodeGen::builtins-ppc-pair-mma.c
	70 ms	x64 debian > LLVM.Transforms/Coroutines::ArgAddr.ll
	130 ms	x64 debian > LLVM.Transforms/Coroutines::coro-async.ll
	90 ms	x64 debian > LLVM.Transforms/Coroutines::coro-retcon-alloca.ll
		View Full Test Results (28 Failed)

Event Timeline

hjooybar2 created this revision.Feb 17 2021, 10:46 AM

Herald added subscribers: kosarev, hiraditya. · View Herald TranscriptFeb 17 2021, 10:46 AM

hjooybar2 requested review of this revision.Feb 17 2021, 10:46 AM

Herald added a project: Restricted Project. · View Herald TranscriptFeb 17 2021, 10:46 AM

Herald added a subscriber: llvm-commits. · View Herald Transcript

Haven't looked into the motivating case yet, but what this patch does is definitely not correct, because it is not permitted to make optimization decisions based on pointer element types.

This revision now requires changes to proceed.Feb 17 2021, 11:27 AM

Harbormaster completed remote builds in B89571: Diff 324319.Feb 17 2021, 11:53 AM

In D96881#2569216, @nikic wrote:

Haven't looked into the motivating case yet, but what this patch does is definitely not correct, because it is not permitted to make optimization decisions based on pointer element types.

+1.

I have updated my implementation base on reviewer's comments. The only reason that I had used void* pointer detection instead of Alloc like function was to cover more cases. However, since using pointer size in optimization passes are prohibited, I changed the implementation back to Alloc like function

Thanks! Could you please include the necessary test changes, so it's possible to see the impact of the patch? Updating tests should just be a matter of running llvm/utils/update_test_checks.py --opt-bin build/bin/opt llvm/test/Transforms/InstCombine/test_name.ll.

Those problems should be treated as problems in phi translation and MemoryDependencyAnalysis.
Pointer types are going away, so even if this papers over them for now, it will break again.

See e.g. https://lists.llvm.org/pipermail/llvm-dev/2019-December/137684.html

This revision now requires changes to proceed.Feb 22 2021, 11:51 PM

hjooybar2 updated this revision to Diff 329457.Mar 9 2021, 1:23 PM

@nikic By making a little modification in my changeset, now none of existing tests are failing. I added a new test to show how the code is supposed to look like.

Here is how newly added test would have looked like with the main branch:

; CHECK-NEXT:  entry:
; CHECK-NEXT:    [[CALL:%.*]] = call noalias dereferenceable_or_null(16) i8* @malloc(i64 16)
; CHECK-NEXT:    [[G3:%.*]] = getelementptr i8, i8* [[CALL]], i64 12
; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i8* [[G3]] to i32*
; CHECK-NEXT:    [[A_C:%.*]] = load i32, i32* [[TMP0]], align 4
; CHECK-NEXT:    ret i32 [[A_C]]
;

My modification causes the InstCombine to behave differently between Alloca and malloc like functions. It actually makes sense, since Alloca usually has type but malloc is always void*

In conclusion, by latest update, all tests passed successfully and a new test is added to show how the optimization looks like.

@lebedev.ri
There is nothing wrong with phi translation and MemoryDependencyAnalysis to fix. This changeset is not introducing a new InstCombine to fix an issue in other passes. Current implementation of InstCombine is carelessly (without doing necessary checks) combine instructions. This changeset merely prevent some of these unprofitable combines. It doesn't make sense to change other passes when the instCombine itself is introducing the problem. Even without considering MemoryDependencyAnalysis there is no benefit in combining bitcast and gep in the testcase I added. This fusion will create a GEP that uses pure byte index instead of element index in a struct.

Harbormaster completed remote builds in B92952: Diff 329457.Mar 10 2021, 3:02 AM

I'm not really sure what to do here. I see @lebedev.ri's point that ideally phi translation would be capable of detecting that an "equivalent" GEP exists, but this doesn't really fit into the current model of the phi translation analysis, so this seems like a pragmatic workaround.

The real solution here of course is to remove both pointer element types and GEP types and thus obviate the problem, but that's still far away.

In D96881#2615088, @hjooybar2 wrote:

<...>

(unless i'm not aware of a RFC newer than https://lists.llvm.org/pipermail/llvm-dev/2019-December/137684.html)
I think my main point is being ignored:
This fix is temporary. After the pointer types are gone from IR,
the problem will reappear, because the GEP's will have pointer offsets,
they won't know anything about structs,
see http://lists.llvm.org/pipermail/llvm-dev/attachments/20191218/761e8893/attachment.obj:

define i8* @gep(p0 %ptr) {
  %res = getelementptr { i8, i32 }, p0 %ptr, i32 5, i32 0
  ret i8* %res
}

So it would be most future-proof to work towards making that transition as easy as possible,
by enhancing whatever passes aren't happy about this existing transformation.

All that being said, i understand that it is a *much* bigger endeavor than just adjusting InstCombine.

llvm/test/Transforms/InstCombine/getelementptr.ll
1275 ↗	(On Diff #329457)	For my own reference, the current result is: https://godbolt.org/z/TTE3nM

In D96881#2617663, @lebedev.ri wrote:

In D96881#2615088, @hjooybar2 wrote:

<...>

(unless i'm not aware of a RFC newer than https://lists.llvm.org/pipermail/llvm-dev/2019-December/137684.html)
I think my main point is being ignored:
This fix is temporary. After the pointer types are gone from IR,
the problem will reappear, because the GEP's will have pointer offsets,
they won't know anything about structs,

Why would this problem reappear? The current transform is what violates opaque pointers, and this patch removes one instance where the transform is applied. The transform uses the origin pointer element type to construct a new GEP, which would not be possible if there are no pointer element types. The whole transform would go away if pointer element types are dropped.

The core problem in this particular case is that GEP's in LLVM are type based, instead of being simple arithmetic with provenance, which means that there is an infinite number of ways to write the same GEP, if you add some pointer casts around it. We use heuristics to pick the used GEP representation, and in this case the choice ends up being unfavorable (though possibly the new choice after this patch is unfavorable in other circumstances -- this is a problem we cannot really resolve without fundamentally changing the GEP representation.)

lebedev.ri resigned from this revision.Mar 10 2021, 10:35 PM

lebedev.ri retitled this revision from Avoid Bitcast-GEP fusion for void pointers to [InstCombine] Avoid Bitcast-GEP fusion for pointers directly from allocation functions.

@nikic As you said, as soon as the GEPs are being updated to simple arithmetic operations, PHI Translation would never be confused by GEP polymorphs. So this changeset along with this whole instruction-combining is going to be out of the picture. But still, for up until that point, I would suggest to fix this unprofitable instCombine

Is there anything else I can do to help proceeding this changeset?

LGTM per above discussion. It's not ideal, but I think it's the best we can do right now.

This revision is now accepted and ready to land.Mar 15 2021, 2:10 PM

Could you please provide Your Name <your@email> to use for the commit?

hjooybar2 updated this revision to Diff 331079.Mar 16 2021, 1:19 PM

Harbormaster completed remote builds in B94115: Diff 331079.Mar 16 2021, 2:01 PM

Closed by commit rG302b80abf036: [InstCombine] Avoid Bitcast-GEP fusion for pointers directly from allocation… (authored by hjooybar2, committed by Asif). · Explain WhyMar 16 2021, 2:04 PM

This revision was automatically updated to reflect the committed changes.

Asif added a commit: rG302b80abf036: [InstCombine] Avoid Bitcast-GEP fusion for pointers directly from allocation….

In D96881#2627389, @nikic wrote:

LGTM per above discussion. It's not ideal, but I think it's the best we can do right now.

Hi All,

I'm seeing a rather numerous set of performance degredations and code size increases on x86 with this patch. I think that's a little unexpected, but do we have benchmarks and runs showing that this is a profitable change to happen in general?

Thanks!

-eric

In D96881#2646035, @echristo wrote:

In D96881#2627389, @nikic wrote:

LGTM per above discussion. It's not ideal, but I think it's the best we can do right now.

Hi All,

I'm seeing a rather numerous set of performance degredations and code size increases on x86 with this patch. I think that's a little unexpected, but do we have benchmarks and runs showing that this is a profitable change to happen in general?

If you're seeing widespread regressions, let's revert this change. My assumption here was that it would fix the motivational case without impacting anything else much, but apparently that's not the case.

If you have any information on the reason for the regression(s), that would of course the great.

nikic mentioned this in D99121: [IR][InstCombine] IntToPtr Produces Typeless Pointer To Byte.Mar 25 2021, 6:00 AM

This is an archive of the discontinued LLVM Phabricator instance.

[InstCombine] Avoid Bitcast-GEP fusion for pointers directly from allocation functions
ClosedPublic

Details

Diff Detail

Unit TestsFailed

Event Timeline

Revision Contents

Diff 324319

llvm/lib/Transforms/InstCombine/InstructionCombining.cpp

This is an archive of the discontinued LLVM Phabricator instance.

[InstCombine] Avoid Bitcast-GEP fusion for pointers directly from allocation functionsClosedPublic

Details

Diff Detail

Unit TestsFailed

Event Timeline

Revision Contents

Diff 324319

llvm/lib/Transforms/InstCombine/InstructionCombining.cpp

[InstCombine] Avoid Bitcast-GEP fusion for pointers directly from allocation functions
ClosedPublic