Page MenuHomePhabricator

[InstCombine] Rewrite (switch (zext X)) as (switch X).
Needs ReviewPublic

Authored by wecing on Sun, Dec 27, 9:53 PM.

Details

Reviewers
spatel
lebedev.ri
Summary

InstCombine today rewrites (zext X == zext Y) as (X == Y), but the same
mechanics is missing for switch; instead, it tries to shrink the switch
condition type to as small as possible. This difference in behaviors
reduces the effectiveness of global value numbering.

Diff Detail

Unit TestsFailed

TimeTest
460 msx64 debian > AddressSanitizer-x86_64-linux.TestCases::strcmp.c
Script: -- : 'RUN: at line 1'; /mnt/disks/ssd0/agent/llvm-project/build/./bin/clang -fsanitize=address -mno-omit-leaf-frame-pointer -fno-omit-frame-pointer -fno-optimize-sibling-calls -gline-tables-only -m64 /mnt/disks/ssd0/agent/llvm-project/compiler-rt/test/asan/TestCases/strcmp.c -o /mnt/disks/ssd0/agent/llvm-project/build/projects/compiler-rt/test/asan/X86_64LinuxConfig/TestCases/Output/strcmp.c.tmp

Event Timeline

wecing created this revision.Sun, Dec 27, 9:53 PM
wecing requested review of this revision.Sun, Dec 27, 9:53 PM
Herald added a project: Restricted Project. · View Herald TranscriptSun, Dec 27, 9:53 PM

Here is a longer example showing the improvements of this change:

target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"

define i32 @f1({ i32, i32 }* %x, { i32, i32 }* %y) {
start:
  %0 = getelementptr inbounds { i32, i32 }, { i32, i32 }* %x, i64 0, i32 0
  %1 = load i32, i32* %0, align 4, !range !5
  %_1 = zext i32 %1 to i64
  %2 = getelementptr inbounds { i32, i32 }, { i32, i32 }* %y, i64 0, i32 0
  %3 = load i32, i32* %2, align 4, !range !5
  %_3 = zext i32 %3 to i64
  %cond1 = icmp eq i64 %_1, %_3
  br i1 %cond1, label %bb1, label %bb41

bb1:
  switch i64 %_1, label %bb41 [
    i64 0, label %bb2
    i64 1, label %bb5
    i64 2, label %bb7
    i64 3, label %bb9
  ]

bb2:
  ret i32 100

bb5:
  ret i32 101

bb7:
  ret i32 102

bb9:
  %4 = getelementptr inbounds { i32, i32 }, { i32, i32 }* %y, i64 0, i32 0
  %5 = load i32, i32* %4, align 4, !range !5
  %_5 = zext i32 %5 to i64
  %cond2 = icmp eq i64 %_5, 3
  br i1 %cond2, label %bb10, label %bb11

bb10:
  ret i32 1001

bb11:
  ret i32 1002

bb41:
  unreachable
}

!5 = !{i32 0, i32 4}

Without this patch, opt -S -O3 no.ll produces:

define i32 @f1({ i32, i32 }* nocapture readonly %x, { i32, i32 }* nocapture readonly %y) local_unnamed_addr #0 {
start:
  %0 = getelementptr inbounds { i32, i32 }, { i32, i32 }* %x, i64 0, i32 0
  %1 = load i32, i32* %0, align 4, !range !0
  %_1 = zext i32 %1 to i64
  %2 = getelementptr inbounds { i32, i32 }, { i32, i32 }* %y, i64 0, i32 0
  %3 = load i32, i32* %2, align 4, !range !0
  %cond1 = icmp eq i32 %1, %3
  tail call void @llvm.assume(i1 %cond1)
  switch i64 %_1, label %bb41 [
    i64 0, label %bb2
    i64 1, label %bb5
    i64 2, label %bb7
    i64 3, label %bb9
  ]

bb2:                                              ; preds = %bb9, %bb7, %bb5, %start
  %merge = phi i32 [ 100, %start ], [ 101, %bb5 ], [ 102, %bb7 ], [ %., %bb9 ]
  ret i32 %merge

bb5:                                              ; preds = %start
  br label %bb2

bb7:                                              ; preds = %start
  br label %bb2

bb9:                                              ; preds = %start
  %cond2 = icmp eq i32 %1, 3
  %. = select i1 %cond2, i32 1001, i32 1002
  br label %bb2

bb41:                                             ; preds = %start
  unreachable
}

!0 = !{i32 0, i32 4}

Note that the icmp eq i32 %1, 3 in bb9 is actually is unnecessary, because in the input, icmp eq i64 %_1, %_3 from the start block must be true.

With this fix, the output becomes:

define i32 @f1({ i32, i32 }* nocapture readonly %x, { i32, i32 }* nocapture readonly %y) local_unnamed_addr #0 {
start:
  %0 = getelementptr inbounds { i32, i32 }, { i32, i32 }* %x, i64 0, i32 0
  %1 = load i32, i32* %0, align 4, !range !0
  %2 = getelementptr inbounds { i32, i32 }, { i32, i32 }* %y, i64 0, i32 0
  %3 = load i32, i32* %2, align 4, !range !0
  %cond1 = icmp eq i32 %1, %3
  tail call void @llvm.assume(i1 %cond1)
  switch i32 %1, label %bb41 [
    i32 0, label %bb2
    i32 1, label %bb5
    i32 2, label %bb7
    i32 3, label %bb9
  ]

bb2:                                              ; preds = %bb9, %bb7, %bb5, %start
  %merge = phi i32 [ 100, %start ], [ 101, %bb5 ], [ 102, %bb7 ], [ 1001, %bb9 ]
  ret i32 %merge

bb5:                                              ; preds = %start
  br label %bb2

bb7:                                              ; preds = %start
  br label %bb2

bb9:                                              ; preds = %start
  br label %bb2

bb41:                                             ; preds = %start
  unreachable
}

!0 = !{i32 0, i32 4}

The difference is also reflected in output assembly.

lebedev.ri added a subscriber: lebedev.ri.
lebedev.ri added inline comments.
llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
2952–2953

Why do we need this?

wecing marked an inline comment as done.Mon, Dec 28, 11:32 AM
wecing added inline comments.
llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
2952–2953

This handles cases like this:

%zx = zext i32 %x to i64
switch i64 %zx, label %bb2 [
  i64 0x7fff_ffff_ffff_ffff, label %bb1 ; exceeds i32 range
]

The 0x7fff_ffff_ffff_ffff has 1 leading zero, so LeadingKnownZeros will be 1, and NewWidth will be 64-1 = 63, which exceeds the range of i32.

wecing marked an inline comment as done.Mon, Dec 28, 11:40 AM

Aha, i see. I think the existing fold should be fixed instead,
because if it doesn't fire (because shouldChangeType() said so?),
how do we know we can ignore that hook here?

@lebedev.ri , which "hook" did you mean?

I think the existing fold is too aggressive. Please take a look at the example I gave in an earlier comment -- doing a less strict fold first gives GVN a chance to optimize the IR.
When the new fold is triggered, the existing more aggressive fold would be triggered (if NewWidth happens to be a standard integer type, e.g. i8, so that shouldChangeType() returns true) the next time InstCombine gets executed.

@lebedev.ri , which "hook" did you mean?

The shouldChangeType()

I think the existing fold is too aggressive.

Please take a look at the example I gave in an earlier comment -- doing a less strict fold first gives GVN a chance to optimize the IR.

Could you please post a godbolt example showing *how* it GVN would optimize it? https://godbolt.org/z/cq5M5f
But that sounds like a GVN bug to me.

When the new fold is triggered, the existing more aggressive fold would be triggered (if NewWidth happens to be a standard integer type, e.g. i8, so that shouldChangeType() returns true) the next time InstCombine gets executed.

Ah yes, sorry, so the example would already be optimized, just to a different width.
https://godbolt.org/z/37jMeM

@lebedev.ri , here is what GVN does for the current output, and this is the new behavior. Note that the first one does

%cond2 = icmp eq i32 %1, 3
br i1 %cond2, label %bb10, label %bb11

while in the second one it's just

br i1 true, label %bb10, label %bb11

that sounds like a GVN bug to me

Probably, but I am not sure whether GVN or InstCombine (or both?) is supposed to know that if zext X == 3 then X == 3.

Ah yes, sorry, so the example would already be optimized, just to a different width.

Not if you include the target datalayout.

I see, thank you.
This reinforces my point, this really looks like a GVN bug.

@lebedev.ri , I made another patch by changing only GVN -- please take a look at D93888. If it could be approved, I will close this PR ("diff"?).