This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Transforms/InstCombine/
-
Transforms/
-
InstCombine/
2/2
InstructionCombining.cpp
-
test/Transforms/InstCombine/
-
Transforms/
-
InstCombine/
-
narrow-switch.ll

Differential D93850

[InstCombine] Rewrite (switch (zext X)) as (switch X).
AbandonedPublic

Authored by wecing on Dec 27 2020, 9:53 PM.

Download Raw Diff

Details

Reviewers

spatel
lebedev.ri

Summary

InstCombine today rewrites (zext X == zext Y) as (X == Y), but the same
mechanics is missing for switch; instead, it tries to shrink the switch
condition type to as small as possible. This difference in behaviors
reduces the effectiveness of global value numbering.

Diff Detail

Repository: rG LLVM Github Monorepo

Unit TestsFailed

	Time	Test
	460 ms	x64 debian > AddressSanitizer-x86_64-linux.TestCases::strcmp.c

Event Timeline

wecing created this revision.Dec 27 2020, 9:53 PM

Herald added a subscriber: hiraditya. · View Herald TranscriptDec 27 2020, 9:53 PM

wecing requested review of this revision.Dec 27 2020, 9:53 PM

Herald added a project: Restricted Project. · View Herald TranscriptDec 27 2020, 9:53 PM

Herald added a subscriber: llvm-commits. · View Herald Transcript

wecing added a reviewer: spatel.Dec 27 2020, 9:55 PM

Here is a longer example showing the improvements of this change:

target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"

define i32 @f1({ i32, i32 }* %x, { i32, i32 }* %y) {
start:
  %0 = getelementptr inbounds { i32, i32 }, { i32, i32 }* %x, i64 0, i32 0
  %1 = load i32, i32* %0, align 4, !range !5
  %_1 = zext i32 %1 to i64
  %2 = getelementptr inbounds { i32, i32 }, { i32, i32 }* %y, i64 0, i32 0
  %3 = load i32, i32* %2, align 4, !range !5
  %_3 = zext i32 %3 to i64
  %cond1 = icmp eq i64 %_1, %_3
  br i1 %cond1, label %bb1, label %bb41

bb1:
  switch i64 %_1, label %bb41 [
    i64 0, label %bb2
    i64 1, label %bb5
    i64 2, label %bb7
    i64 3, label %bb9
  ]

bb2:
  ret i32 100

bb5:
  ret i32 101

bb7:
  ret i32 102

bb9:
  %4 = getelementptr inbounds { i32, i32 }, { i32, i32 }* %y, i64 0, i32 0
  %5 = load i32, i32* %4, align 4, !range !5
  %_5 = zext i32 %5 to i64
  %cond2 = icmp eq i64 %_5, 3
  br i1 %cond2, label %bb10, label %bb11

bb10:
  ret i32 1001

bb11:
  ret i32 1002

bb41:
  unreachable
}

!5 = !{i32 0, i32 4}

Without this patch, opt -S -O3 no.ll produces:

define i32 @f1({ i32, i32 }* nocapture readonly %x, { i32, i32 }* nocapture readonly %y) local_unnamed_addr #0 {
start:
  %0 = getelementptr inbounds { i32, i32 }, { i32, i32 }* %x, i64 0, i32 0
  %1 = load i32, i32* %0, align 4, !range !0
  %_1 = zext i32 %1 to i64
  %2 = getelementptr inbounds { i32, i32 }, { i32, i32 }* %y, i64 0, i32 0
  %3 = load i32, i32* %2, align 4, !range !0
  %cond1 = icmp eq i32 %1, %3
  tail call void @llvm.assume(i1 %cond1)
  switch i64 %_1, label %bb41 [
    i64 0, label %bb2
    i64 1, label %bb5
    i64 2, label %bb7
    i64 3, label %bb9
  ]

bb2:                                              ; preds = %bb9, %bb7, %bb5, %start
  %merge = phi i32 [ 100, %start ], [ 101, %bb5 ], [ 102, %bb7 ], [ %., %bb9 ]
  ret i32 %merge

bb5:                                              ; preds = %start
  br label %bb2

bb7:                                              ; preds = %start
  br label %bb2

bb9:                                              ; preds = %start
  %cond2 = icmp eq i32 %1, 3
  %. = select i1 %cond2, i32 1001, i32 1002
  br label %bb2

bb41:                                             ; preds = %start
  unreachable
}

!0 = !{i32 0, i32 4}

Note that the icmp eq i32 %1, 3 in bb9 is actually is unnecessary, because in the input, icmp eq i64 %_1, %_3 from the start block must be true.

With this fix, the output becomes:

define i32 @f1({ i32, i32 }* nocapture readonly %x, { i32, i32 }* nocapture readonly %y) local_unnamed_addr #0 {
start:
  %0 = getelementptr inbounds { i32, i32 }, { i32, i32 }* %x, i64 0, i32 0
  %1 = load i32, i32* %0, align 4, !range !0
  %2 = getelementptr inbounds { i32, i32 }, { i32, i32 }* %y, i64 0, i32 0
  %3 = load i32, i32* %2, align 4, !range !0
  %cond1 = icmp eq i32 %1, %3
  tail call void @llvm.assume(i1 %cond1)
  switch i32 %1, label %bb41 [
    i32 0, label %bb2
    i32 1, label %bb5
    i32 2, label %bb7
    i32 3, label %bb9
  ]

bb2:                                              ; preds = %bb9, %bb7, %bb5, %start
  %merge = phi i32 [ 100, %start ], [ 101, %bb5 ], [ 102, %bb7 ], [ 1001, %bb9 ]
  ret i32 %merge

bb5:                                              ; preds = %start
  br label %bb2

bb7:                                              ; preds = %start
  br label %bb2

bb9:                                              ; preds = %start
  br label %bb2

bb41:                                             ; preds = %start
  unreachable
}

!0 = !{i32 0, i32 4}

The difference is also reflected in output assembly.

Harbormaster completed remote builds in B83567: Diff 313825.Dec 27 2020, 10:29 PM

lebedev.ri added a reviewer: lebedev.ri.Dec 27 2020, 11:25 PM

lebedev.ri added a subscriber: lebedev.ri.

lebedev.ri added inline comments.

llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
2952–2953	Why do we need this?

wecing marked an inline comment as done.Dec 28 2020, 11:32 AM

wecing added inline comments.

llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
2952–2953	This handles cases like this: %zx = zext i32 %x to i64 switch i64 %zx, label %bb2 [ i64 0x7fff_ffff_ffff_ffff, label %bb1 ; exceeds i32 range ] The `0x7fff_ffff_ffff_ffff` has 1 leading zero, so `LeadingKnownZeros` will be 1, and `NewWidth` will be 64-1 = 63, which exceeds the range of i32.

wecing marked an inline comment as done.Dec 28 2020, 11:40 AM

Aha, i see. I think the existing fold should be fixed instead,
because if it doesn't fire (because shouldChangeType() said so?),
how do we know we can ignore that hook here?

@lebedev.ri , which "hook" did you mean?

I think the existing fold is too aggressive. Please take a look at the example I gave in an earlier comment -- doing a less strict fold first gives GVN a chance to optimize the IR.
When the new fold is triggered, the existing more aggressive fold would be triggered (if NewWidth happens to be a standard integer type, e.g. i8, so that shouldChangeType() returns true) the next time InstCombine gets executed.

In D93850#2473044, @wecing wrote:

@lebedev.ri , which "hook" did you mean?

The shouldChangeType()

I think the existing fold is too aggressive.

Please take a look at the example I gave in an earlier comment -- doing a less strict fold first gives GVN a chance to optimize the IR.

Could you please post a godbolt example showing *how* it GVN would optimize it? https://godbolt.org/z/cq5M5f
But that sounds like a GVN bug to me.

When the new fold is triggered, the existing more aggressive fold would be triggered (if NewWidth happens to be a standard integer type, e.g. i8, so that shouldChangeType() returns true) the next time InstCombine gets executed.

Ah yes, sorry, so the example would already be optimized, just to a different width.
https://godbolt.org/z/37jMeM

@lebedev.ri , here is what GVN does for the current output, and this is the new behavior. Note that the first one does

%cond2 = icmp eq i32 %1, 3
br i1 %cond2, label %bb10, label %bb11

while in the second one it's just

br i1 true, label %bb10, label %bb11

that sounds like a GVN bug to me

Probably, but I am not sure whether GVN or InstCombine (or both?) is supposed to know that if zext X == 3 then X == 3.

Ah yes, sorry, so the example would already be optimized, just to a different width.

Not if you include the target datalayout.

I see, thank you.
This reinforces my point, this really looks like a GVN bug.

wecing mentioned this in D93888: [GVN] If zext X == N or sext X == N, then X == trunc N..Dec 28 2020, 11:26 PM

@lebedev.ri , I made another patch by changing only GVN -- please take a look at D93888. If it could be approved, I will close this PR ("diff"?).

This review seems to be stuck/dead, consider abandoning if no longer relevant.

Herald added a project: Restricted Project. · View Herald TranscriptJan 12 2023, 5:19 PM

Herald added a subscriber: StephenFan. · View Herald Transcript

wecing abandoned this revision.Jan 13 2023, 9:50 AM

Revision Contents

Path

Size

llvm/

lib/

Transforms/

InstCombine/

InstructionCombining.cpp

17 lines

test/

Transforms/

InstCombine/

narrow-switch.ll

27 lines

Diff 313825

llvm/lib/Transforms/InstCombine/InstructionCombining.cpp

Show First 20 Lines • Show All 2,937 Lines • ▼ Show 20 Lines	for (auto &C : SI.cases()) {
LeadingKnownZeros = std::min(		LeadingKnownZeros = std::min(
LeadingKnownZeros, C.getCaseValue()->getValue().countLeadingZeros());		LeadingKnownZeros, C.getCaseValue()->getValue().countLeadingZeros());
LeadingKnownOnes = std::min(		LeadingKnownOnes = std::min(
LeadingKnownOnes, C.getCaseValue()->getValue().countLeadingOnes());		LeadingKnownOnes, C.getCaseValue()->getValue().countLeadingOnes());
}		}

unsigned NewWidth = Known.getBitWidth() - std::max(LeadingKnownZeros, LeadingKnownOnes);		unsigned NewWidth = Known.getBitWidth() - std::max(LeadingKnownZeros, LeadingKnownOnes);

		// Change 'switch (zext X)' into 'switch X' first, before trying the more
		// aggressive trunc rewrite. This implements the same behavior as
		// InstCombineCompares::foldICmpWithZextOrSext() for switch, and improves
		// effectiveness of later passes, e.g. global value numbering.
		Value *CastOp;
		if (match(Cond, m_ZExtOrSExt(m_Value(CastOp)))) {
		unsigned CastOpWidth = computeKnownBits(CastOp, 0, &SI).getBitWidth();
		if (NewWidth > 0 && NewWidth <= CastOpWidth) {
		lebedev.riUnsubmitted Done Reply Inline Actions Why do we need this? lebedev.ri: Why do we need this?
		wecingAuthorUnsubmitted Done Reply Inline Actions This handles cases like this: %zx = zext i32 %x to i64 switch i64 %zx, label %bb2 [ i64 0x7fff_ffff_ffff_ffff, label %bb1 ; exceeds i32 range ] The `0x7fff_ffff_ffff_ffff` has 1 leading zero, so `LeadingKnownZeros` will be 1, and `NewWidth` will be 64-1 = 63, which exceeds the range of i32. wecing: This handles cases like this: ``` %zx = zext i32 %x to i64 switch i64 %zx, label %bb2…
		for (auto Case : SI.cases()) {
		APInt TruncatedCase =
		Case.getCaseValue()->getValue().trunc(CastOpWidth);
		Case.setValue(ConstantInt::get(SI.getContext(), TruncatedCase));
		}
		return replaceOperand(SI, 0, CastOp);
		}
		}

// Shrink the condition operand if the new type is smaller than the old type.		// Shrink the condition operand if the new type is smaller than the old type.
// But do not shrink to a non-standard type, because backend can't generate		// But do not shrink to a non-standard type, because backend can't generate
// good code for that yet.		// good code for that yet.
// TODO: We can make it aggressive again after fixing PR39569.		// TODO: We can make it aggressive again after fixing PR39569.
if (NewWidth > 0 && NewWidth < Known.getBitWidth() &&		if (NewWidth > 0 && NewWidth < Known.getBitWidth() &&
shouldChangeType(Known.getBitWidth(), NewWidth)) {		shouldChangeType(Known.getBitWidth(), NewWidth)) {
IntegerType *Ty = IntegerType::get(SI.getContext(), NewWidth);		IntegerType *Ty = IntegerType::get(SI.getContext(), NewWidth);
Builder.SetInsertPoint(&SI);		Builder.SetInsertPoint(&SI);
▲ Show 20 Lines • Show All 1,105 Lines • Show Last 20 Lines

llvm/test/Transforms/InstCombine/narrow-switch.ll

	Show First 20 Lines • Show All 254 Lines • ▼ Show 20 Lines

	; <label>:10: ; preds = %13, %12, %11, %10, %9, %8, %7			; <label>:10: ; preds = %13, %12, %11, %10, %9, %8, %7
	br label %1			br label %1

	; <label>:11: ; preds = %1			; <label>:11: ; preds = %1
	ret void			ret void
	}			}

				define void @switch_zext_with_range(i32 *%p) {
				; ALL-LABEL: @switch_zext_with_range(
				; ALL: switch i32
				; ALL-NEXT: i32 0, label
				; ALL-NEXT: i32 1, label
				; ALL-NEXT: i32 2, label
				; ALL-NEXT: i32 3, label
				; ALL-NEXT: ]
				;
				entry:
				%x = load i32, i32* %p, align 4, !range !0
				%zx = zext i32 %x to i64
				switch i64 %zx, label %bb.err [
				i64 0, label %bb.end
				i64 1, label %bb.end
				i64 2, label %bb.end
				i64 3, label %bb.end
				]

				bb.end:
				ret void

				bb.err:
				unreachable
				}

				!0 = !{i32 0, i32 4}