This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Target/AMDGPU/
-
Target/
-
AMDGPU/
2/2
AMDGPUISelLowering.cpp
-
test/CodeGen/AMDGPU/
-
CodeGen/
-
AMDGPU/
-
partial-shift-shrink.ll

Differential D136059

[AMDGPU][DAG] Fix trunc/shift combine condition
ClosedPublic

Authored by Pierre-vh on Oct 17 2022, 12:56 AM.

Download Raw Diff

Details

Reviewers

arsenm
foad

Commits

rG824dd811be42: [AMDGPU][DAG] Fix trunc/shift combine condition

Summary

The condition needs to be different for right-shifts, else we may lose information in some cases.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

Pierre-vh created this revision.Oct 17 2022, 12:56 AM

Herald added a project: Restricted Project. · View Herald TranscriptOct 17 2022, 12:56 AM

Herald added subscribers: kosarev, foad, kerbowa and 7 others. · View Herald Transcript

Pierre-vh requested review of this revision.Oct 17 2022, 12:56 AM

Herald added a project: Restricted Project. · View Herald TranscriptOct 17 2022, 12:56 AM

Herald added subscribers: llvm-commits, wdng. · View Herald Transcript

Harbormaster completed remote builds in B192441: Diff 468128.Oct 17 2022, 1:42 AM

foad added inline comments.Oct 17 2022, 2:43 AM

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
3248–3255	The whole `if` condition can be replaced with: `Known.getMaxValue().ule(MaxCstSize)`
3253–3254	For left shifts you can do it so long as the new shift amount is still valid, so ShiftAmt < 32 (aka <= 31). So this can be: `... ? 31 : 32 - Size`

Comments

Update comments, remove dead variable

Harbormaster completed remote builds in B192916: Diff 468791.Oct 19 2022, 12:22 AM

LGTM, thanks!

This revision is now accepted and ready to land.Oct 19 2022, 1:27 AM

Can you check the equivalent combine in globalisel? I think it's just handling the left shifts but should be generalized

In D136059#3868436, @arsenm wrote:

Can you check the equivalent combine in globalisel? I think it's just handling the left shifts but should be generalized

Sure,, I'll check it in a different patch. Can this one land in the meantime?

In D136059#3868436, @arsenm wrote:

Can you check the equivalent combine in globalisel? I think it's just handling the left shifts but should be generalized

I took a look at it and it seems like it's a different combine. This one explicitly tries to reduce 64 bits shift to 32 bits, but matchCombineTruncOfShl/applyCombineTruncOfShl simply moves the trunc into the operand of the shift:

// Fold trunc (shl x, K) -> shl (trunc x), K 
//    => K < VT.getScalarSizeInBits()

I don't think this can work on right shifts unless we add another trunc in front of the shl again.

Maybe another (AMDGPU-specific since I think it just benefits us?) combine would be better for 64 bits right shifts reduction? I wrote it in D136319

In D136059#3870167, @Pierre-vh wrote:

In D136059#3868436, @arsenm wrote:

Can you check the equivalent combine in globalisel? I think it's just handling the left shifts but should be generalized

Sure,, I'll check it in a different patch. Can this one land in the meantime?

Yes, not relevant to this one

Closed by commit rG824dd811be42: [AMDGPU][DAG] Fix trunc/shift combine condition (authored by Pierre-vh). · Explain WhyOct 20 2022, 11:36 PM

This revision was automatically updated to reflect the committed changes.

Pierre-vh added a commit: rG824dd811be42: [AMDGPU][DAG] Fix trunc/shift combine condition.

Revision Contents

Path

Size

llvm/

lib/

Target/

AMDGPU/

AMDGPUISelLowering.cpp

11 lines

test/

CodeGen/

AMDGPU/

partial-shift-shrink.ll

20 lines

Diff 468791

llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp

Show First 20 Lines • Show All 3,239 Lines • ▼ Show 20 Lines	SDValue AMDGPUTargetLowering::performTruncateCombine(
if (VT.getScalarSizeInBits() < 32) {		if (VT.getScalarSizeInBits() < 32) {
EVT SrcVT = Src.getValueType();		EVT SrcVT = Src.getValueType();
if (SrcVT.getScalarSizeInBits() > 32 &&		if (SrcVT.getScalarSizeInBits() > 32 &&
(Src.getOpcode() == ISD::SRL \|\|		(Src.getOpcode() == ISD::SRL \|\|
Src.getOpcode() == ISD::SRA \|\|		Src.getOpcode() == ISD::SRA \|\|
Src.getOpcode() == ISD::SHL)) {		Src.getOpcode() == ISD::SHL)) {
SDValue Amt = Src.getOperand(1);		SDValue Amt = Src.getOperand(1);
KnownBits Known = DAG.computeKnownBits(Amt);		KnownBits Known = DAG.computeKnownBits(Amt);
unsigned Size = VT.getScalarSizeInBits();
if ((Known.isConstant() && Known.getConstant().ule(Size)) \|\|		// - For left shifts, do the transform as long as the shift
(Known.countMaxActiveBits() <= Log2_32(Size))) {		// amount is still legal for i32, so when ShiftAmt < 32 (<= 31)
		// - For right shift, do it if ShiftAmt <= (32 - Size) to avoid
		// losing information stored in the high bits when truncating.
		const unsigned MaxCstSize =
		(Src.getOpcode() == ISD::SHL) ? 31 : (32 - VT.getScalarSizeInBits());
		foadUnsubmitted Done Reply Inline Actions For left shifts you can do it so long as the new shift amount is still valid, so ShiftAmt < 32 (aka <= 31). So this can be: `... ? 31 : 32 - Size` foad: For left shifts you can do it so long as the new shift amount is still valid, so ShiftAmt < 32…
		if (Known.getMaxValue().ule(MaxCstSize)) {
		foadUnsubmitted Done Reply Inline Actions The whole `if` condition can be replaced with: `Known.getMaxValue().ule(MaxCstSize)` foad: The whole `if` condition can be replaced with: `Known.getMaxValue().ule(MaxCstSize)`
EVT MidVT = VT.isVector() ?		EVT MidVT = VT.isVector() ?
EVT::getVectorVT(*DAG.getContext(), MVT::i32,		EVT::getVectorVT(*DAG.getContext(), MVT::i32,
VT.getVectorNumElements()) : MVT::i32;		VT.getVectorNumElements()) : MVT::i32;

EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());		EVT NewShiftVT = getShiftAmountTy(MidVT, DAG.getDataLayout());
SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,		SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MidVT,
Src.getOperand(0));		Src.getOperand(0));
DCI.AddToWorklist(Trunc.getNode());		DCI.AddToWorklist(Trunc.getNode());
▲ Show 20 Lines • Show All 1,578 Lines • Show Last 20 Lines

llvm/test/CodeGen/AMDGPU/partial-shift-shrink.ll

Show First 20 Lines • Show All 122 Lines • ▼ Show 20 Lines	; GCN-NEXT: s_setpc_b64 s[30:31]
%trunc = trunc i64 %shift to i16		%trunc = trunc i64 %shift to i16
ret i16 %trunc		ret i16 %trunc
}		}

define i16 @trunc_srl_i64_var_mask16_to_i16(i64 %x, i64 %amt) {		define i16 @trunc_srl_i64_var_mask16_to_i16(i64 %x, i64 %amt) {
; GCN-LABEL: trunc_srl_i64_var_mask16_to_i16:		; GCN-LABEL: trunc_srl_i64_var_mask16_to_i16:
; GCN: ; %bb.0:		; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v2, 16, v2		; GCN-NEXT: v_and_b32_e32 v1, 16, v2
; GCN-NEXT: v_lshrrev_b64 v[0:1], v2, v[0:1]		; GCN-NEXT: v_lshrrev_b32_e32 v0, v1, v0
; GCN-NEXT: s_setpc_b64 s[30:31]		; GCN-NEXT: s_setpc_b64 s[30:31]
%amt.masked = and i64 %amt, 16		%amt.masked = and i64 %amt, 16
%shift = lshr i64 %x, %amt.masked		%shift = lshr i64 %x, %amt.masked
%trunc = trunc i64 %shift to i16		%trunc = trunc i64 %shift to i16
ret i16 %trunc		ret i16 %trunc
}		}

define i16 @trunc_srl_i64_var_mask31_to_i16(i64 %x, i64 %amt) {		define i16 @trunc_srl_i64_var_mask31_to_i16(i64 %x, i64 %amt) {
; GCN-LABEL: trunc_srl_i64_var_mask31_to_i16:		; GCN-LABEL: trunc_srl_i64_var_mask31_to_i16:
; GCN: ; %bb.0:		; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)		; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_and_b32_e32 v2, 31, v2		; GCN-NEXT: v_and_b32_e32 v2, 31, v2
; GCN-NEXT: v_lshrrev_b64 v[0:1], v2, v[0:1]		; GCN-NEXT: v_lshrrev_b64 v[0:1], v2, v[0:1]
; GCN-NEXT: s_setpc_b64 s[30:31]		; GCN-NEXT: s_setpc_b64 s[30:31]
%amt.masked = and i64 %amt, 31		%amt.masked = and i64 %amt, 31
%shift = lshr i64 %x, %amt.masked		%shift = lshr i64 %x, %amt.masked
%trunc = trunc i64 %shift to i16		%trunc = trunc i64 %shift to i16
ret i16 %trunc		ret i16 %trunc
}		}

		define i32 @trunc_srl_i64_25_to_i26(i64 %x) {
		; GCN-LABEL: trunc_srl_i64_25_to_i26:
		; GCN: ; %bb.0:
		; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
		; GCN-NEXT: v_and_b32_e32 v0, 0xa000000, v0
		; GCN-NEXT: v_alignbit_b32 v0, 0, v0, 25
		; GCN-NEXT: v_add_u32_e32 v0, 55, v0
		; GCN-NEXT: s_setpc_b64 s[30:31]
		%value.knownbits2 = and i64 %x, 167772160 ; 0xA000000
		%shift = lshr i64 %value.knownbits2, 25
		%trunc = trunc i64 %shift to i26
		%add = add i26 %trunc, 55
		%ext = zext i26 %add to i32
		ret i32 %ext
		}