This is an archive of the discontinued LLVM Phabricator instance.

[DAGCombine] Improve (sra (sra x, c1), c2) -> (sra x, (add c1, c2)) folding
ClosedPublic

Authored by RKSimon on Jul 21 2017, 5:42 AM.

Download Raw Diff

Details

Reviewers

craig.topper
andrew.zhogin
spatel
andreadb

Commits

rG5113b48798f9: [DAGCombine] Improve (sra (sra x, c1), c2) -> (sra x, (add c1, c2)) folding
rL340010: [DAGCombine] Improve (sra (sra x, c1), c2) -> (sra x, (add c1, c2)) folding

Summary

Add support for cases where only some c1+c2 results exceed the max bitshift, clamping accordingly.

Diff Detail

Repository: rL LLVM

Event Timeline

RKSimon created this revision.Jul 21 2017, 5:42 AM

Isn't it a little regression for scalar types?

N1.getValueType() == N0.getOperand(1).getValueType()

Instead of

zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);

There is no vector analog of zeroExtendToMatch currently, right?

Btw, I don't think I have rights to accept the revision.

Resurrecting this old patch, refactored to correctly handle overflows of the sum of shifts.

spatel added inline comments.Aug 16 2018, 9:34 AM

test/CodeGen/X86/combine-sra.ll
133–134 ↗	(On Diff #160079)	To provide better coverage, add (or adjust this) test where both of the component shifts are below bitwidth, but the sum exceeds bitwidth?

RKSimon added inline comments.Aug 16 2018, 10:24 AM

test/CodeGen/X86/combine-sra.ll
133–134 ↗	(On Diff #160079)	See @combine_vec_ashr_ashr2 above

LGTM. I notice we don't fold the non-splat case of this pattern in IR. Do you think it's worth adding there too, or backend is good enough?

This revision is now accepted and ready to land.Aug 16 2018, 12:28 PM

In D35722#1202997, @spatel wrote:

LGTM. I notice we don't fold the non-splat case of this pattern in IR. Do you think it's worth adding there too, or backend is good enough?

Add it if you can, properly supporting vectors is always a good idea ;-)

Is there the equivalent logical shifts -> zero combine?

In D35722#1203007, @RKSimon wrote:

In D35722#1202997, @spatel wrote:

LGTM. I notice we don't fold the non-splat case of this pattern in IR. Do you think it's worth adding there too, or backend is good enough?

Add it if you can, properly supporting vectors is always a good idea ;-)

Is there the equivalent logical shifts -> zero combine?

AFAIK, no. I think splats are well supported now in instcombine, but we haven't generalized many transforms for non-splats.

Closed by commit rL340010: [DAGCombine] Improve (sra (sra x, c1), c2) -> (sra x, (add c1, c2)) folding (authored by RKSimon). · Explain WhyAug 17 2018, 3:53 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

trunk/

lib/

CodeGen/

SelectionDAG/

DAGCombiner.cpp

33 lines

test/

CodeGen/

X86/

combine-sra.ll

13 lines

Diff 161203

llvm/trunk/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 6,585 Lines • ▼ Show 20 Lines	if (VT.isVector())
ExtVT, VT.getVectorNumElements());		ExtVT, VT.getVectorNumElements());
if ((!LegalOperations \|\|		if ((!LegalOperations \|\|
TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, ExtVT)))		TLI.isOperationLegal(ISD::SIGN_EXTEND_INREG, ExtVT)))
return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,		return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
N0.getOperand(0), DAG.getValueType(ExtVT));		N0.getOperand(0), DAG.getValueType(ExtVT));
}		}

// fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2))		// fold (sra (sra x, c1), c2) -> (sra x, (add c1, c2))
		// clamp (add c1, c2) to max shift.
if (N0.getOpcode() == ISD::SRA) {		if (N0.getOpcode() == ISD::SRA) {
SDLoc DL(N);		SDLoc DL(N);
EVT ShiftVT = N1.getValueType();		EVT ShiftVT = N1.getValueType();
		EVT ShiftSVT = ShiftVT.getScalarType();
		SmallVector<SDValue, 16> ShiftValues;

auto MatchOutOfRange = [OpSizeInBits](ConstantSDNode *LHS,		auto SumOfShifts = [&](ConstantSDNode LHS, ConstantSDNode RHS) {
ConstantSDNode *RHS) {
APInt c1 = LHS->getAPIntValue();
APInt c2 = RHS->getAPIntValue();
zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
return (c1 + c2).uge(OpSizeInBits);
};
if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchOutOfRange))
return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0),
DAG.getConstant(OpSizeInBits - 1, DL, ShiftVT));

auto MatchInRange = [OpSizeInBits](ConstantSDNode *LHS,
ConstantSDNode *RHS) {
APInt c1 = LHS->getAPIntValue();		APInt c1 = LHS->getAPIntValue();
APInt c2 = RHS->getAPIntValue();		APInt c2 = RHS->getAPIntValue();
zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);		zeroExtendToMatch(c1, c2, 1 /* Overflow Bit */);
return (c1 + c2).ult(OpSizeInBits);		APInt Sum = c1 + c2;
		unsigned ShiftSum =
		Sum.uge(OpSizeInBits) ? (OpSizeInBits - 1) : Sum.getZExtValue();
		ShiftValues.push_back(DAG.getConstant(ShiftSum, DL, ShiftSVT));
		return true;
};		};
if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), MatchInRange)) {		if (ISD::matchBinaryPredicate(N1, N0.getOperand(1), SumOfShifts)) {
SDValue Sum = DAG.getNode(ISD::ADD, DL, ShiftVT, N1, N0.getOperand(1));		SDValue ShiftValue;
return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0), Sum);		if (VT.isVector())
		ShiftValue = DAG.getBuildVector(ShiftVT, DL, ShiftValues);
		else
		ShiftValue = ShiftValues[0];
		return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0), ShiftValue);
}		}
}		}

// fold (sra (shl X, m), (sub result_size, n))		// fold (sra (shl X, m), (sub result_size, n))
// -> (sign_extend (trunc (shl X, (sub (sub result_size, n), m)))) for		// -> (sign_extend (trunc (shl X, (sub (sub result_size, n), m)))) for
// result_size - n != m.		// result_size - n != m.
// If truncate is free for the target sext(shl) is likely to result in better		// If truncate is free for the target sext(shl) is likely to result in better
// code.		// code.
▲ Show 20 Lines • Show All 12,057 Lines • Show Last 20 Lines

llvm/trunk/test/CodeGen/X86/combine-sra.ll

	Show First 20 Lines • Show All 114 Lines • ▼ Show 20 Lines
	}			}

	define <4 x i32> @combine_vec_ashr_ashr3(<4 x i32> %x) {			define <4 x i32> @combine_vec_ashr_ashr3(<4 x i32> %x) {
	; SSE-LABEL: combine_vec_ashr_ashr3:			; SSE-LABEL: combine_vec_ashr_ashr3:
	; SSE: # %bb.0:			; SSE: # %bb.0:
	; SSE-NEXT: movdqa %xmm0, %xmm1			; SSE-NEXT: movdqa %xmm0, %xmm1
	; SSE-NEXT: psrad $27, %xmm1			; SSE-NEXT: psrad $27, %xmm1
	; SSE-NEXT: movdqa %xmm0, %xmm2			; SSE-NEXT: movdqa %xmm0, %xmm2
	; SSE-NEXT: psrad $5, %xmm2			; SSE-NEXT: psrad $15, %xmm2
	; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]			; SSE-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm1[4,5,6,7]
	; SSE-NEXT: movdqa %xmm0, %xmm1
	; SSE-NEXT: psrad $31, %xmm1
	; SSE-NEXT: psrad $1, %xmm0
	; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7]
	; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
	; SSE-NEXT: movdqa %xmm0, %xmm1
	; SSE-NEXT: psrad $10, %xmm1
	; SSE-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm0[4,5,6,7]
	; SSE-NEXT: psrad $31, %xmm0			; SSE-NEXT: psrad $31, %xmm0
	; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3],xmm0[4,5],xmm1[6,7]			; SSE-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
	; SSE-NEXT: retq			; SSE-NEXT: retq
	;			;
	; AVX-LABEL: combine_vec_ashr_ashr3:			; AVX-LABEL: combine_vec_ashr_ashr3:
	; AVX: # %bb.0:			; AVX: # %bb.0:
	; AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0			; AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
	; AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0
	; AVX-NEXT: retq			; AVX-NEXT: retq
	%1 = ashr <4 x i32> %x, <i32 1, i32 5, i32 50, i32 27>			%1 = ashr <4 x i32> %x, <i32 1, i32 5, i32 50, i32 27>
	%2 = ashr <4 x i32> %1, <i32 33, i32 10, i32 33, i32 0>			%2 = ashr <4 x i32> %1, <i32 33, i32 10, i32 33, i32 0>
	ret <4 x i32> %2			ret <4 x i32> %2
	}			}

	; fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))).			; fold (sra x, (trunc (and y, c))) -> (sra x, (and (trunc y), (trunc c))).
	define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) {			define <4 x i32> @combine_vec_ashr_trunc_and(<4 x i32> %x, <4 x i64> %y) {
	▲ Show 20 Lines • Show All 168 Lines • Show Last 20 Lines