This is an archive of the discontinued LLVM Phabricator instance.

[DAGCombiner][X86] Teach SimplifyVBinOp to fold VBinOp (concat X, undef/constant), (concat Y, undef/constant) -> concat (VBinOp X, Y), VecC
ClosedPublic

Authored by craig.topper on Aug 23 2019, 1:30 PM.

Download Raw Diff

Details

Reviewers

spatel
RKSimon

Commits

rG846429de7410: [DAGCombiner][X86] Teach SimplifyVBinOp to fold VBinOp (concat X…
rL369937: [DAGCombiner][X86] Teach SimplifyVBinOp to fold VBinOp (concat X…

Summary

This improves the combine I included in D66504 to handle constants in the upper operands of the concat. If we can constant fold them away we can pull the concat after the bin op. This helps with chains of madd reductions on X86 from loop unrolling. The loop madd reduction pattern creates pmaddwd with half the width of the add that follows it using zeroes to fill the upper bits. If we have two of these added together we can pull the zeroes through the accumulating add and then shrink it.

Diff Detail

Repository

rG LLVM Github Monorepo

Build Status

Buildable 37254
Build 37253: arc lint + arc unit

Event Timeline

craig.topper created this revision.Aug 23 2019, 1:30 PM

Herald added a project: Restricted Project. · View Herald TranscriptAug 23 2019, 1:30 PM

Herald added subscribers: dmgreen, hiraditya. · View Herald Transcript

craig.topper marked an inline comment as done.Aug 23 2019, 1:32 PM

craig.topper added inline comments.

llvm/test/CodeGen/X86/madd.ll
2761	We were just barely able to prove the bits were all 0 and therefore disjoint, but weren't able to remove the ADD/OR completely because the disjoint bits code gets to go one level deeper in computeKnownBits then SimplifyDemandedBits. This is because the disjoint check starts at a depth of 0 for the operands, but the SimplifyDemandedBits starts at a depth of 0 for the OR node itself.

Rebase after D66504.

Harbormaster completed remote builds in B37254: Diff 217071.Aug 25 2019, 9:01 PM

LGTM - cheers

This revision is now accepted and ready to land.Aug 26 2019, 7:13 AM

Closed by commit rL369937: [DAGCombiner][X86] Teach SimplifyVBinOp to fold VBinOp (concat X… (authored by ctopper). · Explain WhyAug 26 2019, 11:08 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

lib/

CodeGen/

SelectionDAG/

DAGCombiner.cpp

36 lines

test/

CodeGen/

X86/

madd.ll

67 lines

Diff 217071

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 19,531 Lines • ▼ Show 20 Lines	if (NarrowVT == Y.getValueType() &&
SDLoc DL(N);		SDLoc DL(N);
SDValue VecC =		SDValue VecC =
DAG.getNode(Opcode, DL, VT, DAG.getUNDEF(VT), DAG.getUNDEF(VT));		DAG.getNode(Opcode, DL, VT, DAG.getUNDEF(VT), DAG.getUNDEF(VT));
SDValue NarrowBO = DAG.getNode(Opcode, DL, NarrowVT, X, Y);		SDValue NarrowBO = DAG.getNode(Opcode, DL, NarrowVT, X, Y);
return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, VecC, NarrowBO, Z);		return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, VecC, NarrowBO, Z);
}		}
}		}

// Make sure all but the first op are undef.		// Make sure all but the first op are undef or constant.
auto ConcatWithUndef = [](SDValue Concat) {		auto ConcatWithConstantOrUndef = [](SDValue Concat) {
return Concat.getOpcode() == ISD::CONCAT_VECTORS &&		return Concat.getOpcode() == ISD::CONCAT_VECTORS &&
std::all_of(std::next(Concat->op_begin()), Concat->op_end(),		std::all_of(std::next(Concat->op_begin()), Concat->op_end(),
[](const SDValue &Op) {		[](const SDValue &Op) {
return Op.isUndef();		return Op.isUndef() \|\|
		ISD::isBuildVectorOfConstantSDNodes(Op.getNode());
});		});
};		};

// The following pattern is likely to emerge with vector reduction ops. Moving		// The following pattern is likely to emerge with vector reduction ops. Moving
// the binary operation ahead of the concat may allow using a narrower vector		// the binary operation ahead of the concat may allow using a narrower vector
// instruction that has better performance than the wide version of the op:		// instruction that has better performance than the wide version of the op:
// VBinOp (concat X, undef), (concat Y, undef) --> concat (VBinOp X, Y), VecC		// VBinOp (concat X, undef/constant), (concat Y, undef/constant) -->
if (ConcatWithUndef(LHS) && ConcatWithUndef(RHS) &&		// concat (VBinOp X, Y), VecC
		if (ConcatWithConstantOrUndef(LHS) && ConcatWithConstantOrUndef(RHS) &&
(LHS.hasOneUse() \|\| RHS.hasOneUse())) {		(LHS.hasOneUse() \|\| RHS.hasOneUse())) {
SDValue X = LHS.getOperand(0);		EVT NarrowVT = LHS.getOperand(0).getValueType();
SDValue Y = RHS.getOperand(0);		if (NarrowVT == RHS.getOperand(0).getValueType() &&
EVT NarrowVT = X.getValueType();
if (NarrowVT == Y.getValueType() &&
TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT)) {		TLI.isOperationLegalOrCustomOrPromote(Opcode, NarrowVT)) {
// (binop undef, undef) may not return undef, so compute that result.
SDLoc DL(N);		SDLoc DL(N);
SDValue VecC =		unsigned NumOperands = LHS.getNumOperands();
DAG.getNode(Opcode, DL, NarrowVT, DAG.getUNDEF(NarrowVT),		SmallVector<SDValue, 4> Ops;
DAG.getUNDEF(NarrowVT));		for (unsigned i = 0; i != NumOperands; ++i) {
SmallVector<SDValue, 4> Ops(LHS.getNumOperands(), VecC);		// This constant fold for operands 1 and up.
Ops[0] = DAG.getNode(Opcode, DL, NarrowVT, X, Y);		Ops.push_back(DAG.getNode(Opcode, DL, NarrowVT, LHS.getOperand(i),
		RHS.getOperand(i)));
		}

return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);		return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
}		}
}		}

if (SDValue V = scalarizeBinOpOfSplats(N, DAG))		if (SDValue V = scalarizeBinOpOfSplats(N, DAG))
return V;		return V;

return SDValue();		return SDValue();
▲ Show 20 Lines • Show All 1,174 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/madd.ll

	Show First 20 Lines • Show All 2,714 Lines • ▼ Show 20 Lines
	; SSE2-NEXT: paddd %xmm1, %xmm2			; SSE2-NEXT: paddd %xmm1, %xmm2
	; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]			; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,0,1]
	; SSE2-NEXT: paddd %xmm2, %xmm0			; SSE2-NEXT: paddd %xmm2, %xmm0
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]			; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
	; SSE2-NEXT: paddd %xmm0, %xmm1			; SSE2-NEXT: paddd %xmm0, %xmm1
	; SSE2-NEXT: movd %xmm1, %eax			; SSE2-NEXT: movd %xmm1, %eax
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; AVX1-LABEL: madd_quad_reduction:			; AVX-LABEL: madd_quad_reduction:
	; AVX1: # %bb.0:			; AVX: # %bb.0:
	; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10			; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
	; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax			; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
	; AVX1-NEXT: vmovdqu (%rdi), %xmm0			; AVX-NEXT: vmovdqu (%rdi), %xmm0
	; AVX1-NEXT: vmovdqu (%rdx), %xmm1			; AVX-NEXT: vmovdqu (%rdx), %xmm1
	; AVX1-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1			; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1
	; AVX1-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0			; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0
	; AVX1-NEXT: vmovdqu (%r8), %xmm2			; AVX-NEXT: vmovdqu (%r8), %xmm2
	; AVX1-NEXT: vpmaddwd (%r9), %xmm2, %xmm2			; AVX-NEXT: vpmaddwd (%r9), %xmm2, %xmm2
	; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1			; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vmovdqu (%rax), %xmm2			; AVX-NEXT: vmovdqu (%rax), %xmm2
	; AVX1-NEXT: vpmaddwd (%r10), %xmm2, %xmm2			; AVX-NEXT: vpmaddwd (%r10), %xmm2, %xmm2
	; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1			; AVX-NEXT: vpaddd %xmm2, %xmm1, %xmm1
	; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0			; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]			; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0			; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]			; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
	; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0			; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX1-NEXT: vmovd %xmm0, %eax			; AVX-NEXT: vmovd %xmm0, %eax
	; AVX1-NEXT: retq			; AVX-NEXT: retq
	;
	; AVX256-LABEL: madd_quad_reduction:
	; AVX256: # %bb.0:
	; AVX256-NEXT: movq {{[0-9]+}}(%rsp), %r10
	; AVX256-NEXT: movq {{[0-9]+}}(%rsp), %rax
	; AVX256-NEXT: vmovdqu (%rdi), %xmm0
	; AVX256-NEXT: vmovdqu (%rdx), %xmm1
	; AVX256-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1
	; AVX256-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0
	; AVX256-NEXT: vmovdqu (%r8), %xmm2
	; AVX256-NEXT: vpmaddwd (%r9), %xmm2, %xmm2
	; AVX256-NEXT: vpaddd %ymm2, %ymm1, %ymm1
	; AVX256-NEXT: vmovdqu (%rax), %xmm2
	; AVX256-NEXT: vpmaddwd (%r10), %xmm2, %xmm2
	; AVX256-NEXT: vpaddd %ymm2, %ymm1, %ymm1
	; AVX256-NEXT: vpaddd %ymm1, %ymm0, %ymm0
	; AVX256-NEXT: vextracti128 $1, %ymm0, %xmm1
	; AVX256-NEXT: vpor %xmm1, %xmm0, %xmm0
	craig.topperAuthorUnsubmitted Done Reply Inline Actions We were just barely able to prove the bits were all 0 and therefore disjoint, but weren't able to remove the ADD/OR completely because the disjoint bits code gets to go one level deeper in computeKnownBits then SimplifyDemandedBits. This is because the disjoint check starts at a depth of 0 for the operands, but the SimplifyDemandedBits starts at a depth of 0 for the OR node itself. craig.topper: We were just barely able to prove the bits were all 0 and therefore disjoint, but weren't able…
	; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,0,1]
	; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX256-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]
	; AVX256-NEXT: vpaddd %xmm1, %xmm0, %xmm0
	; AVX256-NEXT: vmovd %xmm0, %eax
	; AVX256-NEXT: vzeroupper
	; AVX256-NEXT: retq
	%tmp = load <8 x i16>, <8 x i16>* %arg, align 1			%tmp = load <8 x i16>, <8 x i16>* %arg, align 1
	%tmp6 = load <8 x i16>, <8 x i16>* %arg1, align 1			%tmp6 = load <8 x i16>, <8 x i16>* %arg1, align 1
	%tmp7 = sext <8 x i16> %tmp to <8 x i32>			%tmp7 = sext <8 x i16> %tmp to <8 x i32>
	%tmp17 = sext <8 x i16> %tmp6 to <8 x i32>			%tmp17 = sext <8 x i16> %tmp6 to <8 x i32>
	%tmp19 = mul nsw <8 x i32> %tmp7, %tmp17			%tmp19 = mul nsw <8 x i32> %tmp7, %tmp17
	%tmp20 = load <8 x i16>, <8 x i16>* %arg2, align 1			%tmp20 = load <8 x i16>, <8 x i16>* %arg2, align 1
	%tmp21 = load <8 x i16>, <8 x i16>* %arg3, align 1			%tmp21 = load <8 x i16>, <8 x i16>* %arg3, align 1
	%tmp22 = sext <8 x i16> %tmp20 to <8 x i32>			%tmp22 = sext <8 x i16> %tmp20 to <8 x i32>
	▲ Show 20 Lines • Show All 159 Lines • Show Last 20 Lines