This is an archive of the discontinued LLVM Phabricator instance.

[X86] Use PSADBW for v8i8 addition reductions.
ClosedPublic

Authored by craig.topper on Aug 11 2019, 11:03 PM.

Download Raw Diff

Details

Reviewers

RKSimon
spatel

Commits

rG3e44d961705f: [X86] Use PSADBW for v8i8 addition reductions.
rL368864: [X86] Use PSADBW for v8i8 addition reductions.

Summary

Improves the 8 byte case from PR42674.

Diff Detail

Repository

rG LLVM Github Monorepo

Build Status

Buildable 36580
Build 36579: arc lint + arc unit

Event Timeline

craig.topper created this revision.Aug 11 2019, 11:03 PM

Herald added a project: Restricted Project. · View Herald TranscriptAug 11 2019, 11:03 PM

Herald added a subscriber: hiraditya. · View Herald Transcript

Harbormaster completed remote builds in B36580: Diff 214576.Aug 11 2019, 11:03 PM

RKSimon added inline comments.Aug 12 2019, 3:36 AM

llvm/lib/Target/X86/X86ISelLowering.cpp
35451	we can easily support 2i8/4i8 as well by replacing this with an insertion into a zero v16i8 vector

That doesn’t seem profitable for v2i8. We’d be better off extracting both elements and doing a scalar add. For v4i8, I’m not sure. Psadbw is 5 cycles on some CPUs if I remember right, the normal expansion is probably faster on those CPUs.

OK, let's just go for v8i8

This revision is now accepted and ready to land.Aug 14 2019, 7:38 AM

Closed by commit rL368864: [X86] Use PSADBW for v8i8 addition reductions. (authored by ctopper). · Explain WhyAug 14 2019, 8:56 AM

This revision was automatically updated to reflect the committed changes.

Revision Contents

Path

Size

llvm/

lib/

Target/

X86/

X86ISelLowering.cpp

14 lines

test/

CodeGen/

X86/

vector-reduce-add.ll

36 lines

Diff 214576

llvm/lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 32,759 Lines • ▼ Show 20 Lines
	assert(isNullConstant(Index) &&			assert(isNullConstant(Index) &&
	"Reduction doesn't end in an extract from index 0");			"Reduction doesn't end in an extract from index 0");

	EVT VT = ExtElt->getValueType(0);			EVT VT = ExtElt->getValueType(0);
	EVT VecVT = Rdx.getValueType();			EVT VecVT = Rdx.getValueType();
	if (VecVT.getScalarType() != VT)			if (VecVT.getScalarType() != VT)
	return SDValue();			return SDValue();

				SDLoc DL(ExtElt);

				if (VecVT == MVT::v8i8) {
				// Pad with undef.
				Rdx = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v16i8, Rdx,
				DAG.getUNDEF(VecVT));
				RKSimonUnsubmitted Not Done Reply Inline Actions we can easily support 2i8/4i8 as well by replacing this with an insertion into a zero v16i8 vector RKSimon: we can easily support 2i8/4i8 as well by replacing this with an insertion into a zero v16i8…
				Rdx = DAG.getNode(X86ISD::PSADBW, DL, MVT::v2i64, Rdx,
				DAG.getConstant(0, DL, MVT::v16i8));
				Rdx = DAG.getBitcast(MVT::v16i8, Rdx);
				return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Rdx, Index);
				}

	// Must be a >=128-bit vector with pow2 elements.			// Must be a >=128-bit vector with pow2 elements.
	if ((VecVT.getSizeInBits() % 128) != 0 \|\|			if ((VecVT.getSizeInBits() % 128) != 0 \|\|
	!isPowerOf2_32(VecVT.getVectorNumElements()))			!isPowerOf2_32(VecVT.getVectorNumElements()))
	return SDValue();			return SDValue();

	SDLoc DL(ExtElt);

	// vXi8 reduction - sum lo/hi halves then use PSADBW.			// vXi8 reduction - sum lo/hi halves then use PSADBW.
	if (VT == MVT::i8) {			if (VT == MVT::i8) {
	while (Rdx.getValueSizeInBits() > 128) {			while (Rdx.getValueSizeInBits() > 128) {
	unsigned HalfSize = VecVT.getSizeInBits() / 2;			unsigned HalfSize = VecVT.getSizeInBits() / 2;
	unsigned HalfElts = VecVT.getVectorNumElements() / 2;			unsigned HalfElts = VecVT.getVectorNumElements() / 2;
	SDValue Lo = extractSubVector(Rdx, 0, DAG, DL, HalfSize);			SDValue Lo = extractSubVector(Rdx, 0, DAG, DL, HalfSize);
	SDValue Hi = extractSubVector(Rdx, HalfElts, DAG, DL, HalfSize);			SDValue Hi = extractSubVector(Rdx, HalfElts, DAG, DL, HalfSize);
	Rdx = DAG.getNode(ISD::ADD, DL, Lo.getValueType(), Lo, Hi);			Rdx = DAG.getNode(ISD::ADD, DL, Lo.getValueType(), Lo, Hi);
	▲ Show 20 Lines • Show All 9,436 Lines • Show Last 20 Lines

llvm/test/CodeGen/X86/vector-reduce-add.ll

	Show First 20 Lines • Show All 1,024 Lines • ▼ Show 20 Lines
	; AVX512-NEXT: retq			; AVX512-NEXT: retq
	%1 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> %a0)			%1 = call i8 @llvm.experimental.vector.reduce.add.v4i8(<4 x i8> %a0)
	ret i8 %1			ret i8 %1
	}			}

	define i8 @test_v8i8(<8 x i8> %a0) {			define i8 @test_v8i8(<8 x i8> %a0) {
	; SSE2-LABEL: test_v8i8:			; SSE2-LABEL: test_v8i8:
	; SSE2: # %bb.0:			; SSE2: # %bb.0:
	; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]			; SSE2-NEXT: pxor %xmm1, %xmm1
	; SSE2-NEXT: paddb %xmm0, %xmm1			; SSE2-NEXT: psadbw %xmm0, %xmm1
	; SSE2-NEXT: movdqa %xmm1, %xmm0
	; SSE2-NEXT: psrld $16, %xmm0
	; SSE2-NEXT: paddb %xmm1, %xmm0
	; SSE2-NEXT: movdqa %xmm0, %xmm1
	; SSE2-NEXT: psrlw $8, %xmm1
	; SSE2-NEXT: paddb %xmm0, %xmm1
	; SSE2-NEXT: movd %xmm1, %eax			; SSE2-NEXT: movd %xmm1, %eax
	; SSE2-NEXT: # kill: def $al killed $al killed $eax			; SSE2-NEXT: # kill: def $al killed $al killed $eax
	; SSE2-NEXT: retq			; SSE2-NEXT: retq
	;			;
	; SSE41-LABEL: test_v8i8:			; SSE41-LABEL: test_v8i8:
	; SSE41: # %bb.0:			; SSE41: # %bb.0:
	; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]			; SSE41-NEXT: pxor %xmm1, %xmm1
	; SSE41-NEXT: paddb %xmm0, %xmm1			; SSE41-NEXT: psadbw %xmm0, %xmm1
	; SSE41-NEXT: movdqa %xmm1, %xmm0
	; SSE41-NEXT: psrld $16, %xmm0
	; SSE41-NEXT: paddb %xmm1, %xmm0
	; SSE41-NEXT: movdqa %xmm0, %xmm1
	; SSE41-NEXT: psrlw $8, %xmm1
	; SSE41-NEXT: paddb %xmm0, %xmm1
	; SSE41-NEXT: pextrb $0, %xmm1, %eax			; SSE41-NEXT: pextrb $0, %xmm1, %eax
	; SSE41-NEXT: # kill: def $al killed $al killed $eax			; SSE41-NEXT: # kill: def $al killed $al killed $eax
	; SSE41-NEXT: retq			; SSE41-NEXT: retq
	;			;
	; AVX-LABEL: test_v8i8:			; AVX-LABEL: test_v8i8:
	; AVX: # %bb.0:			; AVX: # %bb.0:
	; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]			; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0			; AVX-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
	; AVX-NEXT: vpsrld $16, %xmm0, %xmm1
	; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
	; AVX-NEXT: vpsrlw $8, %xmm0, %xmm1
	; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
	; AVX-NEXT: vpextrb $0, %xmm0, %eax			; AVX-NEXT: vpextrb $0, %xmm0, %eax
	; AVX-NEXT: # kill: def $al killed $al killed $eax			; AVX-NEXT: # kill: def $al killed $al killed $eax
	; AVX-NEXT: retq			; AVX-NEXT: retq
	;			;
	; AVX512-LABEL: test_v8i8:			; AVX512-LABEL: test_v8i8:
	; AVX512: # %bb.0:			; AVX512: # %bb.0:
	; AVX512-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,2,3]			; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0			; AVX512-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
	; AVX512-NEXT: vpsrld $16, %xmm0, %xmm1
	; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
	; AVX512-NEXT: vpsrlw $8, %xmm0, %xmm1
	; AVX512-NEXT: vpaddb %xmm1, %xmm0, %xmm0
	; AVX512-NEXT: vpextrb $0, %xmm0, %eax			; AVX512-NEXT: vpextrb $0, %xmm0, %eax
	; AVX512-NEXT: # kill: def $al killed $al killed $eax			; AVX512-NEXT: # kill: def $al killed $al killed $eax
	; AVX512-NEXT: retq			; AVX512-NEXT: retq
	%1 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %a0)			%1 = call i8 @llvm.experimental.vector.reduce.add.v8i8(<8 x i8> %a0)
	ret i8 %1			ret i8 %1
	}			}

	define i8 @test_v16i8(<16 x i8> %a0) {			define i8 @test_v16i8(<16 x i8> %a0) {
	▲ Show 20 Lines • Show All 300 Lines • Show Last 20 Lines