This is an archive of the discontinued LLVM Phabricator instance.

generate extract_subvector node to avoid disastrous shuffle vector codegen
AbandonedPublic

Authored by spatel on Dec 11 2014, 11:27 AM.

Download Raw Diff

Details

Reviewers

chandlerc
andreadb
mkuper
hfinkel

Summary

This is a partial fix for PR21711 ( http://llvm.org/bugs/show_bug.cgi?id=21711 ). When we extract multiple consecutive elements from a vector to create a build_vector, we should try to form an extract_subvector instead of relying solely on getVectorShuffle().

The difference in output for the simplest v4f64 test case looks like this:

vextractf128	$1, %ymm0, %xmm0
vpermilpd	$1, %xmm0, %xmm1 ## xmm1 = xmm0[1,0]
vunpcklpd	%xmm1, %xmm0, %xmm0 ## xmm0 = xmm0[0],xmm1[0]
vmovapd	%xmm0, (%rdi)
vzeroupper
retq

Becomes:

vextractf128	$1, %ymm0, (%rdi)
vzeroupper
retq

We should still fix the shuffle problem in the x86 backend, but I thought it was best to solve the higher-level problem first. There's also a bug in the x86 backend dealing with arbitrary indexing and lowering the EXTRACT_SUBVECTOR node, so I've limited this patch to firing on the (most common?) case of half-vector extractions. This pattern emerges in particular on SandyBridge because it cracks 32-byte memops in half causing mismatches in vector sizes.

Diff Detail

Event Timeline

spatel updated this revision to Diff 17183.Dec 11 2014, 11:27 AM

spatel retitled this revision from to generate extract_subvector node to avoid disastrous shuffle vector codegen.

spatel updated this object.

spatel edited the test plan for this revision. (Show Details)

spatel added reviewers: mkuper, chandlerc, andreadb, hfinkel.

spatel added a subscriber: Unknown Object (MLST).

I looked at PR15872 yesterday, and ended up with something very similar, although more general.
Too bad I didn't really look at this review before starting. :-\

Then again, maybe I ended up too general, although I don't think I'm running into the miscompile.
Will add you to the review for that in a few minutes, but I'm fine with either version, or some merge of them, as long as it solves both PRs (and I'd still like someone more experienced with this to give a LGTM :-) ).

spatel mentioned this in D6678: Generate better code for shuffles.Dec 16 2014, 10:42 AM

Abandoning patch in favor of D6678.

Revision Contents

Path

Size

lib/

CodeGen/

SelectionDAG/

DAGCombiner.cpp

50 lines

test/

CodeGen/

X86/

vec_extract-avx.ll

81 lines

Diff 17183

lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

	Show First 20 Lines • Show All 9,991 Lines • ▼ Show 20 Lines
	continue;			continue;
	}			}

	// If extracting from the first vector, just use the index directly.			// If extracting from the first vector, just use the index directly.
	SDValue Extract = N->getOperand(i);			SDValue Extract = N->getOperand(i);
	SDValue ExtVal = Extract.getOperand(1);			SDValue ExtVal = Extract.getOperand(1);
	unsigned ExtIndex = cast<ConstantSDNode>(ExtVal)->getZExtValue();			unsigned ExtIndex = cast<ConstantSDNode>(ExtVal)->getZExtValue();
	if (Extract.getOperand(0) == VecIn1) {			if (Extract.getOperand(0) == VecIn1) {
	if (ExtIndex > VT.getVectorNumElements())
	return SDValue();

	Mask.push_back(ExtIndex);			Mask.push_back(ExtIndex);
	continue;			continue;
	}			}

	// Otherwise, use InIdx + VecSize			// Otherwise, use InIdx + VecSize
	Mask.push_back(NumInScalars+ExtIndex);			Mask.push_back(NumInScalars+ExtIndex);
	}			}

	// Avoid introducing illegal shuffles with zero.			// Avoid introducing illegal shuffles with zero.
	if (UsesZeroVector && !TLI.isVectorClearMaskLegal(Mask, VT))			if (UsesZeroVector && !TLI.isVectorClearMaskLegal(Mask, VT))
	return SDValue();			return SDValue();

	// We can't generate a shuffle node with mismatched input and output types.			// We can't generate a shuffle node with mismatched input and output types.
	// Attempt to transform a single input vector to the correct type.			// Attempt to transform a single input vector to the correct type.
	if ((VT != VecIn1.getValueType())) {			EVT VecIn1VT = VecIn1.getValueType();
	// We don't support shuffeling between TWO values of different types.			if ((VT != VecIn1VT)) {
				// We don't support shuffling between TWO values of different types.
	if (VecIn2.getNode())			if (VecIn2.getNode())
	return SDValue();			return SDValue();

				// If the input vector type has a different base type to the output
				// vector type, bail out.
				if (VecIn1VT.getVectorElementType() != VT.getVectorElementType())
				return SDValue();

				// See if this is a vector extraction from a larger vector.
				// Ignore single element build vectors (just 1 mask element) because
				// that's better handled as a scalar, not a vector.

				// TODO: We should be able to allow an aribtrarily larger source vector to
				// be extracted into a smaller vector, but this may cause silently wrong
				// codegen in the x86 backend at least. For now, limit the transform to
				// a simple upper/lower half-size extraction.
				if (VecIn1VT.getSizeInBits() == (VT.getSizeInBits() * 2) &&
				Mask.size() > 1) {
				int StartIdx = Mask[0];
				bool IsExtract = true;
				// The mask must specify consecutive elements from the source vector.
				for (int i = 0, e = Mask.size(); i < e; i++) {
				if (Mask[i] != (i + StartIdx)) {
				IsExtract = false;
				break;
				}
				}
				if (IsExtract)
				// TODO: See comment above; we should be able to remove this check.
				if (StartIdx == 0 \|\| StartIdx == (signed) VT.getVectorNumElements()) {
				SDValue VecIdx = DAG.getIntPtrConstant(StartIdx);
				return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, VecIn1, VecIdx);
				}
				}

	// We only support widening of vectors which are half the size of the			// We only support widening of vectors which are half the size of the
	// output registers. For example XMM->YMM widening on X86 with AVX.			// output registers. For example XMM->YMM widening on X86 with AVX.
	if (VecIn1.getValueType().getSizeInBits()*2 != VT.getSizeInBits())			if (VecIn1VT.getSizeInBits() * 2 != VT.getSizeInBits())
	return SDValue();

	// If the input vector type has a different base type to the output
	// vector type, bail out.
	if (VecIn1.getValueType().getVectorElementType() !=
	VT.getVectorElementType())
	return SDValue();			return SDValue();

	// Widen the input vector by adding undef values.			// Widen the input vector by adding undef values.
	VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,			VecIn1 = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
	VecIn1, DAG.getUNDEF(VecIn1.getValueType()));			VecIn1, DAG.getUNDEF(VecIn1VT));
	}			}

	if (UsesZeroVector)			if (UsesZeroVector)
	VecIn2 = VT.isInteger() ? DAG.getConstant(0, VT) :			VecIn2 = VT.isInteger() ? DAG.getConstant(0, VT) :
	DAG.getConstantFP(0.0, VT);			DAG.getConstantFP(0.0, VT);
	else			else
	// If VecIn2 is unused then change it to undef.			// If VecIn2 is unused then change it to undef.
	VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);			VecIn2 = VecIn2.getNode() ? VecIn2 : DAG.getUNDEF(VT);
	▲ Show 20 Lines • Show All 1,720 Lines • Show Last 20 Lines

test/CodeGen/X86/vec_extract-avx.ll

				; RUN: llc < %s -march=x86-64 -mattr=+avx \| FileCheck %s

				; When extracting multiple consecutive elements from a larger
				; vector into a smaller one, do it efficiently. We should use
				; an EXTRACT_SUBVECTOR node internally rather than a bunch of
				; single element extractions.

				; Extracting the low elements only requires using the right kind of store.
				define void @low_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) {
				%ext0 = extractelement <8 x float> %v, i32 0
				%ext1 = extractelement <8 x float> %v, i32 1
				%ext2 = extractelement <8 x float> %v, i32 2
				%ext3 = extractelement <8 x float> %v, i32 3
				%ins0 = insertelement <4 x float> undef, float %ext0, i32 0
				%ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
				%ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
				%ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
				store <4 x float> %ins3, <4 x float>* %ptr, align 16
				ret void

				; CHECK-LABEL: low_v8f32_to_v4f32
				; CHECK: vmovaps
				; CHECK-NEXT: vzeroupper
				; CHECK-NEXT: retq
				}

				; Extracting the high elements requires just one AVX instruction.
				define void @high_v8f32_to_v4f32(<8 x float> %v, <4 x float>* %ptr) {
				%ext0 = extractelement <8 x float> %v, i32 4
				%ext1 = extractelement <8 x float> %v, i32 5
				%ext2 = extractelement <8 x float> %v, i32 6
				%ext3 = extractelement <8 x float> %v, i32 7
				%ins0 = insertelement <4 x float> undef, float %ext0, i32 0
				%ins1 = insertelement <4 x float> %ins0, float %ext1, i32 1
				%ins2 = insertelement <4 x float> %ins1, float %ext2, i32 2
				%ins3 = insertelement <4 x float> %ins2, float %ext3, i32 3
				store <4 x float> %ins3, <4 x float>* %ptr, align 16
				ret void

				; CHECK-LABEL: high_v8f32_to_v4f32
				; CHECK: vextractf128
				; CHECK-NEXT: vzeroupper
				; CHECK-NEXT: retq
				}

				; Make sure element type doesn't alter the codegen. Note that
				; if we were actually using the vector in this function and
				; have AVX2, we should generate vextracti128 (the int version).
				define void @high_v8i32_to_v4i32(<8 x i32> %v, <4 x i32>* %ptr) {
				%ext0 = extractelement <8 x i32> %v, i32 4
				%ext1 = extractelement <8 x i32> %v, i32 5
				%ext2 = extractelement <8 x i32> %v, i32 6
				%ext3 = extractelement <8 x i32> %v, i32 7
				%ins0 = insertelement <4 x i32> undef, i32 %ext0, i32 0
				%ins1 = insertelement <4 x i32> %ins0, i32 %ext1, i32 1
				%ins2 = insertelement <4 x i32> %ins1, i32 %ext2, i32 2
				%ins3 = insertelement <4 x i32> %ins2, i32 %ext3, i32 3
				store <4 x i32> %ins3, <4 x i32>* %ptr, align 16
				ret void

				; CHECK-LABEL: high_v8i32_to_v4i32
				; CHECK: vextractf128
				; CHECK-NEXT: vzeroupper
				; CHECK-NEXT: retq
				}

				; Make sure that element size doesn't alter the codegen.
				define void @high_v4f64_to_v2f64(<4 x double> %v, <2 x double>* %ptr) {
				%ext0 = extractelement <4 x double> %v, i32 2
				%ext1 = extractelement <4 x double> %v, i32 3
				%ins0 = insertelement <2 x double> undef, double %ext0, i32 0
				%ins1 = insertelement <2 x double> %ins0, double %ext1, i32 1
				store <2 x double> %ins1, <2 x double>* %ptr, align 16
				ret void

				; CHECK-LABEL: high_v4f64_to_v2f64
				; CHECK: vextractf128
				; CHECK-NEXT: vzeroupper
				; CHECK-NEXT: retq
				}