This is an archive of the discontinued LLVM Phabricator instance.

Differential D7898

Use vmovss to handle inserting an element into index 0 of a v8f32 vector of zeros.
ClosedPublic

Authored by craig.topper on Feb 25 2015, 11:22 PM.

Download Raw Diff

Details

Reviewers

chandlerc

Summary

This fixes this case from PR22685 to be handled by vmovss

efine <8 x float> @mov00(float* %ptr) {

%val = load float* %ptr
%vec = insertelement <8 x float> zeroinitializer, float %val, i32 0
ret <8 x float> %vec

}

Instead of this

vxorps %xmm0, %xmm0, %xmm0
vinsertps $0, (%rdi), %xmm0, %xmm0 ## xmm0 = mem[0],xmm0[1,2,3]
vxorps %ymm1, %ymm1, %ymm1
vinsertf128 $0, %xmm0, %ymm1, %ymm0
retq

Diff Detail

Event Timeline

craig.topper updated this revision to Diff 20731.Feb 25 2015, 11:22 PM

craig.topper retitled this revision from to Use vmovss to handle inserting an element into index 0 of a v8f32 vector of zeros..

craig.topper updated this object.

craig.topper edited the test plan for this revision. (Show Details)

craig.topper added a reviewer: chandlerc.

craig.topper added a subscriber: Unknown Object (MLST).

Looks fine. Add the floating point test cases as well?

This revision is now accepted and ready to land.Feb 25 2015, 11:30 PM

Can we move the mask check into lower256BitVectorShuffle() ?

Otherwise, we'll need to duplicate the logic to catch the following cases:

define <4 x i64> @mov_v4i64(i64* %ptr) {
  %val = load i64, i64* %ptr
  %i0 = insertelement <4 x i64> zeroinitializer, i64 %val, i32 0
  ret <4 x i64> %i0
}

define <8 x i32> @mov_v8i32(i32* %ptr) {
  %val = load i32, i32* %ptr
  %i0 = insertelement <8 x i32> zeroinitializer, i32 %val, i32 0
  ret <8 x i32> %i0
}

define <16 x i16> @mov_v16i16(i16* %ptr) {
  %val = load i16, i16* %ptr
  %i0 = insertelement <16 x i16> zeroinitializer, i16 %val, i32 0
  ret <16 x i16> %i0
}

define <32 x i8> @mov_v32i8(i8* %ptr) {
  %val = load i8, i8* %ptr
  %i0 = insertelement <32 x i8> zeroinitializer, i8 %val, i32 0
  ret <32 x i8> %i0
}

I added more test cases to the bug:
http://llvm.org/bugs/show_bug.cgi?id=22685#c4

This may require more than one patch to get right, but I think that we should be handling all subtypes of 256-bit vectors.

Committed my initial change in r23135. Still need to review the additional test cases.

spatel mentioned this in D8341: try to lowerVectorShuffleAsElementInsertion() for all 256-bit vector sub-types [X86, AVX].Mar 14 2015, 8:02 AM

spatel mentioned this in rL233704: [X86, AVX] try to lowerVectorShuffleAsElementInsertion() for all 256-bit vector….Mar 31 2015, 9:35 AM

craig.topper closed this revision.Oct 20 2015, 8:36 AM

Revision Contents

Path

Size

lib/

Target/

X86/

X86ISelLowering.cpp

12 lines

test/

CodeGen/

X86/

vector-shuffle-256-v8.ll

10 lines

Diff 20731

lib/Target/X86/X86ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 5,623 Lines • ▼ Show 20 Lines	if (NumNonZero == 1) {
// the rest of the elements. This will be matched as movd/movq/movss/movsd		// the rest of the elements. This will be matched as movd/movq/movss/movsd
// depending on what the source datatype is.		// depending on what the source datatype is.
if (Idx == 0) {		if (Idx == 0) {
if (NumZero == 0)		if (NumZero == 0)
return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);		return DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);

if (ExtVT == MVT::i32 \|\| ExtVT == MVT::f32 \|\| ExtVT == MVT::f64 \|\|		if (ExtVT == MVT::i32 \|\| ExtVT == MVT::f32 \|\| ExtVT == MVT::f64 \|\|
(ExtVT == MVT::i64 && Subtarget->is64Bit())) {		(ExtVT == MVT::i64 && Subtarget->is64Bit())) {
if (VT.is256BitVector() \|\| VT.is512BitVector()) {		if (VT.is512BitVector()) {
SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);		SDValue ZeroVec = getZeroVector(VT, Subtarget, DAG, dl);
return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,		return DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, VT, ZeroVec,
Item, DAG.getIntPtrConstant(0));		Item, DAG.getIntPtrConstant(0));
}		}
assert(VT.is128BitVector() && "Expected an SSE value type!");
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);		Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Item);
// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.		// Turn it into a MOVL (i.e. movss, movsd, or movd) to a zero vector.
return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);		return getShuffleVectorZeroOrUndef(Item, 0, true, Subtarget, DAG);
}		}

if (ExtVT == MVT::i16 \|\| ExtVT == MVT::i8) {		if (ExtVT == MVT::i16 \|\| ExtVT == MVT::i8) {
Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);		Item = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, Item);
Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);		Item = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Item);
▲ Show 20 Lines • Show All 3,664 Lines • ▼ Show 20 Lines	static SDValue lowerV8F32VectorShuffle(SDValue Op, SDValue V1, SDValue V2,
SelectionDAG &DAG) {		SelectionDAG &DAG) {
SDLoc DL(Op);		SDLoc DL(Op);
assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");		assert(V1.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");		assert(V2.getSimpleValueType() == MVT::v8f32 && "Bad operand type!");
ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);		ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
ArrayRef<int> Mask = SVOp->getMask();		ArrayRef<int> Mask = SVOp->getMask();
assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");		assert(Mask.size() == 8 && "Unexpected mask size for v8 shuffle!");

		// If we have a single input to the zero element, insert that into V1 if we
		// can do so cheaply.
		int NumV2Elements =
		std::count_if(Mask.begin(), Mask.end(), [](int M) { return M >= 8; });
		if (NumV2Elements == 1 && Mask[0] >= 8)
		if (SDValue Insertion = lowerVectorShuffleAsElementInsertion(
		MVT::v8f32, DL, V1, V2, Mask, Subtarget, DAG))
		return Insertion;

if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,		if (SDValue Blend = lowerVectorShuffleAsBlend(DL, MVT::v8f32, V1, V2, Mask,
Subtarget, DAG))		Subtarget, DAG))
return Blend;		return Blend;

// Check for being able to broadcast a single element.		// Check for being able to broadcast a single element.
if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,		if (SDValue Broadcast = lowerVectorShuffleAsBroadcast(MVT::v8f32, DL, V1,
Mask, Subtarget, DAG))		Mask, Subtarget, DAG))
return Broadcast;		return Broadcast;
▲ Show 20 Lines • Show All 14,852 Lines • Show Last 20 Lines

test/CodeGen/X86/vector-shuffle-256-v8.ll

	Show First 20 Lines • Show All 125 Lines • ▼ Show 20 Lines
	; AVX1: # BB#0:			; AVX1: # BB#0:
	; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]			; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]			; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
	; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,0,0,4,4,4,4]			; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,0,0,4,4,4,4]
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: shuffle_v8f32_70000000:			; AVX2-LABEL: shuffle_v8f32_70000000:
	; AVX2: # BB#0:			; AVX2: # BB#0:
	; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX2-NEXT: movl $7, %eax			; AVX2-NEXT: movl $7, %eax
	; AVX2-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1			; AVX2-NEXT: vmovd %eax, %xmm1
	; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2			; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
	; AVX2-NEXT: vinserti128 $0, %xmm1, %ymm2, %ymm1			; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
	; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0			; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>			%shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <8 x float> %shuffle			ret <8 x float> %shuffle
	}			}

	define <8 x float> @shuffle_v8f32_01014545(<8 x float> %a, <8 x float> %b) {			define <8 x float> @shuffle_v8f32_01014545(<8 x float> %a, <8 x float> %b) {
	; ALL-LABEL: shuffle_v8f32_01014545:			; ALL-LABEL: shuffle_v8f32_01014545:
	▲ Show 20 Lines • Show All 809 Lines • ▼ Show 20 Lines
	; AVX1: # BB#0:			; AVX1: # BB#0:
	; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]			; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm0[2,3,0,1]
	; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]			; AVX1-NEXT: vblendpd {{.*#+}} ymm0 = ymm0[0],ymm1[1,2,3]
	; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,0,0,4,4,4,4]			; AVX1-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[3,0,0,0,4,4,4,4]
	; AVX1-NEXT: retq			; AVX1-NEXT: retq
	;			;
	; AVX2-LABEL: shuffle_v8i32_70000000:			; AVX2-LABEL: shuffle_v8i32_70000000:
	; AVX2: # BB#0:			; AVX2: # BB#0:
	; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
	; AVX2-NEXT: movl $7, %eax			; AVX2-NEXT: movl $7, %eax
	; AVX2-NEXT: vpinsrd $0, %eax, %xmm1, %xmm1			; AVX2-NEXT: vmovd %eax, %xmm1
	; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2			; AVX2-NEXT: vpxor %ymm2, %ymm2, %ymm2
	; AVX2-NEXT: vinserti128 $0, %xmm1, %ymm2, %ymm1			; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0],ymm2[1,2,3,4,5,6,7]
	; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0			; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0
	; AVX2-NEXT: retq			; AVX2-NEXT: retq
	%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>			%shuffle = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 7, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0>
	ret <8 x i32> %shuffle			ret <8 x i32> %shuffle
	}			}

	define <8 x i32> @shuffle_v8i32_01014545(<8 x i32> %a, <8 x i32> %b) {			define <8 x i32> @shuffle_v8i32_01014545(<8 x i32> %a, <8 x i32> %b) {
	; AVX1-LABEL: shuffle_v8i32_01014545:			; AVX1-LABEL: shuffle_v8i32_01014545:
	▲ Show 20 Lines • Show All 1,118 Lines • Show Last 20 Lines