Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -6710,36 +6710,78 @@ /// as many lanes with this technique as possible to simplify the remaining /// shuffle. static SmallBitVector computeZeroableShuffleElements(ArrayRef Mask, - SDValue V1, SDValue V2) { + SDValue V1, SDValue V2, + unsigned Depth = 0) { + // Determine zeroable elements of each input. + auto GetSubZeroable = [&](SDValue V) { + MVT VT = V.getSimpleValueType(); + unsigned NumElts = VT.isVector() ? VT.getVectorNumElements() : 1; + + // BUILD_VECTOR - check each operand. + if (V.getOpcode() == ISD::BUILD_VECTOR) { + SmallBitVector SubZeroable(NumElts, false); + for (unsigned i = 0; i != NumElts; ++i) { + SDValue Op = V.getOperand(i); + SubZeroable[i] = (Op.getOpcode() == ISD::UNDEF || X86::isZeroNode(Op)); + } + return SubZeroable; + } + + // SHUFFLE_VECTOR - recursive call to computeZeroableShuffleElements. + else if (ShuffleVectorSDNode *S = dyn_cast(V)) + return computeZeroableShuffleElements(S->getMask(), S->getOperand(0), + S->getOperand(1), Depth + 1); + + return SmallBitVector(NumElts, false); + }; + SmallBitVector Zeroable(Mask.size(), false); + if (Depth == 6) + return Zeroable; // Limit search depth. + while (V1.getOpcode() == ISD::BITCAST) V1 = V1->getOperand(0); while (V2.getOpcode() == ISD::BITCAST) V2 = V2->getOperand(0); - bool V1IsZero = ISD::isBuildVectorAllZeros(V1.getNode()); - bool V2IsZero = ISD::isBuildVectorAllZeros(V2.getNode()); + SmallBitVector V1Zeroables = GetSubZeroable(V1); + SmallBitVector V2Zeroables = GetSubZeroable(V2); for (int i = 0, Size = Mask.size(); i < Size; ++i) { int M = Mask[i]; - // Handle the easy cases. - if (M < 0 || (M >= 0 && M < Size && V1IsZero) || (M >= Size && V2IsZero)) { + + // Handle the easy case. + if (M < 0) { Zeroable[i] = true; continue; } - // If this is an index into a build_vector node (which has the same number - // of elements), dig out the input value and use it. - SDValue V = M < Size ? V1 : V2; - if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands()) + int Idx = M % Size; + SmallBitVector SubZeroable = M < Size ? V1Zeroables : V2Zeroables; + int SubSize = SubZeroable.size(); + + // If fewer (wider) elements - the entire element must be zero. + if (Size > SubSize) { + assert(0 == (Size % SubSize) && "Bad scale"); + unsigned Scale = Size / SubSize; + Zeroable[i] = SubZeroable[Idx / Scale]; continue; + } - SDValue Input = V.getOperand(M % Size); - // The UNDEF opcode check really should be dead code here, but not quite - // worth asserting on (it isn't invalid, just unexpected). - if (Input.getOpcode() == ISD::UNDEF || X86::isZeroNode(Input)) - Zeroable[i] = true; + // If more (narrower) elements - all aliased 'sub-elements' must be zero. + if (Size < SubSize) { + assert(0 == (SubSize % Size) && "Bad scale"); + unsigned Scale = SubSize / Size; + bool Zero = true; + for (unsigned j = 0; j != Scale; ++j) + Zero &= SubZeroable[(Idx * Scale) + j]; + Zeroable[i] = Zero; + continue; + } + + assert(Size == SubSize && "Unexpected mask size"); + Zeroable[i] = SubZeroable[Idx]; } return Zeroable; Index: test/CodeGen/X86/sse41.ll =================================================================== --- test/CodeGen/X86/sse41.ll +++ test/CodeGen/X86/sse41.ll @@ -509,14 +509,14 @@ ; X32: ## BB#0: ; X32-NEXT: xorps %xmm2, %xmm2 ; X32-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0] ; X32-NEXT: retl ; ; X64-LABEL: shuf_X00A: ; X64: ## BB#0: ; X64-NEXT: xorps %xmm2, %xmm2 ; X64-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[0] ; X64-NEXT: retq %vecext = extractelement <4 x float> %x, i32 0 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 @@ -548,13 +548,13 @@ ; X32-LABEL: shuf_X0YC: ; X32: ## BB#0: ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2] +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[2] ; X32-NEXT: retl ; ; X64-LABEL: shuf_X0YC: ; X64: ## BB#0: ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2] +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[2] ; X64-NEXT: retq %vecext = extractelement <4 x float> %x, i32 0 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 Index: test/CodeGen/X86/widen_load-2.ll =================================================================== --- test/CodeGen/X86/widen_load-2.ll +++ test/CodeGen/X86/widen_load-2.ll @@ -178,8 +178,8 @@ %i8vec3pack = type { <3 x i8>, i8 } define void @rot(%i8vec3pack* nocapture sret %result, %i8vec3pack* %X, %i8vec3pack* %rot) nounwind { ; CHECK-LABEL: rot: -; CHECK: movdqa {{.*}}, %[[CONSTANT0:xmm[0-9]+]] -; CHECK-NEXT: movdqa {{.*}}, %[[SHUFFLE_MASK:xmm[0-9]+]] +; CHECK: movdqa {{.*}}, %[[SHUFFLE_MASK:xmm[0-9]+]] +; CHECK-NEXT: movdqa {{.*}}, %[[CONSTANT0:xmm[0-9]+]] ; CHECK-NEXT: pshufb %[[SHUFFLE_MASK]], %[[CONSTANT0]] ; CHECK-NEXT: pmovzxwq %[[CONSTANT0]], %[[CONSTANT0]] ; CHECK-NEXT: movd %[[CONSTANT0]], %e[[R0:[abcd]]]x @@ -187,8 +187,8 @@ ; CHECK-NEXT: movb $-98, 2(%[[PTR0]]) ; CHECK-NEXT: movdqa {{.*}}, %[[CONSTANT1:xmm[0-9]+]] ; CHECK-NEXT: pshufb %[[SHUFFLE_MASK]], %[[CONSTANT1]] -; CHECK-NEXT: pmovzxwq %[[CONSTANT1]], %[[CONSTANT1]] -; CHECK-NEXT: movd %[[CONSTANT1]], %e[[R1:[abcd]]]x +; CHECK-NEXT: pmovzxwq %[[CONSTANT1]], %[[SHUFFLE_MASK]] +; CHECK-NEXT: movd %[[SHUFFLE_MASK]], %e[[R1:[abcd]]]x ; CHECK-NEXT: movw %[[R1]]x, (%[[PTR1:.*]]) ; CHECK-NEXT: movb $1, 2(%[[PTR1]]) ; CHECK-NEXT: pmovzxbd (%[[PTR0]]), %[[X0:xmm[0-9]+]] @@ -196,7 +196,7 @@ ; CHECK-NEXT: psrld $1, %[[X1]] ; CHECK-NEXT: pblendw $192, %[[X0]], %[[X1]] ; CHECK-NEXT: pextrb $8, %[[X1]], 2(%{{.*}}) -; CHECK-NEXT: pshufb %[[SHUFFLE_MASK]], %[[X1]] +; CHECK-NEXT: pshufb {{.*}}, %[[X1]] ; CHECK-NEXT: pmovzxwq %[[X1]], %[[X3:xmm[0-9]+]] ; CHECK-NEXT: movd %[[X3]], %e[[R0:[abcd]]]x ; CHECK-NEXT: movw %[[R0]]x, (%{{.*}})