Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -5135,7 +5135,9 @@ if (M < 0) continue; + // Determine shuffle input and normalize the mask. SDValue V = M < Size ? V1 : V2; + M %= Size; // We are referencing an UNDEF input. if (V.isUndef()) { @@ -5143,12 +5145,77 @@ continue; } - // TODO - handle the Size != (int)V.getNumOperands() cases in future. - if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands()) + // Currently we can only search BUILD_VECTOR for UNDEF/ZERO elements. + if (V.getOpcode() != ISD::BUILD_VECTOR) + continue; + + // If the BUILD_VECTOR has fewer elements then the (larger) source + // element must be UNDEF/ZERO. + // TODO: Is it worth testing the individual bits of a constant? + if ((Size % V.getNumOperands()) == 0) { + unsigned Scale = Size / V->getNumOperands(); + SDValue Op = V.getOperand(M / Scale); + if (Op.isUndef()) + Mask[i] = SM_SentinelUndef; + else if (X86::isZeroNode(Op)) + Mask[i] = SM_SentinelZero; continue; - if (!X86::isZeroNode(V.getOperand(M % Size))) + } + + // If the BUILD_VECTOR has more elements then all the (smaller) source + // elements must be all UNDEF or all ZERO. + if ((V.getNumOperands() % Size) == 0) { + unsigned Scale = V->getNumOperands() / Size; + bool AllUndef = true; + bool AllZero = true; + for (unsigned j = 0; j != Scale; ++j) { + SDValue Op = V.getOperand((M * Scale) + j); + AllUndef &= Op.isUndef(); + AllZero &= X86::isZeroNode(Op); + } + if (AllUndef) + Mask[i] = SM_SentinelUndef; + else if (AllZero) + Mask[i] = SM_SentinelZero; continue; - Mask[i] = SM_SentinelZero; + } + } + + return true; +} + +/// Calls setTargetShuffleZeroElements to resolve a target shuffle mask's inputs +/// and set the SM_SentinelUndef and SM_SentinelZero values. Then check the +/// remaining input indices in case we now have a unary shuffle and adjust the +/// Op0/Op1 inputs accordingly. +/// Returns true if the target shuffle mask was decoded. +static bool resolveTargetShuffleInputs(SDValue Op, bool &IsUnary, SDValue &Op0, + SDValue &Op1, + SmallVectorImpl &Mask) { + if (!setTargetShuffleZeroElements(Op, Mask)) + return false; + + int NumElts = Mask.size(); + bool Op0InUse = std::any_of(Mask.begin(), Mask.end(), [NumElts](int Idx) { + return 0 <= Idx && Idx < NumElts; + }); + bool Op1InUse = std::any_of(Mask.begin(), Mask.end(), + [NumElts](int Idx) { return NumElts <= Idx; }); + + Op0 = Op0InUse ? Op.getOperand(0) : SDValue(); + Op1 = Op1InUse ? Op.getOperand(1) : SDValue(); + IsUnary = !(Op0InUse && Op1InUse); + + if (!IsUnary) + return true; + + // We're only using Op1 - commute the mask and inputs. + if (!Op0InUse && Op1InUse) { + for (int &M : Mask) + if (NumElts <= M) + M -= NumElts; + Op0 = Op1; + Op1 = SDValue(); } return true; @@ -23309,7 +23376,7 @@ /// \brief Combine an arbitrary chain of shuffles into a single instruction if /// possible. /// -/// This is the leaf of the recursive combinine below. When we have found some +/// This is the leaf of the recursive combine below. When we have found some /// chain of single-use x86 shuffle instructions and accumulated the combined /// shuffle mask represented by them, this will try to pattern match that mask /// into either a single instruction if there is a special purpose instruction @@ -23470,13 +23537,19 @@ int NumBytes = VT.getSizeInBits() / 8; int Ratio = NumBytes / Mask.size(); for (int i = 0; i < NumBytes; ++i) { - if (Mask[i / Ratio] == SM_SentinelUndef) { + int M = Mask[i / Ratio]; + if (M == SM_SentinelUndef) { PSHUFBMask.push_back(DAG.getUNDEF(MVT::i8)); continue; } - int M = Mask[i / Ratio] != SM_SentinelZero - ? Ratio * Mask[i / Ratio] + i % Ratio - : 255; + if (M == SM_SentinelZero) { + PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8)); + continue; + } + M = Ratio * M + i % Ratio; + // Check that we are not crossing lanes. + if ((M / 16) != (i / 16)) + return false; PSHUFBMask.push_back(DAG.getConstant(M, DL, MVT::i8)); } MVT ByteVT = MVT::getVectorVT(MVT::i8, NumBytes); @@ -23549,13 +23622,15 @@ assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() && "Can only combine shuffles of the same vector register size."); - if (!isTargetShuffle(Op.getOpcode())) - return false; - SmallVector OpMask; + // Extract target shuffle mask and resolve sentinels and inputs. bool IsUnary; - bool HaveMask = getTargetShuffleMask(Op.getNode(), VT, true, OpMask, IsUnary); - // We only can combine unary shuffles which we can decode the mask for. - if (!HaveMask || !IsUnary) + SDValue Input0, Input1; + SmallVector OpMask; + if (!resolveTargetShuffleInputs(Op, IsUnary, Input0, Input1, OpMask)) + return false; + + // At the moment we can only combine target shuffle unary cases. + if (!IsUnary) return false; assert(VT.getVectorNumElements() == OpMask.size() && @@ -23601,34 +23676,26 @@ RootMaskedIdx % OpRatio); } - // See if we can recurse into the operand to combine more things. - switch (Op.getOpcode()) { - case X86ISD::PSHUFB: - HasPSHUFB = true; - case X86ISD::PSHUFD: - case X86ISD::PSHUFHW: - case X86ISD::PSHUFLW: - if (Op.getOperand(0).hasOneUse() && - combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, - HasPSHUFB, DAG, DCI, Subtarget)) - return true; - break; - - case X86ISD::UNPCKL: - case X86ISD::UNPCKH: - assert(Op.getOperand(0) == Op.getOperand(1) && - "We only combine unary shuffles!"); - // We can't check for single use, we have to check that this shuffle is the - // only user. - if (Op->isOnlyUserOf(Op.getOperand(0).getNode()) && - combineX86ShufflesRecursively(Op.getOperand(0), Root, Mask, Depth + 1, - HasPSHUFB, DAG, DCI, Subtarget)) - return true; - break; + // Handle the all undef case early. + // TODO - should we handle zero/undef case as well? Widening the mask + // will lose information on undef elements possibly reducing future + // combine possibilities. + if (std::all_of(Mask.begin(), Mask.end(), + [](int Idx) { return Idx == SM_SentinelUndef; })) { + DCI.CombineTo(Root.getNode(), DAG.getUNDEF(Root.getValueType())); + return true; } + HasPSHUFB |= (Op.getOpcode() == X86ISD::PSHUFB); + + // See if we can recurse into Input0 (if its a target shuffle). + if (Input0 && Op->isOnlyUserOf(Input0.getNode()) && + combineX86ShufflesRecursively(Input0, Root, Mask, Depth + 1, HasPSHUFB, + DAG, DCI, Subtarget)) + return true; + // Minor canonicalization of the accumulated shuffle mask to make it easier - // to match below. All this does is detect masks with squential pairs of + // to match below. All this does is detect masks with sequential pairs of // elements, and shrink them to the half-width mask. It does this in a loop // so it will reduce the size of the mask to the minimal width mask which // performs an equivalent shuffle. Index: test/CodeGen/X86/pshufb-mask-comments.ll =================================================================== --- test/CodeGen/X86/pshufb-mask-comments.ll +++ test/CodeGen/X86/pshufb-mask-comments.ll @@ -31,7 +31,7 @@ define <16 x i8> @test4(<2 x i64>* %V) { ; CHECK-LABEL: test4 -; CHECK: pshufb {{.*}} +; CHECK-NOT: pshufb {{.*}} store <2 x i64> , <2 x i64>* %V, align 16 %1 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> undef, <16 x i8> ) ret <16 x i8> %1 Index: test/CodeGen/X86/vector-sext.ll =================================================================== --- test/CodeGen/X86/vector-sext.ll +++ test/CodeGen/X86/vector-sext.ll @@ -143,14 +143,12 @@ ; ; SSSE3-LABEL: sext_16i8_to_8i32: ; SSSE3: # BB#0: # %entry -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3] -; SSSE3-NEXT: psrad $24, %xmm2 -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,2,3] -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; SSSE3-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3] +; SSSE3-NEXT: movdqa %xmm0, %xmm1 +; SSSE3-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3] +; SSSE3-NEXT: psrad $24, %xmm0 +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[u,u,u,4,u,u,u,5,u,u,u,6,u,u,u,7] ; SSSE3-NEXT: psrad $24, %xmm1 -; SSSE3-NEXT: movdqa %xmm2, %xmm0 ; SSSE3-NEXT: retq ; ; SSE41-LABEL: sext_16i8_to_8i32: Index: test/CodeGen/X86/vector-zext.ll =================================================================== --- test/CodeGen/X86/vector-zext.ll +++ test/CodeGen/X86/vector-zext.ll @@ -1119,9 +1119,8 @@ ; SSSE3-LABEL: shuf_zext_8i8_to_8i32: ; SSSE3: # BB#0: # %entry ; SSSE3-NEXT: movdqa %xmm0, %xmm1 -; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u] +; SSSE3-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[2],zero,xmm1[4],zero,xmm1[6],zero,xmm1[8],zero,xmm1[10],zero,xmm1[12],zero,xmm1[14],zero ; SSSE3-NEXT: pxor %xmm2, %xmm2 -; SSSE3-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7] ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1],xmm0[2],xmm2[2],xmm0[3],xmm2[3] ; SSSE3-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]