Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -4908,69 +4908,85 @@ /// single input multiple times, and in those cases it will /// adjust the mask to only have indices within that single input. static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, + SmallVectorImpl &Ops, SmallVectorImpl &Mask, bool &IsUnary) { unsigned NumElems = VT.getVectorNumElements(); SDValue ImmN; + assert(Ops.empty() && "Clear Ops vector before calling getTargetShuffleMask"); + IsUnary = false; bool IsFakeUnary = false; switch(N->getOpcode()) { case X86ISD::BLENDI: ImmN = N->getOperand(N->getNumOperands()-1); DecodeBLENDMask(VT, cast(ImmN)->getZExtValue(), Mask); + Ops.append({N->getOperand(0), N->getOperand(1)}); break; case X86ISD::SHUFP: ImmN = N->getOperand(N->getNumOperands()-1); DecodeSHUFPMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); + Ops.append({N->getOperand(0), N->getOperand(1)}); break; case X86ISD::INSERTPS: ImmN = N->getOperand(N->getNumOperands()-1); DecodeINSERTPSMask(cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); + Ops.append({N->getOperand(0), N->getOperand(1)}); break; case X86ISD::UNPCKH: DecodeUNPCKHMask(VT, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); + Ops.append({N->getOperand(0), N->getOperand(1)}); break; case X86ISD::UNPCKL: DecodeUNPCKLMask(VT, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); + Ops.append({N->getOperand(0), N->getOperand(1)}); break; case X86ISD::MOVHLPS: DecodeMOVHLPSMask(NumElems, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); + Ops.append({N->getOperand(0), N->getOperand(1)}); break; case X86ISD::MOVLHPS: DecodeMOVLHPSMask(NumElems, Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); + Ops.append({N->getOperand(0), N->getOperand(1)}); break; case X86ISD::PALIGNR: ImmN = N->getOperand(N->getNumOperands()-1); DecodePALIGNRMask(VT, cast(ImmN)->getZExtValue(), Mask); + Ops.append({N->getOperand(0), N->getOperand(1)}); break; case X86ISD::PSHUFD: case X86ISD::VPERMILPI: ImmN = N->getOperand(N->getNumOperands()-1); DecodePSHUFMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; + Ops.append({N->getOperand(0)}); break; case X86ISD::PSHUFHW: ImmN = N->getOperand(N->getNumOperands()-1); DecodePSHUFHWMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; + Ops.append({N->getOperand(0)}); break; case X86ISD::PSHUFLW: ImmN = N->getOperand(N->getNumOperands()-1); DecodePSHUFLWMask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = true; + Ops.append({N->getOperand(0)}); break; case X86ISD::VZEXT_MOVL: DecodeZeroMoveLowMask(VT, Mask); IsUnary = true; + Ops.append({N->getOperand(0)}); break; case X86ISD::PSHUFB: { IsUnary = true; + Ops.append({N->getOperand(0)}); SDValue MaskNode = N->getOperand(1); SmallVector RawMask; if (getTargetShuffleMaskIndices(MaskNode, 8, RawMask)) { @@ -4987,27 +5003,33 @@ ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERMMask(cast(ImmN)->getZExtValue(), Mask); IsUnary = true; + Ops.append({N->getOperand(0)}); break; case X86ISD::MOVSS: case X86ISD::MOVSD: DecodeScalarMoveMask(VT, /* IsLoad */ false, Mask); + Ops.append({N->getOperand(0), N->getOperand(1)}); break; case X86ISD::VPERM2X128: ImmN = N->getOperand(N->getNumOperands()-1); DecodeVPERM2X128Mask(VT, cast(ImmN)->getZExtValue(), Mask); IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1); + Ops.append({N->getOperand(0), N->getOperand(1)}); break; case X86ISD::MOVSLDUP: DecodeMOVSLDUPMask(VT, Mask); IsUnary = true; + Ops.append({N->getOperand(0)}); break; case X86ISD::MOVSHDUP: DecodeMOVSHDUPMask(VT, Mask); IsUnary = true; + Ops.append({N->getOperand(0)}); break; case X86ISD::MOVDDUP: DecodeMOVDDUPMask(VT, Mask); IsUnary = true; + Ops.append({N->getOperand(0)}); break; case X86ISD::MOVLHPD: case X86ISD::MOVLPD: @@ -5016,6 +5038,7 @@ return false; case X86ISD::VPERMV: { IsUnary = true; + Ops.append({N->getOperand(1)}); SDValue MaskNode = N->getOperand(0); SmallVector RawMask; unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()); @@ -5030,6 +5053,7 @@ return false; } case X86ISD::VPERMV3: { + Ops.append({N->getOperand(0)}); SDValue MaskNode = N->getOperand(1); SmallVector RawMask; @@ -5051,6 +5075,8 @@ if (Mask.empty()) return false; + assert(!Ops.empty() && "Should return operands but didn't."); + // Check if we're getting a shuffle mask with zero'd elements. if (!AllowSentinelZero) if (std::any_of(Mask.begin(), Mask.end(), @@ -5073,16 +5099,17 @@ /// (not just zeroable) from their inputs. /// Returns true if the target shuffle mask was decoded. static bool setTargetShuffleZeroElements(SDValue N, - SmallVectorImpl &Mask) { + SmallVectorImpl &Mask, + SmallVectorImpl &Ops) { bool IsUnary; if (!isTargetShuffle(N.getOpcode())) return false; - if (!getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), true, Mask, - IsUnary)) + if (!getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), true, Ops, + Mask, IsUnary)) return false; - SDValue V1 = N.getOperand(0); - SDValue V2 = IsUnary ? V1 : N.getOperand(1); + SDValue V1 = Ops[0]; + SDValue V2 = IsUnary ? V1 : Ops[1]; while (V1.getOpcode() == ISD::BITCAST) V1 = V1->getOperand(0); @@ -5153,7 +5180,8 @@ static bool resolveTargetShuffleInputs(SDValue Op, bool &IsUnary, SDValue &Op0, SDValue &Op1, SmallVectorImpl &Mask) { - if (!setTargetShuffleZeroElements(Op, Mask)) + SmallVector Ops; + if (!setTargetShuffleZeroElements(Op, Mask, Ops)) return false; int NumElts = Mask.size(); @@ -5163,8 +5191,8 @@ bool Op1InUse = std::any_of(Mask.begin(), Mask.end(), [NumElts](int Idx) { return NumElts <= Idx; }); - Op0 = Op0InUse ? Op.getOperand(0) : SDValue(); - Op1 = Op1InUse ? Op.getOperand(1) : SDValue(); + Op0 = Op0InUse ? Ops[0] : SDValue(); + Op1 = Op1InUse ? Ops[1] : SDValue(); IsUnary = !(Op0InUse && Op1InUse); if (!IsUnary) @@ -5212,9 +5240,10 @@ MVT ShufSVT = ShufVT.getVectorElementType(); int NumElems = (int)ShufVT.getVectorNumElements(); SmallVector ShuffleMask; + SmallVector ShuffleOps; bool IsUnary; - if (!getTargetShuffleMask(N, ShufVT, true, ShuffleMask, IsUnary)) + if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary)) return SDValue(); int Elt = ShuffleMask[Index]; @@ -5225,7 +5254,7 @@ return DAG.getUNDEF(ShufSVT); assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range"); - SDValue NewV = (Elt < NumElems) ? N->getOperand(0) : N->getOperand(1); + SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1]; return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } @@ -23369,15 +23398,15 @@ /// into either a single instruction if there is a special purpose instruction /// for this operation, or into a PSHUFB instruction which is a fully general /// instruction but should only be used to replace chains over a certain depth. -static bool combineX86ShuffleChain(SDValue Op, SDValue Root, ArrayRef Mask, - int Depth, bool HasPSHUFB, SelectionDAG &DAG, +static bool combineX86ShuffleChain(SDValue Input, SDValue Root, + ArrayRef Mask, int Depth, + bool HasPSHUFB, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, const X86Subtarget &Subtarget) { assert(!Mask.empty() && "Cannot combine an empty shuffle mask!"); // Find the operand that enters the chain. Note that multiple uses are OK // here, we're not going to remove the operand we find. - SDValue Input = Op.getOperand(0); while (Input.getOpcode() == ISD::BITCAST) Input = Input.getOperand(0); @@ -23618,6 +23647,15 @@ if (!resolveTargetShuffleInputs(Op, IsUnary, Input0, Input1, OpMask)) return false; + // If the shuffle doesn't use any of its inputs, it just produces 0/undef. + // Return a 0 vector directly. + // FIXME: Look at the mask; produce UNDEF. + if (!Input0 && !Input1) { + DCI.CombineTo(Root.getNode(), getZeroVector(Root.getSimpleValueType(), + Subtarget, DAG, SDLoc(Root))); + return true; + } + // At the moment we can only combine target shuffle unary cases. if (!IsUnary) return false; @@ -23695,7 +23733,7 @@ WidenedMask.clear(); } - return combineX86ShuffleChain(Op, Root, Mask, Depth, HasPSHUFB, DAG, DCI, + return combineX86ShuffleChain(Input0, Root, Mask, Depth, HasPSHUFB, DAG, DCI, Subtarget); } @@ -23706,8 +23744,10 @@ static SmallVector getPSHUFShuffleMask(SDValue N) { MVT VT = N.getSimpleValueType(); SmallVector Mask; + SmallVector Ops; bool IsUnary; - bool HaveMask = getTargetShuffleMask(N.getNode(), VT, false, Mask, IsUnary); + bool HaveMask = + getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary); (void)HaveMask; assert(HaveMask); @@ -24022,8 +24062,11 @@ // Determine which elements are known to be zero. SmallVector TargetMask; - if (!setTargetShuffleZeroElements(N, TargetMask)) - return SDValue(); + { + SmallVector Ops; + if (!setTargetShuffleZeroElements(N, TargetMask, Ops)) + return SDValue(); + } // Helper function to take inner insertps node and attempt to // merge the blend with zero into its zero mask. @@ -24076,7 +24119,8 @@ // Attempt to merge insertps Op1 with an inner target shuffle node. SmallVector TargetMask1; - if (setTargetShuffleZeroElements(Op1, TargetMask1)) { + SmallVector Ops1; + if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) { int M = TargetMask1[SrcIdx]; if (isUndefOrZero(M)) { // Zero/UNDEF insertion - zero out element and remove dependency. @@ -24087,14 +24131,15 @@ // Update insertps mask srcidx and reference the source input directly. assert(0 <= M && M < 8 && "Shuffle index out of range"); InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6); - Op1 = Op1.getOperand(M < 4 ? 0 : 1); + Op1 = Ops1[M < 4 ? 0 : 1]; return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1, DAG.getConstant(InsertPSMask, DL, MVT::i8)); } // Attempt to merge insertps Op0 with an inner target shuffle node. SmallVector TargetMask0; - if (!setTargetShuffleZeroElements(Op0, TargetMask0)) + SmallVector Ops0; + if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0)) return SDValue(); bool Updated = false; @@ -24125,10 +24170,10 @@ // referenced input directly. if (UseInput00 && !UseInput01) { Updated = true; - Op0 = Op0.getOperand(0); + Op0 = Ops0[0]; } else if (!UseInput00 && UseInput01) { Updated = true; - Op0 = Op0.getOperand(1); + Op0 = Ops0[1]; } if (Updated) @@ -24428,9 +24473,10 @@ return SDValue(); SmallVector ShuffleMask; + SmallVector ShuffleOps; bool UnaryShuffle; if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true, - ShuffleMask, UnaryShuffle)) + ShuffleOps, ShuffleMask, UnaryShuffle)) return SDValue(); // Select the input vector, guarding against out of range extract vector. @@ -24445,12 +24491,12 @@ return DAG.getUNDEF(EltVT); assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range"); - SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) - : InVec.getOperand(1); + SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0] + : ShuffleOps[1]; // If inputs to shuffle are the same for both ops, then allow 2 uses - unsigned AllowedUses = InVec.getNumOperands() > 1 && - InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1; + unsigned AllowedUses = + (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1; if (LdNode.getOpcode() == ISD::BITCAST) { // Don't duplicate a load with other uses. @@ -24484,10 +24530,9 @@ SDLoc dl(N); // Create shuffle node taking into account the case that its a unary shuffle - SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) - : InVec.getOperand(1); + SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1]; Shuffle = DAG.getVectorShuffle(CurrentVT, dl, - InVec.getOperand(0), Shuffle, + ShuffleOps[0], Shuffle, &ShuffleMask[0]); Shuffle = DAG.getBitcast(OriginalVT, Shuffle); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, Index: test/CodeGen/X86/avx2-vperm-combining.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/avx2-vperm-combining.ll @@ -0,0 +1,17 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mcpu=x86-64 -mattr=+avx2 | FileCheck %s + +target triple = "x86_64-unknown-unknown" + +define <32 x i8> @shuffle_pshufb_vpermd(<8 x i32> %a) { +; CHECK-LABEL: shuffle_pshufb_vpermd: +; CHECK: # BB#0: +; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18] +; CHECK-NEXT: retq + %tmp0 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> ) + %tmp1 = bitcast <8 x i32> %tmp0 to <32 x i8> + %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> + ret <32 x i8> %tmp2 +} + +declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>)