Index: llvm/trunk/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.h +++ llvm/trunk/lib/Target/X86/X86ISelLowering.h @@ -392,11 +392,21 @@ UNPCKH, VPERMILPV, VPERMILPI, + VPERMI, + VPERM2X128, + + // Variable Permute (VPERM) + // Res = VPERMV MaskV, V0 VPERMV, + + // 3-op Variable Permute (VPERMT2) + // Res = VPERMV3 V0, MaskV, V1 VPERMV3, + + // 3-op Variable Permute overwriting the index (VPERMI2) + // Res = VPERMIV3 V0, MaskV, V1 VPERMIV3, - VPERMI, - VPERM2X128, + // Bitwise ternary logic VPTERNLOG, // Fix Up Special Packed Float32/64 values Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -4936,15 +4936,21 @@ } /// Calculates the shuffle mask corresponding to the target-specific opcode. -/// Returns true if the Mask could be calculated. Sets IsUnary to true if only -/// uses one source. Note that this will set IsUnary for shuffles which use a -/// single input multiple times, and in those cases it will -/// adjust the mask to only have indices within that single input. +/// If the mask could be calculated, returns it in \p Mask, returns the shuffle +/// operands in \p Ops, and returns true. +/// Sets \p IsUnary to true if only one source is used. Note that this will set +/// IsUnary for shuffles which use a single input multiple times, and in those +/// cases it will adjust the mask to only have indices within that single input. +/// It is an error to call this with non-empty Mask/Ops vectors. static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero, + SmallVectorImpl &Ops, SmallVectorImpl &Mask, bool &IsUnary) { unsigned NumElems = VT.getVectorNumElements(); SDValue ImmN; + assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector"); + assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector"); + IsUnary = false; bool IsFakeUnary = false; switch(N->getOpcode()) { @@ -5049,6 +5055,8 @@ return false; case X86ISD::VPERMV: { IsUnary = true; + // Unlike most shuffle nodes, VPERMV's mask operand is operand 0. + Ops.push_back(N->getOperand(1)); SDValue MaskNode = N->getOperand(0); SmallVector RawMask; unsigned MaskLoBits = Log2_64(VT.getVectorNumElements()); @@ -5063,8 +5071,10 @@ return false; } case X86ISD::VPERMV3: { + // Unlike most shuffle nodes, VPERMV3's mask operand is the middle one. + Ops.push_back(N->getOperand(0)); + Ops.push_back(N->getOperand(2)); SDValue MaskNode = N->getOperand(1); - SmallVector RawMask; unsigned MaskLoBits = Log2_64(VT.getVectorNumElements() * 2); if (getTargetShuffleMaskIndices(MaskNode, MaskLoBits, RawMask)) { @@ -5098,6 +5108,14 @@ if (M >= (int)Mask.size()) M -= Mask.size(); + // If we didn't already add operands in the opcode-specific code, default to + // adding 1 or 2 operands starting at 0. + if (Ops.empty()) { + Ops.push_back(N->getOperand(0)); + if (!IsUnary || IsFakeUnary) + Ops.push_back(N->getOperand(1)); + } + return true; } @@ -5106,16 +5124,17 @@ /// (not just zeroable) from their inputs. /// Returns true if the target shuffle mask was decoded. static bool setTargetShuffleZeroElements(SDValue N, - SmallVectorImpl &Mask) { + SmallVectorImpl &Mask, + SmallVectorImpl &Ops) { bool IsUnary; if (!isTargetShuffle(N.getOpcode())) return false; - if (!getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), true, Mask, - IsUnary)) + if (!getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), true, Ops, + Mask, IsUnary)) return false; - SDValue V1 = N.getOperand(0); - SDValue V2 = IsUnary ? V1 : N.getOperand(1); + SDValue V1 = Ops[0]; + SDValue V2 = IsUnary ? V1 : Ops[1]; while (V1.getOpcode() == ISD::BITCAST) V1 = V1->getOperand(0); @@ -5186,7 +5205,8 @@ static bool resolveTargetShuffleInputs(SDValue Op, bool &IsUnary, SDValue &Op0, SDValue &Op1, SmallVectorImpl &Mask) { - if (!setTargetShuffleZeroElements(Op, Mask)) + SmallVector Ops; + if (!setTargetShuffleZeroElements(Op, Mask, Ops)) return false; int NumElts = Mask.size(); @@ -5196,8 +5216,8 @@ bool Op1InUse = std::any_of(Mask.begin(), Mask.end(), [NumElts](int Idx) { return NumElts <= Idx; }); - Op0 = Op0InUse ? Op.getOperand(0) : SDValue(); - Op1 = Op1InUse ? Op.getOperand(1) : SDValue(); + Op0 = Op0InUse ? Ops[0] : SDValue(); + Op1 = Op1InUse ? Ops[1] : SDValue(); IsUnary = !(Op0InUse && Op1InUse); if (!IsUnary) @@ -5245,9 +5265,10 @@ MVT ShufSVT = ShufVT.getVectorElementType(); int NumElems = (int)ShufVT.getVectorNumElements(); SmallVector ShuffleMask; + SmallVector ShuffleOps; bool IsUnary; - if (!getTargetShuffleMask(N, ShufVT, true, ShuffleMask, IsUnary)) + if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary)) return SDValue(); int Elt = ShuffleMask[Index]; @@ -5258,7 +5279,7 @@ return DAG.getUNDEF(ShufSVT); assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range"); - SDValue NewV = (Elt < NumElems) ? N->getOperand(0) : N->getOperand(1); + SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1]; return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1); } @@ -24044,8 +24065,10 @@ static SmallVector getPSHUFShuffleMask(SDValue N) { MVT VT = N.getSimpleValueType(); SmallVector Mask; + SmallVector Ops; bool IsUnary; - bool HaveMask = getTargetShuffleMask(N.getNode(), VT, false, Mask, IsUnary); + bool HaveMask = + getTargetShuffleMask(N.getNode(), VT, false, Ops, Mask, IsUnary); (void)HaveMask; assert(HaveMask); @@ -24360,7 +24383,8 @@ // Determine which elements are known to be zero. SmallVector TargetMask; - if (!setTargetShuffleZeroElements(N, TargetMask)) + SmallVector BlendOps; + if (!setTargetShuffleZeroElements(N, TargetMask, BlendOps)) return SDValue(); // Helper function to take inner insertps node and attempt to @@ -24414,7 +24438,8 @@ // Attempt to merge insertps Op1 with an inner target shuffle node. SmallVector TargetMask1; - if (setTargetShuffleZeroElements(Op1, TargetMask1)) { + SmallVector Ops1; + if (setTargetShuffleZeroElements(Op1, TargetMask1, Ops1)) { int M = TargetMask1[SrcIdx]; if (isUndefOrZero(M)) { // Zero/UNDEF insertion - zero out element and remove dependency. @@ -24425,14 +24450,15 @@ // Update insertps mask srcidx and reference the source input directly. assert(0 <= M && M < 8 && "Shuffle index out of range"); InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6); - Op1 = Op1.getOperand(M < 4 ? 0 : 1); + Op1 = Ops1[M < 4 ? 0 : 1]; return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1, DAG.getConstant(InsertPSMask, DL, MVT::i8)); } // Attempt to merge insertps Op0 with an inner target shuffle node. SmallVector TargetMask0; - if (!setTargetShuffleZeroElements(Op0, TargetMask0)) + SmallVector Ops0; + if (!setTargetShuffleZeroElements(Op0, TargetMask0, Ops0)) return SDValue(); bool Updated = false; @@ -24463,10 +24489,10 @@ // referenced input directly. if (UseInput00 && !UseInput01) { Updated = true; - Op0 = Op0.getOperand(0); + Op0 = Ops0[0]; } else if (!UseInput00 && UseInput01) { Updated = true; - Op0 = Op0.getOperand(1); + Op0 = Ops0[1]; } if (Updated) @@ -24767,9 +24793,10 @@ return SDValue(); SmallVector ShuffleMask; + SmallVector ShuffleOps; bool UnaryShuffle; if (!getTargetShuffleMask(InVec.getNode(), CurrentVT.getSimpleVT(), true, - ShuffleMask, UnaryShuffle)) + ShuffleOps, ShuffleMask, UnaryShuffle)) return SDValue(); // Select the input vector, guarding against out of range extract vector. @@ -24784,12 +24811,12 @@ return DAG.getUNDEF(EltVT); assert(0 <= Idx && Idx < (int)(2 * NumElems) && "Shuffle index out of range"); - SDValue LdNode = (Idx < (int)NumElems) ? InVec.getOperand(0) - : InVec.getOperand(1); + SDValue LdNode = (Idx < (int)NumElems) ? ShuffleOps[0] + : ShuffleOps[1]; // If inputs to shuffle are the same for both ops, then allow 2 uses - unsigned AllowedUses = InVec.getNumOperands() > 1 && - InVec.getOperand(0) == InVec.getOperand(1) ? 2 : 1; + unsigned AllowedUses = + (ShuffleOps.size() > 1 && ShuffleOps[0] == ShuffleOps[1]) ? 2 : 1; if (LdNode.getOpcode() == ISD::BITCAST) { // Don't duplicate a load with other uses. @@ -24823,10 +24850,9 @@ SDLoc dl(N); // Create shuffle node taking into account the case that its a unary shuffle - SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) - : InVec.getOperand(1); + SDValue Shuffle = (UnaryShuffle) ? DAG.getUNDEF(CurrentVT) : ShuffleOps[1]; Shuffle = DAG.getVectorShuffle(CurrentVT, dl, - InVec.getOperand(0), Shuffle, + ShuffleOps[0], Shuffle, &ShuffleMask[0]); Shuffle = DAG.getBitcast(OriginalVT, Shuffle); return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, N->getValueType(0), Shuffle, Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-avx2.ll @@ -0,0 +1,27 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s + +declare <8 x i32> @llvm.x86.avx2.permd(<8 x i32>, <8 x i32>) +declare <8 x float> @llvm.x86.avx2.permps(<8 x float>, <8 x i32>) + +define <32 x i8> @combine_pshufb_vpermd(<8 x i32> %a) { +; CHECK-LABEL: combine_pshufb_vpermd: +; CHECK: # BB#0: +; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18] +; CHECK-NEXT: retq + %tmp0 = call <8 x i32> @llvm.x86.avx2.permd(<8 x i32> %a, <8 x i32> ) + %tmp1 = bitcast <8 x i32> %tmp0 to <32 x i8> + %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> + ret <32 x i8> %tmp2 +} + +define <32 x i8> @combine_pshufb_vpermps(<8 x float> %a) { +; CHECK-LABEL: combine_pshufb_vpermps: +; CHECK: # BB#0: +; CHECK-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,16,17,18,18] +; CHECK-NEXT: retq + %tmp0 = call <8 x float> @llvm.x86.avx2.permps(<8 x float> %a, <8 x i32> ) + %tmp1 = bitcast <8 x float> %tmp0 to <32 x i8> + %tmp2 = shufflevector <32 x i8> %tmp1, <32 x i8> undef, <32 x i32> + ret <32 x i8> %tmp2 +}