Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -5266,8 +5266,7 @@ /// remaining input indices in case we now have a unary shuffle and adjust the /// Op0/Op1 inputs accordingly. /// Returns true if the target shuffle mask was decoded. -static bool resolveTargetShuffleInputs(SDValue Op, bool &IsUnary, SDValue &Op0, - SDValue &Op1, +static bool resolveTargetShuffleInputs(SDValue Op, SDValue &Op0, SDValue &Op1, SmallVectorImpl &Mask) { SmallVector Ops; if (!setTargetShuffleZeroElements(Op, Mask, Ops)) @@ -5282,10 +5281,6 @@ Op0 = Op0InUse ? Ops[0] : SDValue(); Op1 = Op1InUse ? Ops[1] : SDValue(); - IsUnary = !(Op0InUse && Op1InUse); - - if (!IsUnary) - return true; // We're only using Op1 - commute the mask and inputs. if (!Op0InUse && Op1InUse) { @@ -24036,14 +24031,9 @@ "Can only combine shuffles of the same vector register size."); // Extract target shuffle mask and resolve sentinels and inputs. - bool IsUnary; SDValue Input0, Input1; SmallVector OpMask; - if (!resolveTargetShuffleInputs(Op, IsUnary, Input0, Input1, OpMask)) - return false; - - // At the moment we can only combine target shuffle unary cases. - if (!IsUnary) + if (!resolveTargetShuffleInputs(Op, Input0, Input1, OpMask)) return false; assert(VT.getVectorNumElements() == OpMask.size() && @@ -24103,8 +24093,24 @@ Subtarget, DAG, SDLoc(Root))); return true; } + + int MaskSize = Mask.size(); + bool UseInput0 = std::any_of(Mask.begin(), Mask.end(), + [MaskSize](int Idx) { return 0 <= Idx && Idx < MaskSize; }); + bool UseInput1 = std::any_of(Mask.begin(), Mask.end(), + [MaskSize](int Idx) { return MaskSize <= Idx; }); + + // At the moment we can only combine unary shuffle mask cases. + if (UseInput0 && UseInput1) + return false; + else if (UseInput1) { + std::swap(Input0, Input1); + ShuffleVectorSDNode::commuteMask(Mask); + } + assert(Input0 && "Shuffle with no inputs detected"); + // TODO - generalize this to support any variable mask shuffle. HasPSHUFB |= (Op.getOpcode() == X86ISD::PSHUFB); // See if we can recurse into Input0 (if it's a target shuffle). Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-128-v2.ll @@ -929,18 +929,14 @@ ; ; SSE41-LABEL: shuffle_v2i64_bitcast_z123: ; SSE41: # BB#0: -; SSE41-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE41-NEXT: pxor %xmm1, %xmm1 +; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] ; SSE41-NEXT: retq ; ; AVX1-LABEL: shuffle_v2i64_bitcast_z123: ; AVX1: # BB#0: -; AVX1-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vblendps {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0,1],xmm0[2,3,4,5,6,7] ; AVX1-NEXT: retq ; ; AVX2-LABEL: shuffle_v2i64_bitcast_z123: Index: llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll +++ llvm/trunk/test/CodeGen/X86/vector-shuffle-combining-ssse3.ll @@ -63,13 +63,11 @@ define <16 x i8> @combine_unpckl_arg0_pshufb(<16 x i8> %a0, <16 x i8> %a1) { ; SSE-LABEL: combine_unpckl_arg0_pshufb: ; SSE: # BB#0: -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero ; SSE-NEXT: retq ; ; AVX-LABEL: combine_unpckl_arg0_pshufb: ; AVX: # BB#0: -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero,xmm0[0],zero,zero,zero ; AVX-NEXT: retq %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> @@ -80,14 +78,13 @@ define <16 x i8> @combine_unpckl_arg1_pshufb(<16 x i8> %a0, <16 x i8> %a1) { ; SSE-LABEL: combine_unpckl_arg1_pshufb: ; SSE: # BB#0: -; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero +; SSE-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero +; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: combine_unpckl_arg1_pshufb: ; AVX: # BB#0: -; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero,xmm0[1],zero,zero,zero +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero,xmm1[0],zero,zero,zero ; AVX-NEXT: retq %1 = shufflevector <16 x i8> %a0, <16 x i8> %a1, <16 x i32> %2 = tail call <16 x i8> @llvm.x86.ssse3.pshuf.b.128(<16 x i8> %1, <16 x i8> )