Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -23703,6 +23703,52 @@ return true; } +/// Check a target shuffle mask's inputs to see if we can set any values to +/// SM_SentinelZero - this is for elements that are known to be zero +/// (not just zeroable) from their inputs. +static bool setTargetShuffleZeroElements(SDValue N, + SmallVectorImpl &Mask) { + bool IsUnary; + if (!isTargetShuffle(N.getOpcode())) + return false; + if (!getTargetShuffleMask(N.getNode(), N.getSimpleValueType(), true, Mask, + IsUnary)) + return false; + + SDValue V1 = N.getOperand(0); + SDValue V2 = IsUnary ? V1 : N.getOperand(1); + + while (V1.getOpcode() == ISD::BITCAST) + V1 = V1->getOperand(0); + while (V2.getOpcode() == ISD::BITCAST) + V2 = V2->getOperand(0); + + for (int i = 0, Size = Mask.size(); i != Size; ++i) { + int M = Mask[i]; + + // Already decoded as SM_SentinelZero / SM_SentinelUndef. + if (M < 0) + continue; + + SDValue V = M < Size ? V1 : V2; + + // We are referencing an UNDEF input. + if (V.isUndef()) { + Mask[i] = SM_SentinelUndef; + continue; + } + + // TODO - handle the Size != (int)V.getNumOperands() cases in future. + if (V.getOpcode() != ISD::BUILD_VECTOR || Size != (int)V.getNumOperands()) + continue; + if (!X86::isZeroNode(V.getOperand(M % Size))) + continue; + Mask[i] = SM_SentinelZero; + } + + return true; +} + /// \brief Try to combine x86 target specific shuffles. static SDValue PerformTargetShuffleCombine(SDValue N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, @@ -23776,6 +23822,96 @@ return DAG.getNode(X86ISD::BLENDI, DL, VT, V1, V0, NewMask); } + // Attempt to merge blend(insertps(x,y),zero). + if (V0.getOpcode() == X86ISD::INSERTPS || + V1.getOpcode() == X86ISD::INSERTPS) { + assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"); + + // Determine which elements are known to be zero. + SmallVector TargetMask; + if (!setTargetShuffleZeroElements(N, TargetMask)) + return SDValue(); + + // Helper function to take inner insertps node and attempt to + // merge the blend with zero into its zero mask. + auto MergeInsertPSAndBlend = [&](SDValue V, int Offset) { + if (V.getOpcode() != X86ISD::INSERTPS) + return SDValue(); + SDValue Op0 = V.getOperand(0); + SDValue Op1 = V.getOperand(1); + SDValue Op2 = V.getOperand(2); + unsigned InsertPSMask = cast(Op2)->getZExtValue(); + + // Check each element of the blend node's target mask - must either + // be zeroable (and update the zero mask) or selects the element from + // the inner insertps node. + for (int i = 0; i != 4; ++i) + if (TargetMask[i] < 0) + InsertPSMask |= (1u << i); + else if (TargetMask[i] != (i + Offset)) + return SDValue(); + return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, Op0, Op1, + DAG.getConstant(InsertPSMask, DL, MVT::i8)); + }; + + if (SDValue V = MergeInsertPSAndBlend(V0, 0)) + return V; + if (SDValue V = MergeInsertPSAndBlend(V1, 4)) + return V; + } + return SDValue(); + } + case X86ISD::INSERTPS: { + assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32"); + SDValue Op0 = N.getOperand(0); + SDValue Op1 = N.getOperand(1); + SDValue Op2 = N.getOperand(2); + unsigned InsertPSMask = cast(Op2)->getZExtValue(); + unsigned DstIdx = (InsertPSMask >> 4) & 3; + + // Attempt to merge insertps with an inner target shuffle node. + SmallVector TargetMask; + if (!setTargetShuffleZeroElements(Op0, TargetMask)) + return SDValue(); + + bool Updated = false; + bool UseInput00 = false; + bool UseInput01 = false; + for (int i = 0; i != 4; ++i) { + int M = TargetMask[i]; + if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) { + // No change if element is already zero or the inserted element. + continue; + } else if (M < 0) { + // If the target mask is undef/zero then we must zero the element. + InsertPSMask |= (1u << i); + Updated = true; + continue; + } + + // The input vector element must be inline. + if (M != i && M != (i + 4)) + return SDValue(); + + // Determine which inputs of the target shuffle we're using. + UseInput00 |= (0 <= M && M < 4); + UseInput01 |= (4 <= M); + } + + // If we're not using both inputs of the target shuffle then use the + // referenced input directly. + if (UseInput00 && !UseInput01) { + Updated = true; + Op0 = Op0.getOperand(0); + } else if (!UseInput00 && UseInput01) { + Updated = true; + Op0 = Op0.getOperand(1); + } + + if (Updated) + return DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32, Op0, Op1, + DAG.getConstant(InsertPSMask, DL, MVT::i8)); + return SDValue(); } default: @@ -28133,6 +28269,7 @@ case X86ISD::BRCOND: return PerformBrCondCombine(N, DAG, DCI, Subtarget); case X86ISD::VZEXT: return performVZEXTCombine(N, DAG, DCI, Subtarget); case X86ISD::SHUFP: // Handle all target specific shuffles + case X86ISD::INSERTPS: case X86ISD::PALIGNR: case X86ISD::BLENDI: case X86ISD::UNPCKH: Index: test/CodeGen/X86/insertps-combine.ll =================================================================== --- test/CodeGen/X86/insertps-combine.ll +++ test/CodeGen/X86/insertps-combine.ll @@ -6,16 +6,12 @@ define <4 x float> @shuffle_v4f32_0z27(<4 x float> %x, <4 x float> %a) { ; SSE-LABEL: shuffle_v4f32_0z27: ; SSE: # BB#0: -; SSE-NEXT: xorps %xmm2, %xmm2 -; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2] +; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[2] ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v4f32_0z27: ; AVX: # BB#0: -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[2] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[2] ; AVX-NEXT: retq %vecext = extractelement <4 x float> %x, i32 0 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 @@ -50,16 +46,12 @@ define <4 x float> @shuffle_v4f32_0z24(<4 x float> %xyzw, <4 x float> %abcd) { ; SSE-LABEL: shuffle_v4f32_0z24: ; SSE: # BB#0: -; SSE-NEXT: xorps %xmm2, %xmm2 -; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; SSE-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0] ; SSE-NEXT: retq ; ; AVX-LABEL: shuffle_v4f32_0z24: ; AVX: # BB#0: -; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm2[1],xmm0[2,3] -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0] ; AVX-NEXT: retq %vecext = extractelement <4 x float> %xyzw, i32 0 %vecinit = insertelement <4 x float> undef, float %vecext, i32 0 Index: test/CodeGen/X86/merge-consecutive-loads-128.ll =================================================================== --- test/CodeGen/X86/merge-consecutive-loads-128.ll +++ test/CodeGen/X86/merge-consecutive-loads-128.ll @@ -159,15 +159,13 @@ ; ; SSE41-LABEL: merge_4f32_f32_012u: ; SSE41: # BB#0: -; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; SSE41-NEXT: retq ; ; AVX-LABEL: merge_4f32_f32_012u: ; AVX: # BB#0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; AVX-NEXT: retq %ptr0 = getelementptr inbounds float, float* %ptr, i64 0 @@ -195,15 +193,13 @@ ; ; SSE41-LABEL: merge_4f32_f32_019u: ; SSE41: # BB#0: -; SSE41-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; SSE41-NEXT: retq ; ; AVX-LABEL: merge_4f32_f32_019u: ; AVX: # BB#0: -; AVX-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3] +; AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],mem[0],xmm0[3] ; AVX-NEXT: retq %ptr0 = getelementptr inbounds float, float* %ptr, i64 0 Index: test/CodeGen/X86/vector-shuffle-128-v4.ll =================================================================== --- test/CodeGen/X86/vector-shuffle-128-v4.ll +++ test/CodeGen/X86/vector-shuffle-128-v4.ll @@ -1080,15 +1080,11 @@ ; SSE41-LABEL: shuffle_v4f32_0zz6: ; SSE41: # BB#0: ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2] -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v4f32_0zz6: ; AVX: # BB#0: ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,zero,xmm1[2] -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2],xmm0[3] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32> @@ -1129,15 +1125,11 @@ ; SSE41-LABEL: shuffle_v4f32_0z24: ; SSE41: # BB#0: ; SSE41-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0] -; SSE41-NEXT: xorps %xmm1, %xmm1 -; SSE41-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; SSE41-NEXT: retq ; ; AVX-LABEL: shuffle_v4f32_0z24: ; AVX: # BB#0: ; AVX-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],zero,xmm0[2],xmm1[0] -; AVX-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2,3] ; AVX-NEXT: retq %shuffle = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> %shuffle1 = shufflevector <4 x float> zeroinitializer, <4 x float> %shuffle, <4 x i32>