Index: llvm/trunk/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/X86/X86ISelLowering.cpp +++ llvm/trunk/lib/Target/X86/X86ISelLowering.cpp @@ -4317,6 +4317,11 @@ return (Val < 0 || Val == CmpVal); } +/// Val is either the undef or zero sentinel value. +static bool isUndefOrZero(int Val) { + return (Val == SM_SentinelUndef || Val == SM_SentinelZero); +} + /// Return true if every element in Mask, beginning /// from position Pos and ending in Pos+Size, falls within the specified /// sequential range (Low, Low+Size]. or is undef. @@ -23989,6 +23994,7 @@ SDValue Op1 = N.getOperand(1); SDValue Op2 = N.getOperand(2); unsigned InsertPSMask = cast(Op2)->getZExtValue(); + unsigned SrcIdx = (InsertPSMask >> 6) & 0x3; unsigned DstIdx = (InsertPSMask >> 4) & 0x3; unsigned ZeroMask = InsertPSMask & 0xF; @@ -24002,19 +24008,38 @@ return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT), DAG.getConstant(InsertPSMask, DL, MVT::i8)); - SmallVector TargetMask; - if (!setTargetShuffleZeroElements(Op0, TargetMask)) + // Attempt to merge insertps Op1 with an inner target shuffle node. + SmallVector TargetMask1; + if (setTargetShuffleZeroElements(Op1, TargetMask1)) { + int M = TargetMask1[SrcIdx]; + if (isUndefOrZero(M)) { + // Zero/UNDEF insertion - zero out element and remove dependency. + InsertPSMask |= (1u << DstIdx); + return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT), + DAG.getConstant(InsertPSMask, DL, MVT::i8)); + } + // Update insertps mask srcidx and reference the source input directly. + assert(0 <= M && M < 8 && "Shuffle index out of range"); + InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6); + Op1 = Op1.getOperand(M < 4 ? 0 : 1); + return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1, + DAG.getConstant(InsertPSMask, DL, MVT::i8)); + } + + // Attempt to merge insertps Op0 with an inner target shuffle node. + SmallVector TargetMask0; + if (!setTargetShuffleZeroElements(Op0, TargetMask0)) return SDValue(); bool Updated = false; bool UseInput00 = false; bool UseInput01 = false; for (int i = 0; i != 4; ++i) { - int M = TargetMask[i]; + int M = TargetMask0[i]; if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) { // No change if element is already zero or the inserted element. continue; - } else if (M < 0) { + } else if (isUndefOrZero(M)) { // If the target mask is undef/zero then we must zero the element. InsertPSMask |= (1u << i); Updated = true; Index: llvm/trunk/test/CodeGen/X86/sse41.ll =================================================================== --- llvm/trunk/test/CodeGen/X86/sse41.ll +++ llvm/trunk/test/CodeGen/X86/sse41.ll @@ -846,16 +846,12 @@ ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; X32-NEXT: retl ; ; X64-LABEL: insertps_from_broadcast_loadf32: ; X64: ## BB#0: -; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; X64-NEXT: retq %1 = getelementptr inbounds float, float* %fb, i64 %index %2 = load float, float* %1, align 4 @@ -871,16 +867,12 @@ ; X32-LABEL: insertps_from_broadcast_loadv4f32: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movups (%eax), %xmm1 -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; X32-NEXT: retl ; ; X64-LABEL: insertps_from_broadcast_loadv4f32: ; X64: ## BB#0: -; X64-NEXT: movups (%rdi), %xmm1 -; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; X64-NEXT: retq %1 = load <4 x float>, <4 x float>* %b, align 4 %2 = extractelement <4 x float> %1, i32 0 @@ -892,14 +884,12 @@ ret <4 x float> %7 } -;; FIXME: We're emitting an extraneous pshufd/vbroadcast. define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) { ; X32-LABEL: insertps_from_broadcast_multiple_use: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; X32-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0] ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] ; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] ; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] @@ -912,7 +902,6 @@ ; X64-LABEL: insertps_from_broadcast_multiple_use: ; X64: ## BB#0: ; X64-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; X64-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0] ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] ; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] ; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]