Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -24002,6 +24002,7 @@ SDValue Op1 = N.getOperand(1); SDValue Op2 = N.getOperand(2); unsigned InsertPSMask = cast(Op2)->getZExtValue(); + unsigned SrcIdx = (InsertPSMask >> 6) & 0x3; unsigned DstIdx = (InsertPSMask >> 4) & 0x3; unsigned ZeroMask = InsertPSMask & 0xF; @@ -24015,15 +24016,33 @@ return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT), DAG.getConstant(InsertPSMask, DL, MVT::i8)); - SmallVector TargetMask; - if (!setTargetShuffleZeroElements(Op0, TargetMask)) + // Attempt to merge insertps Op1 with an inner target shuffle node. + SmallVector TargetMask1; + if (setTargetShuffleZeroElements(Op1, TargetMask1)) { + int M = TargetMask1[SrcIdx]; + if (M < 0) { + // Zero/UNDEF insertion - zero out element and remove dependency. + InsertPSMask |= (1u << DstIdx); + return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT), + DAG.getConstant(InsertPSMask, DL, MVT::i8)); + } + // Update insertps mask srcidx and reference the source input directly. + InsertPSMask = (InsertPSMask & 0x3f) | ((M & 0x3) << 6); + Op1 = Op1.getOperand(M < 4 ? 0 : 1); + return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, Op1, + DAG.getConstant(InsertPSMask, DL, MVT::i8)); + } + + // Attempt to merge insertps Op0 with an inner target shuffle node. + SmallVector TargetMask0; + if (!setTargetShuffleZeroElements(Op0, TargetMask0)) return SDValue(); bool Updated = false; bool UseInput00 = false; bool UseInput01 = false; for (int i = 0; i != 4; ++i) { - int M = TargetMask[i]; + int M = TargetMask0[i]; if ((InsertPSMask & (1u << i)) || (i == (int)DstIdx)) { // No change if element is already zero or the inserted element. continue; Index: test/CodeGen/X86/sse41.ll =================================================================== --- test/CodeGen/X86/sse41.ll +++ test/CodeGen/X86/sse41.ll @@ -846,16 +846,12 @@ ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; X32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; X32-NEXT: retl ; ; X64-LABEL: insertps_from_broadcast_loadf32: ; X64: ## BB#0: -; X64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero -; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; X64-NEXT: retq %1 = getelementptr inbounds float, float* %fb, i64 %index %2 = load float, float* %1, align 4 @@ -871,16 +867,12 @@ ; X32-LABEL: insertps_from_broadcast_loadv4f32: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax -; X32-NEXT: movups (%eax), %xmm1 -; X32-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; X32-NEXT: retl ; ; X64-LABEL: insertps_from_broadcast_loadv4f32: ; X64: ## BB#0: -; X64-NEXT: movups (%rdi), %xmm1 -; X64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0] -; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0] +; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0] ; X64-NEXT: retq %1 = load <4 x float>, <4 x float>* %b, align 4 %2 = extractelement <4 x float> %1, i32 0 @@ -892,14 +884,12 @@ ret <4 x float> %7 } -;; FIXME: We're emitting an extraneous pshufd/vbroadcast. define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) { ; X32-LABEL: insertps_from_broadcast_multiple_use: ; X32: ## BB#0: ; X32-NEXT: movl {{[0-9]+}}(%esp), %eax ; X32-NEXT: movl {{[0-9]+}}(%esp), %ecx ; X32-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; X32-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0] ; X32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] ; X32-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] ; X32-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0] @@ -912,7 +902,6 @@ ; X64-LABEL: insertps_from_broadcast_multiple_use: ; X64: ## BB#0: ; X64-NEXT: movss {{.*#+}} xmm4 = mem[0],zero,zero,zero -; X64-NEXT: shufps {{.*#+}} xmm4 = xmm4[0,0,0,0] ; X64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm4[0] ; X64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0] ; X64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm4[0]