diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -40754,7 +40754,7 @@ for (unsigned OpIdx : seq(0U, 2U)) OpsOfShuf[OpIdx] = peekThroughOneUseBitcasts(N.getOperand(OpIdx)); unsigned SrcOpcode = OpsOfShuf[0].getOpcode(); - if (TLI.isBinOp(SrcOpcode) && OpsOfShuf[1].getOpcode() == SrcOpcode && + if (isBinOp(SrcOpcode, TLI) && OpsOfShuf[1].getOpcode() == SrcOpcode && IsSafeToMoveShuffle(OpsOfShuf[0], SrcOpcode) && IsSafeToMoveShuffle(OpsOfShuf[1], SrcOpcode)) { assert(OpsOfShuf[0].getNumOperands() == 2 && @@ -40768,14 +40768,20 @@ } // Ensure the total number of shuffles doesn't increase by folding this // shuffle through to the source ops. - if (any_of(enumerate(NthOpsOfShufOps), - [&](auto I) { - return AreAllOperandMergeableWithShuffle( - SrcOpcode, I.index(), I.value()); - }) || + if ((any_of(enumerate(NthOpsOfShufOps), + [&](auto I) { + return AreAllOperandMergeableWithShuffle( + SrcOpcode, I.index(), I.value()); + }) || + all_of(enumerate(NthOpsOfShufOps), + [&](auto I) { + return AreAnyOperandMergeableWithShuffle( + SrcOpcode, I.index(), I.value()); + })) && all_of(enumerate(NthOpsOfShufOps), [&](auto I) { - return AreAnyOperandMergeableWithShuffle(SrcOpcode, I.index(), - I.value()); + return !shouldIgnoreSpecificOperandForOpcode(SrcOpcode, + I.index()) || + all_equal(I.value()); })) { EVT OpVT = OpsOfShuf[0].getValueType(); std::array NewOps; @@ -40783,6 +40789,12 @@ MutableArrayRef NthOpsOfOpsOfShufOp = NthOpsOfShufOps[OpIdx]; SDValue &NewOp = NewOps[OpIdx]; + if (shouldIgnoreSpecificOperandForOpcode(SrcOpcode, OpIdx)) { + assert(all_equal(NthOpsOfOpsOfShufOp) && + "Expected to see identical values of ignored operand."); + NewOp = NthOpsOfOpsOfShufOp[0]; + continue; + } for (SDValue &V : NthOpsOfOpsOfShufOp) V = DAG.getBitcast(ShuffleVT, V); if (N.getNumOperands() == 3) diff --git a/llvm/test/CodeGen/X86/blend-of-shift.ll b/llvm/test/CodeGen/X86/blend-of-shift.ll --- a/llvm/test/CodeGen/X86/blend-of-shift.ll +++ b/llvm/test/CodeGen/X86/blend-of-shift.ll @@ -9,17 +9,15 @@ define <4 x i32> @shuffle_i32_of_shl_i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; SSE2-LABEL: shuffle_i32_of_shl_i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psllw $15, %xmm0 -; SSE2-NEXT: psllw $15, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psllw $15, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i32_of_shl_i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 -; AVX2-NEXT: vpsllw $15, %xmm1, %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0] +; AVX2-NEXT: vpsllw $15, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} %i1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %x, i32 15) %i2 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> %y, i32 15) @@ -31,17 +29,15 @@ define <4 x i32> @shuffle_i32_of_lshr_i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; SSE2-LABEL: shuffle_i32_of_lshr_i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psrlw $15, %xmm0 -; SSE2-NEXT: psrlw $15, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psrlw $15, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i32_of_lshr_i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $15, %xmm1, %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0] +; AVX2-NEXT: vpsrlw $15, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} %i1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %x, i32 15) %i2 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> %y, i32 15) @@ -53,17 +49,15 @@ define <4 x i32> @shuffle_i32_of_ashr_i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; SSE2-LABEL: shuffle_i32_of_ashr_i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psraw $15, %xmm0 -; SSE2-NEXT: psraw $15, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psraw $15, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i32_of_ashr_i16: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0 -; AVX2-NEXT: vpsraw $15, %xmm1, %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0] +; AVX2-NEXT: vpsraw $15, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} %i1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %x, i32 15) %i2 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %y, i32 15) @@ -76,17 +70,15 @@ define <4 x i32> @shuffle_i32_of_shl_i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; SSE2-LABEL: shuffle_i32_of_shl_i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: pslld $31, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i32_of_shl_i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 -; AVX2-NEXT: vpslld $31, %xmm1, %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0] +; AVX2-NEXT: vpslld $31, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} %i1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %x, i32 31) %i2 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> %y, i32 31) @@ -96,17 +88,15 @@ define <4 x i32> @shuffle_i32_of_lshr_i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; SSE2-LABEL: shuffle_i32_of_lshr_i32: ; SSE2: # %bb.0: -; SSE2-NEXT: psrld $31, %xmm0 -; SSE2-NEXT: psrld $31, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psrld $31, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i32_of_lshr_i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 -; AVX2-NEXT: vpsrld $31, %xmm1, %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0] +; AVX2-NEXT: vpsrld $31, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} %i1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %x, i32 31) %i2 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> %y, i32 31) @@ -116,17 +106,15 @@ define <4 x i32> @shuffle_i32_of_ashr_i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; SSE2-LABEL: shuffle_i32_of_ashr_i32: ; SSE2: # %bb.0: -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,2],xmm0[1,0] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i32_of_ashr_i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $31, %xmm1, %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm1[3,2],xmm0[1,0] +; AVX2-NEXT: vpsrad $31, %xmm0, %xmm0 ; AVX2-NEXT: ret{{[l|q]}} %i1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %x, i32 31) %i2 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %y, i32 31) @@ -255,10 +243,9 @@ define <2 x i64> @shuffle_i64_of_shl_i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; SSE2-LABEL: shuffle_i64_of_shl_i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psllw $15, %xmm0 -; SSE2-NEXT: psllw $15, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psllw $15, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i64_of_shl_i16: @@ -277,10 +264,9 @@ define <2 x i64> @shuffle_i64_of_lshr_i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; SSE2-LABEL: shuffle_i64_of_lshr_i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psrlw $15, %xmm0 -; SSE2-NEXT: psrlw $15, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psrlw $15, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i64_of_lshr_i16: @@ -299,10 +285,9 @@ define <2 x i64> @shuffle_i64_of_ashr_i16(<8 x i16> %x, <8 x i16> %y) nounwind { ; SSE2-LABEL: shuffle_i64_of_ashr_i16: ; SSE2: # %bb.0: -; SSE2-NEXT: psraw $15, %xmm0 -; SSE2-NEXT: psraw $15, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psraw $15, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i64_of_ashr_i16: @@ -322,10 +307,9 @@ define <2 x i64> @shuffle_i64_of_shl_i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; SSE2-LABEL: shuffle_i64_of_shl_i32: ; SSE2: # %bb.0: -; SSE2-NEXT: pslld $31, %xmm0 -; SSE2-NEXT: pslld $31, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: pslld $31, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i64_of_shl_i32: @@ -344,10 +328,9 @@ define <2 x i64> @shuffle_i64_of_lshr_i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; SSE2-LABEL: shuffle_i64_of_lshr_i32: ; SSE2: # %bb.0: -; SSE2-NEXT: psrld $31, %xmm0 -; SSE2-NEXT: psrld $31, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psrld $31, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i64_of_lshr_i32: @@ -366,10 +349,9 @@ define <2 x i64> @shuffle_i64_of_ashr_i32(<4 x i32> %x, <4 x i32> %y) nounwind { ; SSE2-LABEL: shuffle_i64_of_ashr_i32: ; SSE2: # %bb.0: -; SSE2-NEXT: psrad $31, %xmm0 -; SSE2-NEXT: psrad $31, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psrad $31, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i64_of_ashr_i32: @@ -389,10 +371,9 @@ define <2 x i64> @shuffle_i64_of_shl_i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; SSE2-LABEL: shuffle_i64_of_shl_i64: ; SSE2: # %bb.0: -; SSE2-NEXT: psllq $63, %xmm0 -; SSE2-NEXT: psllq $63, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psllq $63, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i64_of_shl_i64: @@ -411,10 +392,9 @@ define <2 x i64> @shuffle_i64_of_lshr_i64(<2 x i64> %x, <2 x i64> %y) nounwind { ; SSE2-LABEL: shuffle_i64_of_lshr_i64: ; SSE2: # %bb.0: -; SSE2-NEXT: psrlq $63, %xmm0 -; SSE2-NEXT: psrlq $63, %xmm1 ; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm0[0,1] -; SSE2-NEXT: movaps %xmm1, %xmm0 +; SSE2-NEXT: psrlq $63, %xmm1 +; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: ret{{[l|q]}} ; ; AVX2-LABEL: shuffle_i64_of_lshr_i64: diff --git a/llvm/test/CodeGen/X86/concat-cast.ll b/llvm/test/CodeGen/X86/concat-cast.ll --- a/llvm/test/CodeGen/X86/concat-cast.ll +++ b/llvm/test/CodeGen/X86/concat-cast.ll @@ -430,25 +430,22 @@ define <4 x float> @PR45794(<2 x i64> %x, <2 x i64> %y) { ; SSE-LABEL: PR45794: ; SSE: # %bb.0: -; SSE-NEXT: psrad $16, %xmm0 -; SSE-NEXT: psrad $16, %xmm1 ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; SSE-NEXT: psrad $16, %xmm0 ; SSE-NEXT: cvtdq2ps %xmm0, %xmm0 ; SSE-NEXT: retq ; ; AVX1-LABEL: PR45794: ; AVX1: # %bb.0: -; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0 ; AVX1-NEXT: retq ; ; AVX2-LABEL: PR45794: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 -; AVX2-NEXT: vpsrad $16, %xmm1, %xmm1 ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; AVX2-NEXT: vpsrad $16, %xmm0, %xmm0 ; AVX2-NEXT: vcvtdq2ps %xmm0, %xmm0 ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/known-signbits-vector.ll b/llvm/test/CodeGen/X86/known-signbits-vector.ll --- a/llvm/test/CodeGen/X86/known-signbits-vector.ll +++ b/llvm/test/CodeGen/X86/known-signbits-vector.ll @@ -114,9 +114,8 @@ ; X86-LABEL: signbits_ashr_sitofp_1: ; X86: # %bb.0: ; X86-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X86-NEXT: vpsrad $16, %xmm1, %xmm1 -; X86-NEXT: vpsrad $16, %xmm0, %xmm0 ; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; X86-NEXT: vpsrad $16, %xmm0, %xmm0 ; X86-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X86-NEXT: vzeroupper ; X86-NEXT: retl @@ -124,9 +123,8 @@ ; X64-AVX1-LABEL: signbits_ashr_sitofp_1: ; X64-AVX1: # %bb.0: ; X64-AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 -; X64-AVX1-NEXT: vpsrad $16, %xmm1, %xmm1 -; X64-AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 ; X64-AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm1[1,3] +; X64-AVX1-NEXT: vpsrad $16, %xmm0, %xmm0 ; X64-AVX1-NEXT: vcvtdq2ps %xmm0, %xmm0 ; X64-AVX1-NEXT: vzeroupper ; X64-AVX1-NEXT: retq