diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -43213,35 +43213,41 @@ SDValue N1 = N->getOperand(1); EVT SrcVT = N0.getValueType(); + SDValue BC0 = peekThroughBitcasts(N0); + SDValue BC1 = peekThroughBitcasts(N1); + // Attempt to fold HOP(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X))) // to SHUFFLE(HOP(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for // truncation trees that help us avoid lane crossing shuffles. // TODO: There's a lot more we can do for PACK/HADD style shuffle combines. // TODO: We don't handle vXf64 shuffles yet. - if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR && - N1.getOpcode() == ISD::EXTRACT_SUBVECTOR && - N0.getConstantOperandAPInt(1) == 0 && - N1.getConstantOperandAPInt(1) == SrcVT.getVectorNumElements() && - N0.getOperand(0) == N1.getOperand(0) && VT.is128BitVector() && - N0.getOperand(0).getValueType().is256BitVector() && - SrcVT.getScalarSizeInBits() <= 32) { - // TODO - support target/faux shuffles. - SDValue Vec = peekThroughBitcasts(N0.getOperand(0)); - if (auto *SVN = dyn_cast(Vec)) { + if (VT.is128BitVector() && SrcVT.getScalarSizeInBits() <= 32 && + BC0.getOpcode() == ISD::EXTRACT_SUBVECTOR && + BC1.getOpcode() == ISD::EXTRACT_SUBVECTOR && + BC0.getOperand(0) == BC1.getOperand(0) && + BC0.getOperand(0).getValueType().is256BitVector() && + BC0.getConstantOperandAPInt(1) == 0 && + BC1.getConstantOperandAPInt(1) == + BC0.getValueType().getVectorNumElements()) { + SmallVector ShuffleOps; + SmallVector ShuffleMask, ScaledMask; + SDValue Vec = peekThroughBitcasts(BC0.getOperand(0)); + if (getTargetShuffleInputs(Vec, ShuffleOps, ShuffleMask, DAG)) { + resolveTargetShuffleInputsAndMask(ShuffleOps, ShuffleMask); // To keep the HOP LHS/RHS coherency, we must be able to scale the unary - // shuffle to a vXi64 width - we can probably relax this in the future. - SmallVector ShuffleMask; - if (SVN->getOperand(1).isUndef() && - scaleShuffleElements(SVN->getMask(), 4, ShuffleMask)) { + // shuffle to a v4X64 width - we can probably relax this in the future. + if (!isAnyZero(ShuffleMask) && ShuffleOps.size() == 1 && + ShuffleOps[0].getValueType().is256BitVector() && + scaleShuffleElements(ShuffleMask, 4, ScaledMask)) { SDLoc DL(N); SDValue Lo, Hi; MVT ShufVT = VT.isFloatingPoint() ? MVT::v4f32 : MVT::v4i32; - std::tie(Lo, Hi) = DAG.SplitVector(SVN->getOperand(0), DL); - Lo = DAG.getBitcast(N0.getValueType(), Lo); - Hi = DAG.getBitcast(N1.getValueType(), Hi); + std::tie(Lo, Hi) = DAG.SplitVector(ShuffleOps[0], DL); + Lo = DAG.getBitcast(SrcVT, Lo); + Hi = DAG.getBitcast(SrcVT, Hi); SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi); Res = DAG.getBitcast(ShufVT, Res); - Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ShuffleMask); + Res = DAG.getVectorShuffle(ShufVT, DL, Res, Res, ScaledMask); return DAG.getBitcast(VT, Res); } } diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll @@ -846,9 +846,9 @@ ; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll @@ -1333,9 +1333,9 @@ ; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5 ; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll --- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll +++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll @@ -1137,9 +1137,9 @@ ; AVX2-NEXT: vpcmpgtq %ymm5, %ymm7, %ymm5 ; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpcmpeqd %ymm3, %ymm2, %ymm1 ; AVX2-NEXT: vmovmskps %ymm1, %eax diff --git a/llvm/test/CodeGen/X86/vector-trunc-packus.ll b/llvm/test/CodeGen/X86/vector-trunc-packus.ll --- a/llvm/test/CodeGen/X86/vector-trunc-packus.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-packus.ll @@ -3871,9 +3871,9 @@ ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -4274,9 +4274,9 @@ ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm2 ; AVX2-NEXT: vpand %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, (%rsi) ; AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll --- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll @@ -3639,9 +3639,9 @@ ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 ; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -4060,9 +4060,9 @@ ; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3 ; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpacksswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, (%rsi) ; AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll --- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll +++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll @@ -2812,9 +2812,9 @@ ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 ; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq @@ -3073,9 +3073,9 @@ ; AVX2-NEXT: vpcmpgtq %ymm3, %ymm5, %ymm3 ; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, (%rsi) ; AVX2-NEXT: vzeroupper diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -273,9 +273,9 @@ ; AVX2-NEXT: vpand %ymm2, %ymm1, %ymm1 ; AVX2-NEXT: vpand %ymm2, %ymm0, %ymm0 ; AVX2-NEXT: vpackusdw %ymm1, %ymm0, %ymm0 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] ; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1 ; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3] ; AVX2-NEXT: vpackuswb %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vmovq %xmm0, (%rax) ; AVX2-NEXT: vzeroupper