Index: lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1472,6 +1472,71 @@ break; } + case Intrinsic::x86_sse2_packssdw_128: + case Intrinsic::x86_sse2_packsswb_128: + case Intrinsic::x86_sse2_packuswb_128: + case Intrinsic::x86_sse41_packusdw: + case Intrinsic::x86_avx2_packssdw: + case Intrinsic::x86_avx2_packsswb: + case Intrinsic::x86_avx2_packusdw: + case Intrinsic::x86_avx2_packuswb: { + // TODO Add support for Intrinsic::x86_avx512_mask_pack* + auto *Op0 = II->getArgOperand(0); + auto *Op1 = II->getArgOperand(1); + auto *Ty0 = Op0->getType(); + unsigned InnerVWidth = Ty0->getVectorNumElements(); + assert(VWidth == (InnerVWidth * 2) && "Unexpected input size"); + + APInt Op0DemandedElts(InnerVWidth, 0); + APInt Op1DemandedElts(InnerVWidth, 0); + unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128; + unsigned VWidthPerLane = VWidth / NumLanes; + unsigned InnerVWidthPerLane = InnerVWidth / NumLanes; + + // Per lane, pack the elements of the first input and then the second. + // e.g. + // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3]) + // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15]) + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) + for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) { + if (DemandedElts[(Lane * VWidthPerLane) + Elt]) + Op0DemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt); + if (DemandedElts[(Lane * VWidthPerLane) + Elt + InnerVWidthPerLane]) + Op1DemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt); + } + + UndefElts2 = APInt(InnerVWidth, 0); + TmpV = SimplifyDemandedVectorElts(Op0, Op0DemandedElts, UndefElts2, + Depth + 1); + if (TmpV) { + II->setArgOperand(0, TmpV); + MadeChange = true; + } + + UndefElts3 = APInt(InnerVWidth, 0); + TmpV = SimplifyDemandedVectorElts(Op1, Op1DemandedElts, UndefElts3, + Depth + 1); + if (TmpV) { + II->setArgOperand(1, TmpV); + MadeChange = true; + } + + // Pack UNDEFs from the 2 inputs, one lane at a time. + UndefElts = APInt(VWidth, 0); + UndefElts2 = UndefElts2.zext(VWidth); + UndefElts3 = UndefElts3.zext(VWidth); + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + APInt LaneElts2 = UndefElts2.lshr(InnerVWidthPerLane * Lane); + APInt LaneElts3 = UndefElts3.lshr(InnerVWidthPerLane * Lane); + LaneElts2 = LaneElts2.getLoBits(InnerVWidthPerLane); + LaneElts3 = LaneElts3.getLoBits(InnerVWidthPerLane); + LaneElts2 = LaneElts2.shl(InnerVWidthPerLane * (2 * Lane + 0)); + LaneElts3 = LaneElts3.shl(InnerVWidthPerLane * (2 * Lane + 1)); + UndefElts |= (LaneElts2 | LaneElts3); + } + break; + } + // PSHUFB case Intrinsic::x86_ssse3_pshuf_b_128: case Intrinsic::x86_avx2_pshuf_b: Index: test/Transforms/InstCombine/x86-pack.ll =================================================================== --- test/Transforms/InstCombine/x86-pack.ll +++ test/Transforms/InstCombine/x86-pack.ll @@ -7,11 +7,9 @@ define <8 x i16> @elts_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: @elts_packssdw_128( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> undef, <8 x i32> -; CHECK-NEXT: ret <8 x i16> [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> undef) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> +; CHECK-NEXT: ret <8 x i16> [[TMP2]] ; %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> @@ -22,10 +20,8 @@ define <8 x i16> @elts_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: @elts_packusdw_128( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> %a0, i32 0, i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> %a1, i32 0, i32 3 -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) -; CHECK-NEXT: ret <8 x i16> [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) +; CHECK-NEXT: ret <8 x i16> [[TMP1]] ; %1 = insertelement <4 x i32> %a0, i32 0, i32 0 %2 = insertelement <4 x i32> %a1, i32 0, i32 3 @@ -36,11 +32,9 @@ define <16 x i8> @elts_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: @elts_packsswb_128( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> %a0, i16 0, i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i16> %a1, i16 0, i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> undef, <16 x i32> -; CHECK-NEXT: ret <16 x i8> [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> , <8 x i16> ) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> +; CHECK-NEXT: ret <16 x i8> [[TMP2]] ; %1 = insertelement <8 x i16> %a0, i16 0, i32 0 %2 = insertelement <8 x i16> %a1, i16 0, i32 0 @@ -51,9 +45,7 @@ define <16 x i8> @elts_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: @elts_packuswb_128( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> , <8 x i16> ) -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> -; CHECK-NEXT: ret <16 x i8> [[TMP2]] +; CHECK-NEXT: ret <16 x i8> undef ; %1 = insertelement <8 x i16> undef, i16 0, i32 0 %2 = insertelement <8 x i16> undef, i16 0, i32 0 @@ -64,10 +56,8 @@ define <16 x i16> @elts_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: @elts_packssdw_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; CHECK-NEXT: ret <16 x i16> [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> undef) +; CHECK-NEXT: ret <16 x i16> [[TMP1]] ; %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> @@ -79,7 +69,7 @@ define <16 x i16> @elts_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: @elts_packusdw_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> undef, <8 x i32> [[TMP1]]) ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> undef, <16 x i32> ; CHECK-NEXT: ret <16 x i16> [[TMP3]] ; @@ -92,11 +82,9 @@ define <32 x i8> @elts_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: @elts_packsswb_256( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i16> %a0, i16 0, i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i16> %a1, i16 0, i32 8 -; CHECK-NEXT: [[TMP3:%.*]] = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <32 x i8> [[TMP3]], <32 x i8> undef, <32 x i32> -; CHECK-NEXT: ret <32 x i8> [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> , <16 x i16> ) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> undef, <32 x i32> +; CHECK-NEXT: ret <32 x i8> [[TMP2]] ; %1 = insertelement <16 x i16> %a0, i16 0, i32 0 %2 = insertelement <16 x i16> %a1, i16 0, i32 8 @@ -107,9 +95,7 @@ define <32 x i8> @elts_packuswb_256(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: @elts_packuswb_256( -; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> , <16 x i16> ) -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> undef, <32 x i32> zeroinitializer -; CHECK-NEXT: ret <32 x i8> [[TMP2]] +; CHECK-NEXT: ret <32 x i8> undef ; %1 = insertelement <16 x i16> undef, i16 0, i32 1 %2 = insertelement <16 x i16> undef, i16 0, i32 0