Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp @@ -1472,6 +1472,60 @@ break; } + case Intrinsic::x86_sse2_packssdw_128: + case Intrinsic::x86_sse2_packsswb_128: + case Intrinsic::x86_sse2_packuswb_128: + case Intrinsic::x86_sse41_packusdw: + case Intrinsic::x86_avx2_packssdw: + case Intrinsic::x86_avx2_packsswb: + case Intrinsic::x86_avx2_packusdw: + case Intrinsic::x86_avx2_packuswb: { + // TODO Add support for Intrinsic::x86_avx512_mask_pack* + auto *Ty0 = II->getArgOperand(0)->getType(); + unsigned InnerVWidth = Ty0->getVectorNumElements(); + assert(VWidth == (InnerVWidth * 2) && "Unexpected input size"); + + unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128; + unsigned VWidthPerLane = VWidth / NumLanes; + unsigned InnerVWidthPerLane = InnerVWidth / NumLanes; + + // Per lane, pack the elements of the first input and then the second. + // e.g. + // v8i16 PACK(v4i32 X, v4i32 Y) - (X[0..3],Y[0..3]) + // v32i8 PACK(v16i16 X, v16i16 Y) - (X[0..7],Y[0..7]),(X[8..15],Y[8..15]) + for (int OpNum = 0; OpNum != 2; ++OpNum) { + APInt OpDemandedElts(InnerVWidth, 0); + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + unsigned LaneIdx = Lane * VWidthPerLane; + for (unsigned Elt = 0; Elt != InnerVWidthPerLane; ++Elt) { + unsigned Idx = LaneIdx + Elt + InnerVWidthPerLane * OpNum; + if (DemandedElts[Idx]) + OpDemandedElts.setBit((Lane * InnerVWidthPerLane) + Elt); + } + } + + // Demand elements from the operand. + auto *Op = II->getArgOperand(OpNum); + APInt OpUndefElts(InnerVWidth, 0); + TmpV = SimplifyDemandedVectorElts(Op, OpDemandedElts, OpUndefElts, + Depth + 1); + if (TmpV) { + II->setArgOperand(OpNum, TmpV); + MadeChange = true; + } + + // Pack the operand's UNDEF elements, one lane at a time. + OpUndefElts = OpUndefElts.zext(VWidth); + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + APInt LaneElts = OpUndefElts.lshr(InnerVWidthPerLane * Lane); + LaneElts = LaneElts.getLoBits(InnerVWidthPerLane); + LaneElts = LaneElts.shl(InnerVWidthPerLane * (2 * Lane + OpNum)); + UndefElts |= LaneElts; + } + } + break; + } + // PSHUFB case Intrinsic::x86_ssse3_pshuf_b_128: case Intrinsic::x86_avx2_pshuf_b: Index: llvm/trunk/test/Transforms/InstCombine/x86-pack.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/x86-pack.ll +++ llvm/trunk/test/Transforms/InstCombine/x86-pack.ll @@ -7,11 +7,9 @@ define <8 x i16> @elts_packssdw_128(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: @elts_packssdw_128( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[TMP3]], <8 x i16> undef, <8 x i32> -; CHECK-NEXT: ret <8 x i16> [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> %a0, <4 x i32> undef) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> undef, <8 x i32> +; CHECK-NEXT: ret <8 x i16> [[TMP2]] ; %1 = shufflevector <4 x i32> %a0, <4 x i32> undef, <4 x i32> %2 = shufflevector <4 x i32> %a1, <4 x i32> undef, <4 x i32> @@ -22,10 +20,8 @@ define <8 x i16> @elts_packusdw_128(<4 x i32> %a0, <4 x i32> %a1) { ; CHECK-LABEL: @elts_packusdw_128( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i32> %a0, i32 0, i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <4 x i32> %a1, i32 0, i32 3 -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) -; CHECK-NEXT: ret <8 x i16> [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> %a0, <4 x i32> %a1) +; CHECK-NEXT: ret <8 x i16> [[TMP1]] ; %1 = insertelement <4 x i32> %a0, i32 0, i32 0 %2 = insertelement <4 x i32> %a1, i32 0, i32 3 @@ -36,11 +32,9 @@ define <16 x i8> @elts_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: @elts_packsswb_128( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <8 x i16> %a0, i16 0, i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <8 x i16> %a1, i16 0, i32 0 -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i8> [[TMP3]], <16 x i8> undef, <16 x i32> -; CHECK-NEXT: ret <16 x i8> [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> , <8 x i16> ) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> +; CHECK-NEXT: ret <16 x i8> [[TMP2]] ; %1 = insertelement <8 x i16> %a0, i16 0, i32 0 %2 = insertelement <8 x i16> %a1, i16 0, i32 0 @@ -51,9 +45,7 @@ define <16 x i8> @elts_packuswb_128(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: @elts_packuswb_128( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> , <8 x i16> ) -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> -; CHECK-NEXT: ret <16 x i8> [[TMP2]] +; CHECK-NEXT: ret <16 x i8> undef ; %1 = insertelement <8 x i16> undef, i16 0, i32 0 %2 = insertelement <8 x i16> undef, i16 0, i32 0 @@ -64,10 +56,8 @@ define <16 x i16> @elts_packssdw_256(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: @elts_packssdw_256( -; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; CHECK-NEXT: ret <16 x i16> [[TMP3]] +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> %a0, <8 x i32> undef) +; CHECK-NEXT: ret <16 x i16> [[TMP1]] ; %1 = shufflevector <8 x i32> %a0, <8 x i32> undef, <8 x i32> %2 = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> @@ -79,7 +69,7 @@ define <16 x i16> @elts_packusdw_256(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: @elts_packusdw_256( ; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> %a1, <8 x i32> undef, <8 x i32> -; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> %a0, <8 x i32> [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> undef, <8 x i32> [[TMP1]]) ; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> undef, <16 x i32> ; CHECK-NEXT: ret <16 x i16> [[TMP3]] ; @@ -92,11 +82,9 @@ define <32 x i8> @elts_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: @elts_packsswb_256( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <16 x i16> %a0, i16 0, i32 0 -; CHECK-NEXT: [[TMP2:%.*]] = insertelement <16 x i16> %a1, i16 0, i32 8 -; CHECK-NEXT: [[TMP3:%.*]] = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) -; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <32 x i8> [[TMP3]], <32 x i8> undef, <32 x i32> -; CHECK-NEXT: ret <32 x i8> [[TMP4]] +; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> , <16 x i16> ) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> undef, <32 x i32> +; CHECK-NEXT: ret <32 x i8> [[TMP2]] ; %1 = insertelement <16 x i16> %a0, i16 0, i32 0 %2 = insertelement <16 x i16> %a1, i16 0, i32 8 @@ -107,9 +95,7 @@ define <32 x i8> @elts_packuswb_256(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: @elts_packuswb_256( -; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> , <16 x i16> ) -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> undef, <32 x i32> zeroinitializer -; CHECK-NEXT: ret <32 x i8> [[TMP2]] +; CHECK-NEXT: ret <32 x i8> undef ; %1 = insertelement <16 x i16> undef, i16 0, i32 1 %2 = insertelement <16 x i16> undef, i16 0, i32 0