Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -510,6 +510,84 @@ return Builder.CreateAShr(Vec, ShiftVec); } +static Value *simplifyX86pack(IntrinsicInst &II, InstCombiner &IC, + InstCombiner::BuilderTy &Builder, bool IsSigned) { + Value *Arg0 = II.getArgOperand(0); + Value *Arg1 = II.getArgOperand(1); + Type *ResTy = II.getType(); + + // Fast all undef handling. + if (isa(Arg0) && isa(Arg1)) + return UndefValue::get(ResTy); + + Type *ArgTy = Arg0->getType(); + unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; + unsigned NumDstElts = ResTy->getVectorNumElements(); + unsigned NumSrcElts = ArgTy->getVectorNumElements(); + assert(NumDstElts == (2 * NumSrcElts) && "Unexpected packing types"); + + unsigned NumDstEltsPerLane = NumDstElts / NumLanes; + unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; + unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); + unsigned SrcScalarSizeInBits = ArgTy->getScalarSizeInBits(); + assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && + "Unexpected packing types"); + + // Constant folding. + auto *Cst0 = dyn_cast(Arg0); + auto *Cst1 = dyn_cast(Arg1); + if (Cst0 && Cst1) { + SmallVector Vals; + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) { + unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane; + auto *Cst = (Elt >= NumSrcEltsPerLane) ? Cst1 : Cst0; + auto *COp = Cst->getAggregateElement(SrcIdx); + if (COp && isa(COp)) { + Vals.push_back(UndefValue::get(ResTy->getScalarType())); + continue; + } + + auto *CInt = dyn_cast_or_null(COp); + if (!CInt) + return nullptr; + + APInt Val = CInt->getValue(); + assert(Val.getBitWidth() == SrcScalarSizeInBits && + "Unexpected constant bitwidth"); + + if (IsSigned) { + // PACKSS: Truncate signed value with signed saturation. + // Source values less than dst minint are saturated to minint. + // Source values greater than dst maxint are saturated to maxint. + if (Val.isSignedIntN(DstScalarSizeInBits)) + Val = Val.trunc(DstScalarSizeInBits); + else if (Val.isNegative()) + Val = APInt::getSignedMinValue(DstScalarSizeInBits); + else + Val = APInt::getSignedMaxValue(DstScalarSizeInBits); + } else { + // PACKUS: Truncate signed value with unsigned saturation. + // Source values less than zero are saturated to zero. + // Source values greater than dst maxuint are saturated to maxuint. + if (Val.isIntN(DstScalarSizeInBits)) + Val = Val.trunc(DstScalarSizeInBits); + else if (Val.isNegative()) + Val = APInt::getNullValue(DstScalarSizeInBits); + else + Val = APInt::getAllOnesValue(DstScalarSizeInBits); + } + + Vals.push_back(ConstantInt::get(ResTy->getScalarType(), Val)); + } + } + + return ConstantVector::get(Vals); + } + + return nullptr; +} + static Value *simplifyX86movmsk(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder) { Value *Arg = II.getArgOperand(0); @@ -2153,6 +2231,24 @@ break; } + case Intrinsic::x86_sse2_packssdw_128: + case Intrinsic::x86_sse2_packsswb_128: + case Intrinsic::x86_avx2_packssdw: + case Intrinsic::x86_avx2_packsswb: + // TODO Add support for Intrinsic::x86_avx512_mask_packss* + if (Value *V = simplifyX86pack(*II, *this, *Builder, true)) + return replaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_sse2_packuswb_128: + case Intrinsic::x86_sse41_packusdw: + case Intrinsic::x86_avx2_packusdw: + case Intrinsic::x86_avx2_packuswb: + // TODO Add support for Intrinsic::x86_avx512_mask_packus* + if (Value *V = simplifyX86pack(*II, *this, *Builder, false)) + return replaceInstUsesWith(*II, V); + break; + case Intrinsic::x86_sse41_insertps: if (Value *V = simplifyX86insertps(*II, *Builder)) return replaceInstUsesWith(*II, V); Index: test/Transforms/InstCombine/x86-pack.ll =================================================================== --- test/Transforms/InstCombine/x86-pack.ll +++ test/Transforms/InstCombine/x86-pack.ll @@ -7,8 +7,7 @@ define <8 x i16> @undef_packssdw_128() { ; CHECK-LABEL: @undef_packssdw_128( -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> undef, <4 x i32> undef) -; CHECK-NEXT: ret <8 x i16> [[TMP1]] +; CHECK-NEXT: ret <8 x i16> undef ; %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> undef, <4 x i32> undef) ret <8 x i16> %1 @@ -16,8 +15,7 @@ define <8 x i16> @undef_packusdw_128() { ; CHECK-LABEL: @undef_packusdw_128( -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> undef, <4 x i32> undef) -; CHECK-NEXT: ret <8 x i16> [[TMP1]] +; CHECK-NEXT: ret <8 x i16> undef ; %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> undef, <4 x i32> undef) ret <8 x i16> %1 @@ -25,8 +23,7 @@ define <16 x i8> @undef_packsswb_128() { ; CHECK-LABEL: @undef_packsswb_128( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> undef, <8 x i16> undef) -; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; CHECK-NEXT: ret <16 x i8> undef ; %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> undef, <8 x i16> undef) ret <16 x i8> %1 @@ -34,8 +31,7 @@ define <16 x i8> @undef_packuswb_128() { ; CHECK-LABEL: @undef_packuswb_128( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> undef, <8 x i16> undef) -; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; CHECK-NEXT: ret <16 x i8> undef ; %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> undef, <8 x i16> undef) ret <16 x i8> %1 @@ -43,8 +39,7 @@ define <16 x i16> @undef_packssdw_256() { ; CHECK-LABEL: @undef_packssdw_256( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> undef, <8 x i32> undef) -; CHECK-NEXT: ret <16 x i16> [[TMP1]] +; CHECK-NEXT: ret <16 x i16> undef ; %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> undef, <8 x i32> undef) ret <16 x i16> %1 @@ -52,8 +47,7 @@ define <16 x i16> @undef_packusdw_256() { ; CHECK-LABEL: @undef_packusdw_256( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> undef, <8 x i32> undef) -; CHECK-NEXT: ret <16 x i16> [[TMP1]] +; CHECK-NEXT: ret <16 x i16> undef ; %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> undef, <8 x i32> undef) ret <16 x i16> %1 @@ -61,8 +55,7 @@ define <32 x i8> @undef_packsswb_256() { ; CHECK-LABEL: @undef_packsswb_256( -; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> undef, <16 x i16> undef) -; CHECK-NEXT: ret <32 x i8> [[TMP1]] +; CHECK-NEXT: ret <32 x i8> undef ; %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> undef, <16 x i16> undef) ret <32 x i8> %1 @@ -70,8 +63,7 @@ define <32 x i8> @undef_packuswb_256() { ; CHECK-LABEL: @undef_packuswb_256( -; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> undef, <16 x i16> undef) -; CHECK-NEXT: ret <32 x i8> [[TMP1]] +; CHECK-NEXT: ret <32 x i8> undef ; %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> undef, <16 x i16> undef) ret <32 x i8> %1 @@ -83,8 +75,7 @@ define <8 x i16> @fold_packssdw_128() { ; CHECK-LABEL: @fold_packssdw_128( -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> , <4 x i32> zeroinitializer) -; CHECK-NEXT: ret <8 x i16> [[TMP1]] +; CHECK-NEXT: ret <8 x i16> ; %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> , <4 x i32> zeroinitializer) ret <8 x i16> %1 @@ -92,8 +83,7 @@ define <8 x i16> @fold_packusdw_128() { ; CHECK-LABEL: @fold_packusdw_128( -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> undef, <4 x i32> ) -; CHECK-NEXT: ret <8 x i16> [[TMP1]] +; CHECK-NEXT: ret <8 x i16> ; %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> undef, <4 x i32> ) ret <8 x i16> %1 @@ -101,8 +91,7 @@ define <16 x i8> @fold_packsswb_128() { ; CHECK-LABEL: @fold_packsswb_128( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> zeroinitializer, <8 x i16> undef) -; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; CHECK-NEXT: ret <16 x i8> ; %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> zeroinitializer, <8 x i16> undef) ret <16 x i8> %1 @@ -110,8 +99,7 @@ define <16 x i8> @fold_packuswb_128() { ; CHECK-LABEL: @fold_packuswb_128( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> , <8 x i16> ) -; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; CHECK-NEXT: ret <16 x i8> ; %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> , <8 x i16> ) ret <16 x i8> %1 @@ -119,8 +107,7 @@ define <16 x i16> @fold_packssdw_256() { ; CHECK-LABEL: @fold_packssdw_256( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> , <8 x i32> undef) -; CHECK-NEXT: ret <16 x i16> [[TMP1]] +; CHECK-NEXT: ret <16 x i16> ; %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> , <8 x i32> undef) ret <16 x i16> %1 @@ -128,8 +115,7 @@ define <16 x i16> @fold_packusdw_256() { ; CHECK-LABEL: @fold_packusdw_256( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> , <8 x i32> ) -; CHECK-NEXT: ret <16 x i16> [[TMP1]] +; CHECK-NEXT: ret <16 x i16> ; %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> , <8 x i32> ) ret <16 x i16> %1 @@ -137,8 +123,7 @@ define <32 x i8> @fold_packsswb_256() { ; CHECK-LABEL: @fold_packsswb_256( -; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> undef, <16 x i16> zeroinitializer) -; CHECK-NEXT: ret <32 x i8> [[TMP1]] +; CHECK-NEXT: ret <32 x i8> ; %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> undef, <16 x i16> zeroinitializer) ret <32 x i8> %1 @@ -146,8 +131,7 @@ define <32 x i8> @fold_packuswb_256() { ; CHECK-LABEL: @fold_packuswb_256( -; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> zeroinitializer, <16 x i16> ) -; CHECK-NEXT: ret <32 x i8> [[TMP1]] +; CHECK-NEXT: ret <32 x i8> ; %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> zeroinitializer, <16 x i16> ) ret <32 x i8> %1 @@ -184,9 +168,7 @@ define <16 x i8> @elts_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: @elts_packsswb_128( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> , <8 x i16> ) -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> -; CHECK-NEXT: ret <16 x i8> [[TMP2]] +; CHECK-NEXT: ret <16 x i8> zeroinitializer ; %1 = insertelement <8 x i16> %a0, i16 0, i32 0 %2 = insertelement <8 x i16> %a1, i16 0, i32 0 @@ -234,9 +216,7 @@ define <32 x i8> @elts_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: @elts_packsswb_256( -; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> , <16 x i16> ) -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> undef, <32 x i32> -; CHECK-NEXT: ret <32 x i8> [[TMP2]] +; CHECK-NEXT: ret <32 x i8> zeroinitializer ; %1 = insertelement <16 x i16> %a0, i16 0, i32 0 %2 = insertelement <16 x i16> %a1, i16 0, i32 8