Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -559,6 +559,82 @@ return Builder.CreateMul(LHS, RHS); } +static Value *simplifyX86pack(IntrinsicInst &II, InstCombiner &IC, + InstCombiner::BuilderTy &Builder, bool IsSigned) { + Value *Arg0 = II.getArgOperand(0); + Value *Arg1 = II.getArgOperand(1); + Type *ResTy = II.getType(); + + // Fast all undef handling. + if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1)) + return UndefValue::get(ResTy); + + Type *ArgTy = Arg0->getType(); + unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128; + unsigned NumDstElts = ResTy->getVectorNumElements(); + unsigned NumSrcElts = ArgTy->getVectorNumElements(); + assert(NumDstElts == (2 * NumSrcElts) && "Unexpected packing types"); + + unsigned NumDstEltsPerLane = NumDstElts / NumLanes; + unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes; + unsigned DstScalarSizeInBits = ResTy->getScalarSizeInBits(); + assert(ArgTy->getScalarSizeInBits() == (2 * DstScalarSizeInBits) && + "Unexpected packing types"); + + // Constant folding. + auto *Cst0 = dyn_cast<Constant>(Arg0); + auto *Cst1 = dyn_cast<Constant>(Arg1); + if (!Cst0 || !Cst1) + return nullptr; + + SmallVector<Constant *, 32> Vals; + for (unsigned Lane = 0; Lane != NumLanes; ++Lane) { + for (unsigned Elt = 0; Elt != NumDstEltsPerLane; ++Elt) { + unsigned SrcIdx = Lane * NumSrcEltsPerLane + Elt % NumSrcEltsPerLane; + auto *Cst = (Elt >= NumSrcEltsPerLane) ? Cst1 : Cst0; + auto *COp = Cst->getAggregateElement(SrcIdx); + if (COp && isa<UndefValue>(COp)) { + Vals.push_back(UndefValue::get(ResTy->getScalarType())); + continue; + } + + auto *CInt = dyn_cast_or_null<ConstantInt>(COp); + if (!CInt) + return nullptr; + + APInt Val = CInt->getValue(); + assert(Val.getBitWidth() == ArgTy->getScalarSizeInBits() && + "Unexpected constant bitwidth"); + + if (IsSigned) { + // PACKSS: Truncate signed value with signed saturation. + // Source values less than dst minint are saturated to minint. + // Source values greater than dst maxint are saturated to maxint. + if (Val.isSignedIntN(DstScalarSizeInBits)) + Val = Val.trunc(DstScalarSizeInBits); + else if (Val.isNegative()) + Val = APInt::getSignedMinValue(DstScalarSizeInBits); + else + Val = APInt::getSignedMaxValue(DstScalarSizeInBits); + } else { + // PACKUS: Truncate signed value with unsigned saturation. + // Source values less than zero are saturated to zero. + // Source values greater than dst maxuint are saturated to maxuint. + if (Val.isIntN(DstScalarSizeInBits)) + Val = Val.trunc(DstScalarSizeInBits); + else if (Val.isNegative()) + Val = APInt::getNullValue(DstScalarSizeInBits); + else + Val = APInt::getAllOnesValue(DstScalarSizeInBits); + } + + Vals.push_back(ConstantInt::get(ResTy->getScalarType(), Val)); + } + } + + return ConstantVector::get(Vals); +} + static Value *simplifyX86movmsk(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder) { Value *Arg = II.getArgOperand(0); @@ -2205,6 +2281,24 @@ break; } + case Intrinsic::x86_sse2_packssdw_128: + case Intrinsic::x86_sse2_packsswb_128: + case Intrinsic::x86_avx2_packssdw: + case Intrinsic::x86_avx2_packsswb: + // TODO Add support for Intrinsic::x86_avx512_mask_packss* + if (Value *V = simplifyX86pack(*II, *this, *Builder, true)) + return replaceInstUsesWith(*II, V); + break; + + case Intrinsic::x86_sse2_packuswb_128: + case Intrinsic::x86_sse41_packusdw: + case Intrinsic::x86_avx2_packusdw: + case Intrinsic::x86_avx2_packuswb: + // TODO Add support for Intrinsic::x86_avx512_mask_packus* + if (Value *V = simplifyX86pack(*II, *this, *Builder, false)) + return replaceInstUsesWith(*II, V); + break; + case Intrinsic::x86_sse41_insertps: if (Value *V = simplifyX86insertps(*II, *Builder)) return replaceInstUsesWith(*II, V); Index: test/Transforms/InstCombine/x86-pack.ll =================================================================== --- test/Transforms/InstCombine/x86-pack.ll +++ test/Transforms/InstCombine/x86-pack.ll @@ -7,8 +7,7 @@ define <8 x i16> @undef_packssdw_128() { ; CHECK-LABEL: @undef_packssdw_128( -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> undef, <4 x i32> undef) -; CHECK-NEXT: ret <8 x i16> [[TMP1]] +; CHECK-NEXT: ret <8 x i16> undef ; %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> undef, <4 x i32> undef) ret <8 x i16> %1 @@ -16,8 +15,7 @@ define <8 x i16> @undef_packusdw_128() { ; CHECK-LABEL: @undef_packusdw_128( -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> undef, <4 x i32> undef) -; CHECK-NEXT: ret <8 x i16> [[TMP1]] +; CHECK-NEXT: ret <8 x i16> undef ; %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> undef, <4 x i32> undef) ret <8 x i16> %1 @@ -25,8 +23,7 @@ define <16 x i8> @undef_packsswb_128() { ; CHECK-LABEL: @undef_packsswb_128( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> undef, <8 x i16> undef) -; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; CHECK-NEXT: ret <16 x i8> undef ; %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> undef, <8 x i16> undef) ret <16 x i8> %1 @@ -34,8 +31,7 @@ define <16 x i8> @undef_packuswb_128() { ; CHECK-LABEL: @undef_packuswb_128( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> undef, <8 x i16> undef) -; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; CHECK-NEXT: ret <16 x i8> undef ; %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> undef, <8 x i16> undef) ret <16 x i8> %1 @@ -43,8 +39,7 @@ define <16 x i16> @undef_packssdw_256() { ; CHECK-LABEL: @undef_packssdw_256( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> undef, <8 x i32> undef) -; CHECK-NEXT: ret <16 x i16> [[TMP1]] +; CHECK-NEXT: ret <16 x i16> undef ; %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> undef, <8 x i32> undef) ret <16 x i16> %1 @@ -52,8 +47,7 @@ define <16 x i16> @undef_packusdw_256() { ; CHECK-LABEL: @undef_packusdw_256( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> undef, <8 x i32> undef) -; CHECK-NEXT: ret <16 x i16> [[TMP1]] +; CHECK-NEXT: ret <16 x i16> undef ; %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> undef, <8 x i32> undef) ret <16 x i16> %1 @@ -61,8 +55,7 @@ define <32 x i8> @undef_packsswb_256() { ; CHECK-LABEL: @undef_packsswb_256( -; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> undef, <16 x i16> undef) -; CHECK-NEXT: ret <32 x i8> [[TMP1]] +; CHECK-NEXT: ret <32 x i8> undef ; %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> undef, <16 x i16> undef) ret <32 x i8> %1 @@ -70,8 +63,7 @@ define <32 x i8> @undef_packuswb_256() { ; CHECK-LABEL: @undef_packuswb_256( -; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> undef, <16 x i16> undef) -; CHECK-NEXT: ret <32 x i8> [[TMP1]] +; CHECK-NEXT: ret <32 x i8> undef ; %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> undef, <16 x i16> undef) ret <32 x i8> %1 @@ -83,8 +75,7 @@ define <8 x i16> @fold_packssdw_128() { ; CHECK-LABEL: @fold_packssdw_128( -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> <i32 0, i32 -1, i32 65536, i32 -131072>, <4 x i32> zeroinitializer) -; CHECK-NEXT: ret <8 x i16> [[TMP1]] +; CHECK-NEXT: ret <8 x i16> <i16 0, i16 -1, i16 32767, i16 -32768, i16 0, i16 0, i16 0, i16 0> ; %1 = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> <i32 0, i32 -1, i32 65536, i32 -131072>, <4 x i32> zeroinitializer) ret <8 x i16> %1 @@ -92,8 +83,7 @@ define <8 x i16> @fold_packusdw_128() { ; CHECK-LABEL: @fold_packusdw_128( -; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> undef, <4 x i32> <i32 0, i32 -1, i32 32768, i32 65537>) -; CHECK-NEXT: ret <8 x i16> [[TMP1]] +; CHECK-NEXT: ret <8 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 0, i16 -32768, i16 -1> ; %1 = call <8 x i16> @llvm.x86.sse41.packusdw(<4 x i32> undef, <4 x i32> <i32 0, i32 -1, i32 32768, i32 65537>) ret <8 x i16> %1 @@ -101,8 +91,7 @@ define <16 x i8> @fold_packsswb_128() { ; CHECK-LABEL: @fold_packsswb_128( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> zeroinitializer, <8 x i16> undef) -; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; CHECK-NEXT: ret <16 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef> ; %1 = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> zeroinitializer, <8 x i16> undef) ret <16 x i8> %1 @@ -110,8 +99,7 @@ define <16 x i8> @fold_packuswb_128() { ; CHECK-LABEL: @fold_packuswb_128( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> <i16 0, i16 1, i16 -1, i16 255, i16 -1, i16 -32768, i16 -127, i16 15>, <8 x i16> <i16 -15, i16 127, i16 -32768, i16 1, i16 -255, i16 1, i16 -1, i16 0>) -; CHECK-NEXT: ret <16 x i8> [[TMP1]] +; CHECK-NEXT: ret <16 x i8> <i8 0, i8 1, i8 0, i8 -1, i8 0, i8 0, i8 0, i8 15, i8 0, i8 127, i8 0, i8 1, i8 0, i8 1, i8 0, i8 0> ; %1 = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> <i16 0, i16 1, i16 -1, i16 255, i16 65535, i16 -32768, i16 -127, i16 15>, <8 x i16> <i16 -15, i16 127, i16 32768, i16 -65535, i16 -255, i16 1, i16 -1, i16 0>) ret <16 x i8> %1 @@ -119,8 +107,7 @@ define <16 x i16> @fold_packssdw_256() { ; CHECK-LABEL: @fold_packssdw_256( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> <i32 0, i32 256, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>, <8 x i32> undef) -; CHECK-NEXT: ret <16 x i16> [[TMP1]] +; CHECK-NEXT: ret <16 x i16> <i16 0, i16 256, i16 32767, i16 -32768, i16 undef, i16 undef, i16 undef, i16 undef, i16 -127, i16 -32768, i16 -32767, i16 32767, i16 undef, i16 undef, i16 undef, i16 undef> ; %1 = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> <i32 0, i32 256, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>, <8 x i32> undef) ret <16 x i16> %1 @@ -128,8 +115,7 @@ define <16 x i16> @fold_packusdw_256() { ; CHECK-LABEL: @fold_packusdw_256( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> <i32 0, i32 -256, i32 -65535, i32 65536, i32 127, i32 32768, i32 32767, i32 -32767>, <8 x i32> <i32 0, i32 256, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>) -; CHECK-NEXT: ret <16 x i16> [[TMP1]] +; CHECK-NEXT: ret <16 x i16> <i16 0, i16 0, i16 0, i16 -1, i16 0, i16 256, i16 -1, i16 0, i16 127, i16 -32768, i16 32767, i16 0, i16 0, i16 0, i16 0, i16 32767> ; %1 = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> <i32 0, i32 -256, i32 -65535, i32 65536, i32 127, i32 32768, i32 32767, i32 -32767>, <8 x i32> <i32 0, i32 256, i32 65535, i32 -65536, i32 -127, i32 -32768, i32 -32767, i32 32767>) ret <16 x i16> %1 @@ -137,8 +123,7 @@ define <32 x i8> @fold_packsswb_256() { ; CHECK-LABEL: @fold_packsswb_256( -; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> undef, <16 x i16> zeroinitializer) -; CHECK-NEXT: ret <32 x i8> [[TMP1]] +; CHECK-NEXT: ret <32 x i8> <i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 undef, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0> ; %1 = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> undef, <16 x i16> zeroinitializer) ret <32 x i8> %1 @@ -146,8 +131,7 @@ define <32 x i8> @fold_packuswb_256() { ; CHECK-LABEL: @fold_packuswb_256( -; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> zeroinitializer, <16 x i16> <i16 0, i16 -127, i16 -128, i16 -32768, i16 0, i16 255, i16 256, i16 512, i16 -1, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64>) -; CHECK-NEXT: ret <32 x i8> [[TMP1]] +; CHECK-NEXT: ret <32 x i8> <i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 -1, i8 -1, i8 -1, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 0, i8 1, i8 2, i8 4, i8 8, i8 16, i8 32, i8 64> ; %1 = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> zeroinitializer, <16 x i16> <i16 0, i16 -127, i16 -128, i16 -32768, i16 65536, i16 255, i16 256, i16 512, i16 -1, i16 1, i16 2, i16 4, i16 8, i16 16, i16 32, i16 64>) ret <32 x i8> %1 @@ -184,9 +168,7 @@ define <16 x i8> @elts_packsswb_128(<8 x i16> %a0, <8 x i16> %a1) { ; CHECK-LABEL: @elts_packsswb_128( -; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> <i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, <8 x i16> <i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>) -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> undef, <16 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8, i32 8> -; CHECK-NEXT: ret <16 x i8> [[TMP2]] +; CHECK-NEXT: ret <16 x i8> zeroinitializer ; %1 = insertelement <8 x i16> %a0, i16 0, i32 0 %2 = insertelement <8 x i16> %a1, i16 0, i32 0 @@ -234,9 +216,7 @@ define <32 x i8> @elts_packsswb_256(<16 x i16> %a0, <16 x i16> %a1) { ; CHECK-LABEL: @elts_packsswb_256( -; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> <i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>, <16 x i16> <i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 0, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef, i16 undef>) -; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <32 x i8> [[TMP1]], <32 x i8> undef, <32 x i32> <i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 0, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24, i32 24> -; CHECK-NEXT: ret <32 x i8> [[TMP2]] +; CHECK-NEXT: ret <32 x i8> zeroinitializer ; %1 = insertelement <16 x i16> %a0, i16 0, i32 0 %2 = insertelement <16 x i16> %a1, i16 0, i32 8