Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -541,7 +541,7 @@ return Builder.CreateAShr(Vec, ShiftVec); } -static Value *simplifyX86pack(IntrinsicInst &II, +static Value *simplifyX86pack(IntrinsicInst &II, InstCombiner &IC, InstCombiner::BuilderTy &Builder, bool IsSigned) { Value *Arg0 = II.getArgOperand(0); Value *Arg1 = II.getArgOperand(1); @@ -563,8 +563,19 @@ assert(SrcScalarSizeInBits == (2 * DstScalarSizeInBits) && "Unexpected packing types"); - // Constant folding. - if (!isa(Arg0) || !isa(Arg1)) + // We should only expand packs if each argument is constant or can be + // truncated without saturation - anything else is too complex and unlikely to + // lower as well. + APInt ZeroMask = + APInt::getHighBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); + + if (!isa(Arg0) && + ((IsSigned && IC.ComputeNumSignBits(Arg0) <= DstScalarSizeInBits) || + (!IsSigned && !IC.MaskedValueIsZero(Arg0, ZeroMask)))) + return nullptr; + if (!isa(Arg1) && + ((IsSigned && IC.ComputeNumSignBits(Arg1) <= DstScalarSizeInBits) || + (!IsSigned && !IC.MaskedValueIsZero(Arg1, ZeroMask)))) return nullptr; // Clamp Values - signed/unsigned both use signed clamp values, but they @@ -586,12 +597,17 @@ MaxValue = APInt::getLowBitsSet(SrcScalarSizeInBits, DstScalarSizeInBits); } + // Constant folding requires clamping. auto *MinC = Constant::getIntegerValue(ArgTy, MinValue); auto *MaxC = Constant::getIntegerValue(ArgTy, MaxValue); - Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); - Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); - Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); - Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); + if (isa(Arg0)) { + Arg0 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg0, MinC), MinC, Arg0); + Arg0 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg0, MaxC), MaxC, Arg0); + } + if (isa(Arg1)) { + Arg1 = Builder.CreateSelect(Builder.CreateICmpSLT(Arg1, MinC), MinC, Arg1); + Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1); + } // Shuffle clamped args together at the lane level. SmallVector PackMask; @@ -2937,7 +2953,7 @@ case Intrinsic::x86_avx2_packsswb: case Intrinsic::x86_avx512_packssdw_512: case Intrinsic::x86_avx512_packsswb_512: - if (Value *V = simplifyX86pack(*II, Builder, true)) + if (Value *V = simplifyX86pack(*II, *this, Builder, true)) return replaceInstUsesWith(*II, V); break; @@ -2947,7 +2963,7 @@ case Intrinsic::x86_avx2_packuswb: case Intrinsic::x86_avx512_packusdw_512: case Intrinsic::x86_avx512_packuswb_512: - if (Value *V = simplifyX86pack(*II, Builder, false)) + if (Value *V = simplifyX86pack(*II, *this, Builder, false)) return replaceInstUsesWith(*II, V); break; Index: test/Transforms/InstCombine/X86/x86-pack.ll =================================================================== --- test/Transforms/InstCombine/X86/x86-pack.ll +++ test/Transforms/InstCombine/X86/x86-pack.ll @@ -358,8 +358,9 @@ ; CHECK-LABEL: @trunc_packssdw_128( ; CHECK-NEXT: [[TMP1:%.*]] = ashr <4 x i32> [[A0:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i32> [[A1:%.*]], -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) -; CHECK-NEXT: ret <8 x i16> [[TMP3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = trunc <8 x i32> [[TMP3]] to <8 x i16> +; CHECK-NEXT: ret <8 x i16> [[TMP4]] ; %1 = ashr <4 x i32> %a0, %2 = and <4 x i32> %a1, @@ -371,8 +372,9 @@ ; CHECK-LABEL: @trunc_packusdw_128( ; CHECK-NEXT: [[TMP1:%.*]] = lshr <4 x i32> [[A0:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = and <4 x i32> [[A1:%.*]], -; CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.x86.sse2.packssdw.128(<4 x i32> [[TMP1]], <4 x i32> [[TMP2]]) -; CHECK-NEXT: ret <8 x i16> [[TMP3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP2]], <8 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = trunc <8 x i32> [[TMP3]] to <8 x i16> +; CHECK-NEXT: ret <8 x i16> [[TMP4]] ; %1 = lshr <4 x i32> %a0, %2 = and <4 x i32> %a1, @@ -384,8 +386,9 @@ ; CHECK-LABEL: @trunc_packsswb_128( ; CHECK-NEXT: [[TMP1:%.*]] = ashr <8 x i16> [[A0:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = and <8 x i16> [[A1:%.*]], -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.x86.sse2.packsswb.128(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) -; CHECK-NEXT: ret <16 x i8> [[TMP3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = trunc <16 x i16> [[TMP3]] to <16 x i8> +; CHECK-NEXT: ret <16 x i8> [[TMP4]] ; %1 = ashr <8 x i16> %a0, %2 = and <8 x i16> %a1, @@ -397,8 +400,9 @@ ; CHECK-LABEL: @trunc_packuswb_128( ; CHECK-NEXT: [[TMP1:%.*]] = lshr <8 x i16> [[A0:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = and <8 x i16> [[A1:%.*]], -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.x86.sse2.packuswb.128(<8 x i16> [[TMP1]], <8 x i16> [[TMP2]]) -; CHECK-NEXT: ret <16 x i8> [[TMP3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = trunc <16 x i16> [[TMP3]] to <16 x i8> +; CHECK-NEXT: ret <16 x i8> [[TMP4]] ; %1 = lshr <8 x i16> %a0, %2 = and <8 x i16> %a1, @@ -410,8 +414,9 @@ ; CHECK-LABEL: @trunc_packssdw_256( ; CHECK-NEXT: [[TMP1:%.*]] = ashr <8 x i32> [[A0:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = ashr <8 x i32> [[A1:%.*]], -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; CHECK-NEXT: ret <16 x i16> [[TMP3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = trunc <16 x i32> [[TMP3]] to <16 x i16> +; CHECK-NEXT: ret <16 x i16> [[TMP4]] ; %1 = ashr <8 x i32> %a0, %2 = ashr <8 x i32> %a1, @@ -423,8 +428,9 @@ ; CHECK-LABEL: @trunc_packusdw_256( ; CHECK-NEXT: [[TMP1:%.*]] = lshr <8 x i32> [[A0:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = and <8 x i32> [[A1:%.*]], -; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i16> @llvm.x86.avx2.packssdw(<8 x i32> [[TMP1]], <8 x i32> [[TMP2]]) -; CHECK-NEXT: ret <16 x i16> [[TMP3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[TMP1]], <8 x i32> [[TMP2]], <16 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = trunc <16 x i32> [[TMP3]] to <16 x i16> +; CHECK-NEXT: ret <16 x i16> [[TMP4]] ; %1 = lshr <8 x i32> %a0, %2 = and <8 x i32> %a1, @@ -436,8 +442,9 @@ ; CHECK-LABEL: @trunc_packsswb_256( ; CHECK-NEXT: [[TMP1:%.*]] = ashr <16 x i16> [[A0:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = and <16 x i16> [[A1:%.*]], -; CHECK-NEXT: [[TMP3:%.*]] = call <32 x i8> @llvm.x86.avx2.packsswb(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) -; CHECK-NEXT: ret <32 x i8> [[TMP3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[TMP2]], <32 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = trunc <32 x i16> [[TMP3]] to <32 x i8> +; CHECK-NEXT: ret <32 x i8> [[TMP4]] ; %1 = ashr <16 x i16> %a0, %2 = and <16 x i16> %a1, @@ -449,8 +456,9 @@ ; CHECK-LABEL: @trunc_packuswb_256( ; CHECK-NEXT: [[TMP1:%.*]] = lshr <16 x i16> [[A0:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = and <16 x i16> [[A1:%.*]], -; CHECK-NEXT: [[TMP3:%.*]] = call <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16> [[TMP1]], <16 x i16> [[TMP2]]) -; CHECK-NEXT: ret <32 x i8> [[TMP3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[TMP2]], <32 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = trunc <32 x i16> [[TMP3]] to <32 x i8> +; CHECK-NEXT: ret <32 x i8> [[TMP4]] ; %1 = lshr <16 x i16> %a0, %2 = and <16 x i16> %a1, @@ -462,8 +470,9 @@ ; CHECK-LABEL: @trunc_packssdw_512( ; CHECK-NEXT: [[TMP1:%.*]] = ashr <16 x i32> [[A0:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = ashr <16 x i32> [[A1:%.*]], -; CHECK-NEXT: [[TMP3:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) -; CHECK-NEXT: ret <32 x i16> [[TMP3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <32 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = trunc <32 x i32> [[TMP3]] to <32 x i16> +; CHECK-NEXT: ret <32 x i16> [[TMP4]] ; %1 = ashr <16 x i32> %a0, %2 = ashr <16 x i32> %a1, @@ -475,8 +484,9 @@ ; CHECK-LABEL: @trunc_packusdw_512( ; CHECK-NEXT: [[TMP1:%.*]] = lshr <16 x i32> [[A0:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = and <16 x i32> [[A1:%.*]], -; CHECK-NEXT: [[TMP3:%.*]] = call <32 x i16> @llvm.x86.avx512.packssdw.512(<16 x i32> [[TMP1]], <16 x i32> [[TMP2]]) -; CHECK-NEXT: ret <32 x i16> [[TMP3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> [[TMP2]], <32 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = trunc <32 x i32> [[TMP3]] to <32 x i16> +; CHECK-NEXT: ret <32 x i16> [[TMP4]] ; %1 = lshr <16 x i32> %a0, %2 = and <16 x i32> %a1, @@ -488,8 +498,9 @@ ; CHECK-LABEL: @trunc_packsswb_512( ; CHECK-NEXT: [[TMP1:%.*]] = ashr <32 x i16> [[A0:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = and <32 x i16> [[A1:%.*]], -; CHECK-NEXT: [[TMP3:%.*]] = call <64 x i8> @llvm.x86.avx512.packsswb.512(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) -; CHECK-NEXT: ret <64 x i8> [[TMP3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP1]], <32 x i16> [[TMP2]], <64 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = trunc <64 x i16> [[TMP3]] to <64 x i8> +; CHECK-NEXT: ret <64 x i8> [[TMP4]] ; %1 = ashr <32 x i16> %a0, %2 = and <32 x i16> %a1, @@ -501,8 +512,9 @@ ; CHECK-LABEL: @trunc_packuswb_512( ; CHECK-NEXT: [[TMP1:%.*]] = lshr <32 x i16> [[A0:%.*]], ; CHECK-NEXT: [[TMP2:%.*]] = and <32 x i16> [[A1:%.*]], -; CHECK-NEXT: [[TMP3:%.*]] = call <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16> [[TMP1]], <32 x i16> [[TMP2]]) -; CHECK-NEXT: ret <64 x i8> [[TMP3]] +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <32 x i16> [[TMP1]], <32 x i16> [[TMP2]], <64 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = trunc <64 x i16> [[TMP3]] to <64 x i8> +; CHECK-NEXT: ret <64 x i8> [[TMP4]] ; %1 = lshr <32 x i16> %a0, %2 = and <32 x i16> %a1,