Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -364,18 +364,6 @@ // Integer arithmetic ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_sse2_padds_b : GCCBuiltin<"__builtin_ia32_paddsb128">, - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, - llvm_v16i8_ty], [IntrNoMem, Commutative]>; - def int_x86_sse2_padds_w : GCCBuiltin<"__builtin_ia32_paddsw128">, - Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, - llvm_v8i16_ty], [IntrNoMem, Commutative]>; - def int_x86_sse2_psubs_b : GCCBuiltin<"__builtin_ia32_psubsb128">, - Intrinsic<[llvm_v16i8_ty], [llvm_v16i8_ty, - llvm_v16i8_ty], [IntrNoMem]>; - def int_x86_sse2_psubs_w : GCCBuiltin<"__builtin_ia32_psubsw128">, - Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, - llvm_v8i16_ty], [IntrNoMem]>; def int_x86_sse2_pmulhu_w : GCCBuiltin<"__builtin_ia32_pmulhuw128">, Intrinsic<[llvm_v8i16_ty], [llvm_v8i16_ty, llvm_v8i16_ty], [IntrNoMem, Commutative]>; @@ -1346,18 +1334,6 @@ // Integer arithmetic ops. let TargetPrefix = "x86" in { // All intrinsics start with "llvm.x86.". - def int_x86_avx2_padds_b : GCCBuiltin<"__builtin_ia32_paddsb256">, - Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, - llvm_v32i8_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_padds_w : GCCBuiltin<"__builtin_ia32_paddsw256">, - Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, - llvm_v16i16_ty], [IntrNoMem, Commutative]>; - def int_x86_avx2_psubs_b : GCCBuiltin<"__builtin_ia32_psubsb256">, - Intrinsic<[llvm_v32i8_ty], [llvm_v32i8_ty, - llvm_v32i8_ty], [IntrNoMem]>; - def int_x86_avx2_psubs_w : GCCBuiltin<"__builtin_ia32_psubsw256">, - Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, - llvm_v16i16_ty], [IntrNoMem]>; def int_x86_avx2_pmulhu_w : GCCBuiltin<"__builtin_ia32_pmulhuw256">, Intrinsic<[llvm_v16i16_ty], [llvm_v16i16_ty, llvm_v16i16_ty], [IntrNoMem, Commutative]>; @@ -3677,18 +3653,6 @@ } // Integer arithmetic ops let TargetPrefix = "x86" in { - def int_x86_avx512_padds_b_512 : GCCBuiltin<"__builtin_ia32_paddsb512">, - Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty], - [IntrNoMem]>; - def int_x86_avx512_padds_w_512 : GCCBuiltin<"__builtin_ia32_paddsw512">, - Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty], - [IntrNoMem]>; - def int_x86_avx512_psubs_b_512 : GCCBuiltin<"__builtin_ia32_psubsb512">, - Intrinsic<[llvm_v64i8_ty], [llvm_v64i8_ty, llvm_v64i8_ty], - [IntrNoMem]>; - def int_x86_avx512_psubs_w_512 : GCCBuiltin<"__builtin_ia32_psubsw512">, - Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty], - [IntrNoMem]>; def int_x86_avx512_pmulhu_w_512 : GCCBuiltin<"__builtin_ia32_pmulhuw512">, Intrinsic<[llvm_v32i16_ty], [llvm_v32i16_ty, llvm_v32i16_ty], [IntrNoMem, Commutative]>; Index: lib/IR/AutoUpgrade.cpp =================================================================== --- lib/IR/AutoUpgrade.cpp +++ lib/IR/AutoUpgrade.cpp @@ -77,10 +77,18 @@ Name == "addcarry.u64" || // Added in 8.0 Name == "subborrow.u32" || // Added in 8.0 Name == "subborrow.u64" || // Added in 8.0 + Name.startswith("sse2.padds.") || // Added in 8.0 + Name.startswith("sse2.psubs.") || // Added in 8.0 Name.startswith("sse2.paddus.") || // Added in 8.0 Name.startswith("sse2.psubus.") || // Added in 8.0 + Name.startswith("avx2.padds.") || // Added in 8.0 + Name.startswith("avx2.psubs.") || // Added in 8.0 Name.startswith("avx2.paddus.") || // Added in 8.0 Name.startswith("avx2.psubus.") || // Added in 8.0 + Name.startswith("avx512.padds.") || // Added in 8.0 + Name.startswith("avx512.psubs.") || // Added in 8.0 + Name.startswith("avx512.mask.padds.") || // Added in 8.0 + Name.startswith("avx512.mask.psubs.") || // Added in 8.0 Name.startswith("avx512.mask.paddus.") || // Added in 8.0 Name.startswith("avx512.mask.psubus.") || // Added in 8.0 Name=="ssse3.pabs.b.128" || // Added in 6.0 @@ -288,8 +296,6 @@ Name.startswith("avx512.mask.pror.") || // Added in 7.0 Name.startswith("avx512.mask.prolv.") || // Added in 7.0 Name.startswith("avx512.mask.prol.") || // Added in 7.0 - Name.startswith("avx512.mask.padds.") || // Added in 8.0 - Name.startswith("avx512.mask.psubs.") || // Added in 8.0 Name == "sse.cvtsi2ss" || // Added in 7.0 Name == "sse.cvtsi642ss" || // Added in 7.0 Name == "sse2.cvtsi2sd" || // Added in 7.0 @@ -925,12 +931,14 @@ } static Value *UpgradeX86AddSubSatIntrinsics(IRBuilder<> &Builder, CallInst &CI, - bool IsAddition) { + bool IsSigned, bool IsAddition) { Type *Ty = CI.getType(); Value *Op0 = CI.getOperand(0); Value *Op1 = CI.getOperand(1); - Intrinsic::ID IID = IsAddition ? Intrinsic::uadd_sat : Intrinsic::usub_sat; + Intrinsic::ID IID = + IsSigned ? (IsAddition ? Intrinsic::sadd_sat : Intrinsic::ssub_sat) + : (IsAddition ? Intrinsic::uadd_sat : Intrinsic::usub_sat); Function *Intrin = Intrinsic::getDeclaration(CI.getModule(), IID, Ty); Value *Res = Builder.CreateCall(Intrin, {Op0, Op1}); @@ -1410,36 +1418,6 @@ IID = Intrinsic::x86_avx512_prol_q_512; else llvm_unreachable("Unexpected intrinsic"); - } else if (Name.startswith("padds.")) { - if (VecWidth == 128 && EltWidth == 8) - IID = Intrinsic::x86_sse2_padds_b; - else if (VecWidth == 256 && EltWidth == 8) - IID = Intrinsic::x86_avx2_padds_b; - else if (VecWidth == 512 && EltWidth == 8) - IID = Intrinsic::x86_avx512_padds_b_512; - else if (VecWidth == 128 && EltWidth == 16) - IID = Intrinsic::x86_sse2_padds_w; - else if (VecWidth == 256 && EltWidth == 16) - IID = Intrinsic::x86_avx2_padds_w; - else if (VecWidth == 512 && EltWidth == 16) - IID = Intrinsic::x86_avx512_padds_w_512; - else - llvm_unreachable("Unexpected intrinsic"); - } else if (Name.startswith("psubs.")) { - if (VecWidth == 128 && EltWidth == 8) - IID = Intrinsic::x86_sse2_psubs_b; - else if (VecWidth == 256 && EltWidth == 8) - IID = Intrinsic::x86_avx2_psubs_b; - else if (VecWidth == 512 && EltWidth == 8) - IID = Intrinsic::x86_avx512_psubs_b_512; - else if (VecWidth == 128 && EltWidth == 16) - IID = Intrinsic::x86_sse2_psubs_w; - else if (VecWidth == 256 && EltWidth == 16) - IID = Intrinsic::x86_avx2_psubs_w; - else if (VecWidth == 512 && EltWidth == 16) - IID = Intrinsic::x86_avx512_psubs_w_512; - else - llvm_unreachable("Unexpected intrinsic"); } else return false; @@ -2116,6 +2094,16 @@ if (CI->getNumArgOperands() == 3) Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1)); + } else if (IsX86 && (Name.startswith("sse2.padds.") || + Name.startswith("sse2.psubs.") || + Name.startswith("avx2.padds.") || + Name.startswith("avx2.psubs.") || + Name.startswith("avx512.padds.") || + Name.startswith("avx512.psubs.") || + Name.startswith("avx512.mask.padds.") || + Name.startswith("avx512.mask.psubs."))) { + bool IsAdd = Name.contains(".padds"); + Rep = UpgradeX86AddSubSatIntrinsics(Builder, *CI, true, IsAdd); } else if (IsX86 && (Name.startswith("sse2.paddus.") || Name.startswith("sse2.psubus.") || Name.startswith("avx2.paddus.") || @@ -2123,7 +2111,7 @@ Name.startswith("avx512.mask.paddus.") || Name.startswith("avx512.mask.psubus."))) { bool IsAdd = Name.contains(".paddus"); - Rep = UpgradeX86AddSubSatIntrinsics(Builder, *CI, IsAdd); + Rep = UpgradeX86AddSubSatIntrinsics(Builder, *CI, false, IsAdd); } else if (IsX86 && Name.startswith("avx512.mask.palignr.")) { Rep = UpgradeX86ALIGNIntrinsics(Builder, CI->getArgOperand(0), CI->getArgOperand(1), Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -319,8 +319,6 @@ X86_INTRINSIC_DATA(avx2_packsswb, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx2_packusdw, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx2_packuswb, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx2_padds_b, INTR_TYPE_2OP, ISD::SADDSAT, 0), - X86_INTRINSIC_DATA(avx2_padds_w, INTR_TYPE_2OP, ISD::SADDSAT, 0), X86_INTRINSIC_DATA(avx2_permd, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_permps, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx2_phadd_d, INTR_TYPE_2OP, X86ISD::HADD, 0), @@ -361,8 +359,6 @@ X86_INTRINSIC_DATA(avx2_psrlv_d_256, INTR_TYPE_2OP, ISD::SRL, 0), X86_INTRINSIC_DATA(avx2_psrlv_q, INTR_TYPE_2OP, ISD::SRL, 0), X86_INTRINSIC_DATA(avx2_psrlv_q_256, INTR_TYPE_2OP, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx2_psubs_b, INTR_TYPE_2OP, ISD::SSUBSAT, 0), - X86_INTRINSIC_DATA(avx2_psubs_w, INTR_TYPE_2OP, ISD::SSUBSAT, 0), X86_INTRINSIC_DATA(avx512_add_pd_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND), X86_INTRINSIC_DATA(avx512_add_ps_512, INTR_TYPE_2OP, ISD::FADD, X86ISD::FADD_RND), X86_INTRINSIC_DATA(avx512_cmp_pd_128, CMP_MASK_CC, X86ISD::CMPM, 0), @@ -920,8 +916,6 @@ X86_INTRINSIC_DATA(avx512_packsswb_512, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(avx512_packusdw_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), X86_INTRINSIC_DATA(avx512_packuswb_512, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(avx512_padds_b_512, INTR_TYPE_2OP, ISD::SADDSAT, 0), - X86_INTRINSIC_DATA(avx512_padds_w_512, INTR_TYPE_2OP, ISD::SADDSAT, 0), X86_INTRINSIC_DATA(avx512_permvar_df_256, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx512_permvar_df_512, VPERM_2OP, X86ISD::VPERMV, 0), X86_INTRINSIC_DATA(avx512_permvar_di_256, VPERM_2OP, X86ISD::VPERMV, 0), @@ -1004,8 +998,6 @@ X86_INTRINSIC_DATA(avx512_psrlv_w_128, INTR_TYPE_2OP, ISD::SRL, 0), X86_INTRINSIC_DATA(avx512_psrlv_w_256, INTR_TYPE_2OP, ISD::SRL, 0), X86_INTRINSIC_DATA(avx512_psrlv_w_512, INTR_TYPE_2OP, ISD::SRL, 0), - X86_INTRINSIC_DATA(avx512_psubs_b_512, INTR_TYPE_2OP, ISD::SSUBSAT, 0), - X86_INTRINSIC_DATA(avx512_psubs_w_512, INTR_TYPE_2OP, ISD::SSUBSAT, 0), X86_INTRINSIC_DATA(avx512_pternlog_d_128, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0), X86_INTRINSIC_DATA(avx512_pternlog_d_256, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0), X86_INTRINSIC_DATA(avx512_pternlog_d_512, INTR_TYPE_4OP, X86ISD::VPTERNLOG, 0), @@ -1168,8 +1160,6 @@ X86_INTRINSIC_DATA(sse2_packssdw_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packsswb_128, INTR_TYPE_2OP, X86ISD::PACKSS, 0), X86_INTRINSIC_DATA(sse2_packuswb_128, INTR_TYPE_2OP, X86ISD::PACKUS, 0), - X86_INTRINSIC_DATA(sse2_padds_b, INTR_TYPE_2OP, ISD::SADDSAT, 0), - X86_INTRINSIC_DATA(sse2_padds_w, INTR_TYPE_2OP, ISD::SADDSAT, 0), X86_INTRINSIC_DATA(sse2_pmadd_wd, INTR_TYPE_2OP, X86ISD::VPMADDWD, 0), X86_INTRINSIC_DATA(sse2_pmovmskb_128, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(sse2_pmulh_w, INTR_TYPE_2OP, ISD::MULHS, 0), @@ -1191,8 +1181,6 @@ X86_INTRINSIC_DATA(sse2_psrli_d, VSHIFT, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(sse2_psrli_q, VSHIFT, X86ISD::VSRLI, 0), X86_INTRINSIC_DATA(sse2_psrli_w, VSHIFT, X86ISD::VSRLI, 0), - X86_INTRINSIC_DATA(sse2_psubs_b, INTR_TYPE_2OP, ISD::SSUBSAT, 0), - X86_INTRINSIC_DATA(sse2_psubs_w, INTR_TYPE_2OP, ISD::SSUBSAT, 0), X86_INTRINSIC_DATA(sse2_ucomieq_sd, COMI, X86ISD::UCOMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse2_ucomige_sd, COMI, X86ISD::UCOMI, ISD::SETGE), X86_INTRINSIC_DATA(sse2_ucomigt_sd, COMI, X86ISD::UCOMI, ISD::SETGT), Index: lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- lib/Transforms/InstCombine/InstCombineCalls.cpp +++ lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -243,67 +243,6 @@ return nullptr; } -static Value *simplifyX86AddsSubs(const IntrinsicInst &II, - InstCombiner::BuilderTy &Builder) { - bool IsAddition; - - switch (II.getIntrinsicID()) { - default: llvm_unreachable("Unexpected intrinsic!"); - case Intrinsic::x86_sse2_padds_b: - case Intrinsic::x86_sse2_padds_w: - case Intrinsic::x86_avx2_padds_b: - case Intrinsic::x86_avx2_padds_w: - case Intrinsic::x86_avx512_padds_b_512: - case Intrinsic::x86_avx512_padds_w_512: - IsAddition = true; - break; - case Intrinsic::x86_sse2_psubs_b: - case Intrinsic::x86_sse2_psubs_w: - case Intrinsic::x86_avx2_psubs_b: - case Intrinsic::x86_avx2_psubs_w: - case Intrinsic::x86_avx512_psubs_b_512: - case Intrinsic::x86_avx512_psubs_w_512: - IsAddition = false; - break; - } - - auto *Arg0 = dyn_cast(II.getOperand(0)); - auto *Arg1 = dyn_cast(II.getOperand(1)); - auto VT = cast(II.getType()); - auto SVT = VT->getElementType(); - unsigned NumElems = VT->getNumElements(); - - if (!Arg0 || !Arg1) - return nullptr; - - SmallVector Result; - - APInt MaxValue = APInt::getSignedMaxValue(SVT->getIntegerBitWidth()); - APInt MinValue = APInt::getSignedMinValue(SVT->getIntegerBitWidth()); - for (unsigned i = 0; i < NumElems; ++i) { - auto *Elt0 = Arg0->getAggregateElement(i); - auto *Elt1 = Arg1->getAggregateElement(i); - if (isa(Elt0) || isa(Elt1)) { - Result.push_back(UndefValue::get(SVT)); - continue; - } - - if (!isa(Elt0) || !isa(Elt1)) - return nullptr; - - const APInt &Val0 = cast(Elt0)->getValue(); - const APInt &Val1 = cast(Elt1)->getValue(); - bool Overflow = false; - APInt ResultElem = IsAddition ? Val0.sadd_ov(Val1, Overflow) - : Val0.ssub_ov(Val1, Overflow); - if (Overflow) - ResultElem = Val0.isNegative() ? MinValue : MaxValue; - Result.push_back(Constant::getIntegerValue(SVT, ResultElem)); - } - - return ConstantVector::get(Result); -} - static Value *simplifyX86immShift(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder) { bool LogicalShift = false; @@ -2784,23 +2723,6 @@ break; } - // Constant fold add/sub with saturation intrinsics. - case Intrinsic::x86_sse2_padds_b: - case Intrinsic::x86_sse2_padds_w: - case Intrinsic::x86_sse2_psubs_b: - case Intrinsic::x86_sse2_psubs_w: - case Intrinsic::x86_avx2_padds_b: - case Intrinsic::x86_avx2_padds_w: - case Intrinsic::x86_avx2_psubs_b: - case Intrinsic::x86_avx2_psubs_w: - case Intrinsic::x86_avx512_padds_b_512: - case Intrinsic::x86_avx512_padds_w_512: - case Intrinsic::x86_avx512_psubs_b_512: - case Intrinsic::x86_avx512_psubs_w_512: - if (Value *V = simplifyX86AddsSubs(*II, Builder)) - return replaceInstUsesWith(*II, V); - break; - // Constant fold ashr( , Ci ). // Constant fold lshr( , Ci ). // Constant fold shl( , Ci ). Index: test/CodeGen/X86/avx2-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx2-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx2-intrinsics-fast-isel.ll @@ -98,11 +98,11 @@ ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %arg0, <32 x i8> %arg1) + %res = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1) %bc = bitcast <32 x i8> %res to <4 x i64> ret <4 x i64> %bc } -declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone +declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone define <4 x i64> @test_mm256_adds_epi16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_adds_epi16: @@ -111,11 +111,11 @@ ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %arg0, <16 x i16> %arg1) + %res = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1) %bc = bitcast <16 x i16> %res to <4 x i64> ret <4 x i64> %bc } -declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone +declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone define <4 x i64> @test_mm256_adds_epu8(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_adds_epu8: @@ -2527,11 +2527,11 @@ ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <32 x i8> %arg1 = bitcast <4 x i64> %a1 to <32 x i8> - %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %arg0, <32 x i8> %arg1) + %res = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %arg0, <32 x i8> %arg1) %bc = bitcast <32 x i8> %res to <4 x i64> ret <4 x i64> %bc } -declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone +declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone define <4 x i64> @test_mm256_subs_epi16(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_subs_epi16: @@ -2540,11 +2540,11 @@ ; CHECK-NEXT: ret{{[l|q]}} %arg0 = bitcast <4 x i64> %a0 to <16 x i16> %arg1 = bitcast <4 x i64> %a1 to <16 x i16> - %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %arg0, <16 x i16> %arg1) + %res = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %arg0, <16 x i16> %arg1) %bc = bitcast <16 x i16> %res to <4 x i64> ret <4 x i64> %bc } -declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone +declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone define <4 x i64> @test_mm256_subs_epu8(<4 x i64> %a0, <4 x i64> %a1) { ; CHECK-LABEL: test_mm256_subs_epu8: Index: test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll =================================================================== --- test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll +++ test/CodeGen/X86/avx2-intrinsics-x86-upgrade.ll @@ -614,6 +614,28 @@ declare <4 x i64> @llvm.x86.avx2.pmul.dq(<8 x i32>, <8 x i32>) nounwind readnone +define <32 x i8> @test_x86_avx2_padds_b(<32 x i8> %a0, <32 x i8> %a1) { +; CHECK-LABEL: test_x86_avx2_padds_b: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} +declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_padds_w(<16 x i16> %a0, <16 x i16> %a1) { +; CHECK-LABEL: test_x86_avx2_padds_w: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone + + define <32 x i8> @test_x86_avx2_paddus_b(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: test_x86_avx2_paddus_b: ; CHECK: ## %bb.0: @@ -636,6 +658,28 @@ declare <16 x i16> @llvm.x86.avx2.paddus.w(<16 x i16>, <16 x i16>) nounwind readnone +define <32 x i8> @test_x86_avx2_psubs_b(<32 x i8> %a0, <32 x i8> %a1) { +; CHECK-LABEL: test_x86_avx2_psubs_b: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] + ret <32 x i8> %res +} +declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) nounwind readnone + + +define <16 x i16> @test_x86_avx2_psubs_w(<16 x i16> %a0, <16 x i16> %a1) { +; CHECK-LABEL: test_x86_avx2_psubs_w: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 +; CHECK-NEXT: ret{{[l|q]}} + %res = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] + ret <16 x i16> %res +} +declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) nounwind readnone + + define <32 x i8> @test_x86_avx2_psubus_b(<32 x i8> %a0, <32 x i8> %a1) { ; CHECK-LABEL: test_x86_avx2_psubus_b: ; CHECK: ## %bb.0: Index: test/CodeGen/X86/avx2-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/avx2-intrinsics-x86.ll +++ test/CodeGen/X86/avx2-intrinsics-x86.ll @@ -151,38 +151,6 @@ } -define <32 x i8> @test_x86_avx2_padds_b(<32 x i8> %a0, <32 x i8> %a1) { -; AVX2-LABEL: test_x86_avx2_padds_b: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xec,0xc1] -; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx2_padds_b: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1] -; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} -declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone - - -define <16 x i16> @test_x86_avx2_padds_w(<16 x i16> %a0, <16 x i16> %a1) { -; AVX2-LABEL: test_x86_avx2_padds_w: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xed,0xc1] -; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx2_padds_w: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1] -; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} -declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone - - define <8 x i32> @test_x86_avx2_pmadd_wd(<16 x i16> %a0, <16 x i16> %a1) { ; AVX2-LABEL: test_x86_avx2_pmadd_wd: ; AVX2: ## %bb.0: @@ -607,37 +575,6 @@ declare <16 x i16> @llvm.x86.avx2.psrli.w(<16 x i16>, i32) nounwind readnone -define <32 x i8> @test_x86_avx2_psubs_b(<32 x i8> %a0, <32 x i8> %a1) { -; AVX2-LABEL: test_x86_avx2_psubs_b: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe8,0xc1] -; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx2_psubs_b: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1] -; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] - %res = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a0, <32 x i8> %a1) ; <<32 x i8>> [#uses=1] - ret <32 x i8> %res -} -declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone - - -define <16 x i16> @test_x86_avx2_psubs_w(<16 x i16> %a0, <16 x i16> %a1) { -; AVX2-LABEL: test_x86_avx2_psubs_w: -; AVX2: ## %bb.0: -; AVX2-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## encoding: [0xc5,0xfd,0xe9,0xc1] -; AVX2-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx2_psubs_w: -; AVX512VL: ## %bb.0: -; AVX512VL-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1] -; AVX512VL-NEXT: ret{{[l|q]}} ## encoding: [0xc3] - %res = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a0, <16 x i16> %a1) ; <<16 x i16>> [#uses=1] - ret <16 x i16> %res -} -declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone - define <8 x i32> @test_x86_avx2_phadd_d(<8 x i32> %a0, <8 x i32> %a1) { ; CHECK-LABEL: test_x86_avx2_phadd_d: ; CHECK: ## %bb.0: @@ -848,28 +785,28 @@ ; X86-AVX: ## %bb.0: ; X86-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] ; X86-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI51_0, kind: FK_Data_4 +; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI47_0, kind: FK_Data_4 ; X86-AVX-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_packusdw_fold: ; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vmovaps LCPI51_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] +; X86-AVX512VL-NEXT: vmovaps LCPI47_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] ; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI51_0, kind: FK_Data_4 +; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI47_0, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_packusdw_fold: ; X64-AVX: ## %bb.0: ; X64-AVX-NEXT: vmovaps {{.*#+}} ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] ; X64-AVX-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI51_0-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI47_0-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_packusdw_fold: ; X64-AVX512VL: ## %bb.0: ; X64-AVX512VL-NEXT: vmovaps {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [0,0,0,0,255,32767,65535,0,0,0,0,0,0,0,0,0] ; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xfc,0x28,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI51_0-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI47_0-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] %res = call <16 x i16> @llvm.x86.avx2.packusdw(<8 x i32> zeroinitializer, <8 x i32> ) ret <16 x i16> %res @@ -1376,36 +1313,36 @@ ; X86-AVX: ## %bb.0: ; X86-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23] ; X86-AVX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI83_0, kind: FK_Data_4 -; X86-AVX-NEXT: vpsravd LCPI83_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI83_1, kind: FK_Data_4 +; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI79_0, kind: FK_Data_4 +; X86-AVX-NEXT: vpsravd LCPI79_1, %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] +; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI79_1, kind: FK_Data_4 ; X86-AVX-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_const: ; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vmovdqa LCPI83_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23] +; X86-AVX512VL-NEXT: vmovdqa LCPI79_0, %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23] ; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI83_0, kind: FK_Data_4 -; X86-AVX512VL-NEXT: vpsravd LCPI83_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI83_1, kind: FK_Data_4 +; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI79_0, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpsravd LCPI79_1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI79_1, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psrav_d_const: ; X64-AVX: ## %bb.0: ; X64-AVX-NEXT: vmovdqa {{.*#+}} xmm0 = [2,9,4294967284,23] ; X64-AVX-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI83_0-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI79_0-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI83_1-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI79_1-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_const: ; X64-AVX512VL: ## %bb.0: ; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %xmm0 ## EVEX TO VEX Compression xmm0 = [2,9,4294967284,23] ; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xf9,0x6f,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI83_0-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI79_0-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x46,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI83_1-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI79_1-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] %res = call <4 x i32> @llvm.x86.avx2.psrav.d(<4 x i32> , <4 x i32> ) ret <4 x i32> %res @@ -1431,36 +1368,36 @@ ; X86-AVX: ## %bb.0: ; X86-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X86-AVX-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI85_0, kind: FK_Data_4 -; X86-AVX-NEXT: vpsravd LCPI85_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI85_1, kind: FK_Data_4 +; X86-AVX-NEXT: ## fixup A - offset: 4, value: LCPI81_0, kind: FK_Data_4 +; X86-AVX-NEXT: vpsravd LCPI81_1, %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] +; X86-AVX-NEXT: ## fixup A - offset: 5, value: LCPI81_1, kind: FK_Data_4 ; X86-AVX-NEXT: retl ## encoding: [0xc3] ; ; X86-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const: ; X86-AVX512VL: ## %bb.0: -; X86-AVX512VL-NEXT: vmovdqa LCPI85_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] +; X86-AVX512VL-NEXT: vmovdqa LCPI81_0, %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X86-AVX512VL-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI85_0, kind: FK_Data_4 -; X86-AVX512VL-NEXT: vpsravd LCPI85_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI85_1, kind: FK_Data_4 +; X86-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI81_0, kind: FK_Data_4 +; X86-AVX512VL-NEXT: vpsravd LCPI81_1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] +; X86-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI81_1, kind: FK_Data_4 ; X86-AVX512VL-NEXT: retl ## encoding: [0xc3] ; ; X64-AVX-LABEL: test_x86_avx2_psrav_d_256_const: ; X64-AVX: ## %bb.0: ; X64-AVX-NEXT: vmovdqa {{.*#+}} ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X64-AVX-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI85_0-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 4, value: LCPI81_0-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI85_1-4, kind: reloc_riprel_4byte +; X64-AVX-NEXT: ## fixup A - offset: 5, value: LCPI81_1-4, kind: reloc_riprel_4byte ; X64-AVX-NEXT: retq ## encoding: [0xc3] ; ; X64-AVX512VL-LABEL: test_x86_avx2_psrav_d_256_const: ; X64-AVX512VL: ## %bb.0: ; X64-AVX512VL-NEXT: vmovdqa {{.*}}(%rip), %ymm0 ## EVEX TO VEX Compression ymm0 = [2,9,4294967284,23,4294967270,37,4294967256,51] ; X64-AVX512VL-NEXT: ## encoding: [0xc5,0xfd,0x6f,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI85_0-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 4, value: LCPI81_0-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: vpsravd {{.*}}(%rip), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x7d,0x46,0x05,A,A,A,A] -; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI85_1-4, kind: reloc_riprel_4byte +; X64-AVX512VL-NEXT: ## fixup A - offset: 5, value: LCPI81_1-4, kind: reloc_riprel_4byte ; X64-AVX512VL-NEXT: retq ## encoding: [0xc3] %res = call <8 x i32> @llvm.x86.avx2.psrav.d.256(<8 x i32> , <8 x i32> ) ret <8 x i32> %res Index: test/CodeGen/X86/avx512bw-intrinsics-canonical.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics-canonical.ll +++ test/CodeGen/X86/avx512bw-intrinsics-canonical.ll @@ -2,8 +2,292 @@ ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512BW ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512bw | FileCheck %s --check-prefix=ALL --check-prefix=AVX512F-32 -; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse2-builtins.c +; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512bw-builtins.c +; +; Signed Saturation +; + +define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: test_mask_adds_epi16_rr_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rr_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %res = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a, <32 x i16> %b) + ret <32 x i16> %res +} +declare <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16>, <32 x i16>) + +define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epi16_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %1 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru + ret <32 x i16> %3 +} + +define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epi16_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %1 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer + ret <32 x i16> %3 +} + +define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; AVX512BW-LABEL: test_mask_adds_epi16_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a, <32 x i16> %b) + ret <32 x i16> %1 +} + +define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epi16_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru + ret <32 x i16> %3 +} + +define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_mask_adds_epi16_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpaddsw (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer + ret <32 x i16> %3 +} + +define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; AVX512BW-LABEL: test_mask_subs_epi16_rr_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rr_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %sub = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %a, <32 x i16> %b) + ret <32 x i16> %sub +} +declare <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16>, <32 x i16>) + +define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epi16_rrk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rrk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm2, %zmm0 +; AVX512F-32-NEXT: retl + %sub = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %a, <32 x i16> %b) + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> %passThru + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epi16_rrkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %edi, %k1 +; AVX512BW-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rrkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %sub = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %a, <32 x i16> %b) + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> zeroinitializer + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; AVX512BW-LABEL: test_mask_subs_epi16_rm_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rm_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %sub = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %a, <32 x i16> %b) + ret <32 x i16> %sub +} + +define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epi16_rmk_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm1 {%k1} +; AVX512BW-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rmk_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm1 {%k1} +; AVX512F-32-NEXT: vmovdqa64 %zmm1, %zmm0 +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %sub = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %a, <32 x i16> %b) + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> %passThru + ret <32 x i16> %res +} + +define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; AVX512BW-LABEL: test_mask_subs_epi16_rmkz_512: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: kmovd %esi, %k1 +; AVX512BW-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z} +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rmkz_512: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: movl {{[0-9]+}}(%esp), %eax +; AVX512F-32-NEXT: kmovd {{[0-9]+}}(%esp), %k1 +; AVX512F-32-NEXT: vpsubsw (%eax), %zmm0, %zmm0 {%k1} {z} +; AVX512F-32-NEXT: retl + %b = load <32 x i16>, <32 x i16>* %ptr_b + %sub = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> %a, <32 x i16> %b) + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i16> %sub, <32 x i16> zeroinitializer + ret <32 x i16> %res +} + + +define <64 x i16> @test_mask_adds_epi16_rr_1024(<64 x i16> %a, <64 x i16> %b) { +; AVX512BW-LABEL: test_mask_adds_epi16_rr_1024: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpaddsw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpaddsw %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_adds_epi16_rr_1024: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: pushl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %ebp, -8 +; AVX512F-32-NEXT: movl %esp, %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: andl $-64, %esp +; AVX512F-32-NEXT: subl $64, %esp +; AVX512F-32-NEXT: vpaddsw %zmm2, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpaddsw 8(%ebp), %zmm1, %zmm1 +; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: popl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512F-32-NEXT: retl + %1 = call <64 x i16> @llvm.sadd.sat.v64i16(<64 x i16> %a, <64 x i16> %b) + ret <64 x i16> %1 +} +declare <64 x i16> @llvm.sadd.sat.v64i16(<64 x i16>, <64 x i16>) + +define <64 x i16> @test_mask_subs_epi16_rr_1024(<64 x i16> %a, <64 x i16> %b) { +; AVX512BW-LABEL: test_mask_subs_epi16_rr_1024: +; AVX512BW: ## %bb.0: +; AVX512BW-NEXT: vpsubsw %zmm2, %zmm0, %zmm0 +; AVX512BW-NEXT: vpsubsw %zmm3, %zmm1, %zmm1 +; AVX512BW-NEXT: retq +; +; AVX512F-32-LABEL: test_mask_subs_epi16_rr_1024: +; AVX512F-32: # %bb.0: +; AVX512F-32-NEXT: pushl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_offset 8 +; AVX512F-32-NEXT: .cfi_offset %ebp, -8 +; AVX512F-32-NEXT: movl %esp, %ebp +; AVX512F-32-NEXT: .cfi_def_cfa_register %ebp +; AVX512F-32-NEXT: andl $-64, %esp +; AVX512F-32-NEXT: subl $64, %esp +; AVX512F-32-NEXT: vpsubsw %zmm2, %zmm0, %zmm0 +; AVX512F-32-NEXT: vpsubsw 8(%ebp), %zmm1, %zmm1 +; AVX512F-32-NEXT: movl %ebp, %esp +; AVX512F-32-NEXT: popl %ebp +; AVX512F-32-NEXT: .cfi_def_cfa %esp, 4 +; AVX512F-32-NEXT: retl + %sub = call <64 x i16> @llvm.ssub.sat.v64i16(<64 x i16> %a, <64 x i16> %b) + ret <64 x i16> %sub +} +declare <64 x i16> @llvm.ssub.sat.v64i16(<64 x i16>, <64 x i16>); + +; +; Unsigned Saturation +; define <32 x i16> @test_mask_adds_epu16_rr_512(<32 x i16> %a, <32 x i16> %b) { ; AVX512BW-LABEL: test_mask_adds_epu16_rr_512: Index: test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll @@ -3285,6 +3285,113 @@ declare <64 x i8> @llvm.x86.avx512.mask.psubus.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) +define <32 x i16> @test_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; CHECK-LABEL: test_adds_epi16_rr_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xed,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <32 x i16> @llvm.x86.avx512.padds.w.512(<32 x i16> %a, <32 x i16> %b) + ret <32 x i16> %1 +} + +define <32 x i16> @test_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; X86-LABEL: test_adds_epi16_rrk_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xed,0xd1] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_adds_epi16_rrk_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xed,0xd1] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <32 x i16> @llvm.x86.avx512.padds.w.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru + ret <32 x i16> %3 +} + +define <32 x i16> @test_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; X86-LABEL: test_adds_epi16_rrkz_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xed,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_adds_epi16_rrkz_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xed,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <32 x i16> @llvm.x86.avx512.padds.w.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer + ret <32 x i16> %3 +} + +define <32 x i16> @test_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; X86-LABEL: test_adds_epi16_rm_512: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vpaddsw (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xed,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_adds_epi16_rm_512: +; X64: # %bb.0: +; X64-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xed,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = call <32 x i16> @llvm.x86.avx512.padds.w.512(<32 x i16> %a, <32 x i16> %b) + ret <32 x i16> %1 +} + +define <32 x i16> @test_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; X86-LABEL: test_adds_epi16_rmk_512: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpaddsw (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xed,0x08] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_adds_epi16_rmk_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpaddsw (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xed,0x0f] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = call <32 x i16> @llvm.x86.avx512.padds.w.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru + ret <32 x i16> %3 +} + +define <32 x i16> @test_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; X86-LABEL: test_adds_epi16_rmkz_512: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpaddsw (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xed,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_adds_epi16_rmkz_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xed,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = call <32 x i16> @llvm.x86.avx512.padds.w.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer + ret <32 x i16> %3 +} + +declare <32 x i16> @llvm.x86.avx512.padds.w.512(<32 x i16>, <32 x i16>) + define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { ; CHECK-LABEL: test_mask_adds_epi16_rr_512: ; CHECK: # %bb.0: @@ -3384,6 +3491,113 @@ declare <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) +define <32 x i16> @test_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { +; CHECK-LABEL: test_subs_epi16_rr_512: +; CHECK: # %bb.0: +; CHECK-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe9,0xc1] +; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %1 = call <32 x i16> @llvm.x86.avx512.psubs.w.512(<32 x i16> %a, <32 x i16> %b) + ret <32 x i16> %1 +} + +define <32 x i16> @test_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { +; X86-LABEL: test_subs_epi16_rrk_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe9,0xd1] +; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_subs_epi16_rrk_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe9,0xd1] +; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <32 x i16> @llvm.x86.avx512.psubs.w.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru + ret <32 x i16> %3 +} + +define <32 x i16> @test_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { +; X86-LABEL: test_subs_epi16_rrkz_512: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe9,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_subs_epi16_rrkz_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe9,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <32 x i16> @llvm.x86.avx512.psubs.w.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer + ret <32 x i16> %3 +} + +define <32 x i16> @test_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { +; X86-LABEL: test_subs_epi16_rm_512: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vpsubsw (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe9,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_subs_epi16_rm_512: +; X64: # %bb.0: +; X64-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe9,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = call <32 x i16> @llvm.x86.avx512.psubs.w.512(<32 x i16> %a, <32 x i16> %b) + ret <32 x i16> %1 +} + +define <32 x i16> @test_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { +; X86-LABEL: test_subs_epi16_rmk_512: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpsubsw (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe9,0x08] +; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_subs_epi16_rmk_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpsubsw (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe9,0x0f] +; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = call <32 x i16> @llvm.x86.avx512.psubs.w.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru + ret <32 x i16> %3 +} + +define <32 x i16> @test_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { +; X86-LABEL: test_subs_epi16_rmkz_512: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpsubsw (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe9,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_subs_epi16_rmkz_512: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe9,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <32 x i16>, <32 x i16>* %ptr_b + %1 = call <32 x i16> @llvm.x86.avx512.psubs.w.512(<32 x i16> %a, <32 x i16> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer + ret <32 x i16> %3 +} + +declare <32 x i16> @llvm.x86.avx512.psubs.w.512(<32 x i16>, <32 x i16>) + define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { ; CHECK-LABEL: test_mask_subs_epi16_rr_512: ; CHECK: # %bb.0: Index: test/CodeGen/X86/avx512bw-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512bw-intrinsics.ll +++ test/CodeGen/X86/avx512bw-intrinsics.ll @@ -683,221 +683,6 @@ declare <64 x i8> @llvm.x86.avx512.packuswb.512(<32 x i16>, <32 x i16>) -define <32 x i16> @test_mask_adds_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { -; CHECK-LABEL: test_mask_adds_epi16_rr_512: -; CHECK: # %bb.0: -; CHECK-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xed,0xc1] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <32 x i16> @llvm.x86.avx512.padds.w.512(<32 x i16> %a, <32 x i16> %b) - ret <32 x i16> %1 -} - -define <32 x i16> @test_mask_adds_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { -; X86-LABEL: test_mask_adds_epi16_rrk_512: -; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xed,0xd1] -; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epi16_rrk_512: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpaddsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xed,0xd1] -; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] -; X64-NEXT: retq # encoding: [0xc3] - %1 = call <32 x i16> @llvm.x86.avx512.padds.w.512(<32 x i16> %a, <32 x i16> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru - ret <32 x i16> %3 -} - -define <32 x i16> @test_mask_adds_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { -; X86-LABEL: test_mask_adds_epi16_rrkz_512: -; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xed,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epi16_rrkz_512: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpaddsw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xed,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %1 = call <32 x i16> @llvm.x86.avx512.padds.w.512(<32 x i16> %a, <32 x i16> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer - ret <32 x i16> %3 -} - -define <32 x i16> @test_mask_adds_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { -; X86-LABEL: test_mask_adds_epi16_rm_512: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpaddsw (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xed,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epi16_rm_512: -; X64: # %bb.0: -; X64-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xed,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <32 x i16>, <32 x i16>* %ptr_b - %1 = call <32 x i16> @llvm.x86.avx512.padds.w.512(<32 x i16> %a, <32 x i16> %b) - ret <32 x i16> %1 -} - -define <32 x i16> @test_mask_adds_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { -; X86-LABEL: test_mask_adds_epi16_rmk_512: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpaddsw (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xed,0x08] -; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epi16_rmk_512: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpaddsw (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xed,0x0f] -; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <32 x i16>, <32 x i16>* %ptr_b - %1 = call <32 x i16> @llvm.x86.avx512.padds.w.512(<32 x i16> %a, <32 x i16> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru - ret <32 x i16> %3 -} - -define <32 x i16> @test_mask_adds_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { -; X86-LABEL: test_mask_adds_epi16_rmkz_512: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpaddsw (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xed,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epi16_rmkz_512: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpaddsw (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xed,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <32 x i16>, <32 x i16>* %ptr_b - %1 = call <32 x i16> @llvm.x86.avx512.padds.w.512(<32 x i16> %a, <32 x i16> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer - ret <32 x i16> %3 -} - -declare <32 x i16> @llvm.x86.avx512.padds.w.512(<32 x i16>, <32 x i16>) - -define <32 x i16> @test_mask_subs_epi16_rr_512(<32 x i16> %a, <32 x i16> %b) { -; CHECK-LABEL: test_mask_subs_epi16_rr_512: -; CHECK: # %bb.0: -; CHECK-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe9,0xc1] -; CHECK-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %1 = call <32 x i16> @llvm.x86.avx512.psubs.w.512(<32 x i16> %a, <32 x i16> %b) - ret <32 x i16> %1 -} - -define <32 x i16> @test_mask_subs_epi16_rrk_512(<32 x i16> %a, <32 x i16> %b, <32 x i16> %passThru, i32 %mask) { -; X86-LABEL: test_mask_subs_epi16_rrk_512: -; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe9,0xd1] -; X86-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epi16_rrk_512: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpsubsw %zmm1, %zmm0, %zmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe9,0xd1] -; X64-NEXT: vmovdqa64 %zmm2, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc2] -; X64-NEXT: retq # encoding: [0xc3] - %1 = call <32 x i16> @llvm.x86.avx512.psubs.w.512(<32 x i16> %a, <32 x i16> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru - ret <32 x i16> %3 -} - -define <32 x i16> @test_mask_subs_epi16_rrkz_512(<32 x i16> %a, <32 x i16> %b, i32 %mask) { -; X86-LABEL: test_mask_subs_epi16_rrkz_512: -; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe9,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epi16_rrkz_512: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpsubsw %zmm1, %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe9,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %1 = call <32 x i16> @llvm.x86.avx512.psubs.w.512(<32 x i16> %a, <32 x i16> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer - ret <32 x i16> %3 -} - -define <32 x i16> @test_mask_subs_epi16_rm_512(<32 x i16> %a, <32 x i16>* %ptr_b) { -; X86-LABEL: test_mask_subs_epi16_rm_512: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpsubsw (%eax), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe9,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epi16_rm_512: -; X64: # %bb.0: -; X64-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 # encoding: [0x62,0xf1,0x7d,0x48,0xe9,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <32 x i16>, <32 x i16>* %ptr_b - %1 = call <32 x i16> @llvm.x86.avx512.psubs.w.512(<32 x i16> %a, <32 x i16> %b) - ret <32 x i16> %1 -} - -define <32 x i16> @test_mask_subs_epi16_rmk_512(<32 x i16> %a, <32 x i16>* %ptr_b, <32 x i16> %passThru, i32 %mask) { -; X86-LABEL: test_mask_subs_epi16_rmk_512: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpsubsw (%eax), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe9,0x08] -; X86-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epi16_rmk_512: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpsubsw (%rdi), %zmm0, %zmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0xe9,0x0f] -; X64-NEXT: vmovdqa64 %zmm1, %zmm0 # encoding: [0x62,0xf1,0xfd,0x48,0x6f,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <32 x i16>, <32 x i16>* %ptr_b - %1 = call <32 x i16> @llvm.x86.avx512.psubs.w.512(<32 x i16> %a, <32 x i16> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> %passThru - ret <32 x i16> %3 -} - -define <32 x i16> @test_mask_subs_epi16_rmkz_512(<32 x i16> %a, <32 x i16>* %ptr_b, i32 %mask) { -; X86-LABEL: test_mask_subs_epi16_rmkz_512: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpsubsw (%eax), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe9,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epi16_rmkz_512: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpsubsw (%rdi), %zmm0, %zmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xc9,0xe9,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <32 x i16>, <32 x i16>* %ptr_b - %1 = call <32 x i16> @llvm.x86.avx512.psubs.w.512(<32 x i16> %a, <32 x i16> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i16> %1, <32 x i16> zeroinitializer - ret <32 x i16> %3 -} - -declare <32 x i16> @llvm.x86.avx512.psubs.w.512(<32 x i16>, <32 x i16>) - - define <32 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_512(<32 x i16> %x0, <32 x i16> %x1, <32 x i16> %x2, i32 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_512: ; X86: # %bb.0: Index: test/CodeGen/X86/avx512bwvl-intrinsics-canonical.ll =================================================================== --- test/CodeGen/X86/avx512bwvl-intrinsics-canonical.ll +++ test/CodeGen/X86/avx512bwvl-intrinsics-canonical.ll @@ -1,9 +1,593 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl -mattr=+avx512bw -mattr=+avx512vl --show-mc-encoding| FileCheck %s -; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/sse2-builtins.c +; NOTE: This should use IR equivalent to what is generated by clang/test/CodeGen/avx512vlbw-builtins.c +; +; Signed Saturation +; + +define <8 x i16> @test_mask_adds_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epi16_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %1 +} +declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_mask_adds_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xed,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru + ret <8 x i16> %3 +} + +define <8 x i16> @test_mask_adds_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xed,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer + ret <8 x i16> %3 +} + +define <8 x i16> @test_mask_adds_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi16_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %1 +} + +define <8 x i16> @test_mask_adds_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xed,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru + ret <8 x i16> %3 +} + +define <8 x i16> @test_mask_adds_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xed,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer + ret <8 x i16> %3 +} + +define <16 x i16> @test_mask_adds_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_adds_epi16_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %a, <16 x i16> %b) + ret <16 x i16> %1 +} +declare <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16>, <16 x i16>) + +define <16 x i16> @test_mask_adds_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xed,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru + ret <16 x i16> %3 +} + +define <16 x i16> @test_mask_adds_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xed,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer + ret <16 x i16> %3 +} + +define <16 x i16> @test_mask_adds_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi16_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xed,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %a, <16 x i16> %b) + ret <16 x i16> %1 +} + +define <16 x i16> @test_mask_adds_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xed,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru + ret <16 x i16> %3 +} + +define <16 x i16> @test_mask_adds_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi16_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xed,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer + ret <16 x i16> %3 +} + +define <8 x i16> @test_mask_subs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epi16_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %sub = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %sub +} +declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>) + +define <8 x i16> @test_mask_subs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe9,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %sub = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b) + %bc = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %bc, <8 x i16> %sub, <8 x i16> %passThru + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %sub = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b) + %bc = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %bc, <8 x i16> %sub, <8 x i16> zeroinitializer + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epi16_rm_128(<8 x i16> %a, <8 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi16_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %sub = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b) + ret <8 x i16> %sub +} + +define <8 x i16> @test_mask_subs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe9,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %sub = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b) + %bc = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %bc, <8 x i16> %sub, <8 x i16> %passThru + ret <8 x i16> %res +} + +define <8 x i16> @test_mask_subs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %sub = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a, <8 x i16> %b) + %bc = bitcast i8 %mask to <8 x i1> + %res = select <8 x i1> %bc, <8 x i16> %sub, <8 x i16> zeroinitializer + ret <8 x i16> %res +} + +define <16 x i16> @test_mask_subs_epi16_rr_256(<16 x i16> %a, <16 x i16> %b) { +; CHECK-LABEL: test_mask_subs_epi16_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %sub = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %a, <16 x i16> %b) + ret <16 x i16> %sub +} +declare <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16>, <16 x i16>) + +define <16 x i16> @test_mask_subs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe9,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %sub = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %a, <16 x i16> %b) + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i16> %sub, <16 x i16> %passThru + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %sub = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %a, <16 x i16> %b) + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i16> %sub, <16 x i16> zeroinitializer + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epi16_rm_256(<16 x i16> %a, <16 x i16>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi16_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %sub = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %a, <16 x i16> %b) + ret <16 x i16> %sub +} + +define <16 x i16> @test_mask_subs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe9,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %sub = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %a, <16 x i16> %b) + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i16> %sub, <16 x i16> %passThru + ret <16 x i16> %res +} + +define <16 x i16> @test_mask_subs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi16_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %sub = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> %a, <16 x i16> %b) + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i16> %sub, <16 x i16> zeroinitializer + ret <16 x i16> %res +} + +define <16 x i8> @test_mask_adds_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epi8_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %1 +} +declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_mask_adds_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xec,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru + ret <16 x i8> %3 +} + +define <16 x i8> @test_mask_adds_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xec,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + ret <16 x i8> %3 +} + +define <16 x i8> @test_mask_adds_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi8_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %1 +} + +define <16 x i8> @test_mask_adds_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xec,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru + ret <16 x i8> %3 +} + +define <16 x i8> @test_mask_adds_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xec,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a, <16 x i8> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + ret <16 x i8> %3 +} + +define <32 x i8> @test_mask_adds_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_adds_epi8_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %a, <32 x i8> %b) + ret <32 x i8> %1 +} +declare <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8>, <32 x i8>) + +define <32 x i8> @test_mask_adds_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xec,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %a, <32 x i8> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru + ret <32 x i8> %3 +} + +define <32 x i8> @test_mask_adds_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xec,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %1 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %a, <32 x i8> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + ret <32 x i8> %3 +} + +define <32 x i8> @test_mask_adds_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_adds_epi8_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xec,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %a, <32 x i8> %b) + ret <32 x i8> %1 +} + +define <32 x i8> @test_mask_adds_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xec,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %a, <32 x i8> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru + ret <32 x i8> %3 +} + +define <32 x i8> @test_mask_adds_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_adds_epi8_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xec,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> %a, <32 x i8> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + ret <32 x i8> %3 +} + +define <16 x i8> @test_mask_subs_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epi8_rr_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %sub = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %sub +} +declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8>, <16 x i8>) + +define <16 x i8> @test_mask_subs_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe8,0xd1] +; CHECK-NEXT: vmovdqa %xmm2, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %sub = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b) + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i8> %sub, <16 x i8> %passThru + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %sub = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b) + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i8> %sub, <16 x i8> zeroinitializer + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi8_rm_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %sub = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %sub +} + +define <16 x i8> @test_mask_subs_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmk_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x09,0xe8,0x0f] +; CHECK-NEXT: vmovdqa %xmm1, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %sub = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b) + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i8> %sub, <16 x i8> %passThru + ret <16 x i8> %res +} + +define <16 x i8> @test_mask_subs_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmkz_128: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0x89,0xe8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %sub = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a, <16 x i8> %b) + %bc = bitcast i16 %mask to <16 x i1> + %res = select <16 x i1> %bc, <16 x i8> %sub, <16 x i8> zeroinitializer + ret <16 x i8> %res +} + +define <32 x i8> @test_mask_subs_epi8_rr_256(<32 x i8> %a, <32 x i8> %b) { +; CHECK-LABEL: test_mask_subs_epi8_rr_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %sub = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %a, <32 x i8> %b) + ret <32 x i8> %sub +} +declare <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8>, <32 x i8>) + +define <32 x i8> @test_mask_subs_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm2 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe8,0xd1] +; CHECK-NEXT: vmovdqa %ymm2, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; CHECK-NEXT: retq ## encoding: [0xc3] + %sub = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %a, <32 x i8> %b) + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i8> %sub, <32 x i8> %passThru + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rrkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %edi, %k1 ## encoding: [0xc5,0xfb,0x92,0xcf] +; CHECK-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %sub = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %a, <32 x i8> %b) + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i8> %sub, <32 x i8> zeroinitializer + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epi8_rm_256(<32 x i8> %a, <32 x i8>* %ptr_b) { +; CHECK-LABEL: test_mask_subs_epi8_rm_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0xe8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %sub = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %a, <32 x i8> %b) + ret <32 x i8> %sub +} + +define <32 x i8> @test_mask_subs_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmk_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm1 {%k1} ## encoding: [0x62,0xf1,0x7d,0x29,0xe8,0x0f] +; CHECK-NEXT: vmovdqa %ymm1, %ymm0 ## EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %sub = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %a, <32 x i8> %b) + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i8> %sub, <32 x i8> %passThru + ret <32 x i8> %res +} + +define <32 x i8> @test_mask_subs_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; CHECK-LABEL: test_mask_subs_epi8_rmkz_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovd %esi, %k1 ## encoding: [0xc5,0xfb,0x92,0xce] +; CHECK-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0x07] +; CHECK-NEXT: retq ## encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %sub = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> %a, <32 x i8> %b) + %bc = bitcast i32 %mask to <32 x i1> + %res = select <32 x i1> %bc, <32 x i8> %sub, <32 x i8> zeroinitializer + ret <32 x i8> %res +} + +; +; Unsigned Saturation +; + define <8 x i16> @test_mask_adds_epu16_rr_128(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: test_mask_adds_epu16_rr_128: ; CHECK: ## %bb.0: Index: test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512bwvl-intrinsics-upgrade.ll @@ -7348,6 +7348,338 @@ declare <16 x i16> @llvm.x86.avx512.mask.padds.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) +declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) #0 + +define <8 x i16> @test_test_subs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; X86-LABEL: test_test_subs_epi16_rrk_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpsubsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe9,0xd1] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_test_subs_epi16_rrk_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpsubsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe9,0xd1] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru + ret <8 x i16> %3 +} + +define <8 x i16> @test_test_subs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; X86-LABEL: test_test_subs_epi16_rrkz_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xe9,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_test_subs_epi16_rrkz_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xe9,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer + ret <8 x i16> %3 +} + +define <8 x i16> @test_test_subs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; X86-LABEL: test_test_subs_epi16_rmk_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9] +; X86-NEXT: vpsubsw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe9,0x08] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_test_subs_epi16_rmk_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpsubsw (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe9,0x0f] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru + ret <8 x i16> %3 +} + +define <8 x i16> @test_test_subs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; X86-LABEL: test_test_subs_epi16_rmkz_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9] +; X86-NEXT: vpsubsw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xe9,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_test_subs_epi16_rmkz_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xe9,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer + ret <8 x i16> %3 +} + +declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) #0 + +define <16 x i16> @test_test_subs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; X86-LABEL: test_test_subs_epi16_rrk_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpsubsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe9,0xd1] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_test_subs_epi16_rrk_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpsubsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe9,0xd1] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru + ret <16 x i16> %3 +} + +define <16 x i16> @test_test_subs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; X86-LABEL: test_test_subs_epi16_rrkz_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_test_subs_epi16_rrkz_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer + ret <16 x i16> %3 +} + +define <16 x i16> @test_test_subs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; X86-LABEL: test_test_subs_epi16_rmk_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpsubsw (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe9,0x08] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_test_subs_epi16_rmk_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpsubsw (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe9,0x0f] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru + ret <16 x i16> %3 +} + +define <16 x i16> @test_test_subs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; X86-LABEL: test_test_subs_epi16_rmkz_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpsubsw (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_test_subs_epi16_rmkz_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer + ret <16 x i16> %3 +} + +declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) #0 + +define <16 x i8> @test_test_subs_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; X86-LABEL: test_test_subs_epi8_rrk_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpsubsb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe8,0xd1] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_test_subs_epi8_rrk_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpsubsb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe8,0xd1] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a, <16 x i8> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru + ret <16 x i8> %3 +} + +define <16 x i8> @test_test_subs_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; X86-LABEL: test_test_subs_epi8_rrkz_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xe8,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_test_subs_epi8_rrkz_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xe8,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a, <16 x i8> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + ret <16 x i8> %3 +} + +define <16 x i8> @test_test_subs_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; X86-LABEL: test_test_subs_epi8_rmk_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpsubsb (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe8,0x08] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_test_subs_epi8_rmk_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpsubsb (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe8,0x0f] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a, <16 x i8> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru + ret <16 x i8> %3 +} + +define <16 x i8> @test_test_subs_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; X86-LABEL: test_test_subs_epi8_rmkz_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpsubsb (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xe8,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_test_subs_epi8_rmkz_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xe8,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a, <16 x i8> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + ret <16 x i8> %3 +} + +declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) #0 + +define <32 x i8> @test_test_subs_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; X86-LABEL: test_test_subs_epi8_rrk_256: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpsubsb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe8,0xd1] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_test_subs_epi8_rrk_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpsubsb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe8,0xd1] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a, <32 x i8> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru + ret <32 x i8> %3 +} + +define <32 x i8> @test_test_subs_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; X86-LABEL: test_test_subs_epi8_rrkz_256: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_test_subs_epi8_rrkz_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a, <32 x i8> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + ret <32 x i8> %3 +} + +define <32 x i8> @test_test_subs_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; X86-LABEL: test_test_subs_epi8_rmk_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpsubsb (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe8,0x08] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_test_subs_epi8_rmk_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpsubsb (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe8,0x0f] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a, <32 x i8> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru + ret <32 x i8> %3 +} + +define <32 x i8> @test_test_subs_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; X86-LABEL: test_test_subs_epi8_rmkz_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpsubsb (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_test_subs_epi8_rmkz_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a, <32 x i8> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + ret <32 x i8> %3 +} + define <8 x i16> @test_mask_subs_epi16_rr_128(<8 x i16> %a, <8 x i16> %b) { ; CHECK-LABEL: test_mask_subs_epi16_rr_128: ; CHECK: # %bb.0: @@ -7550,6 +7882,354 @@ declare <16 x i16> @llvm.x86.avx512.mask.psubs.w.256(<16 x i16>, <16 x i16>, <16 x i16>, i16) +declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) #0 + +define <8 x i16> @test_adds_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { +; X86-LABEL: test_adds_epi16_rrk_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpaddsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xed,0xd1] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_adds_epi16_rrk_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpaddsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xed,0xd1] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru + ret <8 x i16> %3 +} + +define <8 x i16> @test_adds_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { +; X86-LABEL: test_adds_epi16_rrkz_128: +; X86: # %bb.0: +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] +; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] +; X86-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xed,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_adds_epi16_rrkz_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xed,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer + ret <8 x i16> %3 +} + +define <8 x i16> @test_adds_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { +; X86-LABEL: test_adds_epi16_rmk_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9] +; X86-NEXT: vpaddsw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xed,0x08] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_adds_epi16_rmk_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpaddsw (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xed,0x0f] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru + ret <8 x i16> %3 +} + +define <8 x i16> @test_adds_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { +; X86-LABEL: test_adds_epi16_rmkz_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] +; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9] +; X86-NEXT: vpaddsw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xed,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_adds_epi16_rmkz_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xed,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <8 x i16>, <8 x i16>* %ptr_b + %1 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a, <8 x i16> %b) + %2 = bitcast i8 %mask to <8 x i1> + %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer + ret <8 x i16> %3 +} + +declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) #0 + +define <16 x i16> @test_adds_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { +; X86-LABEL: test_adds_epi16_rrk_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpaddsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xed,0xd1] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_adds_epi16_rrk_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpaddsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xed,0xd1] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru + ret <16 x i16> %3 +} + +define <16 x i16> @test_adds_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { +; X86-LABEL: test_adds_epi16_rrkz_256: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xed,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_adds_epi16_rrkz_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xed,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer + ret <16 x i16> %3 +} + +define <16 x i16> @test_adds_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { +; X86-LABEL: test_adds_epi16_rmk_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpaddsw (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xed,0x08] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_adds_epi16_rmk_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpaddsw (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xed,0x0f] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru + ret <16 x i16> %3 +} + +define <16 x i16> @test_adds_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { +; X86-LABEL: test_adds_epi16_rmkz_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpaddsw (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xed,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_adds_epi16_rmkz_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xed,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <16 x i16>, <16 x i16>* %ptr_b + %1 = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a, <16 x i16> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer + ret <16 x i16> %3 +} + +declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) #0 + +define <16 x i8> @test_adds_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { +; X86-LABEL: test_adds_epi8_rrk_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xec,0xd1] +; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_adds_epi8_rrk_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xec,0xd1] +; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a, <16 x i8> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru + ret <16 x i8> %3 +} + +define <16 x i8> @test_adds_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { +; X86-LABEL: test_adds_epi8_rrkz_128: +; X86: # %bb.0: +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xec,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_adds_epi8_rrkz_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xec,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a, <16 x i8> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + ret <16 x i8> %3 +} + +define <16 x i8> @test_adds_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { +; X86-LABEL: test_adds_epi8_rm_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: vpaddsb (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_adds_epi8_rm_128: +; X64: # %bb.0: +; X64-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a, <16 x i8> %b) + ret <16 x i8> %1 +} + +define <16 x i8> @test_adds_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { +; X86-LABEL: test_adds_epi8_rmk_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpaddsb (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xec,0x08] +; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_adds_epi8_rmk_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpaddsb (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xec,0x0f] +; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a, <16 x i8> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru + ret <16 x i8> %3 +} + +define <16 x i8> @test_adds_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { +; X86-LABEL: test_adds_epi8_rmkz_128: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpaddsb (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xec,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_adds_epi8_rmkz_128: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xec,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <16 x i8>, <16 x i8>* %ptr_b + %1 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a, <16 x i8> %b) + %2 = bitcast i16 %mask to <16 x i1> + %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer + ret <16 x i8> %3 +} + +declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) #0 + +define <32 x i8> @test_adds_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { +; X86-LABEL: test_adds_epi8_rrk_256: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpaddsb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xec,0xd1] +; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_adds_epi8_rrk_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpaddsb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xec,0xd1] +; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a, <32 x i8> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru + ret <32 x i8> %3 +} + +define <32 x i8> @test_adds_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { +; X86-LABEL: test_adds_epi8_rrkz_256: +; X86: # %bb.0: +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] +; X86-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xec,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_adds_epi8_rrkz_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] +; X64-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xec,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %1 = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a, <32 x i8> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + ret <32 x i8> %3 +} + +define <32 x i8> @test_adds_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { +; X86-LABEL: test_adds_epi8_rmk_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpaddsb (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xec,0x08] +; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_adds_epi8_rmk_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpaddsb (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xec,0x0f] +; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a, <32 x i8> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru + ret <32 x i8> %3 +} + +define <32 x i8> @test_adds_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { +; X86-LABEL: test_adds_epi8_rmkz_256: +; X86: # %bb.0: +; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] +; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] +; X86-NEXT: vpaddsb (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xec,0x00] +; X86-NEXT: retl # encoding: [0xc3] +; +; X64-LABEL: test_adds_epi8_rmkz_256: +; X64: # %bb.0: +; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] +; X64-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xec,0x07] +; X64-NEXT: retq # encoding: [0xc3] + %b = load <32 x i8>, <32 x i8>* %ptr_b + %1 = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a, <32 x i8> %b) + %2 = bitcast i32 %mask to <32 x i1> + %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer + ret <32 x i8> %3 +} + define <16 x i8> @test_mask_adds_epi8_rr_128(<16 x i8> %a, <16 x i8> %b) { ; CHECK-LABEL: test_mask_adds_epi8_rr_128: ; CHECK: # %bb.0: @@ -7945,4 +8625,3 @@ } declare <32 x i8> @llvm.x86.avx512.mask.psubs.b.256(<32 x i8>, <32 x i8>, <32 x i8>, i32) - Index: test/CodeGen/X86/avx512bwvl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512bwvl-intrinsics.ll +++ test/CodeGen/X86/avx512bwvl-intrinsics.ll @@ -1127,686 +1127,6 @@ declare <32 x i8> @llvm.x86.avx2.packuswb(<16 x i16>, <16 x i16>) -declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) #0 - -define <8 x i16> @test_mask_adds_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { -; X86-LABEL: test_mask_adds_epi16_rrk_128: -; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] -; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] -; X86-NEXT: vpaddsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xed,0xd1] -; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epi16_rrk_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpaddsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xed,0xd1] -; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; X64-NEXT: retq # encoding: [0xc3] - %1 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a, <8 x i16> %b) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru - ret <8 x i16> %3 -} - -define <8 x i16> @test_mask_adds_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { -; X86-LABEL: test_mask_adds_epi16_rrkz_128: -; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] -; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] -; X86-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xed,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epi16_rrkz_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xed,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %1 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a, <8 x i16> %b) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer - ret <8 x i16> %3 -} - -define <8 x i16> @test_mask_adds_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { -; X86-LABEL: test_mask_adds_epi16_rmk_128: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] -; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9] -; X86-NEXT: vpaddsw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xed,0x08] -; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epi16_rmk_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpaddsw (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xed,0x0f] -; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %1 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a, <8 x i16> %b) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru - ret <8 x i16> %3 -} - -define <8 x i16> @test_mask_adds_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { -; X86-LABEL: test_mask_adds_epi16_rmkz_128: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] -; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9] -; X86-NEXT: vpaddsw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xed,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epi16_rmkz_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpaddsw (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xed,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %1 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a, <8 x i16> %b) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer - ret <8 x i16> %3 -} - -declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) #0 - -define <16 x i16> @test_mask_adds_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { -; X86-LABEL: test_mask_adds_epi16_rrk_256: -; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpaddsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xed,0xd1] -; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epi16_rrk_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpaddsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xed,0xd1] -; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; X64-NEXT: retq # encoding: [0xc3] - %1 = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a, <16 x i16> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru - ret <16 x i16> %3 -} - -define <16 x i16> @test_mask_adds_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { -; X86-LABEL: test_mask_adds_epi16_rrkz_256: -; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xed,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epi16_rrkz_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpaddsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xed,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %1 = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a, <16 x i16> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer - ret <16 x i16> %3 -} - -define <16 x i16> @test_mask_adds_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { -; X86-LABEL: test_mask_adds_epi16_rmk_256: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpaddsw (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xed,0x08] -; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epi16_rmk_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpaddsw (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xed,0x0f] -; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %1 = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a, <16 x i16> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru - ret <16 x i16> %3 -} - -define <16 x i16> @test_mask_adds_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { -; X86-LABEL: test_mask_adds_epi16_rmkz_256: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpaddsw (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xed,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epi16_rmkz_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpaddsw (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xed,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %1 = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> %a, <16 x i16> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer - ret <16 x i16> %3 -} - -declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) #0 - -define <8 x i16> @test_mask_subs_epi16_rrk_128(<8 x i16> %a, <8 x i16> %b, <8 x i16> %passThru, i8 %mask) { -; X86-LABEL: test_mask_subs_epi16_rrk_128: -; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] -; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] -; X86-NEXT: vpsubsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe9,0xd1] -; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epi16_rrk_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpsubsw %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe9,0xd1] -; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; X64-NEXT: retq # encoding: [0xc3] - %1 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a, <8 x i16> %b) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru - ret <8 x i16> %3 -} - -define <8 x i16> @test_mask_subs_epi16_rrkz_128(<8 x i16> %a, <8 x i16> %b, i8 %mask) { -; X86-LABEL: test_mask_subs_epi16_rrkz_128: -; X86: # %bb.0: -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax # encoding: [0x0f,0xb6,0x44,0x24,0x04] -; X86-NEXT: kmovd %eax, %k1 # encoding: [0xc5,0xfb,0x92,0xc8] -; X86-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xe9,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epi16_rrkz_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xe9,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %1 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a, <8 x i16> %b) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer - ret <8 x i16> %3 -} - -define <8 x i16> @test_mask_subs_epi16_rmk_128(<8 x i16> %a, <8 x i16>* %ptr_b, <8 x i16> %passThru, i8 %mask) { -; X86-LABEL: test_mask_subs_epi16_rmk_128: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] -; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9] -; X86-NEXT: vpsubsw (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe9,0x08] -; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epi16_rmk_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpsubsw (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe9,0x0f] -; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %1 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a, <8 x i16> %b) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> %passThru - ret <8 x i16> %3 -} - -define <8 x i16> @test_mask_subs_epi16_rmkz_128(<8 x i16> %a, <8 x i16>* %ptr_b, i8 %mask) { -; X86-LABEL: test_mask_subs_epi16_rmkz_128: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx # encoding: [0x0f,0xb6,0x4c,0x24,0x08] -; X86-NEXT: kmovd %ecx, %k1 # encoding: [0xc5,0xfb,0x92,0xc9] -; X86-NEXT: vpsubsw (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xe9,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epi16_rmkz_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpsubsw (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xe9,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <8 x i16>, <8 x i16>* %ptr_b - %1 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a, <8 x i16> %b) - %2 = bitcast i8 %mask to <8 x i1> - %3 = select <8 x i1> %2, <8 x i16> %1, <8 x i16> zeroinitializer - ret <8 x i16> %3 -} - -declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) #0 - -define <16 x i16> @test_mask_subs_epi16_rrk_256(<16 x i16> %a, <16 x i16> %b, <16 x i16> %passThru, i16 %mask) { -; X86-LABEL: test_mask_subs_epi16_rrk_256: -; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpsubsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe9,0xd1] -; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epi16_rrk_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpsubsw %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe9,0xd1] -; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; X64-NEXT: retq # encoding: [0xc3] - %1 = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a, <16 x i16> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru - ret <16 x i16> %3 -} - -define <16 x i16> @test_mask_subs_epi16_rrkz_256(<16 x i16> %a, <16 x i16> %b, i16 %mask) { -; X86-LABEL: test_mask_subs_epi16_rrkz_256: -; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epi16_rrkz_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpsubsw %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %1 = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a, <16 x i16> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer - ret <16 x i16> %3 -} - -define <16 x i16> @test_mask_subs_epi16_rmk_256(<16 x i16> %a, <16 x i16>* %ptr_b, <16 x i16> %passThru, i16 %mask) { -; X86-LABEL: test_mask_subs_epi16_rmk_256: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpsubsw (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe9,0x08] -; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epi16_rmk_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpsubsw (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe9,0x0f] -; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %1 = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a, <16 x i16> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> %passThru - ret <16 x i16> %3 -} - -define <16 x i16> @test_mask_subs_epi16_rmkz_256(<16 x i16> %a, <16 x i16>* %ptr_b, i16 %mask) { -; X86-LABEL: test_mask_subs_epi16_rmkz_256: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpsubsw (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epi16_rmkz_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpsubsw (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xe9,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <16 x i16>, <16 x i16>* %ptr_b - %1 = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> %a, <16 x i16> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i16> %1, <16 x i16> zeroinitializer - ret <16 x i16> %3 -} - -declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) #0 - -define <16 x i8> @test_mask_adds_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { -; X86-LABEL: test_mask_adds_epi8_rrk_128: -; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xec,0xd1] -; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epi8_rrk_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpaddsb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xec,0xd1] -; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; X64-NEXT: retq # encoding: [0xc3] - %1 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a, <16 x i8> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru - ret <16 x i8> %3 -} - -define <16 x i8> @test_mask_adds_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { -; X86-LABEL: test_mask_adds_epi8_rrkz_128: -; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xec,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epi8_rrkz_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xec,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %1 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a, <16 x i8> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer - ret <16 x i8> %3 -} - -define <16 x i8> @test_mask_adds_epi8_rm_128(<16 x i8> %a, <16 x i8>* %ptr_b) { -; X86-LABEL: test_mask_adds_epi8_rm_128: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: vpaddsb (%eax), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epi8_rm_128: -; X64: # %bb.0: -; X64-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %1 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a, <16 x i8> %b) - ret <16 x i8> %1 -} - -define <16 x i8> @test_mask_adds_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { -; X86-LABEL: test_mask_adds_epi8_rmk_128: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpaddsb (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xec,0x08] -; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epi8_rmk_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpaddsb (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xec,0x0f] -; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %1 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a, <16 x i8> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru - ret <16 x i8> %3 -} - -define <16 x i8> @test_mask_adds_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { -; X86-LABEL: test_mask_adds_epi8_rmkz_128: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpaddsb (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xec,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epi8_rmkz_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpaddsb (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xec,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %1 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a, <16 x i8> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer - ret <16 x i8> %3 -} - -declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) #0 - -define <32 x i8> @test_mask_adds_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { -; X86-LABEL: test_mask_adds_epi8_rrk_256: -; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpaddsb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xec,0xd1] -; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epi8_rrk_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpaddsb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xec,0xd1] -; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; X64-NEXT: retq # encoding: [0xc3] - %1 = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a, <32 x i8> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru - ret <32 x i8> %3 -} - -define <32 x i8> @test_mask_adds_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { -; X86-LABEL: test_mask_adds_epi8_rrkz_256: -; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xec,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epi8_rrkz_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpaddsb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xec,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %1 = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a, <32 x i8> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer - ret <32 x i8> %3 -} - -define <32 x i8> @test_mask_adds_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { -; X86-LABEL: test_mask_adds_epi8_rmk_256: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpaddsb (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xec,0x08] -; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epi8_rmk_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpaddsb (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xec,0x0f] -; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %1 = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a, <32 x i8> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru - ret <32 x i8> %3 -} - -define <32 x i8> @test_mask_adds_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { -; X86-LABEL: test_mask_adds_epi8_rmkz_256: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpaddsb (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xec,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_adds_epi8_rmkz_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpaddsb (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xec,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %1 = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> %a, <32 x i8> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer - ret <32 x i8> %3 -} - -declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) #0 - -define <16 x i8> @test_mask_subs_epi8_rrk_128(<16 x i8> %a, <16 x i8> %b, <16 x i8> %passThru, i16 %mask) { -; X86-LABEL: test_mask_subs_epi8_rrk_128: -; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpsubsb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe8,0xd1] -; X86-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epi8_rrk_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpsubsb %xmm1, %xmm0, %xmm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe8,0xd1] -; X64-NEXT: vmovdqa %xmm2, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc2] -; X64-NEXT: retq # encoding: [0xc3] - %1 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a, <16 x i8> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru - ret <16 x i8> %3 -} - -define <16 x i8> @test_mask_subs_epi8_rrkz_128(<16 x i8> %a, <16 x i8> %b, i16 %mask) { -; X86-LABEL: test_mask_subs_epi8_rrkz_128: -; X86: # %bb.0: -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xe8,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epi8_rrkz_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xe8,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %1 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a, <16 x i8> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer - ret <16 x i8> %3 -} - -define <16 x i8> @test_mask_subs_epi8_rmk_128(<16 x i8> %a, <16 x i8>* %ptr_b, <16 x i8> %passThru, i16 %mask) { -; X86-LABEL: test_mask_subs_epi8_rmk_128: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpsubsb (%eax), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe8,0x08] -; X86-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epi8_rmk_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpsubsb (%rdi), %xmm0, %xmm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x09,0xe8,0x0f] -; X64-NEXT: vmovdqa %xmm1, %xmm0 # EVEX TO VEX Compression encoding: [0xc5,0xf9,0x6f,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %1 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a, <16 x i8> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> %passThru - ret <16 x i8> %3 -} - -define <16 x i8> @test_mask_subs_epi8_rmkz_128(<16 x i8> %a, <16 x i8>* %ptr_b, i16 %mask) { -; X86-LABEL: test_mask_subs_epi8_rmkz_128: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovw {{[0-9]+}}(%esp), %k1 # encoding: [0xc5,0xf8,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpsubsb (%eax), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xe8,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epi8_rmkz_128: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpsubsb (%rdi), %xmm0, %xmm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0x89,0xe8,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <16 x i8>, <16 x i8>* %ptr_b - %1 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a, <16 x i8> %b) - %2 = bitcast i16 %mask to <16 x i1> - %3 = select <16 x i1> %2, <16 x i8> %1, <16 x i8> zeroinitializer - ret <16 x i8> %3 -} - -declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) #0 - -define <32 x i8> @test_mask_subs_epi8_rrk_256(<32 x i8> %a, <32 x i8> %b, <32 x i8> %passThru, i32 %mask) { -; X86-LABEL: test_mask_subs_epi8_rrk_256: -; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpsubsb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe8,0xd1] -; X86-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epi8_rrk_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpsubsb %ymm1, %ymm0, %ymm2 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe8,0xd1] -; X64-NEXT: vmovdqa %ymm2, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc2] -; X64-NEXT: retq # encoding: [0xc3] - %1 = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a, <32 x i8> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru - ret <32 x i8> %3 -} - -define <32 x i8> @test_mask_subs_epi8_rrkz_256(<32 x i8> %a, <32 x i8> %b, i32 %mask) { -; X86-LABEL: test_mask_subs_epi8_rrkz_256: -; X86: # %bb.0: -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x04] -; X86-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epi8_rrkz_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %edi, %k1 # encoding: [0xc5,0xfb,0x92,0xcf] -; X64-NEXT: vpsubsb %ymm1, %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %1 = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a, <32 x i8> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer - ret <32 x i8> %3 -} - -define <32 x i8> @test_mask_subs_epi8_rmk_256(<32 x i8> %a, <32 x i8>* %ptr_b, <32 x i8> %passThru, i32 %mask) { -; X86-LABEL: test_mask_subs_epi8_rmk_256: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpsubsb (%eax), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe8,0x08] -; X86-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epi8_rmk_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpsubsb (%rdi), %ymm0, %ymm1 {%k1} # encoding: [0x62,0xf1,0x7d,0x29,0xe8,0x0f] -; X64-NEXT: vmovdqa %ymm1, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x6f,0xc1] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %1 = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a, <32 x i8> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> %passThru - ret <32 x i8> %3 -} - -define <32 x i8> @test_mask_subs_epi8_rmkz_256(<32 x i8> %a, <32 x i8>* %ptr_b, i32 %mask) { -; X86-LABEL: test_mask_subs_epi8_rmkz_256: -; X86: # %bb.0: -; X86-NEXT: movl {{[0-9]+}}(%esp), %eax # encoding: [0x8b,0x44,0x24,0x04] -; X86-NEXT: kmovd {{[0-9]+}}(%esp), %k1 # encoding: [0xc4,0xe1,0xf9,0x90,0x4c,0x24,0x08] -; X86-NEXT: vpsubsb (%eax), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0x00] -; X86-NEXT: retl # encoding: [0xc3] -; -; X64-LABEL: test_mask_subs_epi8_rmkz_256: -; X64: # %bb.0: -; X64-NEXT: kmovd %esi, %k1 # encoding: [0xc5,0xfb,0x92,0xce] -; X64-NEXT: vpsubsb (%rdi), %ymm0, %ymm0 {%k1} {z} # encoding: [0x62,0xf1,0x7d,0xa9,0xe8,0x07] -; X64-NEXT: retq # encoding: [0xc3] - %b = load <32 x i8>, <32 x i8>* %ptr_b - %1 = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> %a, <32 x i8> %b) - %2 = bitcast i32 %mask to <32 x i1> - %3 = select <32 x i1> %2, <32 x i8> %1, <32 x i8> zeroinitializer - ret <32 x i8> %3 -} - define <8 x i16>@test_int_x86_avx512_mask_vpermt2var_hi_128(<8 x i16> %x0, <8 x i16> %x1, <8 x i16> %x2, i8 %x3) { ; X86-LABEL: test_int_x86_avx512_mask_vpermt2var_hi_128: ; X86: # %bb.0: Index: test/CodeGen/X86/pic-load-remat.ll =================================================================== --- test/CodeGen/X86/pic-load-remat.ll +++ test/CodeGen/X86/pic-load-remat.ll @@ -5,30 +5,30 @@ br label %bb bb: ; preds = %bb, %entry - %tmp4403 = tail call <8 x i16> @llvm.x86.sse2.psubs.w( <8 x i16> zeroinitializer, <8 x i16> zeroinitializer ) nounwind readnone ; <<8 x i16>> [#uses=2] - %tmp4443 = tail call <8 x i16> @llvm.x86.sse2.padds.w( <8 x i16> zeroinitializer, <8 x i16> zeroinitializer ) nounwind readnone ; <<8 x i16>> [#uses=1] + %tmp4403 = tail call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> zeroinitializer, <8 x i16> zeroinitializer ) nounwind readnone ; <<8 x i16>> [#uses=2] + %tmp4443 = tail call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> zeroinitializer, <8 x i16> zeroinitializer ) nounwind readnone ; <<8 x i16>> [#uses=1] %tmp4609 = tail call <8 x i16> @llvm.x86.sse2.psll.w( <8 x i16> zeroinitializer, <8 x i16> bitcast (<4 x i32> < i32 3, i32 5, i32 6, i32 9 > to <8 x i16>) ) ; <<8 x i16>> [#uses=1] %tmp4651 = add <8 x i16> %tmp4609, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1 > ; <<8 x i16>> [#uses=1] %tmp4658 = tail call <8 x i16> @llvm.x86.sse2.psll.w( <8 x i16> %tmp4651, <8 x i16> bitcast (<4 x i32> < i32 4, i32 1, i32 2, i32 3 > to <8 x i16>) ) ; <<8 x i16>> [#uses=1] %tmp4669 = tail call <8 x i16> @llvm.x86.sse2.pavg.w( <8 x i16> < i16 -23170, i16 -23170, i16 -23170, i16 -23170, i16 -23170, i16 -23170, i16 -23170, i16 -23170 >, <8 x i16> %tmp4443 ) nounwind readnone ; <<8 x i16>> [#uses=2] - %tmp4679 = tail call <8 x i16> @llvm.x86.sse2.padds.w( <8 x i16> %tmp4669, <8 x i16> %tmp4669 ) nounwind readnone ; <<8 x i16>> [#uses=1] + %tmp4679 = tail call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %tmp4669, <8 x i16> %tmp4669 ) nounwind readnone ; <<8 x i16>> [#uses=1] %tmp4689 = add <8 x i16> %tmp4679, %tmp4658 ; <<8 x i16>> [#uses=1] - %tmp4700 = tail call <8 x i16> @llvm.x86.sse2.padds.w( <8 x i16> %tmp4689, <8 x i16> zeroinitializer ) nounwind readnone ; <<8 x i16>> [#uses=1] + %tmp4700 = tail call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %tmp4689, <8 x i16> zeroinitializer ) nounwind readnone ; <<8 x i16>> [#uses=1] %tmp4708 = bitcast <8 x i16> %tmp4700 to <2 x i64> ; <<2 x i64>> [#uses=1] %tmp4772 = add <8 x i16> zeroinitializer, < i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1 > ; <<8 x i16>> [#uses=1] %tmp4779 = tail call <8 x i16> @llvm.x86.sse2.psll.w( <8 x i16> %tmp4772, <8 x i16> bitcast (<4 x i32> < i32 3, i32 5, i32 undef, i32 7 > to <8 x i16>) ) ; <<8 x i16>> [#uses=1] %tmp4810 = add <8 x i16> zeroinitializer, %tmp4779 ; <<8 x i16>> [#uses=1] - %tmp4821 = tail call <8 x i16> @llvm.x86.sse2.padds.w( <8 x i16> %tmp4810, <8 x i16> zeroinitializer ) nounwind readnone ; <<8 x i16>> [#uses=1] + %tmp4821 = tail call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %tmp4810, <8 x i16> zeroinitializer ) nounwind readnone ; <<8 x i16>> [#uses=1] %tmp4829 = bitcast <8 x i16> %tmp4821 to <2 x i64> ; <<2 x i64>> [#uses=1] %tmp4900 = tail call <8 x i16> @llvm.x86.sse2.psll.w( <8 x i16> zeroinitializer, <8 x i16> bitcast (<4 x i32> < i32 1, i32 1, i32 2, i32 2 > to <8 x i16>) ) ; <<8 x i16>> [#uses=1] %tmp4911 = tail call <8 x i16> @llvm.x86.sse2.pavg.w( <8 x i16> < i16 -23170, i16 -23170, i16 -23170, i16 -23170, i16 -23170, i16 -23170, i16 -23170, i16 -23170 >, <8 x i16> zeroinitializer ) nounwind readnone ; <<8 x i16>> [#uses=2] - %tmp4921 = tail call <8 x i16> @llvm.x86.sse2.padds.w( <8 x i16> %tmp4911, <8 x i16> %tmp4911 ) nounwind readnone ; <<8 x i16>> [#uses=1] + %tmp4921 = tail call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %tmp4911, <8 x i16> %tmp4911 ) nounwind readnone ; <<8 x i16>> [#uses=1] %tmp4931 = add <8 x i16> %tmp4921, %tmp4900 ; <<8 x i16>> [#uses=1] - %tmp4942 = tail call <8 x i16> @llvm.x86.sse2.padds.w( <8 x i16> %tmp4931, <8 x i16> zeroinitializer ) nounwind readnone ; <<8 x i16>> [#uses=1] + %tmp4942 = tail call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %tmp4931, <8 x i16> zeroinitializer ) nounwind readnone ; <<8 x i16>> [#uses=1] %tmp4950 = bitcast <8 x i16> %tmp4942 to <2 x i64> ; <<2 x i64>> [#uses=1] - %tmp4957 = tail call <8 x i16> @llvm.x86.sse2.padds.w( <8 x i16> %tmp4403, <8 x i16> zeroinitializer ) nounwind readnone ; <<8 x i16>> [#uses=1] + %tmp4957 = tail call <8 x i16> @llvm.sadd.sat.v8i16( <8 x i16> %tmp4403, <8 x i16> zeroinitializer ) nounwind readnone ; <<8 x i16>> [#uses=1] %tmp4958 = bitcast <8 x i16> %tmp4957 to <2 x i64> ; <<2 x i64>> [#uses=1] - %tmp4967 = tail call <8 x i16> @llvm.x86.sse2.psubs.w( <8 x i16> %tmp4403, <8 x i16> zeroinitializer ) nounwind readnone ; <<8 x i16>> [#uses=1] + %tmp4967 = tail call <8 x i16> @llvm.ssub.sat.v8i16( <8 x i16> %tmp4403, <8 x i16> zeroinitializer ) nounwind readnone ; <<8 x i16>> [#uses=1] %tmp4968 = bitcast <8 x i16> %tmp4967 to <2 x i64> ; <<2 x i64>> [#uses=1] store <2 x i64> %tmp4829, <2 x i64>* null, align 16 store <2 x i64> %tmp4958, <2 x i64>* null, align 16 @@ -42,6 +42,6 @@ declare <8 x i16> @llvm.x86.sse2.pavg.w(<8 x i16>, <8 x i16>) nounwind readnone -declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone +declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone -declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone +declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone Index: test/CodeGen/X86/sse2-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -151,11 +151,11 @@ ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %arg1 = bitcast <2 x i64> %a1 to <16 x i8> - %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %arg0, <16 x i8> %arg1) + %res = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %arg0, <16 x i8> %arg1) %bc = bitcast <16 x i8> %res to <2 x i64> ret <2 x i64> %bc } -declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone +declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone define <2 x i64> @test_mm_adds_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; SSE-LABEL: test_mm_adds_epi16: @@ -174,11 +174,11 @@ ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %arg0, <8 x i16> %arg1) + %res = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %arg0, <8 x i16> %arg1) %bc = bitcast <8 x i16> %res to <2 x i64> ret <2 x i64> %bc } -declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone +declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone define <2 x i64> @test_mm_adds_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; SSE-LABEL: test_mm_adds_epu8: @@ -6174,11 +6174,11 @@ ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <16 x i8> %arg1 = bitcast <2 x i64> %a1 to <16 x i8> - %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %arg0, <16 x i8> %arg1) + %res = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %arg0, <16 x i8> %arg1) %bc = bitcast <16 x i8> %res to <2 x i64> ret <2 x i64> %bc } -declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone +declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone define <2 x i64> @test_mm_subs_epi16(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; SSE-LABEL: test_mm_subs_epi16: @@ -6197,11 +6197,11 @@ ; AVX512-NEXT: ret{{[l|q]}} # encoding: [0xc3] %arg0 = bitcast <2 x i64> %a0 to <8 x i16> %arg1 = bitcast <2 x i64> %a1 to <8 x i16> - %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %arg0, <8 x i16> %arg1) + %res = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %arg0, <8 x i16> %arg1) %bc = bitcast <8 x i16> %res to <2 x i64> ret <2 x i64> %bc } -declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone +declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone define <2 x i64> @test_mm_subs_epu8(<2 x i64> %a0, <2 x i64> %a1) nounwind { ; SSE-LABEL: test_mm_subs_epu8: Index: test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll =================================================================== --- test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll +++ test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll @@ -884,6 +884,48 @@ declare <4 x float> @llvm.x86.sse2.cvtdq2ps(<4 x i32>) nounwind readnone +define <16 x i8> @test_x86_sse2_padds_b(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_padds_b: +; SSE: ## %bb.0: +; SSE-NEXT: paddsb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xec,0xc1] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX1-LABEL: test_x86_sse2_padds_b: +; AVX1: ## %bb.0: +; AVX1-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xec,0xc1] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse2_padds_b: +; AVX512: ## %bb.0: +; AVX512-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0xc1] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} +declare <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone + + +define <8 x i16> @test_x86_sse2_padds_w(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_padds_w: +; SSE: ## %bb.0: +; SSE-NEXT: paddsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xed,0xc1] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX1-LABEL: test_x86_sse2_padds_w: +; AVX1: ## %bb.0: +; AVX1-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xed,0xc1] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse2_padds_w: +; AVX512: ## %bb.0: +; AVX512-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0xc1] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + + define <16 x i8> @test_x86_sse2_paddus_b(<16 x i8> %a0, <16 x i8> %a1) { ; SSE-LABEL: test_x86_sse2_paddus_b: ; SSE: ## %bb.0: @@ -926,6 +968,48 @@ declare <8 x i16> @llvm.x86.sse2.paddus.w(<8 x i16>, <8 x i16>) nounwind readnone +define <16 x i8> @test_x86_sse2_psubs_b(<16 x i8> %a0, <16 x i8> %a1) { +; SSE-LABEL: test_x86_sse2_psubs_b: +; SSE: ## %bb.0: +; SSE-NEXT: psubsb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe8,0xc1] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX1-LABEL: test_x86_sse2_psubs_b: +; AVX1: ## %bb.0: +; AVX1-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe8,0xc1] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse2_psubs_b: +; AVX512: ## %bb.0: +; AVX512-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0xc1] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] + ret <16 x i8> %res +} +declare <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8>, <16 x i8>) nounwind readnone + + +define <8 x i16> @test_x86_sse2_psubs_w(<8 x i16> %a0, <8 x i16> %a1) { +; SSE-LABEL: test_x86_sse2_psubs_w: +; SSE: ## %bb.0: +; SSE-NEXT: psubsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe9,0xc1] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX1-LABEL: test_x86_sse2_psubs_w: +; AVX1: ## %bb.0: +; AVX1-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe9,0xc1] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; +; AVX512-LABEL: test_x86_sse2_psubs_w: +; AVX512: ## %bb.0: +; AVX512-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0xc1] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] + %res = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] + ret <8 x i16> %res +} +declare <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16>, <8 x i16>) nounwind readnone + + define <16 x i8> @test_x86_sse2_psubus_b(<16 x i8> %a0, <16 x i8> %a1) { ; SSE-LABEL: test_x86_sse2_psubus_b: ; SSE: ## %bb.0: Index: test/CodeGen/X86/sse2-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/sse2-intrinsics-x86.ll +++ test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -919,48 +919,6 @@ } -define <16 x i8> @test_x86_sse2_padds_b(<16 x i8> %a0, <16 x i8> %a1) { -; SSE-LABEL: test_x86_sse2_padds_b: -; SSE: ## %bb.0: -; SSE-NEXT: paddsb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xec,0xc1] -; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX1-LABEL: test_x86_sse2_padds_b: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xec,0xc1] -; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX512-LABEL: test_x86_sse2_padds_b: -; AVX512: ## %bb.0: -; AVX512-NEXT: vpaddsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xec,0xc1] -; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} -declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone - - -define <8 x i16> @test_x86_sse2_padds_w(<8 x i16> %a0, <8 x i16> %a1) { -; SSE-LABEL: test_x86_sse2_padds_w: -; SSE: ## %bb.0: -; SSE-NEXT: paddsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xed,0xc1] -; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX1-LABEL: test_x86_sse2_padds_w: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xed,0xc1] -; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX512-LABEL: test_x86_sse2_padds_w: -; AVX512: ## %bb.0: -; AVX512-NEXT: vpaddsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xed,0xc1] -; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} -declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone - - define <4 x i32> @test_x86_sse2_pmadd_wd(<8 x i16> %a0, <8 x i16> %a1) { ; SSE-LABEL: test_x86_sse2_pmadd_wd: ; SSE: ## %bb.0: @@ -1520,48 +1478,6 @@ declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32) nounwind readnone -define <16 x i8> @test_x86_sse2_psubs_b(<16 x i8> %a0, <16 x i8> %a1) { -; SSE-LABEL: test_x86_sse2_psubs_b: -; SSE: ## %bb.0: -; SSE-NEXT: psubsb %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe8,0xc1] -; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX1-LABEL: test_x86_sse2_psubs_b: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe8,0xc1] -; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX512-LABEL: test_x86_sse2_psubs_b: -; AVX512: ## %bb.0: -; AVX512-NEXT: vpsubsb %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe8,0xc1] -; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] - %res = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> %a0, <16 x i8> %a1) ; <<16 x i8>> [#uses=1] - ret <16 x i8> %res -} -declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone - - -define <8 x i16> @test_x86_sse2_psubs_w(<8 x i16> %a0, <8 x i16> %a1) { -; SSE-LABEL: test_x86_sse2_psubs_w: -; SSE: ## %bb.0: -; SSE-NEXT: psubsw %xmm1, %xmm0 ## encoding: [0x66,0x0f,0xe9,0xc1] -; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX1-LABEL: test_x86_sse2_psubs_w: -; AVX1: ## %bb.0: -; AVX1-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0xe9,0xc1] -; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] -; -; AVX512-LABEL: test_x86_sse2_psubs_w: -; AVX512: ## %bb.0: -; AVX512-NEXT: vpsubsw %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0xe9,0xc1] -; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] - %res = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> %a0, <8 x i16> %a1) ; <<8 x i16>> [#uses=1] - ret <8 x i16> %res -} -declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone - - define i32 @test_x86_sse2_ucomieq_sd(<2 x double> %a0, <2 x double> %a1) { ; SSE-LABEL: test_x86_sse2_ucomieq_sd: ; SSE: ## %bb.0: Index: test/Instrumentation/MemorySanitizer/msan_x86intrinsics.ll =================================================================== --- test/Instrumentation/MemorySanitizer/msan_x86intrinsics.ll +++ test/Instrumentation/MemorySanitizer/msan_x86intrinsics.ll @@ -46,14 +46,14 @@ ; Check that shadow is OR'ed, and origin is Select'ed ; And no shadow checks! -define <8 x i16> @Paddsw128(<8 x i16> %a, <8 x i16> %b) nounwind uwtable sanitize_memory { - %call = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a, <8 x i16> %b) +define <8 x i16> @Pmulhuw128(<8 x i16> %a, <8 x i16> %b) nounwind uwtable sanitize_memory { + %call = call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a, <8 x i16> %b) ret <8 x i16> %call } -declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> %a, <8 x i16> %b) nounwind +declare <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %a, <8 x i16> %b) nounwind -; CHECK-LABEL: @Paddsw128 +; CHECK-LABEL: @Pmulhuw128 ; CHECK-NEXT: load <8 x i16>, <8 x i16>* {{.*}} @__msan_param_tls ; CHECK-ORIGINS: load i32, i32* {{.*}} @__msan_param_origin_tls ; CHECK-NEXT: load <8 x i16>, <8 x i16>* {{.*}} @__msan_param_tls @@ -62,7 +62,7 @@ ; CHECK-ORIGINS: = bitcast <8 x i16> {{.*}} to i128 ; CHECK-ORIGINS-NEXT: = icmp ne i128 {{.*}}, 0 ; CHECK-ORIGINS-NEXT: = select i1 {{.*}}, i32 {{.*}}, i32 -; CHECK-NEXT: call <8 x i16> @llvm.x86.sse2.padds.w +; CHECK-NEXT: call <8 x i16> @llvm.x86.sse2.pmulhu.w ; CHECK-NEXT: store <8 x i16> {{.*}} @__msan_retval_tls ; CHECK-ORIGINS: store i32 {{.*}} @__msan_retval_origin_tls ; CHECK-NEXT: ret <8 x i16> Index: test/Transforms/InstCombine/X86/x86-adds-subs.ll =================================================================== --- test/Transforms/InstCombine/X86/x86-adds-subs.ll +++ test/Transforms/InstCombine/X86/x86-adds-subs.ll @@ -27,7 +27,8 @@ define <16 x i8> @sse2_adds_b_constant_undefs() { ; CHECK-LABEL: @sse2_adds_b_constant_undefs( -; CHECK-NEXT: ret <16 x i8> +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.sadd.sat.v16i8(<16 x i8> , <16 x i8> ) +; CHECK-NEXT: ret <16 x i8> [[TMP1]] ; %1 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> , <16 x i8> ) ret <16 x i8> %1 @@ -59,7 +60,8 @@ define <32 x i8> @avx2_adds_b_constant_undefs() { ; CHECK-LABEL: @avx2_adds_b_constant_undefs( -; CHECK-NEXT: ret <32 x i8> +; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.sadd.sat.v32i8(<32 x i8> , <32 x i8> ) +; CHECK-NEXT: ret <32 x i8> [[TMP1]] ; %1 = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> , <32 x i8> ) ret <32 x i8> %1 @@ -91,7 +93,8 @@ define <64 x i8> @avx512_mask_adds_b_constant_undefs() { ; CHECK-LABEL: @avx512_mask_adds_b_constant_undefs( -; CHECK-NEXT: ret <64 x i8> +; CHECK-NEXT: [[TMP1:%.*]] = call <64 x i8> @llvm.sadd.sat.v64i8(<64 x i8> , <64 x i8> ) +; CHECK-NEXT: ret <64 x i8> [[TMP1]] ; %1 = call <64 x i8> @llvm.x86.avx512.mask.padds.b.512(<64 x i8> , <64 x i8> , <64 x i8> zeroinitializer, i64 -1) ret <64 x i8> %1 @@ -123,7 +126,8 @@ define <8 x i16> @sse2_adds_w_constant_undefs() { ; CHECK-LABEL: @sse2_adds_w_constant_undefs( -; CHECK-NEXT: ret <8 x i16> +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.sadd.sat.v8i16(<8 x i16> , <8 x i16> ) +; CHECK-NEXT: ret <8 x i16> [[TMP1]] ; %1 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> , <8 x i16> ) ret <8 x i16> %1 @@ -155,7 +159,8 @@ define <16 x i16> @avx2_adds_w_constant_undefs() { ; CHECK-LABEL: @avx2_adds_w_constant_undefs( -; CHECK-NEXT: ret <16 x i16> +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i16> @llvm.sadd.sat.v16i16(<16 x i16> , <16 x i16> ) +; CHECK-NEXT: ret <16 x i16> [[TMP1]] ; %1 = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> , <16 x i16> ) ret <16 x i16> %1 @@ -187,7 +192,8 @@ define <32 x i16> @avx512_mask_adds_w_constant_undefs() { ; CHECK-LABEL: @avx512_mask_adds_w_constant_undefs( -; CHECK-NEXT: ret <32 x i16> +; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i16> @llvm.sadd.sat.v32i16(<32 x i16> , <32 x i16> ) +; CHECK-NEXT: ret <32 x i16> [[TMP1]] ; %1 = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> , <32 x i16> , <32 x i16> zeroinitializer, i32 -1) ret <32 x i16> %1 @@ -219,7 +225,8 @@ define <16 x i8> @sse2_subs_b_constant_undefs() { ; CHECK-LABEL: @sse2_subs_b_constant_undefs( -; CHECK-NEXT: ret <16 x i8> +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i8> @llvm.ssub.sat.v16i8(<16 x i8> , <16 x i8> ) +; CHECK-NEXT: ret <16 x i8> [[TMP1]] ; %1 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> , <16 x i8> ) ret <16 x i8> %1 @@ -251,7 +258,8 @@ define <32 x i8> @avx2_subs_b_constant_undefs() { ; CHECK-LABEL: @avx2_subs_b_constant_undefs( -; CHECK-NEXT: ret <32 x i8> +; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i8> @llvm.ssub.sat.v32i8(<32 x i8> , <32 x i8> ) +; CHECK-NEXT: ret <32 x i8> [[TMP1]] ; %1 = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> , <32 x i8> ) ret <32 x i8> %1 @@ -283,7 +291,8 @@ define <64 x i8> @avx512_mask_subs_b_constant_undefs() { ; CHECK-LABEL: @avx512_mask_subs_b_constant_undefs( -; CHECK-NEXT: ret <64 x i8> +; CHECK-NEXT: [[TMP1:%.*]] = call <64 x i8> @llvm.ssub.sat.v64i8(<64 x i8> , <64 x i8> ) +; CHECK-NEXT: ret <64 x i8> [[TMP1]] ; %1 = call <64 x i8> @llvm.x86.avx512.mask.psubs.b.512(<64 x i8> , <64 x i8> , <64 x i8> zeroinitializer, i64 -1) ret <64 x i8> %1 @@ -315,7 +324,8 @@ define <8 x i16> @sse2_subs_w_constant_undefs() { ; CHECK-LABEL: @sse2_subs_w_constant_undefs( -; CHECK-NEXT: ret <8 x i16> +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x i16> @llvm.ssub.sat.v8i16(<8 x i16> , <8 x i16> ) +; CHECK-NEXT: ret <8 x i16> [[TMP1]] ; %1 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> , <8 x i16> ) ret <8 x i16> %1 @@ -347,7 +357,8 @@ define <16 x i16> @avx2_subs_w_constant_undefs() { ; CHECK-LABEL: @avx2_subs_w_constant_undefs( -; CHECK-NEXT: ret <16 x i16> +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x i16> @llvm.ssub.sat.v16i16(<16 x i16> , <16 x i16> ) +; CHECK-NEXT: ret <16 x i16> [[TMP1]] ; %1 = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> , <16 x i16> ) ret <16 x i16> %1 @@ -379,7 +390,8 @@ define <32 x i16> @avx512_mask_subs_w_constant_undefs() { ; CHECK-LABEL: @avx512_mask_subs_w_constant_undefs( -; CHECK-NEXT: ret <32 x i16> +; CHECK-NEXT: [[TMP1:%.*]] = call <32 x i16> @llvm.ssub.sat.v32i16(<32 x i16> , <32 x i16> ) +; CHECK-NEXT: ret <32 x i16> [[TMP1]] ; %1 = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> , <32 x i16> , <32 x i16> zeroinitializer, i32 -1) ret <32 x i16> %1