Index: llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ llvm/trunk/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -243,6 +243,86 @@ return nullptr; } +static Value *simplifyX86AddsSubs(const IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + bool IsAddition = false; + bool IsMasked = false; + + switch (II.getIntrinsicID()) { + default: llvm_unreachable("Unexpected intrinsic!"); + case Intrinsic::x86_sse2_padds_b: + case Intrinsic::x86_sse2_padds_w: + case Intrinsic::x86_avx2_padds_b: + case Intrinsic::x86_avx2_padds_w: + IsAddition = true; IsMasked = false; + break; + case Intrinsic::x86_sse2_psubs_b: + case Intrinsic::x86_sse2_psubs_w: + case Intrinsic::x86_avx2_psubs_b: + case Intrinsic::x86_avx2_psubs_w: + IsAddition = false; IsMasked = false; + break; + case Intrinsic::x86_avx512_mask_padds_b_512: + case Intrinsic::x86_avx512_mask_padds_w_512: + IsAddition = true; IsMasked = true; + break; + case Intrinsic::x86_avx512_mask_psubs_b_512: + case Intrinsic::x86_avx512_mask_psubs_w_512: + IsAddition = false; IsMasked = true; + break; + } + + auto *Arg0 = dyn_cast(II.getOperand(0)); + auto *Arg1 = dyn_cast(II.getOperand(1)); + auto VT = cast(II.getType()); + auto SVT = VT->getElementType(); + unsigned NumElems = VT->getNumElements(); + + if (!Arg0 || !Arg1 || (IsMasked && !isa(II.getOperand(2)))) + return nullptr; + + SmallVector Result; + + APInt MaxValue = APInt::getSignedMaxValue(SVT->getIntegerBitWidth()); + APInt MinValue = APInt::getSignedMinValue(SVT->getIntegerBitWidth()); + for (unsigned i = 0; i < NumElems; ++i) { + auto *Elt0 = Arg0->getAggregateElement(i); + auto *Elt1 = Arg1->getAggregateElement(i); + if (isa(Elt0) || isa(Elt1)) { + Result.push_back(UndefValue::get(SVT)); + continue; + } + + if (!isa(Elt0) || !isa(Elt1)) + return nullptr; + + const APInt &Val0 = cast(Elt0)->getValue(); + const APInt &Val1 = cast(Elt1)->getValue(); + bool Overflow = false; + APInt ResultElem = IsAddition ? Val0.sadd_ov(Val1, Overflow) + : Val0.ssub_ov(Val1, Overflow); + if (Overflow) + ResultElem = Val0.isNegative() ? MinValue : MaxValue; + Result.push_back(Constant::getIntegerValue(SVT, ResultElem)); + } + + Value *ResultVec = ConstantVector::get(Result); + + if (II.getNumArgOperands() == 4) { // For masked intrinsics. + Value *Src = II.getOperand(2); + auto Mask = II.getOperand(3); + if (auto *C = dyn_cast(Mask)) + if (C->isAllOnesValue()) + return ResultVec; + auto *MaskTy = VectorType::get( + Builder.getInt1Ty(), cast(Mask->getType())->getBitWidth()); + Mask = Builder.CreateBitCast(Mask, MaskTy); + ResultVec = Builder.CreateSelect(Mask, ResultVec, Src); + } + + return ResultVec; +} + static Value *simplifyX86immShift(const IntrinsicInst &II, InstCombiner::BuilderTy &Builder) { bool LogicalShift = false; @@ -2525,6 +2605,24 @@ break; } + // Constant fold add/sub with saturation intrinsics. + case Intrinsic::x86_sse2_padds_b: + case Intrinsic::x86_sse2_padds_w: + case Intrinsic::x86_sse2_psubs_b: + case Intrinsic::x86_sse2_psubs_w: + case Intrinsic::x86_avx2_padds_b: + case Intrinsic::x86_avx2_padds_w: + case Intrinsic::x86_avx2_psubs_b: + case Intrinsic::x86_avx2_psubs_w: + case Intrinsic::x86_avx512_mask_padds_b_512: + case Intrinsic::x86_avx512_mask_padds_w_512: + case Intrinsic::x86_avx512_mask_psubs_b_512: + case Intrinsic::x86_avx512_mask_psubs_w_512: + if (Value *V = simplifyX86AddsSubs(*II, Builder)) + return replaceInstUsesWith(*II, V); + break; + + // Constant fold ashr( , Ci ). // Constant fold lshr( , Ci ). // Constant fold shl( , Ci ). Index: llvm/trunk/test/Transforms/InstCombine/X86/x86-adds-subs.ll =================================================================== --- llvm/trunk/test/Transforms/InstCombine/X86/x86-adds-subs.ll +++ llvm/trunk/test/Transforms/InstCombine/X86/x86-adds-subs.ll @@ -0,0 +1,351 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s + +define <16 x i8> @sse2_adds_b_constant() { +; CHECK-LABEL: @sse2_adds_b_constant( +; CHECK-NEXT: ret <16 x i8> + %1 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> , <16 x i8> ) + ret <16 x i8> %1 +} + +define <16 x i8> @sse2_adds_b_constant_underflow() { +; CHECK-LABEL: @sse2_adds_b_constant_underflow( +; CHECK-NEXT: ret <16 x i8> + %1 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> , <16 x i8> ) + ret <16 x i8> %1 +} + +define <16 x i8> @sse2_adds_b_constant_overflow() { +; CHECK-LABEL: @sse2_adds_b_constant_overflow( +; CHECK-NEXT: ret <16 x i8> + %1 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> , <16 x i8> ) + ret <16 x i8> %1 +} + +define <16 x i8> @sse2_adds_b_constant_undefs() { +; CHECK-LABEL: @sse2_adds_b_constant_undefs( +; CHECK-NEXT: ret <16 x i8> + %1 = call <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8> , <16 x i8> ) + ret <16 x i8> %1 +} + +define <32 x i8> @avx2_adds_b_constant() { +; CHECK-LABEL: @avx2_adds_b_constant( +; CHECK-NEXT: ret <32 x i8> + %1 = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> , <32 x i8> ) + ret <32 x i8> %1 +} + +define <32 x i8> @avx2_adds_b_constant_underflow() { +; CHECK-LABEL: @avx2_adds_b_constant_underflow( +; CHECK-NEXT: ret <32 x i8> + %1 = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> , <32 x i8> ) + ret <32 x i8> %1 +} + +define <32 x i8> @avx2_adds_b_constant_overflow() { +; CHECK-LABEL: @avx2_adds_b_constant_overflow( +; CHECK-NEXT: ret <32 x i8> + %1 = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> , <32 x i8> ) + ret <32 x i8> %1 +} + +define <32 x i8> @avx2_adds_b_constant_undefs() { +; CHECK-LABEL: @avx2_adds_b_constant_undefs( +; CHECK-NEXT: ret <32 x i8> + %1 = call <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8> , <32 x i8> ) + ret <32 x i8> %1 +} + +define <64 x i8> @avx512_mask_adds_b_constant() { +; CHECK-LABEL: @avx512_mask_adds_b_constant( +; CHECK-NEXT: ret <64 x i8> + %1 = call <64 x i8> @llvm.x86.avx512.mask.padds.b.512(<64 x i8> , <64 x i8> , <64 x i8> zeroinitializer, i64 -3) + ret <64 x i8> %1 +} + +define <64 x i8> @avx512_mask_adds_b_constant_underflow() { +; CHECK-LABEL: @avx512_mask_adds_b_constant_underflow( +; CHECK-NEXT: ret <64 x i8> + %1 = call <64 x i8> @llvm.x86.avx512.mask.padds.b.512(<64 x i8> , <64 x i8> , <64 x i8> zeroinitializer, i64 -1) + ret <64 x i8> %1 +} + +define <64 x i8> @avx512_mask_adds_b_constant_overflow() { +; CHECK-LABEL: @avx512_mask_adds_b_constant_overflow( +; CHECK-NEXT: ret <64 x i8> + %1 = call <64 x i8> @llvm.x86.avx512.mask.padds.b.512(<64 x i8> , <64 x i8> , <64 x i8> zeroinitializer, i64 -1) + ret <64 x i8> %1 +} + +define <64 x i8> @avx512_mask_adds_b_constant_undefs() { +; CHECK-LABEL: @avx512_mask_adds_b_constant_undefs( +; CHECK-NEXT: ret <64 x i8> + %1 = call <64 x i8> @llvm.x86.avx512.mask.padds.b.512(<64 x i8> , <64 x i8> , <64 x i8> zeroinitializer, i64 -1) + ret <64 x i8> %1 +} + +define <8 x i16> @sse2_adds_w_constant() { +; CHECK-LABEL: @sse2_adds_w_constant( +; CHECK-NEXT: ret <8 x i16> + %1 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> , <8 x i16> ) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_adds_w_constant_underflow() { +; CHECK-LABEL: @sse2_adds_w_constant_underflow( +; CHECK-NEXT: ret <8 x i16> + %1 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> , <8 x i16> ) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_adds_w_constant_overflow() { +; CHECK-LABEL: @sse2_adds_w_constant_overflow( +; CHECK-NEXT: ret <8 x i16> + %1 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> , <8 x i16> ) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_adds_w_constant_undefs() { +; CHECK-LABEL: @sse2_adds_w_constant_undefs( +; CHECK-NEXT: ret <8 x i16> + %1 = call <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16> , <8 x i16> ) + ret <8 x i16> %1 +} + +define <16 x i16> @avx2_adds_w_constant() { +; CHECK-LABEL: @avx2_adds_w_constant( +; CHECK-NEXT: ret <16 x i16> + %1 = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> , <16 x i16> ) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_adds_w_constant_underflow() { +; CHECK-LABEL: @avx2_adds_w_constant_underflow( +; CHECK-NEXT: ret <16 x i16> + %1 = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> , <16 x i16> ) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_adds_w_constant_overflow() { +; CHECK-LABEL: @avx2_adds_w_constant_overflow( +; CHECK-NEXT: ret <16 x i16> + %1 = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> , <16 x i16> ) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_adds_w_constant_undefs() { +; CHECK-LABEL: @avx2_adds_w_constant_undefs( +; CHECK-NEXT: ret <16 x i16> + %1 = call <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16> , <16 x i16> ) + ret <16 x i16> %1 +} + +define <32 x i16> @avx512_mask_adds_w_constant() { +; CHECK-LABEL: @avx512_mask_adds_w_constant( +; CHECK-NEXT: ret <32 x i16> + %1 = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> , <32 x i16> , <32 x i16> zeroinitializer, i32 -3) + ret <32 x i16> %1 +} + +define <32 x i16> @avx512_mask_adds_w_constant_underflow() { +; CHECK-LABEL: @avx512_mask_adds_w_constant_underflow( +; CHECK-NEXT: ret <32 x i16> + %1 = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> , <32 x i16> , <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %1 +} + +define <32 x i16> @avx512_mask_adds_w_constant_overflow() { +; CHECK-LABEL: @avx512_mask_adds_w_constant_overflow( +; CHECK-NEXT: ret <32 x i16> + %1 = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> , <32 x i16> , <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %1 +} + +define <32 x i16> @avx512_mask_adds_w_constant_undefs() { +; CHECK-LABEL: @avx512_mask_adds_w_constant_undefs( +; CHECK-NEXT: ret <32 x i16> + %1 = call <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16> , <32 x i16> , <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %1 +} + +define <16 x i8> @sse2_subs_b_constant() { +; CHECK-LABEL: @sse2_subs_b_constant( +; CHECK-NEXT: ret <16 x i8> + %1 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> , <16 x i8> ) + ret <16 x i8> %1 +} + +define <16 x i8> @sse2_subs_b_constant_underflow() { +; CHECK-LABEL: @sse2_subs_b_constant_underflow( +; CHECK-NEXT: ret <16 x i8> + %1 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> , <16 x i8> ) + ret <16 x i8> %1 +} + +define <16 x i8> @sse2_subs_b_constant_overflow() { +; CHECK-LABEL: @sse2_subs_b_constant_overflow( +; CHECK-NEXT: ret <16 x i8> + %1 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> , <16 x i8> ) + ret <16 x i8> %1 +} + +define <16 x i8> @sse2_subs_b_constant_undefs() { +; CHECK-LABEL: @sse2_subs_b_constant_undefs( +; CHECK-NEXT: ret <16 x i8> + %1 = call <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8> , <16 x i8> ) + ret <16 x i8> %1 +} + +define <32 x i8> @avx2_subs_b_constant() { +; CHECK-LABEL: @avx2_subs_b_constant( +; CHECK-NEXT: ret <32 x i8> + %1 = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> , <32 x i8> ) + ret <32 x i8> %1 +} + +define <32 x i8> @avx2_subs_b_constant_underflow() { +; CHECK-LABEL: @avx2_subs_b_constant_underflow( +; CHECK-NEXT: ret <32 x i8> + %1 = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> , <32 x i8> ) + ret <32 x i8> %1 +} + +define <32 x i8> @avx2_subs_b_constant_overflow() { +; CHECK-LABEL: @avx2_subs_b_constant_overflow( +; CHECK-NEXT: ret <32 x i8> + %1 = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> , <32 x i8> ) + ret <32 x i8> %1 +} + +define <32 x i8> @avx2_subs_b_constant_undefs() { +; CHECK-LABEL: @avx2_subs_b_constant_undefs( +; CHECK-NEXT: ret <32 x i8> + %1 = call <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8> , <32 x i8> ) + ret <32 x i8> %1 +} + +define <64 x i8> @avx512_mask_subs_b_constant() { +; CHECK-LABEL: @avx512_mask_subs_b_constant( +; CHECK-NEXT: ret <64 x i8> + %1 = call <64 x i8> @llvm.x86.avx512.mask.psubs.b.512(<64 x i8> , <64 x i8> , <64 x i8> zeroinitializer, i64 -3) + ret <64 x i8> %1 +} + +define <64 x i8> @avx512_mask_subs_b_constant_underflow() { +; CHECK-LABEL: @avx512_mask_subs_b_constant_underflow( +; CHECK-NEXT: ret <64 x i8> + %1 = call <64 x i8> @llvm.x86.avx512.mask.psubs.b.512(<64 x i8> , <64 x i8> , <64 x i8> zeroinitializer, i64 -1) + ret <64 x i8> %1 +} + +define <64 x i8> @avx512_mask_subs_b_constant_overflow() { +; CHECK-LABEL: @avx512_mask_subs_b_constant_overflow( +; CHECK-NEXT: ret <64 x i8> + %1 = call <64 x i8> @llvm.x86.avx512.mask.psubs.b.512(<64 x i8> , <64 x i8> , <64 x i8> zeroinitializer, i64 -1) + ret <64 x i8> %1 +} + +define <64 x i8> @avx512_mask_subs_b_constant_undefs() { +; CHECK-LABEL: @avx512_mask_subs_b_constant_undefs( +; CHECK-NEXT: ret <64 x i8> + %1 = call <64 x i8> @llvm.x86.avx512.mask.psubs.b.512(<64 x i8> , <64 x i8> , <64 x i8> zeroinitializer, i64 -1) + ret <64 x i8> %1 +} + +define <8 x i16> @sse2_subs_w_constant() { +; CHECK-LABEL: @sse2_subs_w_constant( +; CHECK-NEXT: ret <8 x i16> + %1 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> , <8 x i16> ) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_subs_w_constant_underflow() { +; CHECK-LABEL: @sse2_subs_w_constant_underflow( +; CHECK-NEXT: ret <8 x i16> + %1 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> , <8 x i16> ) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_subs_w_constant_overflow() { +; CHECK-LABEL: @sse2_subs_w_constant_overflow( +; CHECK-NEXT: ret <8 x i16> + %1 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> , <8 x i16> ) + ret <8 x i16> %1 +} + +define <8 x i16> @sse2_subs_w_constant_undefs() { +; CHECK-LABEL: @sse2_subs_w_constant_undefs( +; CHECK-NEXT: ret <8 x i16> + %1 = call <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16> , <8 x i16> ) + ret <8 x i16> %1 +} + +define <16 x i16> @avx2_subs_w_constant() { +; CHECK-LABEL: @avx2_subs_w_constant( +; CHECK-NEXT: ret <16 x i16> + %1 = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> , <16 x i16> ) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_subs_w_constant_underflow() { +; CHECK-LABEL: @avx2_subs_w_constant_underflow( +; CHECK-NEXT: ret <16 x i16> + %1 = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> , <16 x i16> ) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_subs_w_constant_overflow() { +; CHECK-LABEL: @avx2_subs_w_constant_overflow( +; CHECK-NEXT: ret <16 x i16> + %1 = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> , <16 x i16> ) + ret <16 x i16> %1 +} + +define <16 x i16> @avx2_subs_w_constant_undefs() { +; CHECK-LABEL: @avx2_subs_w_constant_undefs( +; CHECK-NEXT: ret <16 x i16> + %1 = call <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16> , <16 x i16> ) + ret <16 x i16> %1 +} + +define <32 x i16> @avx512_mask_subs_w_constant() { +; CHECK-LABEL: @avx512_mask_subs_w_constant( +; CHECK-NEXT: ret <32 x i16> + %1 = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> , <32 x i16> , <32 x i16> zeroinitializer, i32 -3) + ret <32 x i16> %1 +} + +define <32 x i16> @avx512_mask_subs_w_constant_underflow() { +; CHECK-LABEL: @avx512_mask_subs_w_constant_underflow( +; CHECK-NEXT: ret <32 x i16> + %1 = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> , <32 x i16> , <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %1 +} + +define <32 x i16> @avx512_mask_subs_w_constant_overflow() { +; CHECK-LABEL: @avx512_mask_subs_w_constant_overflow( +; CHECK-NEXT: ret <32 x i16> + %1 = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> , <32 x i16> , <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %1 +} + +define <32 x i16> @avx512_mask_subs_w_constant_undefs() { +; CHECK-LABEL: @avx512_mask_subs_w_constant_undefs( +; CHECK-NEXT: ret <32 x i16> + %1 = call <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16> , <32 x i16> , <32 x i16> zeroinitializer, i32 -1) + ret <32 x i16> %1 +} + +declare <16 x i8> @llvm.x86.sse2.padds.b(<16 x i8>, <16 x i8>) nounwind readnone +declare <16 x i8> @llvm.x86.sse2.psubs.b(<16 x i8>, <16 x i8>) nounwind readnone +declare <8 x i16> @llvm.x86.sse2.padds.w(<8 x i16>, <8 x i16>) nounwind readnone +declare <8 x i16> @llvm.x86.sse2.psubs.w(<8 x i16>, <8 x i16>) nounwind readnone +declare <32 x i8> @llvm.x86.avx2.padds.b(<32 x i8>, <32 x i8>) nounwind readnone +declare <32 x i8> @llvm.x86.avx2.psubs.b(<32 x i8>, <32 x i8>) nounwind readnone +declare <16 x i16> @llvm.x86.avx2.padds.w(<16 x i16>, <16 x i16>) nounwind readnone +declare <16 x i16> @llvm.x86.avx2.psubs.w(<16 x i16>, <16 x i16>) nounwind readnone +declare <64 x i8> @llvm.x86.avx512.mask.padds.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) nounwind readnone +declare <64 x i8> @llvm.x86.avx512.mask.psubs.b.512(<64 x i8>, <64 x i8>, <64 x i8>, i64) nounwind readnone +declare <32 x i16> @llvm.x86.avx512.mask.padds.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) nounwind readnone +declare <32 x i16> @llvm.x86.avx512.mask.psubs.w.512(<32 x i16>, <32 x i16>, <32 x i16>, i32) nounwind readnone