Index: llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp =================================================================== --- llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -576,6 +576,95 @@ return ConstantVector::get(Vals); } +// Replace X86-specific intrinsics with generic floor-ceil where applicable. +static Value *simplifyX86round(IntrinsicInst &II, + InstCombiner::BuilderTy &Builder) { + unsigned RoundControl; + Intrinsic::ID IntrinsicID = II.getIntrinsicID(); + if (IntrinsicID == Intrinsic::x86_sse41_round_ss || + IntrinsicID == Intrinsic::x86_sse41_round_sd) + RoundControl = cast(II.getArgOperand(2))->getZExtValue(); + else if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ss || + IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_sd) + RoundControl = cast(II.getArgOperand(4))->getZExtValue(); + else + RoundControl = cast(II.getArgOperand(1))->getZExtValue(); + + int SAE; + if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ps_512 || + IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_pd_512) + SAE = cast(II.getArgOperand(4))->getSExtValue(); + else if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ss || + IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_sd) + SAE = cast(II.getArgOperand(5))->getSExtValue(); + else + SAE = 4; + + if (SAE != 4 || (RoundControl != 2 /*ceil*/ && RoundControl != 1 /*floor*/)) + return nullptr; + + Value *Src, *Dst, *Mask; + bool IsScalar = false; + if (IntrinsicID == Intrinsic::x86_sse41_round_ss || + IntrinsicID == Intrinsic::x86_sse41_round_sd || + IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ss || + IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_sd) { + IsScalar = true; + if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ss || + IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_sd) { + Mask = II.getArgOperand(3); + Value *Zero = Constant::getNullValue(Mask->getType()); + Mask = Builder.CreateAnd(Mask, 1); + Mask = Builder.CreateICmp(ICmpInst::ICMP_NE, Mask, Zero); + Dst = II.getArgOperand(2); + } else + Dst = II.getArgOperand(0); + Src = Builder.CreateExtractElement(II.getArgOperand(1), (uint64_t)0); + } else { + Src = II.getArgOperand(0); + if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ps_128 || + IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ps_256 || + IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ps_512 || + IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_pd_128 || + IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_pd_256 || + IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_pd_512) { + Dst = II.getArgOperand(2); + Mask = II.getArgOperand(3); + } else { + Dst = Src; + Mask = ConstantInt::getAllOnesValue( + Builder.getIntNTy(Src->getType()->getVectorNumElements())); + } + } + + Intrinsic::ID ID = (RoundControl == 2) ? Intrinsic::ceil : Intrinsic::floor; + Value *Res = Builder.CreateIntrinsic(ID, {Src}, &II); + if (!IsScalar) { + if (auto *C = dyn_cast(Mask)) + if (C->isAllOnesValue()) + return Res; + auto *MaskTy = VectorType::get( + Builder.getInt1Ty(), cast(Mask->getType())->getBitWidth()); + Mask = Builder.CreateBitCast(Mask, MaskTy); + unsigned Width = Src->getType()->getVectorNumElements(); + if (MaskTy->getVectorNumElements() > Width) { + uint32_t Indices[4]; + for (unsigned i = 0; i != Width; ++i) + Indices[i] = i; + Mask = Builder.CreateShuffleVector(Mask, Mask, + makeArrayRef(Indices, Width)); + } + return Builder.CreateSelect(Mask, Res, Dst); + } + if (IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_ss || + IntrinsicID == Intrinsic::x86_avx512_mask_rndscale_sd) { + Dst = Builder.CreateExtractElement(Dst, (uint64_t)0); + Res = Builder.CreateSelect(Mask, Res, Dst); + Dst = II.getArgOperand(0); + } + return Builder.CreateInsertElement(Dst, Res, (uint64_t)0); +} + static Value *simplifyX86movmsk(const IntrinsicInst &II) { Value *Arg = II.getArgOperand(0); Type *ResTy = II.getType(); @@ -2222,6 +2311,22 @@ break; } + case Intrinsic::x86_sse41_round_ps: + case Intrinsic::x86_sse41_round_pd: + case Intrinsic::x86_avx_round_ps_256: + case Intrinsic::x86_avx_round_pd_256: + case Intrinsic::x86_avx512_mask_rndscale_ps_128: + case Intrinsic::x86_avx512_mask_rndscale_ps_256: + case Intrinsic::x86_avx512_mask_rndscale_ps_512: + case Intrinsic::x86_avx512_mask_rndscale_pd_128: + case Intrinsic::x86_avx512_mask_rndscale_pd_256: + case Intrinsic::x86_avx512_mask_rndscale_pd_512: + case Intrinsic::x86_avx512_mask_rndscale_ss: + case Intrinsic::x86_avx512_mask_rndscale_sd: + if (Value *V = simplifyX86round(*II, Builder)) + return replaceInstUsesWith(*II, V); + break; + case Intrinsic::x86_mmx_pmovmskb: case Intrinsic::x86_sse_movmsk_ps: case Intrinsic::x86_sse2_movmsk_pd: @@ -2438,8 +2543,6 @@ case Intrinsic::x86_sse2_cmp_sd: case Intrinsic::x86_sse2_min_sd: case Intrinsic::x86_sse2_max_sd: - case Intrinsic::x86_sse41_round_ss: - case Intrinsic::x86_sse41_round_sd: case Intrinsic::x86_xop_vfrcz_ss: case Intrinsic::x86_xop_vfrcz_sd: { unsigned VWidth = II->getType()->getVectorNumElements(); @@ -2452,6 +2555,19 @@ } break; } + case Intrinsic::x86_sse41_round_ss: + case Intrinsic::x86_sse41_round_sd: { + unsigned VWidth = II->getType()->getVectorNumElements(); + APInt UndefElts(VWidth, 0); + APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth)); + if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) { + if (V != II) + return replaceInstUsesWith(*II, V); + return II; + } else if (Value *V = simplifyX86round(*II, Builder)) + return replaceInstUsesWith(*II, V); + break; + } // Constant fold ashr( , Ci ). // Constant fold lshr( , Ci ). Index: llvm/test/Transforms/InstCombine/X86/x86-avx.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/InstCombine/X86/x86-avx.ll @@ -0,0 +1,41 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -instcombine -S | FileCheck %s + +declare <8 x float> @llvm.x86.avx.round.ps.256(<8 x float>, i32) +declare <4 x double> @llvm.x86.avx.round.pd.256(<4 x double>, i32) + +define <8 x float> @test_round_ps_floor(<8 x float> %a) { +; CHECK-LABEL: @test_round_ps_floor( +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[A:%.*]]) +; CHECK-NEXT: ret <8 x float> [[TMP1]] +; + %1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a, i32 1) + ret <8 x float> %1 +} + +define <8 x float> @test_round_ps_ceil(<8 x float> %a) { +; CHECK-LABEL: @test_round_ps_ceil( +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[A:%.*]]) +; CHECK-NEXT: ret <8 x float> [[TMP1]] +; + %1 = call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %a, i32 2) + ret <8 x float> %1 +} + +define <4 x double> @test_round_pd_floor(<4 x double> %a) { +; CHECK-LABEL: @test_round_pd_floor( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[A:%.*]]) +; CHECK-NEXT: ret <4 x double> [[TMP1]] +; + %1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a, i32 1) + ret <4 x double> %1 +} + +define <4 x double> @test_round_pd_deil(<4 x double> %a) { +; CHECK-LABEL: @test_round_pd_deil( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[A:%.*]]) +; CHECK-NEXT: ret <4 x double> [[TMP1]] +; + %1 = call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %a, i32 2) + ret <4 x double> %1 +} Index: llvm/test/Transforms/InstCombine/X86/x86-avx512.ll =================================================================== --- llvm/test/Transforms/InstCombine/X86/x86-avx512.ll +++ llvm/test/Transforms/InstCombine/X86/x86-avx512.ll @@ -916,6 +916,213 @@ declare i32 @llvm.x86.avx512.cvttsd2usi(<2 x double>, i32) declare i64 @llvm.x86.avx512.cvttsd2usi64(<2 x double>, i32) +declare <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32, i32) +declare <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32, i32) +declare <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float>, i32, <4 x float>, i8) +declare <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float>, i32, <8 x float>, i8) +declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <16 x float>, i16, i32) +declare <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double>, i32, <2 x double>, i8) +declare <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double>, i32, <4 x double>, i8) +declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32) + +define <4 x float> @test_rndscale_ss_floor(<4 x float> %src0, <4 x float> %src1, <4 x float> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_ss_floor( +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[K:%.*]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[SRC1:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.floor.f32(float [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[DST:%.*]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP2]], float [[TMP5]], float [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[SRC0:%.*]], float [[TMP6]], i64 0 +; CHECK-NEXT: ret <4 x float> [[TMP7]] +; + %1 = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %src0, <4 x float> %src1, <4 x float> %dst, i8 %k, i32 1, i32 4) + ret <4 x float> %1 +} + +define <4 x float> @test_rndscale_ss_ceil(<4 x float> %src0, <4 x float> %src1, <4 x float> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_ss_ceil( +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[K:%.*]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <4 x float> [[SRC1:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = call float @llvm.ceil.f32(float [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <4 x float> [[DST:%.*]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP2]], float [[TMP5]], float [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <4 x float> [[SRC0:%.*]], float [[TMP6]], i64 0 +; CHECK-NEXT: ret <4 x float> [[TMP7]] +; + %1 = call <4 x float> @llvm.x86.avx512.mask.rndscale.ss(<4 x float> %src0, <4 x float> %src1, <4 x float> %dst, i8 %k, i32 2, i32 4) + ret <4 x float> %1 +} + +define <2 x double> @test_rndscale_sd_floor(<2 x double> %src0, <2 x double> %src1, <2 x double> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_sd_floor( +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[K:%.*]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[SRC1:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.floor.f64(double [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[DST:%.*]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP2]], double [[TMP5]], double [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[SRC0:%.*]], double [[TMP6]], i64 0 +; CHECK-NEXT: ret <2 x double> [[TMP7]] +; + %1 = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %src0, <2 x double> %src1, <2 x double> %dst, i8 %k, i32 1, i32 4) + ret <2 x double> %1 +} + +define <2 x double> @test_rndscale_sd_ceil(<2 x double> %src0, <2 x double> %src1, <2 x double> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_sd_ceil( +; CHECK-NEXT: [[TMP1:%.*]] = and i8 [[K:%.*]], 1 +; CHECK-NEXT: [[TMP2:%.*]] = icmp eq i8 [[TMP1]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[SRC1:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = call double @llvm.ceil.f64(double [[TMP3]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <2 x double> [[DST:%.*]], i64 0 +; CHECK-NEXT: [[TMP6:%.*]] = select i1 [[TMP2]], double [[TMP5]], double [[TMP4]] +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x double> [[SRC0:%.*]], double [[TMP6]], i64 0 +; CHECK-NEXT: ret <2 x double> [[TMP7]] +; + %1 = call <2 x double> @llvm.x86.avx512.mask.rndscale.sd(<2 x double> %src0, <2 x double> %src1, <2 x double> %dst, i8 %k, i32 2, i32 4) + ret <2 x double> %1 +} + +define <4 x float> @test_rndscale_ps_128_floor(<4 x float> %src, <4 x float> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_ps_128_floor( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.floor.v4f32(<4 x float> [[SRC:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP1]], <4 x float> [[DST:%.*]] +; CHECK-NEXT: ret <4 x float> [[TMP4]] +; + %1 = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %src, i32 1, <4 x float> %dst, i8 %k) + ret <4 x float> %1 +} + +define <4 x float> @test_rndscale_ps_128_ceil(<4 x float> %src, <4 x float> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_ps_128_ceil( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x float> @llvm.ceil.v4f32(<4 x float> [[SRC:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x float> [[TMP1]], <4 x float> [[DST:%.*]] +; CHECK-NEXT: ret <4 x float> [[TMP4]] +; + %1 = call <4 x float> @llvm.x86.avx512.mask.rndscale.ps.128(<4 x float> %src, i32 2, <4 x float> %dst, i8 %k) + ret <4 x float> %1 +} + +define <8 x float> @test_rndscale_ps_256_floor(<8 x float> %src, <8 x float> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_ps_256_floor( +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.floor.v8f32(<8 x float> [[SRC:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[DST:%.*]] +; CHECK-NEXT: ret <8 x float> [[TMP3]] +; + %1 = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %src, i32 1, <8 x float> %dst, i8 %k) + ret <8 x float> %1 +} + +define <8 x float> @test_rndscale_ps_256_ceil(<8 x float> %src, <8 x float> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_ps_256_ceil( +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x float> @llvm.ceil.v8f32(<8 x float> [[SRC:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x float> [[TMP1]], <8 x float> [[DST:%.*]] +; CHECK-NEXT: ret <8 x float> [[TMP3]] +; + %1 = call <8 x float> @llvm.x86.avx512.mask.rndscale.ps.256(<8 x float> %src, i32 2, <8 x float> %dst, i8 %k) + ret <8 x float> %1 +} + +define <16 x float> @test_rndscale_ps_512_floor(<16 x float> %src, <16 x float> %dst, i16 %k) { +; CHECK-LABEL: @test_rndscale_ps_512_floor( +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.floor.v16f32(<16 x float> [[SRC:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[K:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[DST:%.*]] +; CHECK-NEXT: ret <16 x float> [[TMP3]] +; + %1 = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %src, i32 1, <16 x float> %dst, i16 %k, i32 4) + ret <16 x float> %1 +} + +define <16 x float> @test_rndscale_ps_512_ceil(<16 x float> %src, <16 x float> %dst, i16 %k) { +; CHECK-LABEL: @test_rndscale_ps_512_ceil( +; CHECK-NEXT: [[TMP1:%.*]] = call <16 x float> @llvm.ceil.v16f32(<16 x float> [[SRC:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i16 [[K:%.*]] to <16 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <16 x i1> [[TMP2]], <16 x float> [[TMP1]], <16 x float> [[DST:%.*]] +; CHECK-NEXT: ret <16 x float> [[TMP3]] +; + %1 = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %src, i32 2, <16 x float> %dst, i16 %k, i32 4) + ret <16 x float> %1 +} + +define <2 x double> @test_rndscale_pd_128_floor(<2 x double> %src, <2 x double> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_pd_128_floor( +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x double> @llvm.floor.v2f64(<2 x double> [[SRC:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = select <2 x i1> [[TMP3]], <2 x double> [[TMP1]], <2 x double> [[DST:%.*]] +; CHECK-NEXT: ret <2 x double> [[TMP4]] +; + %1 = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %src, i32 1, <2 x double> %dst, i8 %k) + ret <2 x double> %1 +} + +define <2 x double> @test_rndscale_pd_128_ceil(<2 x double> %src, <2 x double> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_pd_128_ceil( +; CHECK-NEXT: [[TMP1:%.*]] = call <2 x double> @llvm.ceil.v2f64(<2 x double> [[SRC:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <2 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = select <2 x i1> [[TMP3]], <2 x double> [[TMP1]], <2 x double> [[DST:%.*]] +; CHECK-NEXT: ret <2 x double> [[TMP4]] +; + %1 = call <2 x double> @llvm.x86.avx512.mask.rndscale.pd.128(<2 x double> %src, i32 2, <2 x double> %dst, i8 %k) + ret <2 x double> %1 +} + +define <4 x double> @test_rndscale_pd_256_floor(<4 x double> %src, <4 x double> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_pd_256_floor( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.floor.v4f64(<4 x double> [[SRC:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP1]], <4 x double> [[DST:%.*]] +; CHECK-NEXT: ret <4 x double> [[TMP4]] +; + %1 = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %src, i32 1, <4 x double> %dst, i8 %k) + ret <4 x double> %1 +} + +define <4 x double> @test_rndscale_pd_256_ceil(<4 x double> %src, <4 x double> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_pd_256_ceil( +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x double> @llvm.ceil.v4f64(<4 x double> [[SRC:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i1> [[TMP2]], <8 x i1> undef, <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = select <4 x i1> [[TMP3]], <4 x double> [[TMP1]], <4 x double> [[DST:%.*]] +; CHECK-NEXT: ret <4 x double> [[TMP4]] +; + %1 = call <4 x double> @llvm.x86.avx512.mask.rndscale.pd.256(<4 x double> %src, i32 2, <4 x double> %dst, i8 %k) + ret <4 x double> %1 +} + +define <8 x double> @test_rndscale_pd_512_floor(<8 x double> %src, <8 x double> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_pd_512_floor( +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.floor.v8f64(<8 x double> [[SRC:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[DST:%.*]] +; CHECK-NEXT: ret <8 x double> [[TMP3]] +; + %1 = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %src, i32 1, <8 x double> %dst, i8 %k, i32 4) + ret <8 x double> %1 +} + +define <8 x double> @test_rndscale_pd_512_ceil(<8 x double> %src, <8 x double> %dst, i8 %k) { +; CHECK-LABEL: @test_rndscale_pd_512_ceil( +; CHECK-NEXT: [[TMP1:%.*]] = call <8 x double> @llvm.ceil.v8f64(<8 x double> [[SRC:%.*]]) +; CHECK-NEXT: [[TMP2:%.*]] = bitcast i8 [[K:%.*]] to <8 x i1> +; CHECK-NEXT: [[TMP3:%.*]] = select <8 x i1> [[TMP2]], <8 x double> [[TMP1]], <8 x double> [[DST:%.*]] +; CHECK-NEXT: ret <8 x double> [[TMP3]] +; + %1 = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %src, i32 2, <8 x double> %dst, i8 %k, i32 4) + ret <8 x double> %1 +} + declare <4 x float> @llvm.x86.avx512.mask.vfmadd.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) define <4 x float> @test_mask_vfmadd_ss(<4 x float> %a, <4 x float> %b, <4 x float> %c, i8 %mask) { Index: llvm/test/Transforms/InstCombine/X86/x86-sse41.ll =================================================================== --- llvm/test/Transforms/InstCombine/X86/x86-sse41.ll +++ llvm/test/Transforms/InstCombine/X86/x86-sse41.ll @@ -4,7 +4,7 @@ define <2 x double> @test_round_sd(<2 x double> %a, <2 x double> %b) { ; CHECK-LABEL: @test_round_sd( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a, <2 x double> %b, i32 10) +; CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> [[A:%.*]], <2 x double> [[B:%.*]], i32 10) ; CHECK-NEXT: ret <2 x double> [[TMP1]] ; %1 = insertelement <2 x double> %a, double 1.000000e+00, i32 0 @@ -13,9 +13,31 @@ ret <2 x double> %3 } +define <2 x double> @test_round_sd_floor(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: @test_round_sd_floor( +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = call double @llvm.floor.f64(double [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[A:%.*]], double [[TMP2]], i64 0 +; CHECK-NEXT: ret <2 x double> [[TMP3]] +; + %1 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a, <2 x double> %b, i32 1) + ret <2 x double> %1 +} + +define <2 x double> @test_round_sd_ceil(<2 x double> %a, <2 x double> %b) { +; CHECK-LABEL: @test_round_sd_ceil( +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <2 x double> [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = call double @llvm.ceil.f64(double [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <2 x double> [[A:%.*]], double [[TMP2]], i64 0 +; CHECK-NEXT: ret <2 x double> [[TMP3]] +; + %1 = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %a, <2 x double> %b, i32 2) + ret <2 x double> %1 +} + define double @test_round_sd_0(double %a, double %b) { ; CHECK-LABEL: @test_round_sd_0( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double %b, i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <2 x double> undef, double [[B:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = tail call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> undef, <2 x double> [[TMP1]], i32 10) ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <2 x double> [[TMP2]], i32 0 ; CHECK-NEXT: ret double [[TMP3]] @@ -44,7 +66,7 @@ define <4 x float> @test_round_ss(<4 x float> %a, <4 x float> %b) { ; CHECK-LABEL: @test_round_ss( -; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> , <4 x float> %b, i32 10) +; CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> , <4 x float> [[B:%.*]], i32 10) ; CHECK-NEXT: ret <4 x float> [[TMP1]] ; %1 = insertelement <4 x float> %a, float 1.000000e+00, i32 1 @@ -57,9 +79,31 @@ ret <4 x float> %7 } +define <4 x float> @test_round_ss_floor(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: @test_round_ss_floor( +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.floor.f32(float [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[A:%.*]], float [[TMP2]], i64 0 +; CHECK-NEXT: ret <4 x float> [[TMP3]] +; + %1 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a, <4 x float> %b, i32 1) + ret <4 x float> %1 +} + +define <4 x float> @test_round_ss_ceil(<4 x float> %a, <4 x float> %b) { +; CHECK-LABEL: @test_round_ss_ceil( +; CHECK-NEXT: [[TMP1:%.*]] = extractelement <4 x float> [[B:%.*]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = call float @llvm.ceil.f32(float [[TMP1]]) +; CHECK-NEXT: [[TMP3:%.*]] = insertelement <4 x float> [[A:%.*]], float [[TMP2]], i64 0 +; CHECK-NEXT: ret <4 x float> [[TMP3]] +; + %1 = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %a, <4 x float> %b, i32 2) + ret <4 x float> %1 +} + define float @test_round_ss_0(float %a, float %b) { ; CHECK-LABEL: @test_round_ss_0( -; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float %b, i32 0 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x float> undef, float [[B:%.*]], i32 0 ; CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> undef, <4 x float> [[TMP1]], i32 10) ; CHECK-NEXT: [[R:%.*]] = extractelement <4 x float> [[TMP2]], i32 0 ; CHECK-NEXT: ret float [[R]]