Index: clang/lib/CodeGen/CGBuiltin.cpp =================================================================== --- clang/lib/CodeGen/CGBuiltin.cpp +++ clang/lib/CodeGen/CGBuiltin.cpp @@ -8408,6 +8408,130 @@ return EmitX86MaskedCompare(CGF, 1, true, { In, Zero }); } +static Value *EmitX86Round(CodeGenFunction &CGF, ArrayRef Ops, + unsigned BuiltinID) { + int RoundControl; + if (BuiltinID == clang::X86::BI__builtin_ia32_roundss || + BuiltinID == clang::X86::BI__builtin_ia32_roundsd) + RoundControl = cast(Ops[2])->getSExtValue(); + else if (BuiltinID == clang::X86::BI__builtin_ia32_rndscalesd_round_mask || + BuiltinID == clang::X86::BI__builtin_ia32_rndscaless_round_mask) + RoundControl = cast(Ops[4])->getSExtValue(); + else + RoundControl = cast(Ops[1])->getSExtValue(); + + int SAE; + if (BuiltinID == clang::X86::BI__builtin_ia32_rndscaleps_mask || + BuiltinID == clang::X86::BI__builtin_ia32_rndscalepd_mask) + SAE = cast(Ops[4])->getSExtValue(); + else if (BuiltinID == clang::X86::BI__builtin_ia32_rndscalesd_round_mask || + BuiltinID == clang::X86::BI__builtin_ia32_rndscaless_round_mask) + SAE = cast(Ops[5])->getSExtValue(); + else + SAE = 4; + + if (SAE != 4 || (RoundControl != 2 /*ceil*/ && RoundControl != 1 /*floor*/)) { + Intrinsic::ID ID; + switch (BuiltinID) { + default: llvm_unreachable("Unsupported intrinsic!"); + case clang::X86::BI__builtin_ia32_roundps: + ID = Intrinsic::x86_sse41_round_ps; + break; + case clang::X86::BI__builtin_ia32_roundss: + ID = Intrinsic::x86_sse41_round_ss; + break; + case clang::X86::BI__builtin_ia32_roundsd: + ID = Intrinsic::x86_sse41_round_sd; + break; + case clang::X86::BI__builtin_ia32_roundpd: + ID = Intrinsic::x86_sse41_round_pd; + break; + case clang::X86::BI__builtin_ia32_roundpd256: + ID = Intrinsic::x86_avx_round_pd_256; + break; + case clang::X86::BI__builtin_ia32_roundps256: + ID = Intrinsic::x86_avx_round_ps_256; + break; + case clang::X86::BI__builtin_ia32_rndscaleps_mask: + ID = Intrinsic::x86_avx512_mask_rndscale_ps_512; + break; + case clang::X86::BI__builtin_ia32_rndscalepd_mask: + ID = Intrinsic::x86_avx512_mask_rndscale_pd_512; + break; + case clang::X86::BI__builtin_ia32_rndscalepd_128_mask: + ID = Intrinsic::x86_avx512_mask_rndscale_pd_128; + break; + case clang::X86::BI__builtin_ia32_rndscalepd_256_mask: + ID = Intrinsic::x86_avx512_mask_rndscale_pd_256; + break; + case clang::X86::BI__builtin_ia32_rndscaleps_128_mask: + ID = Intrinsic::x86_avx512_mask_rndscale_ps_128; + break; + case clang::X86::BI__builtin_ia32_rndscaleps_256_mask: + ID = Intrinsic::x86_avx512_mask_rndscale_ps_256; + break; + case clang::X86::BI__builtin_ia32_rndscalesd_round_mask: + ID = Intrinsic::x86_avx512_mask_rndscale_sd; + break; + case clang::X86::BI__builtin_ia32_rndscaless_round_mask: + ID = Intrinsic::x86_avx512_mask_rndscale_ss; + break; + } + llvm::Function *F = CGF.CGM.getIntrinsic(ID); + return CGF.Builder.CreateCall(F, Ops); + } + + Value *Src, *Dst, *Mask; + bool IsScalar = false; + if (BuiltinID == clang::X86::BI__builtin_ia32_roundss || + BuiltinID == clang::X86::BI__builtin_ia32_roundsd || + BuiltinID == clang::X86::BI__builtin_ia32_rndscalesd_round_mask || + BuiltinID == clang::X86::BI__builtin_ia32_rndscaless_round_mask) { + IsScalar = true; + if (BuiltinID == clang::X86::BI__builtin_ia32_rndscalesd_round_mask || + BuiltinID == clang::X86::BI__builtin_ia32_rndscaless_round_mask) { + llvm::Type *MaskTy = Ops[3]->getType(); + llvm::Type *I32Ty = CGF.Builder.getInt32Ty(); + Value *One = llvm::ConstantInt::get(I32Ty, 1); + Value *Zero = llvm::Constant::getNullValue(I32Ty); + Mask = (MaskTy == I32Ty) ? Ops[3] : CGF.Builder.CreateZExt(Ops[3], I32Ty); + Mask = CGF.Builder.CreateAnd(Mask, One); + Mask = CGF.Builder.CreateICmp(ICmpInst::ICMP_NE, Mask, Zero); + Dst = Ops[2]; + } + else + Dst = Ops[0]; + Src = CGF.Builder.CreateExtractElement(Ops[1], (uint64_t)0); + } else { + Src = Ops[0]; + if (BuiltinID == clang::X86::BI__builtin_ia32_rndscaleps_mask || + BuiltinID == clang::X86::BI__builtin_ia32_rndscalepd_mask || + BuiltinID == clang::X86::BI__builtin_ia32_rndscalepd_128_mask || + BuiltinID == clang::X86::BI__builtin_ia32_rndscalepd_256_mask || + BuiltinID == clang::X86::BI__builtin_ia32_rndscaleps_128_mask || + BuiltinID == clang::X86::BI__builtin_ia32_rndscaleps_256_mask) { + Dst = Ops[2]; + Mask = Ops[3]; + } else { + Dst = Src; + Mask = llvm::ConstantInt::getAllOnesValue(CGF.Builder.getIntNTy(Src->getType()->getVectorNumElements())); + } + } + + Intrinsic::ID ID = (RoundControl == 2) ? Intrinsic::ceil : Intrinsic::floor; + Value *F = CGF.CGM.getIntrinsic(ID, Src->getType()); + Value *Res = CGF.Builder.CreateCall(F, {Src}); + if (!IsScalar) + return EmitX86Select(CGF, Mask, Res, Dst); + if (BuiltinID == clang::X86::BI__builtin_ia32_rndscalesd_round_mask || + BuiltinID == clang::X86::BI__builtin_ia32_rndscaless_round_mask) { + Dst = CGF.Builder.CreateExtractElement(Dst, (uint64_t)0); + Res = CGF.Builder.CreateSelect(Mask, Res, Dst); + Dst = Ops[0]; + } + return CGF.Builder.CreateInsertElement(Dst, Res, (uint64_t)0); +} + static Value *EmitX86Abs(CodeGenFunction &CGF, ArrayRef Ops) { llvm::Type *Ty = Ops[0]->getType(); @@ -9103,6 +9227,22 @@ return Builder.CreateBitCast(Res, Ops[0]->getType()); } + case X86::BI__builtin_ia32_roundps: + case X86::BI__builtin_ia32_roundss: + case X86::BI__builtin_ia32_roundsd: + case X86::BI__builtin_ia32_roundpd: + case X86::BI__builtin_ia32_roundpd256: + case X86::BI__builtin_ia32_roundps256: + case X86::BI__builtin_ia32_rndscaleps_mask: + case X86::BI__builtin_ia32_rndscalepd_mask: + case X86::BI__builtin_ia32_rndscalepd_128_mask: + case X86::BI__builtin_ia32_rndscalepd_256_mask: + case X86::BI__builtin_ia32_rndscaleps_128_mask: + case X86::BI__builtin_ia32_rndscaleps_256_mask: + case X86::BI__builtin_ia32_rndscalesd_round_mask: + case X86::BI__builtin_ia32_rndscaless_round_mask: + return EmitX86Round(*this, Ops, BuiltinID); + case X86::BI__builtin_ia32_vplzcntd_128: case X86::BI__builtin_ia32_vplzcntd_256: case X86::BI__builtin_ia32_vplzcntd_512: Index: clang/test/CodeGen/avx-builtins.c =================================================================== --- clang/test/CodeGen/avx-builtins.c +++ clang/test/CodeGen/avx-builtins.c @@ -202,13 +202,15 @@ __m256d test_mm256_ceil_pd(__m256d x) { // CHECK-LABEL: test_mm256_ceil_pd - // CHECK: call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 2) + // CHECK: @llvm.ceil.v4f64 + // CHECK-NOT: select return _mm256_ceil_pd(x); } __m256 test_mm_ceil_ps(__m256 x) { // CHECK-LABEL: test_mm_ceil_ps - // CHECK: call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 2) + // CHECK: @llvm.ceil.v8f32 + // CHECK-NOT: select return _mm256_ceil_ps(x); } @@ -364,13 +366,15 @@ __m256d test_mm256_floor_pd(__m256d x) { // CHECK-LABEL: test_mm256_floor_pd - // CHECK: call <4 x double> @llvm.x86.avx.round.pd.256(<4 x double> %{{.*}}, i32 1) + // CHECK: @llvm.floor.v4f64 + // CHECK-NOT: select return _mm256_floor_pd(x); } __m256 test_mm_floor_ps(__m256 x) { // CHECK-LABEL: test_mm_floor_ps - // CHECK: call <8 x float> @llvm.x86.avx.round.ps.256(<8 x float> %{{.*}}, i32 1) + // CHECK: @llvm.floor.v8f32 + // CHECK-NOT: select return _mm256_floor_ps(x); } Index: clang/test/CodeGen/avx512f-builtins.c =================================================================== --- clang/test/CodeGen/avx512f-builtins.c +++ clang/test/CodeGen/avx512f-builtins.c @@ -7565,31 +7565,67 @@ return _mm512_min_round_ps(__A,__B,_MM_FROUND_CUR_DIRECTION); } +__m512 test_mm512_floor_ps(__m512 __A) +{ + // CHECK-LABEL: @test_mm512_floor_ps + // CHECK: @llvm.floor.v16f32 + // CHECK-NOT: select + return _mm512_floor_ps(__A); +} + +__m512d test_mm512_floor_pd(__m512d __A) +{ + // CHECK-LABEL: @test_mm512_floor_pd + // CHECK: @llvm.floor.v8f64 + // CHECK-NOT: select + return _mm512_floor_pd(__A); +} + __m512 test_mm512_mask_floor_ps (__m512 __W, __mmask16 __U, __m512 __A) { - // CHECK-LABEL: @test_mm512_mask_floor_ps - // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512 + // CHECK-LABEL: @test_mm512_mask_floor_ps + // CHECK: @llvm.floor.v16f32 + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_floor_ps (__W,__U,__A); } __m512d test_mm512_mask_floor_pd (__m512d __W, __mmask8 __U, __m512d __A) { - // CHECK-LABEL: @test_mm512_mask_floor_pd - // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512 + // CHECK-LABEL: @test_mm512_mask_floor_pd + // CHECK: @llvm.floor.v8f64 + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_floor_pd (__W,__U,__A); } +__m512 test_mm512_ceil_ps(__m512 __A) +{ + // CHECK-LABEL: @test_mm512_ceil_ps + // CHECK: @llvm.ceil.v16f32 + // CHECK-NOT: select + return _mm512_ceil_ps(__A); +} + +__m512d test_mm512_ceil_pd(__m512d __A) +{ + // CHECK-LABEL: @test_mm512_ceil_pd + // CHECK: @llvm.ceil.v8f64 + // CHECK-NOT: select + return _mm512_ceil_pd(__A); +} + __m512 test_mm512_mask_ceil_ps (__m512 __W, __mmask16 __U, __m512 __A) { - // CHECK-LABEL: @test_mm512_mask_ceil_ps - // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512 + // CHECK-LABEL: @test_mm512_mask_ceil_ps + // CHECK: @llvm.ceil.v16f32 + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_ceil_ps (__W,__U,__A); } __m512d test_mm512_mask_ceil_pd (__m512d __W, __mmask8 __U, __m512d __A) { - // CHECK-LABEL: @test_mm512_mask_ceil_pd - // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512 + // CHECK-LABEL: @test_mm512_mask_ceil_pd + // CHECK: @llvm.ceil.v8f64 + // CHECK: select <8 x i1> %{{.*}}, <8 x double> %{{.*}}, <8 x double> %{{.*}} return _mm512_mask_ceil_pd (__W,__U,__A); } @@ -7597,14 +7633,30 @@ { // CHECK-LABEL: @test_mm512_mask_roundscale_ps // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512 + return _mm512_mask_roundscale_ps(__W,__U,__A, 3); +} + +__m512 test_mm512_mask_roundscale_floor_ps(__m512 __W, __mmask16 __U, __m512 __A) +{ + // CHECK-LABEL: @test_mm512_mask_roundscale_floor_ps + // CHECK: @llvm.floor.v16f32 + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} return _mm512_mask_roundscale_ps(__W,__U,__A, 1); } +__m512 test_mm512_mask_roundscale_ceil_ps(__m512 __W, __mmask16 __U, __m512 __A) +{ + // CHECK-LABEL: @test_mm512_mask_roundscale_ceil_ps + // CHECK: @llvm.ceil.v16f32 + // CHECK: select <16 x i1> %{{.*}}, <16 x float> %{{.*}}, <16 x float> %{{.*}} + return _mm512_mask_roundscale_ps(__W,__U,__A, 2); +} + __m512 test_mm512_maskz_roundscale_ps(__mmask16 __U, __m512 __A) { // CHECK-LABEL: @test_mm512_maskz_roundscale_ps // CHECK: @llvm.x86.avx512.mask.rndscale.ps.512 - return _mm512_maskz_roundscale_ps(__U,__A, 1); + return _mm512_maskz_roundscale_ps(__U,__A, 3); } __m512 test_mm512_mask_roundscale_round_ps(__m512 __A,__mmask16 __U,__m512 __C) @@ -7632,14 +7684,14 @@ { // CHECK-LABEL: @test_mm512_mask_roundscale_pd // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512 - return _mm512_mask_roundscale_pd(__W,__U,__A, 1); + return _mm512_mask_roundscale_pd(__W,__U,__A, 3); } __m512d test_mm512_maskz_roundscale_pd(__mmask8 __U, __m512d __A) { // CHECK-LABEL: @test_mm512_maskz_roundscale_pd // CHECK: @llvm.x86.avx512.mask.rndscale.pd.512 - return _mm512_maskz_roundscale_pd(__U,__A, 1); + return _mm512_maskz_roundscale_pd(__U,__A, 3); } __m512d test_mm512_mask_roundscale_round_pd(__m512d __A,__mmask8 __U,__m512d __C) Index: clang/test/CodeGen/sse41-builtins.c =================================================================== --- clang/test/CodeGen/sse41-builtins.c +++ clang/test/CodeGen/sse41-builtins.c @@ -44,25 +44,31 @@ __m128d test_mm_ceil_pd(__m128d x) { // CHECK-LABEL: test_mm_ceil_pd - // CHECK: call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 2) + // CHECK: @llvm.ceil.v2f64 + // CHECK-NOT: select return _mm_ceil_pd(x); } __m128 test_mm_ceil_ps(__m128 x) { // CHECK-LABEL: test_mm_ceil_ps - // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 2) + // CHECK: @llvm.ceil.v4f32 + // CHECK-NOT: select return _mm_ceil_ps(x); } __m128d test_mm_ceil_sd(__m128d x, __m128d y) { // CHECK-LABEL: test_mm_ceil_sd - // CHECK: call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 2) + // CHECK: extractelement + // CHECK: @llvm.ceil.f64 + // CHECK: insertelement return _mm_ceil_sd(x, y); } __m128 test_mm_ceil_ss(__m128 x, __m128 y) { // CHECK-LABEL: test_mm_ceil_ss - // CHECK: call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 2) + // CHECK: extractelement + // CHECK: @llvm.ceil.f32 + // CHECK: insertelement return _mm_ceil_ss(x, y); } @@ -196,25 +202,31 @@ __m128d test_mm_floor_pd(__m128d x) { // CHECK-LABEL: test_mm_floor_pd - // CHECK: call <2 x double> @llvm.x86.sse41.round.pd(<2 x double> %{{.*}}, i32 1) + // CHECK: @llvm.floor.v2f64 + // CHECK-NOT: select return _mm_floor_pd(x); } __m128 test_mm_floor_ps(__m128 x) { // CHECK-LABEL: test_mm_floor_ps - // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 1) + // CHECK: @llvm.floor.v4f32 + // CHECK-NOT: select return _mm_floor_ps(x); } __m128d test_mm_floor_sd(__m128d x, __m128d y) { // CHECK-LABEL: test_mm_floor_sd - // CHECK: call <2 x double> @llvm.x86.sse41.round.sd(<2 x double> %{{.*}}, <2 x double> %{{.*}}, i32 1) + // CHECK: extractelement + // CHECK: @llvm.floor.f64 + // CHECK: insertelement return _mm_floor_sd(x, y); } __m128 test_mm_floor_ss(__m128 x, __m128 y) { // CHECK-LABEL: test_mm_floor_ss - // CHECK: call <4 x float> @llvm.x86.sse41.round.ss(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i32 1) + // CHECK: extractelement + // CHECK: @llvm.floor.f32 + // CHECK: insertelement return _mm_floor_ss(x, y); }