Index: include/llvm/IR/IntrinsicsX86.td =================================================================== --- include/llvm/IR/IntrinsicsX86.td +++ include/llvm/IR/IntrinsicsX86.td @@ -177,9 +177,6 @@ def int_x86_sse_sqrt_ss : GCCBuiltin<"__builtin_ia32_sqrtss">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; - def int_x86_sse_sqrt_ps : GCCBuiltin<"__builtin_ia32_sqrtps">, - Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], - [IntrNoMem]>; def int_x86_sse_rcp_ss : GCCBuiltin<"__builtin_ia32_rcpss">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty], [IntrNoMem]>; @@ -307,9 +304,6 @@ def int_x86_sse2_sqrt_sd : GCCBuiltin<"__builtin_ia32_sqrtsd">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], [IntrNoMem]>; - def int_x86_sse2_sqrt_pd : GCCBuiltin<"__builtin_ia32_sqrtpd">, - Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty], - [IntrNoMem]>; def int_x86_sse2_min_sd : GCCBuiltin<"__builtin_ia32_minsd">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty], [IntrNoMem]>; @@ -982,11 +976,6 @@ Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty], [IntrNoMem]>; - def int_x86_avx_sqrt_pd_256 : GCCBuiltin<"__builtin_ia32_sqrtpd256">, - Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty], [IntrNoMem]>; - def int_x86_avx_sqrt_ps_256 : GCCBuiltin<"__builtin_ia32_sqrtps256">, - Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>; - def int_x86_avx_rsqrt_ps_256 : GCCBuiltin<"__builtin_ia32_rsqrtps256">, Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>; @@ -4505,29 +4494,17 @@ Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_sqrt_ss : GCCBuiltin<"__builtin_ia32_sqrtss_round_mask">, + def int_x86_avx512_sqrt_ss_mask : GCCBuiltin<"__builtin_ia32_sqrtss_mask">, Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_sqrt_sd : GCCBuiltin<"__builtin_ia32_sqrtsd_round_mask">, + def int_x86_avx512_sqrt_sd_mask : GCCBuiltin<"__builtin_ia32_sqrtsd_mask">, Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_sqrt_pd_128 : GCCBuiltin<"__builtin_ia32_sqrtpd128_mask">, - Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, - llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_sqrt_pd_256 : GCCBuiltin<"__builtin_ia32_sqrtpd256_mask">, - Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty, - llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_sqrt_pd_512 : GCCBuiltin<"__builtin_ia32_sqrtpd512_mask">, + def int_x86_avx512_sqrt_pd_512_mask : GCCBuiltin<"__builtin_ia32_sqrt_pd512_mask">, Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty, llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>; - def int_x86_avx512_mask_sqrt_ps_128 : GCCBuiltin<"__builtin_ia32_sqrtps128_mask">, - Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, - llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_sqrt_ps_256 : GCCBuiltin<"__builtin_ia32_sqrtps256_mask">, - Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty, - llvm_i8_ty], [IntrNoMem]>; - def int_x86_avx512_mask_sqrt_ps_512 : GCCBuiltin<"__builtin_ia32_sqrtps512_mask">, + def int_x86_avx512_sqrt_ps_512_mask : GCCBuiltin<"__builtin_ia32_sqrt_ps512_mask">, Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>; def int_x86_avx512_mask_fixupimm_pd_128 : Index: lib/IR/AutoUpgrade.cpp =================================================================== --- lib/IR/AutoUpgrade.cpp +++ lib/IR/AutoUpgrade.cpp @@ -79,6 +79,10 @@ Name.startswith("avx2.pabs.") || // Added in 6.0 Name.startswith("avx512.mask.pabs.") || // Added in 6.0 Name.startswith("avx512.broadcastm") || // Added in 6.0 + Name.startswith("avx512.mask.sqrt") || // Added in 6.0 + Name.startswith("avx.sqrt.p") || // Added in 6.0 + Name.startswith("sse2.sqrt.p") || // Added in 6.0 + Name.startswith("sse.sqrt.p") || // Added in 6.0 Name.startswith("avx512.mask.pbroadcast") || // Added in 6.0 Name.startswith("sse2.pcmpeq.") || // Added in 3.1 Name.startswith("sse2.pcmpgt.") || // Added in 3.1 @@ -1044,6 +1048,60 @@ ExtTy->getPrimitiveSizeInBits(); Rep = Builder.CreateZExt(CI->getArgOperand(0), ExtTy); Rep = Builder.CreateVectorSplat(NumElts, Rep); + } else if (IsX86 && (Name.startswith("avx512.mask.sqrt.s"))) { + if (cast(CI->getArgOperand(4))->getZExtValue() != 4) { + Intrinsic::ID ID; + if (Name == "avx512.mask.sqrt.sd") + ID = Intrinsic::x86_avx512_sqrt_sd_mask; + else + ID = Intrinsic::x86_avx512_sqrt_ss_mask; + Function *Intrin = Intrinsic::getDeclaration(F->getParent(), ID); + Rep = Builder.CreateCall(Intrin, + {CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), CI->getArgOperand(3), + CI->getArgOperand(4)}); + } else { + llvm::Value *C0 = llvm::ConstantInt::get(Type::getInt32Ty(C), 0); + Value *A = Builder.CreateExtractElement(CI->getArgOperand(0), C0); + Function *Intrin = Intrinsic::getDeclaration( + F->getParent(), Intrinsic::sqrt, A->getType()); + Value *Src = Builder.CreateExtractElement(CI->getArgOperand(2), C0); + Value *Mask = CI->getArgOperand(3); + int MaskSize = Mask->getType()->getScalarSizeInBits(); + llvm::Type *MaskTy = + llvm::VectorType::get(Builder.getInt1Ty(), MaskSize); + Mask = Builder.CreateBitCast(Mask, MaskTy); + Mask = Builder.CreateExtractElement(Mask, C0); + A = Builder.CreateSelect(Mask, Builder.CreateCall(Intrin, {A}), Src); + Rep = Builder.CreateInsertElement(CI->getArgOperand(1), A, C0); + } + } else if (IsX86 && (Name.startswith("avx.sqrt.p") || + Name.startswith("sse2.sqrt.p") || + Name.startswith("sse.sqrt.p"))) { + Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), + Intrinsic::sqrt, + CI->getType()), + {CI->getArgOperand(0)}); + } else if (IsX86 && (Name.startswith("avx512.mask.sqrt.p"))) { + if (Name.endswith("512") && + cast(CI->getArgOperand(3))->getZExtValue() != 4) { + Intrinsic::ID ID; + if (Name == "avx512.mask.sqrt.pd.512") + ID = Intrinsic::x86_avx512_sqrt_pd_512_mask; + else + ID = Intrinsic::x86_avx512_sqrt_ps_512_mask; + Function *Intrin = Intrinsic::getDeclaration(F->getParent(), ID); + Rep = Builder.CreateCall(Intrin, + {CI->getArgOperand(0), CI->getArgOperand(1), + CI->getArgOperand(2), CI->getArgOperand(3)}); + } else { + Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), + Intrinsic::sqrt, + CI->getType()), + {CI->getArgOperand(0)}); + Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep, + CI->getArgOperand(1)); + } } else if (IsX86 && (Name.startswith("avx512.ptestm") || Name.startswith("avx512.ptestnm"))) { Value *Op0 = CI->getArgOperand(0); Index: lib/Target/X86/X86IntrinsicsInfo.h =================================================================== --- lib/Target/X86/X86IntrinsicsInfo.h +++ lib/Target/X86/X86IntrinsicsInfo.h @@ -391,8 +391,6 @@ X86_INTRINSIC_DATA(avx_round_pd_256, ROUNDP, X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(avx_round_ps_256, ROUNDP, X86ISD::VRNDSCALE, 0), X86_INTRINSIC_DATA(avx_rsqrt_ps_256, INTR_TYPE_1OP, X86ISD::FRSQRT, 0), - X86_INTRINSIC_DATA(avx_sqrt_pd_256, INTR_TYPE_1OP, ISD::FSQRT, 0), - X86_INTRINSIC_DATA(avx_sqrt_ps_256, INTR_TYPE_1OP, ISD::FSQRT, 0), X86_INTRINSIC_DATA(avx_vpermilvar_pd, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), X86_INTRINSIC_DATA(avx_vpermilvar_pd_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), X86_INTRINSIC_DATA(avx_vpermilvar_ps, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0), @@ -1084,18 +1082,6 @@ X86ISD::SCALEFS, 0), X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM, X86ISD::SCALEFS, 0), - X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), - X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), - X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK, ISD::FSQRT, - X86ISD::FSQRT_RND), - X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), - X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0), - X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_512, INTR_TYPE_1OP_MASK, ISD::FSQRT, - X86ISD::FSQRT_RND), - X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FSQRTS_RND, 0), - X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK_RM, - X86ISD::FSQRTS_RND, 0), X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB, X86ISD::FSUB_RND), X86_INTRINSIC_DATA(avx512_mask_sub_ps_512, INTR_TYPE_2OP_MASK, ISD::FSUB, @@ -1546,6 +1532,16 @@ X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0), X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0), + + X86_INTRINSIC_DATA(avx512_sqrt_pd_512_mask, INTR_TYPE_1OP_MASK, ISD::FSQRT, + X86ISD::FSQRT_RND), + X86_INTRINSIC_DATA(avx512_sqrt_ps_512_mask, INTR_TYPE_1OP_MASK, ISD::FSQRT, + X86ISD::FSQRT_RND), + X86_INTRINSIC_DATA(avx512_sqrt_sd_mask, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FSQRTS_RND, 0), + X86_INTRINSIC_DATA(avx512_sqrt_ss_mask, INTR_TYPE_SCALAR_MASK_RM, + X86ISD::FSQRTS_RND, 0), + X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI), X86_INTRINSIC_DATA(avx512_vcvtsd2si32, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0), @@ -1606,7 +1602,6 @@ X86_INTRINSIC_DATA(sse_movmsk_ps, INTR_TYPE_1OP, X86ISD::MOVMSK, 0), X86_INTRINSIC_DATA(sse_rcp_ps, INTR_TYPE_1OP, X86ISD::FRCP, 0), X86_INTRINSIC_DATA(sse_rsqrt_ps, INTR_TYPE_1OP, X86ISD::FRSQRT, 0), - X86_INTRINSIC_DATA(sse_sqrt_ps, INTR_TYPE_1OP, ISD::FSQRT, 0), X86_INTRINSIC_DATA(sse_ucomieq_ss, COMI, X86ISD::UCOMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse_ucomige_ss, COMI, X86ISD::UCOMI, ISD::SETGE), X86_INTRINSIC_DATA(sse_ucomigt_ss, COMI, X86ISD::UCOMI, ISD::SETGT), @@ -1663,7 +1658,6 @@ X86_INTRINSIC_DATA(sse2_psubs_w, INTR_TYPE_2OP, X86ISD::SUBS, 0), X86_INTRINSIC_DATA(sse2_psubus_b, INTR_TYPE_2OP, X86ISD::SUBUS, 0), X86_INTRINSIC_DATA(sse2_psubus_w, INTR_TYPE_2OP, X86ISD::SUBUS, 0), - X86_INTRINSIC_DATA(sse2_sqrt_pd, INTR_TYPE_1OP, ISD::FSQRT, 0), X86_INTRINSIC_DATA(sse2_ucomieq_sd, COMI, X86ISD::UCOMI, ISD::SETEQ), X86_INTRINSIC_DATA(sse2_ucomige_sd, COMI, X86ISD::UCOMI, ISD::SETGE), X86_INTRINSIC_DATA(sse2_ucomigt_sd, COMI, X86ISD::UCOMI, ISD::SETGT), Index: test/CodeGen/X86/avx-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -3019,10 +3019,12 @@ ; X64: # %bb.0: ; X64-NEXT: vsqrtpd %ymm0, %ymm0 ; X64-NEXT: retq - %res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) - ret <4 x double> %res +entry: + %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a0) #2 + ret <4 x double> %0 } -declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone + +declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) #1 define <8 x float> @test_mm256_sqrt_ps(<8 x float> %a0) nounwind { ; X32-LABEL: test_mm256_sqrt_ps: @@ -3034,10 +3036,12 @@ ; X64: # %bb.0: ; X64-NEXT: vsqrtps %ymm0, %ymm0 ; X64-NEXT: retq - %res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) - ret <8 x float> %res +entry: + %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a0) #2 + ret <8 x float> %0 } -declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone + +declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #1 define void @test_mm256_store_pd(double* %a0, <4 x double> %a1) nounwind { ; X32-LABEL: test_mm256_store_pd: Index: test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll =================================================================== --- test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll +++ test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll @@ -4,6 +4,36 @@ ; We don't check any vinsertf128 variant with immediate 0 because that's just a blend. +define <4 x double> @test_x86_avx_sqrt_pd_256(<4 x double> %a0) { +; AVX-LABEL: test_x86_avx_sqrt_pd_256: +; AVX: # %bb.0: +; AVX-NEXT: vsqrtpd %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x51,0xc0] +; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_sqrt_pd_256: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vsqrtpd %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x51,0xc0] +; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) ; <<4 x double>> [#uses=1] + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone + +define <8 x float> @test_x86_avx_sqrt_ps_256(<8 x float> %a0) { +; AVX-LABEL: test_x86_avx_sqrt_ps_256: +; AVX: # %bb.0: +; AVX-NEXT: vsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x51,0xc0] +; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] +; +; AVX512VL-LABEL: test_x86_avx_sqrt_ps_256: +; AVX512VL: # %bb.0: +; AVX512VL-NEXT: vsqrtps %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x51,0xc0] +; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1] + ret <8 x float> %res +} +declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone + define <4 x double> @test_x86_avx_vinsertf128_pd_256_1(<4 x double> %a0, <2 x double> %a1) { ; CHECK-LABEL: test_x86_avx_vinsertf128_pd_256_1: ; CHECK: # %bb.0: Index: test/CodeGen/X86/avx-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/avx-intrinsics-x86.ll +++ test/CodeGen/X86/avx-intrinsics-x86.ll @@ -633,39 +633,6 @@ } declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone - -define <4 x double> @test_x86_avx_sqrt_pd_256(<4 x double> %a0) { -; AVX-LABEL: test_x86_avx_sqrt_pd_256: -; AVX: # %bb.0: -; AVX-NEXT: vsqrtpd %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x51,0xc0] -; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx_sqrt_pd_256: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vsqrtpd %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x51,0xc0] -; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) ; <<4 x double>> [#uses=1] - ret <4 x double> %res -} -declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone - - -define <8 x float> @test_x86_avx_sqrt_ps_256(<8 x float> %a0) { -; AVX-LABEL: test_x86_avx_sqrt_ps_256: -; AVX: # %bb.0: -; AVX-NEXT: vsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x51,0xc0] -; AVX-NEXT: ret{{[l|q]}} # encoding: [0xc3] -; -; AVX512VL-LABEL: test_x86_avx_sqrt_ps_256: -; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vsqrtps %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x51,0xc0] -; AVX512VL-NEXT: ret{{[l|q]}} # encoding: [0xc3] - %res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1] - ret <8 x float> %res -} -declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone - - define <2 x double> @test_x86_avx_vpermilvar_pd(<2 x double> %a0, <2 x i64> %a1) { ; AVX-LABEL: test_x86_avx_vpermilvar_pd: ; AVX: # %bb.0: Index: test/CodeGen/X86/avx512-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx512-intrinsics-fast-isel.ll @@ -1738,3 +1738,273 @@ !0 = !{i32 1} +define <2 x double> @test_mm_sqrt_round_sd(<2 x double> %__A, <2 x double> %__B) { +; X32-LABEL: test_mm_sqrt_round_sd: +; X32: # %bb.0: # %entry +; X32-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 +; X32-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_sqrt_round_sd: +; X64: # %bb.0: # %entry +; X64-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1] +; X64-NEXT: retq +entry: + %extract = extractelement <2 x double> %__A, i64 0 + %0 = tail call double @llvm.sqrt.f64(double %extract) + %1 = insertelement <2 x double> %__B, double %0, i64 0 + ret <2 x double> %1 +} + +declare double @llvm.sqrt.f64(double) #1 + +define <2 x double> @test_mm_mask_sqrt_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { +; X32-LABEL: test_mm_mask_sqrt_sd: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; X32-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm2[1] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_mask_sqrt_sd: +; X64: # %bb.0: # %entry +; X64-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; X64-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm2[1] +; X64-NEXT: retq +entry: + %extract.i = extractelement <2 x double> %__A, i64 0 + %extract1.i = extractelement <2 x double> %__W, i64 0 + %0 = bitcast i8 %__U to <8 x i1> + %extract2.i = extractelement <8 x i1> %0, i64 0 + %1 = tail call double @llvm.sqrt.f64(double %extract.i) #2 + %2 = select i1 %extract2.i, double %1, double %extract1.i + %3 = insertelement <2 x double> %__B, double %2, i64 0 + ret <2 x double> %3 +} + +define <2 x double> @test_mm_mask_sqrt_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { +; X32-LABEL: test_mm_mask_sqrt_round_sd: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; X32-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm2[1] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_mask_sqrt_round_sd: +; X64: # %bb.0: # %entry +; X64-NEXT: vsqrtsd %xmm1, %xmm1, %xmm1 +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vmovsd %xmm1, %xmm0, %xmm0 {%k1} +; X64-NEXT: vmovsd {{.*#+}} xmm0 = xmm0[0],xmm2[1] +; X64-NEXT: retq +entry: + %extract = extractelement <2 x double> %__A, i64 0 + %extract1 = extractelement <2 x double> %__W, i64 0 + %0 = bitcast i8 %__U to <8 x i1> + %extract2 = extractelement <8 x i1> %0, i64 0 + %1 = tail call double @llvm.sqrt.f64(double %extract) + %2 = select i1 %extract2, double %1, double %extract1 + %3 = insertelement <2 x double> %__B, double %2, i64 0 + ret <2 x double> %3 +} + +define <2 x double> @test_mm_maskz_sqrt_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { +; X32-LABEL: test_mm_maskz_sqrt_sd: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; X32-NEXT: vmovsd %xmm0, %xmm0, %xmm2 {%k1} +; X32-NEXT: vmovsd {{.*#+}} xmm0 = xmm2[0],xmm1[1] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_maskz_sqrt_sd: +; X64: # %bb.0: # %entry +; X64-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; X64-NEXT: vmovsd %xmm0, %xmm0, %xmm2 {%k1} +; X64-NEXT: vmovsd {{.*#+}} xmm0 = xmm2[0],xmm1[1] +; X64-NEXT: retq +entry: + %extract.i = extractelement <2 x double> %__A, i64 0 + %0 = bitcast i8 %__U to <8 x i1> + %extract2.i = extractelement <8 x i1> %0, i64 0 + %1 = tail call double @llvm.sqrt.f64(double %extract.i) #2 + %2 = select i1 %extract2.i, double %1, double 0.000000e+00 + %3 = insertelement <2 x double> %__B, double %2, i64 0 + ret <2 x double> %3 +} + +define <2 x double> @test_mm_maskz_sqrt_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) { +; X32-LABEL: test_mm_maskz_sqrt_round_sd: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; X32-NEXT: vmovsd %xmm0, %xmm0, %xmm2 {%k1} +; X32-NEXT: vmovsd {{.*#+}} xmm0 = xmm2[0],xmm1[1] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_maskz_sqrt_round_sd: +; X64: # %bb.0: # %entry +; X64-NEXT: vsqrtsd %xmm0, %xmm0, %xmm0 +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vxorpd %xmm2, %xmm2, %xmm2 +; X64-NEXT: vmovsd %xmm0, %xmm0, %xmm2 {%k1} +; X64-NEXT: vmovsd {{.*#+}} xmm0 = xmm2[0],xmm1[1] +; X64-NEXT: retq +entry: + %extract = extractelement <2 x double> %__A, i64 0 + %0 = bitcast i8 %__U to <8 x i1> + %extract2 = extractelement <8 x i1> %0, i64 0 + %1 = tail call double @llvm.sqrt.f64(double %extract) + %2 = select i1 %extract2, double %1, double 0.000000e+00 + %3 = insertelement <2 x double> %__B, double %2, i64 0 + ret <2 x double> %3 +} + +define <4 x float> @test_mm_sqrt_round_ss(<4 x float> %__A, <4 x float> %__B) { +; X32-LABEL: test_mm_sqrt_round_ss: +; X32: # %bb.0: # %entry +; X32-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; X32-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_sqrt_round_ss: +; X64: # %bb.0: # %entry +; X64-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; X64-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; X64-NEXT: retq +entry: + %extract = extractelement <4 x float> %__A, i64 0 + %0 = tail call float @llvm.sqrt.f32(float %extract) + %1 = insertelement <4 x float> %__B, float %0, i64 0 + ret <4 x float> %1 +} + +declare float @llvm.sqrt.f32(float) #1 + +define <4 x float> @test_mm_mask_sqrt_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { +; X32-LABEL: test_mm_mask_sqrt_ss: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; X32-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_mask_sqrt_ss: +; X64: # %bb.0: # %entry +; X64-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; X64-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; X64-NEXT: retq +entry: + %extract.i = extractelement <4 x float> %__A, i64 0 + %extract1.i = extractelement <4 x float> %__W, i64 0 + %0 = bitcast i8 %__U to <8 x i1> + %extract2.i = extractelement <8 x i1> %0, i64 0 + %1 = tail call float @llvm.sqrt.f32(float %extract.i) #2 + %2 = select i1 %extract2.i, float %1, float %extract1.i + %3 = insertelement <4 x float> %__B, float %2, i64 0 + ret <4 x float> %3 +} + +define <4 x float> @test_mm_mask_sqrt_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { +; X32-LABEL: test_mm_mask_sqrt_round_ss: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; X32-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_mask_sqrt_round_ss: +; X64: # %bb.0: # %entry +; X64-NEXT: vsqrtss %xmm1, %xmm1, %xmm1 +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vmovss %xmm1, %xmm0, %xmm0 {%k1} +; X64-NEXT: vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3] +; X64-NEXT: retq +entry: + %extract = extractelement <4 x float> %__A, i64 0 + %extract1 = extractelement <4 x float> %__W, i64 0 + %0 = bitcast i8 %__U to <8 x i1> + %extract2 = extractelement <8 x i1> %0, i64 0 + %1 = tail call float @llvm.sqrt.f32(float %extract) + %2 = select i1 %extract2, float %1, float %extract1 + %3 = insertelement <4 x float> %__B, float %2, i64 0 + ret <4 x float> %3 +} + +define <4 x float> @test_mm_maskz_sqrt_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { +; X32-LABEL: test_mm_maskz_sqrt_ss: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X32-NEXT: vmovss %xmm0, %xmm0, %xmm2 {%k1} +; X32-NEXT: vmovss {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_maskz_sqrt_ss: +; X64: # %bb.0: # %entry +; X64-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-NEXT: vmovss %xmm0, %xmm0, %xmm2 {%k1} +; X64-NEXT: vmovss {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3] +; X64-NEXT: retq +entry: + %extract.i = extractelement <4 x float> %__A, i64 0 + %0 = bitcast i8 %__U to <8 x i1> + %extract2.i = extractelement <8 x i1> %0, i64 0 + %1 = tail call float @llvm.sqrt.f32(float %extract.i) #2 + %2 = select i1 %extract2.i, float %1, float 0.000000e+00 + %3 = insertelement <4 x float> %__B, float %2, i64 0 + ret <4 x float> %3 +} + +define <4 x float> @test_mm_maskz_sqrt_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) { +; X32-LABEL: test_mm_maskz_sqrt_round_ss: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X32-NEXT: vmovss %xmm0, %xmm0, %xmm2 {%k1} +; X32-NEXT: vmovss {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3] +; X32-NEXT: retl +; +; X64-LABEL: test_mm_maskz_sqrt_round_ss: +; X64: # %bb.0: # %entry +; X64-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2 +; X64-NEXT: vmovss %xmm0, %xmm0, %xmm2 {%k1} +; X64-NEXT: vmovss {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3] +; X64-NEXT: retq +entry: + %extract = extractelement <4 x float> %__A, i64 0 + %0 = bitcast i8 %__U to <8 x i1> + %extract2 = extractelement <8 x i1> %0, i64 0 + %1 = tail call float @llvm.sqrt.f32(float %extract) + %2 = select i1 %extract2, float %1, float 0.000000e+00 + %3 = insertelement <4 x float> %__B, float %2, i64 0 + ret <4 x float> %3 +} + Index: test/CodeGen/X86/avx512-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512-intrinsics-upgrade.ll @@ -1,6 +1,81 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s +declare <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone + +define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { +; CHECK-LABEL: test_sqrt_ss: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vsqrtss %xmm0, %xmm0, %xmm3 +; CHECK-NEXT: vmovaps %xmm2, %xmm4 +; CHECK-NEXT: vmovss %xmm3, %xmm1, %xmm4 {%k1} +; CHECK-NEXT: vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z} +; CHECK-NEXT: vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddps %xmm2, %xmm4, %xmm1 +; CHECK-NEXT: vaddps %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) + %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1) + %res2 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 2) + %res3 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 3) + + %res.1 = fadd <4 x float> %res0, %res1 + %res.2 = fadd <4 x float> %res2, %res3 + %res = fadd <4 x float> %res.1, %res.2 + ret <4 x float> %res +} + +declare <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone + +define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { +; CHECK-LABEL: test_sqrt_sd: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovw %edi, %k1 +; CHECK-NEXT: vsqrtsd %xmm0, %xmm0, %xmm3 +; CHECK-NEXT: vmovapd %xmm2, %xmm4 +; CHECK-NEXT: vmovsd %xmm3, %xmm1, %xmm4 {%k1} +; CHECK-NEXT: vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} +; CHECK-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z} +; CHECK-NEXT: vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddpd %xmm2, %xmm4, %xmm1 +; CHECK-NEXT: vaddpd %xmm0, %xmm3, %xmm0 +; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 +; CHECK-NEXT: retq + %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) + %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1) + %res2 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 2) + %res3 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 3) + + %res.1 = fadd <2 x double> %res0, %res1 + %res.2 = fadd <2 x double> %res2, %res3 + %res = fadd <2 x double> %res.1, %res.2 + ret <2 x double> %res +} + +define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) { +; CHECK-LABEL: test_sqrt_pd_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vsqrtpd %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 4) + ret <8 x double> %res +} +declare <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone + +define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) { +; CHECK-LABEL: test_sqrt_ps_512: +; CHECK: ## %bb.0: +; CHECK-NEXT: vsqrtps %zmm0, %zmm0 +; CHECK-NEXT: retq + %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4) + ret <16 x float> %res +} + +declare <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone + declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone define i16 @unpckbw_test(i16 %a0, i16 %a1) { Index: test/CodeGen/X86/avx512-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512-intrinsics.ll +++ test/CodeGen/X86/avx512-intrinsics.ll @@ -272,24 +272,6 @@ } declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone -define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) { -; CHECK-LABEL: test_sqrt_pd_512: -; CHECK: ## %bb.0: -; CHECK-NEXT: vsqrtpd %zmm0, %zmm0 -; CHECK-NEXT: retq - %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 4) - ret <8 x double> %res -} -declare <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone - -define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) { -; CHECK-LABEL: test_sqrt_ps_512: -; CHECK: ## %bb.0: -; CHECK-NEXT: vsqrtps %zmm0, %zmm0 -; CHECK-NEXT: retq - %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4) - ret <16 x float> %res -} define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0) { ; CHECK-LABEL: test_sqrt_round_ps_512: ; CHECK: ## %bb.0: @@ -337,58 +319,6 @@ } declare <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone -declare <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone - -define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) { -; CHECK-LABEL: test_sqrt_ss: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovaps %xmm2, %xmm3 -; CHECK-NEXT: vsqrtss %xmm1, %xmm0, %xmm3 {%k1} -; CHECK-NEXT: vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z} -; CHECK-NEXT: vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddps %xmm2, %xmm3, %xmm1 -; CHECK-NEXT: vaddps %xmm0, %xmm4, %xmm0 -; CHECK-NEXT: vaddps %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: retq - %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4) - %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1) - %res2 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 2) - %res3 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 3) - - %res.1 = fadd <4 x float> %res0, %res1 - %res.2 = fadd <4 x float> %res2, %res3 - %res = fadd <4 x float> %res.1, %res.2 - ret <4 x float> %res -} - -declare <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone - -define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) { -; CHECK-LABEL: test_sqrt_sd: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 -; CHECK-NEXT: vmovapd %xmm2, %xmm3 -; CHECK-NEXT: vsqrtsd %xmm1, %xmm0, %xmm3 {%k1} -; CHECK-NEXT: vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1} -; CHECK-NEXT: vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z} -; CHECK-NEXT: vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0 -; CHECK-NEXT: vaddpd %xmm2, %xmm3, %xmm1 -; CHECK-NEXT: vaddpd %xmm0, %xmm4, %xmm0 -; CHECK-NEXT: vaddpd %xmm0, %xmm1, %xmm0 -; CHECK-NEXT: retq - %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4) - %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1) - %res2 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 2) - %res3 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 3) - - %res.1 = fadd <2 x double> %res0, %res1 - %res.2 = fadd <2 x double> %res2, %res3 - %res = fadd <2 x double> %res.1, %res.2 - ret <2 x double> %res -} - define i64 @test_x86_sse2_cvtsd2si64(<2 x double> %a0) { ; CHECK-LABEL: test_x86_sse2_cvtsd2si64: ; CHECK: ## %bb.0: Index: test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll +++ test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll @@ -1937,3 +1937,178 @@ } !0 = !{i32 1} + +define <2 x double> @test_mm_mask_sqrt_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A) { +; X32-LABEL: test_mm_mask_sqrt_pd: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vsqrtpd %xmm1, %xmm0 {%k1} +; X32-NEXT: retl +; +; X64-LABEL: test_mm_mask_sqrt_pd: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vsqrtpd %xmm1, %xmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %__A) #2 + %1 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> + %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__W + ret <2 x double> %2 +} + +declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) + +define <2 x double> @test_mm_maskz_sqrt_pd(i8 zeroext %__U, <2 x double> %__A) { +; X32-LABEL: test_mm_maskz_sqrt_pd: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vsqrtpd %xmm0, %xmm0 {%k1} {z} +; X32-NEXT: retl +; +; X64-LABEL: test_mm_maskz_sqrt_pd: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vsqrtpd %xmm0, %xmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %__A) #2 + %1 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> + %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer + ret <2 x double> %2 +} + +define <4 x double> @test_mm256_mask_sqrt_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A) { +; X32-LABEL: test_mm256_mask_sqrt_pd: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vsqrtpd %ymm1, %ymm0 {%k1} +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_mask_sqrt_pd: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vsqrtpd %ymm1, %ymm0 {%k1} +; X64-NEXT: retq +entry: + %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %__A) #2 + %1 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__W + ret <4 x double> %2 +} + +declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) + +define <4 x double> @test_mm256_maskz_sqrt_pd(i8 zeroext %__U, <4 x double> %__A) { +; X32-LABEL: test_mm256_maskz_sqrt_pd: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vsqrtpd %ymm0, %ymm0 {%k1} {z} +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_maskz_sqrt_pd: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vsqrtpd %ymm0, %ymm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %__A) #2 + %1 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer + ret <4 x double> %2 +} + +define <4 x float> @test_mm_mask_sqrt_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) { +; X32-LABEL: test_mm_mask_sqrt_ps: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vsqrtps %xmm1, %xmm0 {%k1} +; X32-NEXT: retl +; +; X64-LABEL: test_mm_mask_sqrt_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vsqrtps %xmm1, %xmm0 {%k1} +; X64-NEXT: retq +entry: + %0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %__A) #2 + %1 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__W + ret <4 x float> %2 +} + +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) + +define <4 x float> @test_mm_maskz_sqrt_ps(i8 zeroext %__U, <4 x float> %__A) { +; X32-LABEL: test_mm_maskz_sqrt_ps: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vsqrtps %xmm0, %xmm0 {%k1} {z} +; X32-NEXT: retl +; +; X64-LABEL: test_mm_maskz_sqrt_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vsqrtps %xmm0, %xmm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %__A) #2 + %1 = bitcast i8 %__U to <8 x i1> + %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> + %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer + ret <4 x float> %2 +} + +define <8 x float> @test_mm256_mask_sqrt_ps(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A) { +; X32-LABEL: test_mm256_mask_sqrt_ps: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vsqrtps %ymm1, %ymm0 {%k1} +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_mask_sqrt_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vsqrtps %ymm1, %ymm0 {%k1} +; X64-NEXT: retq +entry: + %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %__A) #2 + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__W + ret <8 x float> %2 +} + +define <8 x float> @test_mm256_maskz_sqrt_ps(i8 zeroext %__U, <8 x float> %__A) { +; X32-LABEL: test_mm256_maskz_sqrt_ps: +; X32: # %bb.0: # %entry +; X32-NEXT: movb {{[0-9]+}}(%esp), %al +; X32-NEXT: kmovw %eax, %k1 +; X32-NEXT: vsqrtps %ymm0, %ymm0 {%k1} {z} +; X32-NEXT: retl +; +; X64-LABEL: test_mm256_maskz_sqrt_ps: +; X64: # %bb.0: # %entry +; X64-NEXT: kmovw %edi, %k1 +; X64-NEXT: vsqrtps %ymm0, %ymm0 {%k1} {z} +; X64-NEXT: retq +entry: + %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %__A) #2 + %1 = bitcast i8 %__U to <8 x i1> + %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer + ret <8 x float> %2 +} + +declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) + Index: test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll +++ test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll @@ -6140,3 +6140,26 @@ ret i8 %res2 } +define <4 x double> @test_sqrt_pd_256(<4 x double> %a0, i8 %mask) { +; CHECK-LABEL: test_sqrt_pd_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vsqrtpd %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x51,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <4 x double> @llvm.x86.avx512.mask.sqrt.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 %mask) + ret <4 x double> %res +} +declare <4 x double> @llvm.x86.avx512.mask.sqrt.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone + +define <8 x float> @test_sqrt_ps_256(<8 x float> %a0, i8 %mask) { +; CHECK-LABEL: test_sqrt_ps_256: +; CHECK: ## %bb.0: +; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] +; CHECK-NEXT: vsqrtps %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x51,0xc0] +; CHECK-NEXT: retq ## encoding: [0xc3] + %res = call <8 x float> @llvm.x86.avx512.mask.sqrt.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask) + ret <8 x float> %res +} + +declare <8 x float> @llvm.x86.avx512.mask.sqrt.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone + Index: test/CodeGen/X86/avx512vl-intrinsics.ll =================================================================== --- test/CodeGen/X86/avx512vl-intrinsics.ll +++ test/CodeGen/X86/avx512vl-intrinsics.ll @@ -905,29 +905,6 @@ } declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>) -define <4 x double> @test_sqrt_pd_256(<4 x double> %a0, i8 %mask) { -; CHECK-LABEL: test_sqrt_pd_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vsqrtpd %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x51,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <4 x double> @llvm.x86.avx512.mask.sqrt.pd.256(<4 x double> %a0, <4 x double> zeroinitializer, i8 %mask) - ret <4 x double> %res -} -declare <4 x double> @llvm.x86.avx512.mask.sqrt.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone - -define <8 x float> @test_sqrt_ps_256(<8 x float> %a0, i8 %mask) { -; CHECK-LABEL: test_sqrt_ps_256: -; CHECK: ## %bb.0: -; CHECK-NEXT: kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf] -; CHECK-NEXT: vsqrtps %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x51,0xc0] -; CHECK-NEXT: retq ## encoding: [0xc3] - %res = call <8 x float> @llvm.x86.avx512.mask.sqrt.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask) - ret <8 x float> %res -} - -declare <8 x float> @llvm.x86.avx512.mask.sqrt.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone - define <4 x double> @test_getexp_pd_256(<4 x double> %a0) { ; CHECK-LABEL: test_getexp_pd_256: ; CHECK: ## %bb.0: Index: test/CodeGen/X86/sse-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/sse-intrinsics-fast-isel.ll +++ test/CodeGen/X86/sse-intrinsics-fast-isel.ll @@ -1577,10 +1577,10 @@ ; X64: # %bb.0: ; X64-NEXT: sqrtps %xmm0, %xmm0 ; X64-NEXT: retq - %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) + %res = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0) ret <4 x float> %res } -declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone +declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) nounwind readnone define <4 x float> @test_mm_sqrt_ss(<4 x float> %a0) { ; X32-LABEL: test_mm_sqrt_ss: Index: test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll =================================================================== --- test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll +++ test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll @@ -1,6 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s + +define <4 x float> @test_x86_sse_sqrt_ps(<4 x float> %a0) { +; CHECK-LABEL: test_x86_sse_sqrt_ps: +; CHECK: ## %bb.0: +; CHECK-NEXT: sqrtps %xmm0, %xmm0 +; CHECK-NEXT: retl + %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1] + ret <4 x float> %res +} +declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone + + define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) { ; CHECK-LABEL: test_x86_sse_storeu_ps: ; CHECK: ## %bb.0: Index: test/CodeGen/X86/sse-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/sse-intrinsics-x86.ll +++ test/CodeGen/X86/sse-intrinsics-x86.ll @@ -459,27 +459,6 @@ declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone -define <4 x float> @test_x86_sse_sqrt_ps(<4 x float> %a0) { -; SSE-LABEL: test_x86_sse_sqrt_ps: -; SSE: ## %bb.0: -; SSE-NEXT: sqrtps %xmm0, %xmm0 ## encoding: [0x0f,0x51,0xc0] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse_sqrt_ps: -; AVX2: ## %bb.0: -; AVX2-NEXT: vsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x51,0xc0] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse_sqrt_ps: -; SKX: ## %bb.0: -; SKX-NEXT: vsqrtps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x51,0xc0] -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1] - ret <4 x float> %res -} -declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone - - define <4 x float> @test_x86_sse_sqrt_ss(<4 x float> %a0) { ; SSE-LABEL: test_x86_sse_sqrt_ss: ; SSE: ## %bb.0: Index: test/CodeGen/X86/sse2-intrinsics-fast-isel.ll =================================================================== --- test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -2948,10 +2948,10 @@ ; X64: # %bb.0: ; X64-NEXT: sqrtpd %xmm0, %xmm0 ; X64-NEXT: retq - %res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) + %res = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a0) ret <2 x double> %res } -declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone +declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) nounwind readnone define <2 x double> @test_mm_sqrt_sd(<2 x double> %a0, <2 x double> %a1) nounwind { ; X32-LABEL: test_mm_sqrt_sd: Index: test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll =================================================================== --- test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll +++ test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll @@ -1,6 +1,18 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s + +define <2 x double> @test_x86_sse2_sqrt_pd(<2 x double> %a0) { +; CHECK-LABEL: test_x86_sse2_sqrt_pd: +; CHECK: ## %bb.0: +; CHECK-NEXT: sqrtpd %xmm0, %xmm0 +; CHECK-NEXT: retl + %res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) ; <<2 x double>> [#uses=1] + ret <2 x double> %res +} +declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone + + define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) { ; CHECK-LABEL: test_x86_sse2_psll_dq_bs: ; CHECK: ## %bb.0: Index: test/CodeGen/X86/sse2-intrinsics-x86.ll =================================================================== --- test/CodeGen/X86/sse2-intrinsics-x86.ll +++ test/CodeGen/X86/sse2-intrinsics-x86.ll @@ -1585,28 +1585,6 @@ } declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone - -define <2 x double> @test_x86_sse2_sqrt_pd(<2 x double> %a0) { -; SSE-LABEL: test_x86_sse2_sqrt_pd: -; SSE: ## %bb.0: -; SSE-NEXT: sqrtpd %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x51,0xc0] -; SSE-NEXT: retl ## encoding: [0xc3] -; -; AVX2-LABEL: test_x86_sse2_sqrt_pd: -; AVX2: ## %bb.0: -; AVX2-NEXT: vsqrtpd %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x51,0xc0] -; AVX2-NEXT: retl ## encoding: [0xc3] -; -; SKX-LABEL: test_x86_sse2_sqrt_pd: -; SKX: ## %bb.0: -; SKX-NEXT: vsqrtpd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x51,0xc0] -; SKX-NEXT: retl ## encoding: [0xc3] - %res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) ; <<2 x double>> [#uses=1] - ret <2 x double> %res -} -declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone - - define <2 x double> @test_x86_sse2_sqrt_sd(<2 x double> %a0) { ; SSE-LABEL: test_x86_sse2_sqrt_sd: ; SSE: ## %bb.0: