Index: include/llvm/IR/IntrinsicsX86.td
===================================================================
--- include/llvm/IR/IntrinsicsX86.td
+++ include/llvm/IR/IntrinsicsX86.td
@@ -177,9 +177,6 @@
   def int_x86_sse_sqrt_ss : GCCBuiltin<"__builtin_ia32_sqrtss">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty],
                         [IntrNoMem]>;
-  def int_x86_sse_sqrt_ps : GCCBuiltin<"__builtin_ia32_sqrtps">,
-              Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty],
-                        [IntrNoMem]>;
   def int_x86_sse_rcp_ss : GCCBuiltin<"__builtin_ia32_rcpss">,
               Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty],
                         [IntrNoMem]>;
@@ -307,9 +304,6 @@
   def int_x86_sse2_sqrt_sd : GCCBuiltin<"__builtin_ia32_sqrtsd">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty],
                         [IntrNoMem]>;
-  def int_x86_sse2_sqrt_pd : GCCBuiltin<"__builtin_ia32_sqrtpd">,
-              Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty],
-                        [IntrNoMem]>;
   def int_x86_sse2_min_sd : GCCBuiltin<"__builtin_ia32_minsd">,
               Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty,
                          llvm_v2f64_ty], [IntrNoMem]>;
@@ -982,11 +976,6 @@
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty,
                   llvm_v8f32_ty], [IntrNoMem]>;
 
-  def int_x86_avx_sqrt_pd_256 : GCCBuiltin<"__builtin_ia32_sqrtpd256">,
-        Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty], [IntrNoMem]>;
-  def int_x86_avx_sqrt_ps_256 : GCCBuiltin<"__builtin_ia32_sqrtps256">,
-        Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
-
   def int_x86_avx_rsqrt_ps_256 : GCCBuiltin<"__builtin_ia32_rsqrtps256">,
         Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty], [IntrNoMem]>;
 
@@ -4505,29 +4494,17 @@
           Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                     llvm_v16f32_ty, llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_sqrt_ss : GCCBuiltin<"__builtin_ia32_sqrtss_round_mask">,
+  def int_x86_avx512_sqrt_ss_mask : GCCBuiltin<"__builtin_ia32_sqrtss_mask">,
         Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty, llvm_v4f32_ty,
                                     llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_sqrt_sd : GCCBuiltin<"__builtin_ia32_sqrtsd_round_mask">,
+  def int_x86_avx512_sqrt_sd_mask : GCCBuiltin<"__builtin_ia32_sqrtsd_mask">,
         Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty, llvm_v2f64_ty,
                                     llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
 
-  def int_x86_avx512_mask_sqrt_pd_128 : GCCBuiltin<"__builtin_ia32_sqrtpd128_mask">,
-        Intrinsic<[llvm_v2f64_ty], [llvm_v2f64_ty, llvm_v2f64_ty,
-                                    llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_sqrt_pd_256 : GCCBuiltin<"__builtin_ia32_sqrtpd256_mask">,
-        Intrinsic<[llvm_v4f64_ty], [llvm_v4f64_ty, llvm_v4f64_ty,
-                                    llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_sqrt_pd_512 : GCCBuiltin<"__builtin_ia32_sqrtpd512_mask">,
+  def int_x86_avx512_sqrt_pd_512_mask : GCCBuiltin<"__builtin_ia32_sqrt_pd512_mask">,
         Intrinsic<[llvm_v8f64_ty], [llvm_v8f64_ty, llvm_v8f64_ty,
                                     llvm_i8_ty, llvm_i32_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_sqrt_ps_128 : GCCBuiltin<"__builtin_ia32_sqrtps128_mask">,
-        Intrinsic<[llvm_v4f32_ty], [llvm_v4f32_ty, llvm_v4f32_ty,
-                                     llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_sqrt_ps_256 : GCCBuiltin<"__builtin_ia32_sqrtps256_mask">,
-        Intrinsic<[llvm_v8f32_ty], [llvm_v8f32_ty, llvm_v8f32_ty,
-                                     llvm_i8_ty], [IntrNoMem]>;
-  def int_x86_avx512_mask_sqrt_ps_512 : GCCBuiltin<"__builtin_ia32_sqrtps512_mask">,
+  def int_x86_avx512_sqrt_ps_512_mask : GCCBuiltin<"__builtin_ia32_sqrt_ps512_mask">,
         Intrinsic<[llvm_v16f32_ty], [llvm_v16f32_ty, llvm_v16f32_ty,
                                      llvm_i16_ty, llvm_i32_ty], [IntrNoMem]>;
   def int_x86_avx512_mask_fixupimm_pd_128 :
Index: lib/IR/AutoUpgrade.cpp
===================================================================
--- lib/IR/AutoUpgrade.cpp
+++ lib/IR/AutoUpgrade.cpp
@@ -79,6 +79,10 @@
       Name.startswith("avx2.pabs.") || // Added in 6.0
       Name.startswith("avx512.mask.pabs.") || // Added in 6.0
       Name.startswith("avx512.broadcastm") || // Added in 6.0
+      Name.startswith("avx512.mask.sqrt") || // Added in 6.0
+      Name.startswith("avx.sqrt.p") || // Added in 6.0
+      Name.startswith("sse2.sqrt.p") || // Added in 6.0
+      Name.startswith("sse.sqrt.p") || // Added in 6.0
       Name.startswith("avx512.mask.pbroadcast") || // Added in 6.0
       Name.startswith("sse2.pcmpeq.") || // Added in 3.1
       Name.startswith("sse2.pcmpgt.") || // Added in 3.1
@@ -1044,6 +1048,60 @@
                          ExtTy->getPrimitiveSizeInBits();
       Rep = Builder.CreateZExt(CI->getArgOperand(0), ExtTy);
       Rep = Builder.CreateVectorSplat(NumElts, Rep);
+    } else if (IsX86 && (Name.startswith("avx512.mask.sqrt.s"))) {
+      if (cast<llvm::ConstantInt>(CI->getArgOperand(4))->getZExtValue() != 4) {
+        Intrinsic::ID ID;
+        if (Name == "avx512.mask.sqrt.sd")
+          ID = Intrinsic::x86_avx512_sqrt_sd_mask;
+        else
+          ID = Intrinsic::x86_avx512_sqrt_ss_mask;
+        Function *Intrin = Intrinsic::getDeclaration(F->getParent(), ID);
+        Rep = Builder.CreateCall(Intrin,
+                                 {CI->getArgOperand(0), CI->getArgOperand(1),
+                                  CI->getArgOperand(2), CI->getArgOperand(3),
+                                  CI->getArgOperand(4)});
+      } else {
+        llvm::Value *C0 = llvm::ConstantInt::get(Type::getInt32Ty(C), 0);
+        Value *A = Builder.CreateExtractElement(CI->getArgOperand(0), C0);
+        Function *Intrin = Intrinsic::getDeclaration(
+            F->getParent(), Intrinsic::sqrt, A->getType());
+        Value *Src = Builder.CreateExtractElement(CI->getArgOperand(2), C0);
+        Value *Mask = CI->getArgOperand(3);
+        int MaskSize = Mask->getType()->getScalarSizeInBits();
+        llvm::Type *MaskTy =
+            llvm::VectorType::get(Builder.getInt1Ty(), MaskSize);
+        Mask = Builder.CreateBitCast(Mask, MaskTy);
+        Mask = Builder.CreateExtractElement(Mask, C0);
+        A = Builder.CreateSelect(Mask, Builder.CreateCall(Intrin, {A}), Src);
+        Rep = Builder.CreateInsertElement(CI->getArgOperand(1), A, C0);
+      }
+    } else if (IsX86 && (Name.startswith("avx.sqrt.p") ||
+                         Name.startswith("sse2.sqrt.p") ||
+                         Name.startswith("sse.sqrt.p"))) {
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(),
+                                                         Intrinsic::sqrt,
+                                                         CI->getType()),
+                               {CI->getArgOperand(0)});
+    } else if (IsX86 && (Name.startswith("avx512.mask.sqrt.p"))) {
+      if (Name.endswith("512") &&
+          cast<llvm::ConstantInt>(CI->getArgOperand(3))->getZExtValue() != 4) {
+        Intrinsic::ID ID;
+        if (Name == "avx512.mask.sqrt.pd.512")
+          ID = Intrinsic::x86_avx512_sqrt_pd_512_mask;
+        else
+          ID = Intrinsic::x86_avx512_sqrt_ps_512_mask;
+        Function *Intrin = Intrinsic::getDeclaration(F->getParent(), ID);
+        Rep = Builder.CreateCall(Intrin,
+                                 {CI->getArgOperand(0), CI->getArgOperand(1),
+                                  CI->getArgOperand(2), CI->getArgOperand(3)});
+      } else {
+        Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(),
+                                                           Intrinsic::sqrt,
+                                                           CI->getType()),
+                                 {CI->getArgOperand(0)});
+        Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
+                            CI->getArgOperand(1));
+      }
     } else if (IsX86 && (Name.startswith("avx512.ptestm") ||
                          Name.startswith("avx512.ptestnm"))) {
       Value *Op0 = CI->getArgOperand(0);
Index: lib/Target/X86/X86IntrinsicsInfo.h
===================================================================
--- lib/Target/X86/X86IntrinsicsInfo.h
+++ lib/Target/X86/X86IntrinsicsInfo.h
@@ -391,8 +391,6 @@
   X86_INTRINSIC_DATA(avx_round_pd_256,  ROUNDP, X86ISD::VRNDSCALE, 0),
   X86_INTRINSIC_DATA(avx_round_ps_256,  ROUNDP, X86ISD::VRNDSCALE, 0),
   X86_INTRINSIC_DATA(avx_rsqrt_ps_256,  INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
-  X86_INTRINSIC_DATA(avx_sqrt_pd_256,   INTR_TYPE_1OP, ISD::FSQRT, 0),
-  X86_INTRINSIC_DATA(avx_sqrt_ps_256,   INTR_TYPE_1OP, ISD::FSQRT, 0),
   X86_INTRINSIC_DATA(avx_vpermilvar_pd,     INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
   X86_INTRINSIC_DATA(avx_vpermilvar_pd_256, INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
   X86_INTRINSIC_DATA(avx_vpermilvar_ps,     INTR_TYPE_2OP, X86ISD::VPERMILPV, 0),
@@ -1084,18 +1082,6 @@
                      X86ISD::SCALEFS, 0),
   X86_INTRINSIC_DATA(avx512_mask_scalef_ss, INTR_TYPE_SCALAR_MASK_RM,
                      X86ISD::SCALEFS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_sqrt_pd_512, INTR_TYPE_1OP_MASK, ISD::FSQRT,
-                     X86ISD::FSQRT_RND),
-  X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_128, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_256, INTR_TYPE_1OP_MASK, ISD::FSQRT, 0),
-  X86_INTRINSIC_DATA(avx512_mask_sqrt_ps_512, INTR_TYPE_1OP_MASK, ISD::FSQRT,
-                     X86ISD::FSQRT_RND),
-  X86_INTRINSIC_DATA(avx512_mask_sqrt_sd, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FSQRTS_RND, 0),
-  X86_INTRINSIC_DATA(avx512_mask_sqrt_ss, INTR_TYPE_SCALAR_MASK_RM,
-                     X86ISD::FSQRTS_RND, 0),
   X86_INTRINSIC_DATA(avx512_mask_sub_pd_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
                      X86ISD::FSUB_RND),
   X86_INTRINSIC_DATA(avx512_mask_sub_ps_512, INTR_TYPE_2OP_MASK, ISD::FSUB,
@@ -1546,6 +1532,16 @@
   X86_INTRINSIC_DATA(avx512_rsqrt28_ps, INTR_TYPE_1OP_MASK_RM,X86ISD::RSQRT28, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt28_sd, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0),
   X86_INTRINSIC_DATA(avx512_rsqrt28_ss, INTR_TYPE_SCALAR_MASK_RM,X86ISD::RSQRT28S, 0),
+
+  X86_INTRINSIC_DATA(avx512_sqrt_pd_512_mask, INTR_TYPE_1OP_MASK, ISD::FSQRT,
+                     X86ISD::FSQRT_RND),
+  X86_INTRINSIC_DATA(avx512_sqrt_ps_512_mask, INTR_TYPE_1OP_MASK, ISD::FSQRT,
+                     X86ISD::FSQRT_RND),
+  X86_INTRINSIC_DATA(avx512_sqrt_sd_mask, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FSQRTS_RND, 0),
+  X86_INTRINSIC_DATA(avx512_sqrt_ss_mask, INTR_TYPE_SCALAR_MASK_RM,
+                     X86ISD::FSQRTS_RND, 0),
+
   X86_INTRINSIC_DATA(avx512_vcomi_sd, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
   X86_INTRINSIC_DATA(avx512_vcomi_ss, COMI_RM, X86ISD::COMI, X86ISD::UCOMI),
   X86_INTRINSIC_DATA(avx512_vcvtsd2si32, INTR_TYPE_2OP, X86ISD::CVTS2SI_RND, 0),
@@ -1606,7 +1602,6 @@
   X86_INTRINSIC_DATA(sse_movmsk_ps,     INTR_TYPE_1OP, X86ISD::MOVMSK, 0),
   X86_INTRINSIC_DATA(sse_rcp_ps,        INTR_TYPE_1OP, X86ISD::FRCP, 0),
   X86_INTRINSIC_DATA(sse_rsqrt_ps,      INTR_TYPE_1OP, X86ISD::FRSQRT, 0),
-  X86_INTRINSIC_DATA(sse_sqrt_ps,       INTR_TYPE_1OP, ISD::FSQRT, 0),
   X86_INTRINSIC_DATA(sse_ucomieq_ss,    COMI, X86ISD::UCOMI, ISD::SETEQ),
   X86_INTRINSIC_DATA(sse_ucomige_ss,    COMI, X86ISD::UCOMI, ISD::SETGE),
   X86_INTRINSIC_DATA(sse_ucomigt_ss,    COMI, X86ISD::UCOMI, ISD::SETGT),
@@ -1663,7 +1658,6 @@
   X86_INTRINSIC_DATA(sse2_psubs_w,      INTR_TYPE_2OP, X86ISD::SUBS, 0),
   X86_INTRINSIC_DATA(sse2_psubus_b,     INTR_TYPE_2OP, X86ISD::SUBUS, 0),
   X86_INTRINSIC_DATA(sse2_psubus_w,     INTR_TYPE_2OP, X86ISD::SUBUS, 0),
-  X86_INTRINSIC_DATA(sse2_sqrt_pd,      INTR_TYPE_1OP, ISD::FSQRT, 0),
   X86_INTRINSIC_DATA(sse2_ucomieq_sd,   COMI, X86ISD::UCOMI, ISD::SETEQ),
   X86_INTRINSIC_DATA(sse2_ucomige_sd,   COMI, X86ISD::UCOMI, ISD::SETGE),
   X86_INTRINSIC_DATA(sse2_ucomigt_sd,   COMI, X86ISD::UCOMI, ISD::SETGT),
Index: test/CodeGen/X86/avx-intrinsics-fast-isel.ll
===================================================================
--- test/CodeGen/X86/avx-intrinsics-fast-isel.ll
+++ test/CodeGen/X86/avx-intrinsics-fast-isel.ll
@@ -3019,10 +3019,12 @@
 ; X64:       # %bb.0:
 ; X64-NEXT:    vsqrtpd %ymm0, %ymm0
 ; X64-NEXT:    retq
-  %res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0)
-  ret <4 x double> %res
+entry:
+  %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %a0) #2
+  ret <4 x double> %0
 }
-declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
+
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double>) #1
 
 define <8 x float> @test_mm256_sqrt_ps(<8 x float> %a0) nounwind {
 ; X32-LABEL: test_mm256_sqrt_ps:
@@ -3034,10 +3036,12 @@
 ; X64:       # %bb.0:
 ; X64-NEXT:    vsqrtps %ymm0, %ymm0
 ; X64-NEXT:    retq
-  %res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0)
-  ret <8 x float> %res
+entry:
+  %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %a0) #2
+  ret <8 x float> %0
 }
-declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
+
+declare <8 x float> @llvm.sqrt.v8f32(<8 x float>) #1
 
 define void @test_mm256_store_pd(double* %a0, <4 x double> %a1) nounwind {
 ; X32-LABEL: test_mm256_store_pd:
Index: test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
===================================================================
--- test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
+++ test/CodeGen/X86/avx-intrinsics-x86-upgrade.ll
@@ -4,6 +4,36 @@
 
 ; We don't check any vinsertf128 variant with immediate 0 because that's just a blend.
 
+define <4 x double> @test_x86_avx_sqrt_pd_256(<4 x double> %a0) {
+; AVX-LABEL: test_x86_avx_sqrt_pd_256:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vsqrtpd %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x51,0xc0]
+; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx_sqrt_pd_256:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vsqrtpd %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x51,0xc0]
+; AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) ; <<4 x double>> [#uses=1]
+  ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
+
+define <8 x float> @test_x86_avx_sqrt_ps_256(<8 x float> %a0) {
+; AVX-LABEL: test_x86_avx_sqrt_ps_256:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x51,0xc0]
+; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+;
+; AVX512VL-LABEL: test_x86_avx_sqrt_ps_256:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vsqrtps %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x51,0xc0]
+; AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
+  ret <8 x float> %res
+}
+declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
+
 define <4 x double> @test_x86_avx_vinsertf128_pd_256_1(<4 x double> %a0, <2 x double> %a1) {
 ; CHECK-LABEL: test_x86_avx_vinsertf128_pd_256_1:
 ; CHECK:       # %bb.0:
Index: test/CodeGen/X86/avx-intrinsics-x86.ll
===================================================================
--- test/CodeGen/X86/avx-intrinsics-x86.ll
+++ test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -633,39 +633,6 @@
 }
 declare <8 x float> @llvm.x86.avx.rsqrt.ps.256(<8 x float>) nounwind readnone
 
-
-define <4 x double> @test_x86_avx_sqrt_pd_256(<4 x double> %a0) {
-; AVX-LABEL: test_x86_avx_sqrt_pd_256:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vsqrtpd %ymm0, %ymm0 # encoding: [0xc5,0xfd,0x51,0xc0]
-; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_sqrt_pd_256:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vsqrtpd %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfd,0x51,0xc0]
-; AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double> %a0) ; <<4 x double>> [#uses=1]
-  ret <4 x double> %res
-}
-declare <4 x double> @llvm.x86.avx.sqrt.pd.256(<4 x double>) nounwind readnone
-
-
-define <8 x float> @test_x86_avx_sqrt_ps_256(<8 x float> %a0) {
-; AVX-LABEL: test_x86_avx_sqrt_ps_256:
-; AVX:       # %bb.0:
-; AVX-NEXT:    vsqrtps %ymm0, %ymm0 # encoding: [0xc5,0xfc,0x51,0xc0]
-; AVX-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-;
-; AVX512VL-LABEL: test_x86_avx_sqrt_ps_256:
-; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vsqrtps %ymm0, %ymm0 # EVEX TO VEX Compression encoding: [0xc5,0xfc,0x51,0xc0]
-; AVX512VL-NEXT:    ret{{[l|q]}} # encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float> %a0) ; <<8 x float>> [#uses=1]
-  ret <8 x float> %res
-}
-declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
-
-
 define <2 x double> @test_x86_avx_vpermilvar_pd(<2 x double> %a0, <2 x i64> %a1) {
 ; AVX-LABEL: test_x86_avx_vpermilvar_pd:
 ; AVX:       # %bb.0:
Index: test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
===================================================================
--- test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
+++ test/CodeGen/X86/avx512-intrinsics-fast-isel.ll
@@ -1738,3 +1738,273 @@
 
 !0 = !{i32 1}
 
+define <2 x double> @test_mm_sqrt_round_sd(<2 x double> %__A, <2 x double> %__B) {
+; X32-LABEL: test_mm_sqrt_round_sd:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
+; X32-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sqrt_round_sd:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
+; X64-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm1[1]
+; X64-NEXT:    retq
+entry:
+  %extract = extractelement <2 x double> %__A, i64 0
+  %0 = tail call double @llvm.sqrt.f64(double %extract)
+  %1 = insertelement <2 x double> %__B, double %0, i64 0
+  ret <2 x double> %1
+}
+
+declare double @llvm.sqrt.f64(double) #1
+
+define <2 x double> @test_mm_mask_sqrt_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
+; X32-LABEL: test_mm_mask_sqrt_sd:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    vsqrtsd %xmm1, %xmm1, %xmm1
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; X32-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_mask_sqrt_sd:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vsqrtsd %xmm1, %xmm1, %xmm1
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; X64-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
+; X64-NEXT:    retq
+entry:
+  %extract.i = extractelement <2 x double> %__A, i64 0
+  %extract1.i = extractelement <2 x double> %__W, i64 0
+  %0 = bitcast i8 %__U to <8 x i1>
+  %extract2.i = extractelement <8 x i1> %0, i64 0
+  %1 = tail call double @llvm.sqrt.f64(double %extract.i) #2
+  %2 = select i1 %extract2.i, double %1, double %extract1.i
+  %3 = insertelement <2 x double> %__B, double %2, i64 0
+  ret <2 x double> %3
+}
+
+define <2 x double> @test_mm_mask_sqrt_round_sd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
+; X32-LABEL: test_mm_mask_sqrt_round_sd:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    vsqrtsd %xmm1, %xmm1, %xmm1
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; X32-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_mask_sqrt_round_sd:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vsqrtsd %xmm1, %xmm1, %xmm1
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vmovsd %xmm1, %xmm0, %xmm0 {%k1}
+; X64-NEXT:    vmovsd {{.*#+}} xmm0 = xmm0[0],xmm2[1]
+; X64-NEXT:    retq
+entry:
+  %extract = extractelement <2 x double> %__A, i64 0
+  %extract1 = extractelement <2 x double> %__W, i64 0
+  %0 = bitcast i8 %__U to <8 x i1>
+  %extract2 = extractelement <8 x i1> %0, i64 0
+  %1 = tail call double @llvm.sqrt.f64(double %extract)
+  %2 = select i1 %extract2, double %1, double %extract1
+  %3 = insertelement <2 x double> %__B, double %2, i64 0
+  ret <2 x double> %3
+}
+
+define <2 x double> @test_mm_maskz_sqrt_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
+; X32-LABEL: test_mm_maskz_sqrt_sd:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; X32-NEXT:    vmovsd %xmm0, %xmm0, %xmm2 {%k1}
+; X32-NEXT:    vmovsd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_maskz_sqrt_sd:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; X64-NEXT:    vmovsd %xmm0, %xmm0, %xmm2 {%k1}
+; X64-NEXT:    vmovsd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
+; X64-NEXT:    retq
+entry:
+  %extract.i = extractelement <2 x double> %__A, i64 0
+  %0 = bitcast i8 %__U to <8 x i1>
+  %extract2.i = extractelement <8 x i1> %0, i64 0
+  %1 = tail call double @llvm.sqrt.f64(double %extract.i) #2
+  %2 = select i1 %extract2.i, double %1, double 0.000000e+00
+  %3 = insertelement <2 x double> %__B, double %2, i64 0
+  ret <2 x double> %3
+}
+
+define <2 x double> @test_mm_maskz_sqrt_round_sd(i8 zeroext %__U, <2 x double> %__A, <2 x double> %__B) {
+; X32-LABEL: test_mm_maskz_sqrt_round_sd:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; X32-NEXT:    vmovsd %xmm0, %xmm0, %xmm2 {%k1}
+; X32-NEXT:    vmovsd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_maskz_sqrt_round_sd:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm0
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vxorpd %xmm2, %xmm2, %xmm2
+; X64-NEXT:    vmovsd %xmm0, %xmm0, %xmm2 {%k1}
+; X64-NEXT:    vmovsd {{.*#+}} xmm0 = xmm2[0],xmm1[1]
+; X64-NEXT:    retq
+entry:
+  %extract = extractelement <2 x double> %__A, i64 0
+  %0 = bitcast i8 %__U to <8 x i1>
+  %extract2 = extractelement <8 x i1> %0, i64 0
+  %1 = tail call double @llvm.sqrt.f64(double %extract)
+  %2 = select i1 %extract2, double %1, double 0.000000e+00
+  %3 = insertelement <2 x double> %__B, double %2, i64 0
+  ret <2 x double> %3
+}
+
+define <4 x float> @test_mm_sqrt_round_ss(<4 x float> %__A, <4 x float> %__B) {
+; X32-LABEL: test_mm_sqrt_round_ss:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; X32-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_sqrt_round_ss:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; X64-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3]
+; X64-NEXT:    retq
+entry:
+  %extract = extractelement <4 x float> %__A, i64 0
+  %0 = tail call float @llvm.sqrt.f32(float %extract)
+  %1 = insertelement <4 x float> %__B, float %0, i64 0
+  ret <4 x float> %1
+}
+
+declare float @llvm.sqrt.f32(float) #1
+
+define <4 x float> @test_mm_mask_sqrt_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
+; X32-LABEL: test_mm_mask_sqrt_ss:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    vsqrtss %xmm1, %xmm1, %xmm1
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; X32-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_mask_sqrt_ss:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vsqrtss %xmm1, %xmm1, %xmm1
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; X64-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; X64-NEXT:    retq
+entry:
+  %extract.i = extractelement <4 x float> %__A, i64 0
+  %extract1.i = extractelement <4 x float> %__W, i64 0
+  %0 = bitcast i8 %__U to <8 x i1>
+  %extract2.i = extractelement <8 x i1> %0, i64 0
+  %1 = tail call float @llvm.sqrt.f32(float %extract.i) #2
+  %2 = select i1 %extract2.i, float %1, float %extract1.i
+  %3 = insertelement <4 x float> %__B, float %2, i64 0
+  ret <4 x float> %3
+}
+
+define <4 x float> @test_mm_mask_sqrt_round_ss(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
+; X32-LABEL: test_mm_mask_sqrt_round_ss:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    vsqrtss %xmm1, %xmm1, %xmm1
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; X32-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_mask_sqrt_round_ss:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vsqrtss %xmm1, %xmm1, %xmm1
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vmovss %xmm1, %xmm0, %xmm0 {%k1}
+; X64-NEXT:    vmovss {{.*#+}} xmm0 = xmm0[0],xmm2[1,2,3]
+; X64-NEXT:    retq
+entry:
+  %extract = extractelement <4 x float> %__A, i64 0
+  %extract1 = extractelement <4 x float> %__W, i64 0
+  %0 = bitcast i8 %__U to <8 x i1>
+  %extract2 = extractelement <8 x i1> %0, i64 0
+  %1 = tail call float @llvm.sqrt.f32(float %extract)
+  %2 = select i1 %extract2, float %1, float %extract1
+  %3 = insertelement <4 x float> %__B, float %2, i64 0
+  ret <4 x float> %3
+}
+
+define <4 x float> @test_mm_maskz_sqrt_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
+; X32-LABEL: test_mm_maskz_sqrt_ss:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X32-NEXT:    vmovss %xmm0, %xmm0, %xmm2 {%k1}
+; X32-NEXT:    vmovss {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_maskz_sqrt_ss:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X64-NEXT:    vmovss %xmm0, %xmm0, %xmm2 {%k1}
+; X64-NEXT:    vmovss {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
+; X64-NEXT:    retq
+entry:
+  %extract.i = extractelement <4 x float> %__A, i64 0
+  %0 = bitcast i8 %__U to <8 x i1>
+  %extract2.i = extractelement <8 x i1> %0, i64 0
+  %1 = tail call float @llvm.sqrt.f32(float %extract.i) #2
+  %2 = select i1 %extract2.i, float %1, float 0.000000e+00
+  %3 = insertelement <4 x float> %__B, float %2, i64 0
+  ret <4 x float> %3
+}
+
+define <4 x float> @test_mm_maskz_sqrt_round_ss(i8 zeroext %__U, <4 x float> %__A, <4 x float> %__B) {
+; X32-LABEL: test_mm_maskz_sqrt_round_ss:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X32-NEXT:    vmovss %xmm0, %xmm0, %xmm2 {%k1}
+; X32-NEXT:    vmovss {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_maskz_sqrt_round_ss:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    vsqrtss %xmm0, %xmm0, %xmm0
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; X64-NEXT:    vmovss %xmm0, %xmm0, %xmm2 {%k1}
+; X64-NEXT:    vmovss {{.*#+}} xmm0 = xmm2[0],xmm1[1,2,3]
+; X64-NEXT:    retq
+entry:
+  %extract = extractelement <4 x float> %__A, i64 0
+  %0 = bitcast i8 %__U to <8 x i1>
+  %extract2 = extractelement <8 x i1> %0, i64 0
+  %1 = tail call float @llvm.sqrt.f32(float %extract)
+  %2 = select i1 %extract2, float %1, float 0.000000e+00
+  %3 = insertelement <4 x float> %__B, float %2, i64 0
+  ret <4 x float> %3
+}
+
Index: test/CodeGen/X86/avx512-intrinsics-upgrade.ll
===================================================================
--- test/CodeGen/X86/avx512-intrinsics-upgrade.ll
+++ test/CodeGen/X86/avx512-intrinsics-upgrade.ll
@@ -1,6 +1,81 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
 
+declare <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
+
+define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
+; CHECK-LABEL: test_sqrt_ss:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vsqrtss %xmm0, %xmm0, %xmm3
+; CHECK-NEXT:    vmovaps %xmm2, %xmm4
+; CHECK-NEXT:    vmovss %xmm3, %xmm1, %xmm4 {%k1}
+; CHECK-NEXT:    vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddps %xmm2, %xmm4, %xmm1
+; CHECK-NEXT:    vaddps %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
+  %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1)
+  %res2 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 2)
+  %res3 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 3)
+
+  %res.1 = fadd <4 x float> %res0, %res1
+  %res.2 = fadd <4 x float> %res2, %res3
+  %res   = fadd <4 x float> %res.1, %res.2
+  ret <4 x float> %res
+}
+
+declare <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
+
+define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
+; CHECK-LABEL: test_sqrt_sd:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1
+; CHECK-NEXT:    vsqrtsd %xmm0, %xmm0, %xmm3
+; CHECK-NEXT:    vmovapd %xmm2, %xmm4
+; CHECK-NEXT:    vmovsd %xmm3, %xmm1, %xmm4 {%k1}
+; CHECK-NEXT:    vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
+; CHECK-NEXT:    vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm3 {%k1} {z}
+; CHECK-NEXT:    vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0
+; CHECK-NEXT:    vaddpd %xmm2, %xmm4, %xmm1
+; CHECK-NEXT:    vaddpd %xmm0, %xmm3, %xmm0
+; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT:    retq
+  %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
+  %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1)
+  %res2 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 2)
+  %res3 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 3)
+
+  %res.1 = fadd <2 x double> %res0, %res1
+  %res.2 = fadd <2 x double> %res2, %res3
+  %res   = fadd <2 x double> %res.1, %res.2
+  ret <2 x double> %res
+}
+
+define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) {
+; CHECK-LABEL: test_sqrt_pd_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vsqrtpd %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 -1, i32 4)
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone
+
+define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) {
+; CHECK-LABEL: test_sqrt_ps_512:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    vsqrtps %zmm0, %zmm0
+; CHECK-NEXT:    retq
+  %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
+  ret <16 x float> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone
+
 declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
 
 define i16 @unpckbw_test(i16 %a0, i16 %a1) {
Index: test/CodeGen/X86/avx512-intrinsics.ll
===================================================================
--- test/CodeGen/X86/avx512-intrinsics.ll
+++ test/CodeGen/X86/avx512-intrinsics.ll
@@ -272,24 +272,6 @@
 }
 declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
 
-define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) {
-; CHECK-LABEL: test_sqrt_pd_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vsqrtpd %zmm0, %zmm0
-; CHECK-NEXT:    retq
-  %res = call <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double> %a0,  <8 x double> zeroinitializer, i8 -1, i32 4)
-  ret <8 x double> %res
-}
-declare <8 x double> @llvm.x86.avx512.mask.sqrt.pd.512(<8 x double>, <8 x double>, i8, i32) nounwind readnone
-
-define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) {
-; CHECK-LABEL: test_sqrt_ps_512:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    vsqrtps %zmm0, %zmm0
-; CHECK-NEXT:    retq
-  %res = call <16 x float> @llvm.x86.avx512.mask.sqrt.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
-  ret <16 x float> %res
-}
 define <16 x float> @test_sqrt_round_ps_512(<16 x float> %a0) {
 ; CHECK-LABEL: test_sqrt_round_ps_512:
 ; CHECK:       ## %bb.0:
@@ -337,58 +319,6 @@
 }
 declare <16 x float> @llvm.x86.avx512.mask.getexp.ps.512(<16 x float>, <16 x float>, i16, i32) nounwind readnone
 
-declare <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
-
-define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2, i8 %mask) {
-; CHECK-LABEL: test_sqrt_ss:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovaps %xmm2, %xmm3
-; CHECK-NEXT:    vsqrtss %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT:    vsqrtss {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT:    vsqrtss {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
-; CHECK-NEXT:    vsqrtss {rz-sae}, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddps %xmm2, %xmm3, %xmm1
-; CHECK-NEXT:    vaddps %xmm0, %xmm4, %xmm0
-; CHECK-NEXT:    vaddps %xmm0, %xmm1, %xmm0
-; CHECK-NEXT:    retq
-  %res0 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 4)
-  %res1 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> %a2, i8 %mask, i32 1)
-  %res2 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 %mask, i32 2)
-  %res3 = call <4 x float> @llvm.x86.avx512.mask.sqrt.ss(<4 x float>%a0, <4 x float> %a1, <4 x float> zeroinitializer, i8 -1, i32 3)
-
-  %res.1 = fadd <4 x float> %res0, %res1
-  %res.2 = fadd <4 x float> %res2, %res3
-  %res   = fadd <4 x float> %res.1, %res.2
-  ret <4 x float> %res
-}
-
-declare <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>, <2 x double>, <2 x double>, i8, i32) nounwind readnone
-
-define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, i8 %mask) {
-; CHECK-LABEL: test_sqrt_sd:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1
-; CHECK-NEXT:    vmovapd %xmm2, %xmm3
-; CHECK-NEXT:    vsqrtsd %xmm1, %xmm0, %xmm3 {%k1}
-; CHECK-NEXT:    vsqrtsd {rd-sae}, %xmm1, %xmm0, %xmm2 {%k1}
-; CHECK-NEXT:    vsqrtsd {ru-sae}, %xmm1, %xmm0, %xmm4 {%k1} {z}
-; CHECK-NEXT:    vsqrtsd {rz-sae}, %xmm1, %xmm0, %xmm0
-; CHECK-NEXT:    vaddpd %xmm2, %xmm3, %xmm1
-; CHECK-NEXT:    vaddpd %xmm0, %xmm4, %xmm0
-; CHECK-NEXT:    vaddpd %xmm0, %xmm1, %xmm0
-; CHECK-NEXT:    retq
-  %res0 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 4)
-  %res1 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> %a2, i8 %mask, i32 1)
-  %res2 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 %mask, i32 2)
-  %res3 = call <2 x double> @llvm.x86.avx512.mask.sqrt.sd(<2 x double>%a0, <2 x double> %a1, <2 x double> zeroinitializer, i8 -1, i32 3)
-
-  %res.1 = fadd <2 x double> %res0, %res1
-  %res.2 = fadd <2 x double> %res2, %res3
-  %res   = fadd <2 x double> %res.1, %res.2
-  ret <2 x double> %res
-}
-
 define i64 @test_x86_sse2_cvtsd2si64(<2 x double> %a0) {
 ; CHECK-LABEL: test_x86_sse2_cvtsd2si64:
 ; CHECK:       ## %bb.0:
Index: test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
===================================================================
--- test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
+++ test/CodeGen/X86/avx512vl-intrinsics-fast-isel.ll
@@ -1937,3 +1937,178 @@
 }
 
 !0 = !{i32 1}
+
+define <2 x double> @test_mm_mask_sqrt_pd(<2 x double> %__W, i8 zeroext %__U, <2 x double> %__A) {
+; X32-LABEL: test_mm_mask_sqrt_pd:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vsqrtpd %xmm1, %xmm0 {%k1}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_mask_sqrt_pd:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vsqrtpd %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %__A) #2
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> %__W
+  ret <2 x double> %2
+}
+
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
+
+define <2 x double> @test_mm_maskz_sqrt_pd(i8 zeroext %__U, <2 x double> %__A) {
+; X32-LABEL: test_mm_maskz_sqrt_pd:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vsqrtpd %xmm0, %xmm0 {%k1} {z}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_maskz_sqrt_pd:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vsqrtpd %xmm0, %xmm0 {%k1} {z}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <2 x double> @llvm.sqrt.v2f64(<2 x double> %__A) #2
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <2 x i32> <i32 0, i32 1>
+  %2 = select <2 x i1> %extract.i, <2 x double> %0, <2 x double> zeroinitializer
+  ret <2 x double> %2
+}
+
+define <4 x double> @test_mm256_mask_sqrt_pd(<4 x double> %__W, i8 zeroext %__U, <4 x double> %__A) {
+; X32-LABEL: test_mm256_mask_sqrt_pd:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vsqrtpd %ymm1, %ymm0 {%k1}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm256_mask_sqrt_pd:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vsqrtpd %ymm1, %ymm0 {%k1}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %__A) #2
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> %__W
+  ret <4 x double> %2
+}
+
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
+
+define <4 x double> @test_mm256_maskz_sqrt_pd(i8 zeroext %__U, <4 x double> %__A) {
+; X32-LABEL: test_mm256_maskz_sqrt_pd:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vsqrtpd %ymm0, %ymm0 {%k1} {z}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm256_maskz_sqrt_pd:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vsqrtpd %ymm0, %ymm0 {%k1} {z}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <4 x double> @llvm.sqrt.v4f64(<4 x double> %__A) #2
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x double> %0, <4 x double> zeroinitializer
+  ret <4 x double> %2
+}
+
+define <4 x float> @test_mm_mask_sqrt_ps(<4 x float> %__W, i8 zeroext %__U, <4 x float> %__A) {
+; X32-LABEL: test_mm_mask_sqrt_ps:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vsqrtps %xmm1, %xmm0 {%k1}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_mask_sqrt_ps:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vsqrtps %xmm1, %xmm0 {%k1}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %__A) #2
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> %__W
+  ret <4 x float> %2
+}
+
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+
+define <4 x float> @test_mm_maskz_sqrt_ps(i8 zeroext %__U, <4 x float> %__A) {
+; X32-LABEL: test_mm_maskz_sqrt_ps:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vsqrtps %xmm0, %xmm0 {%k1} {z}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm_maskz_sqrt_ps:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vsqrtps %xmm0, %xmm0 {%k1} {z}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %__A) #2
+  %1 = bitcast i8 %__U to <8 x i1>
+  %extract.i = shufflevector <8 x i1> %1, <8 x i1> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  %2 = select <4 x i1> %extract.i, <4 x float> %0, <4 x float> zeroinitializer
+  ret <4 x float> %2
+}
+
+define <8 x float> @test_mm256_mask_sqrt_ps(<8 x float> %__W, i8 zeroext %__U, <8 x float> %__A) {
+; X32-LABEL: test_mm256_mask_sqrt_ps:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vsqrtps %ymm1, %ymm0 {%k1}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm256_mask_sqrt_ps:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vsqrtps %ymm1, %ymm0 {%k1}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %__A) #2
+  %1 = bitcast i8 %__U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> %__W
+  ret <8 x float> %2
+}
+
+define <8 x float> @test_mm256_maskz_sqrt_ps(i8 zeroext %__U, <8 x float> %__A) {
+; X32-LABEL: test_mm256_maskz_sqrt_ps:
+; X32:       # %bb.0: # %entry
+; X32-NEXT:    movb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    kmovw %eax, %k1
+; X32-NEXT:    vsqrtps %ymm0, %ymm0 {%k1} {z}
+; X32-NEXT:    retl
+;
+; X64-LABEL: test_mm256_maskz_sqrt_ps:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    kmovw %edi, %k1
+; X64-NEXT:    vsqrtps %ymm0, %ymm0 {%k1} {z}
+; X64-NEXT:    retq
+entry:
+  %0 = tail call <8 x float> @llvm.sqrt.v8f32(<8 x float> %__A) #2
+  %1 = bitcast i8 %__U to <8 x i1>
+  %2 = select <8 x i1> %1, <8 x float> %0, <8 x float> zeroinitializer
+  ret <8 x float> %2
+}
+
+declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
+
Index: test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
===================================================================
--- test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
+++ test/CodeGen/X86/avx512vl-intrinsics-upgrade.ll
@@ -6140,3 +6140,26 @@
   ret i8 %res2
 }
 
+define <4 x double> @test_sqrt_pd_256(<4 x double> %a0, i8 %mask) {
+; CHECK-LABEL: test_sqrt_pd_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vsqrtpd %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x51,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <4 x double> @llvm.x86.avx512.mask.sqrt.pd.256(<4 x double> %a0,  <4 x double> zeroinitializer, i8 %mask)
+  ret <4 x double> %res
+}
+declare <4 x double> @llvm.x86.avx512.mask.sqrt.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
+
+define <8 x float> @test_sqrt_ps_256(<8 x float> %a0, i8 %mask) {
+; CHECK-LABEL: test_sqrt_ps_256:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
+; CHECK-NEXT:    vsqrtps %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x51,0xc0]
+; CHECK-NEXT:    retq ## encoding: [0xc3]
+  %res = call <8 x float> @llvm.x86.avx512.mask.sqrt.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask)
+  ret <8 x float> %res
+}
+
+declare <8 x float> @llvm.x86.avx512.mask.sqrt.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
+
Index: test/CodeGen/X86/avx512vl-intrinsics.ll
===================================================================
--- test/CodeGen/X86/avx512vl-intrinsics.ll
+++ test/CodeGen/X86/avx512vl-intrinsics.ll
@@ -905,29 +905,6 @@
 }
 declare <4 x float> @llvm.x86.sse.min.ps(<4 x float>, <4 x float>)
 
-define <4 x double> @test_sqrt_pd_256(<4 x double> %a0, i8 %mask) {
-; CHECK-LABEL: test_sqrt_pd_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vsqrtpd %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0xfd,0xa9,0x51,0xc0]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <4 x double> @llvm.x86.avx512.mask.sqrt.pd.256(<4 x double> %a0,  <4 x double> zeroinitializer, i8 %mask)
-  ret <4 x double> %res
-}
-declare <4 x double> @llvm.x86.avx512.mask.sqrt.pd.256(<4 x double>, <4 x double>, i8) nounwind readnone
-
-define <8 x float> @test_sqrt_ps_256(<8 x float> %a0, i8 %mask) {
-; CHECK-LABEL: test_sqrt_ps_256:
-; CHECK:       ## %bb.0:
-; CHECK-NEXT:    kmovw %edi, %k1 ## encoding: [0xc5,0xf8,0x92,0xcf]
-; CHECK-NEXT:    vsqrtps %ymm0, %ymm0 {%k1} {z} ## encoding: [0x62,0xf1,0x7c,0xa9,0x51,0xc0]
-; CHECK-NEXT:    retq ## encoding: [0xc3]
-  %res = call <8 x float> @llvm.x86.avx512.mask.sqrt.ps.256(<8 x float> %a0, <8 x float> zeroinitializer, i8 %mask)
-  ret <8 x float> %res
-}
-
-declare <8 x float> @llvm.x86.avx512.mask.sqrt.ps.256(<8 x float>, <8 x float>, i8) nounwind readnone
-
 define <4 x double> @test_getexp_pd_256(<4 x double> %a0) {
 ; CHECK-LABEL: test_getexp_pd_256:
 ; CHECK:       ## %bb.0:
Index: test/CodeGen/X86/sse-intrinsics-fast-isel.ll
===================================================================
--- test/CodeGen/X86/sse-intrinsics-fast-isel.ll
+++ test/CodeGen/X86/sse-intrinsics-fast-isel.ll
@@ -1577,10 +1577,10 @@
 ; X64:       # %bb.0:
 ; X64-NEXT:    sqrtps %xmm0, %xmm0
 ; X64-NEXT:    retq
-  %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0)
+  %res = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %a0)
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>) nounwind readnone
 
 define <4 x float> @test_mm_sqrt_ss(<4 x float> %a0) {
 ; X32-LABEL: test_mm_sqrt_ss:
Index: test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll
===================================================================
--- test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll
+++ test/CodeGen/X86/sse-intrinsics-x86-upgrade.ll
@@ -1,6 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s
 
+
+define <4 x float> @test_x86_sse_sqrt_ps(<4 x float> %a0) {
+; CHECK-LABEL: test_x86_sse_sqrt_ps:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    sqrtps %xmm0, %xmm0
+; CHECK-NEXT:    retl
+  %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ret <4 x float> %res
+}
+declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
+
+
 define void @test_x86_sse_storeu_ps(i8* %a0, <4 x float> %a1) {
 ; CHECK-LABEL: test_x86_sse_storeu_ps:
 ; CHECK:       ## %bb.0:
Index: test/CodeGen/X86/sse-intrinsics-x86.ll
===================================================================
--- test/CodeGen/X86/sse-intrinsics-x86.ll
+++ test/CodeGen/X86/sse-intrinsics-x86.ll
@@ -459,27 +459,6 @@
 declare <4 x float> @llvm.x86.sse.rsqrt.ss(<4 x float>) nounwind readnone
 
 
-define <4 x float> @test_x86_sse_sqrt_ps(<4 x float> %a0) {
-; SSE-LABEL: test_x86_sse_sqrt_ps:
-; SSE:       ## %bb.0:
-; SSE-NEXT:    sqrtps %xmm0, %xmm0 ## encoding: [0x0f,0x51,0xc0]
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse_sqrt_ps:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vsqrtps %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0x51,0xc0]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse_sqrt_ps:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vsqrtps %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf8,0x51,0xc0]
-; SKX-NEXT:    retl ## encoding: [0xc3]
-  %res = call <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float> %a0) ; <<4 x float>> [#uses=1]
-  ret <4 x float> %res
-}
-declare <4 x float> @llvm.x86.sse.sqrt.ps(<4 x float>) nounwind readnone
-
-
 define <4 x float> @test_x86_sse_sqrt_ss(<4 x float> %a0) {
 ; SSE-LABEL: test_x86_sse_sqrt_ss:
 ; SSE:       ## %bb.0:
Index: test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
===================================================================
--- test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
+++ test/CodeGen/X86/sse2-intrinsics-fast-isel.ll
@@ -2948,10 +2948,10 @@
 ; X64:       # %bb.0:
 ; X64-NEXT:    sqrtpd %xmm0, %xmm0
 ; X64-NEXT:    retq
-  %res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0)
+  %res = call <2 x double> @llvm.sqrt.v2f64(<2 x double> %a0)
   ret <2 x double> %res
 }
-declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double>) nounwind readnone
 
 define <2 x double> @test_mm_sqrt_sd(<2 x double> %a0, <2 x double> %a1) nounwind {
 ; X32-LABEL: test_mm_sqrt_sd:
Index: test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
===================================================================
--- test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
+++ test/CodeGen/X86/sse2-intrinsics-x86-upgrade.ll
@@ -1,6 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s
 
+
+define <2 x double> @test_x86_sse2_sqrt_pd(<2 x double> %a0) {
+; CHECK-LABEL: test_x86_sse2_sqrt_pd:
+; CHECK:       ## %bb.0:
+; CHECK-NEXT:    sqrtpd %xmm0, %xmm0
+; CHECK-NEXT:    retl
+  %res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
+  ret <2 x double> %res
+}
+declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
+
+
 define <2 x i64> @test_x86_sse2_psll_dq_bs(<2 x i64> %a0) {
 ; CHECK-LABEL: test_x86_sse2_psll_dq_bs:
 ; CHECK:       ## %bb.0:
Index: test/CodeGen/X86/sse2-intrinsics-x86.ll
===================================================================
--- test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -1585,28 +1585,6 @@
 }
 declare <8 x i16> @llvm.x86.sse2.psubus.w(<8 x i16>, <8 x i16>) nounwind readnone
 
-
-define <2 x double> @test_x86_sse2_sqrt_pd(<2 x double> %a0) {
-; SSE-LABEL: test_x86_sse2_sqrt_pd:
-; SSE:       ## %bb.0:
-; SSE-NEXT:    sqrtpd %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x51,0xc0]
-; SSE-NEXT:    retl ## encoding: [0xc3]
-;
-; AVX2-LABEL: test_x86_sse2_sqrt_pd:
-; AVX2:       ## %bb.0:
-; AVX2-NEXT:    vsqrtpd %xmm0, %xmm0 ## encoding: [0xc5,0xf9,0x51,0xc0]
-; AVX2-NEXT:    retl ## encoding: [0xc3]
-;
-; SKX-LABEL: test_x86_sse2_sqrt_pd:
-; SKX:       ## %bb.0:
-; SKX-NEXT:    vsqrtpd %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x51,0xc0]
-; SKX-NEXT:    retl ## encoding: [0xc3]
-  %res = call <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double> %a0) ; <<2 x double>> [#uses=1]
-  ret <2 x double> %res
-}
-declare <2 x double> @llvm.x86.sse2.sqrt.pd(<2 x double>) nounwind readnone
-
-
 define <2 x double> @test_x86_sse2_sqrt_sd(<2 x double> %a0) {
 ; SSE-LABEL: test_x86_sse2_sqrt_sd:
 ; SSE:       ## %bb.0: