Index: clang/docs/LanguageExtensions.rst
===================================================================
--- clang/docs/LanguageExtensions.rst
+++ clang/docs/LanguageExtensions.rst
@@ -643,6 +643,8 @@
  T __builtin_elementwise_bitreverse(T x)     return the integer represented after reversing the bits of x     integer types
  T __builtin_elementwise_exp(T x)            returns the base-e exponential, e^x, of the specified value      floating point types
  T __builtin_elementwise_exp2(T x)           returns the base-2 exponential, 2^x, of the specified value      floating point types
+
+ T __builtin_elementwise_sqrt(T x)           return the square root of a floating-point number                floating point types
  T __builtin_elementwise_roundeven(T x)      round x to the nearest integer value in floating point format,   floating point types
                                              rounding halfway cases to even (that is, to the nearest value
                                              that is an even integer), regardless of the current rounding
Index: clang/include/clang/Basic/Builtins.def
===================================================================
--- clang/include/clang/Basic/Builtins.def
+++ clang/include/clang/Basic/Builtins.def
@@ -694,6 +694,7 @@
 BUILTIN(__builtin_elementwise_rint, "v.", "nct")
 BUILTIN(__builtin_elementwise_nearbyint, "v.", "nct")
 BUILTIN(__builtin_elementwise_sin, "v.", "nct")
+BUILTIN(__builtin_elementwise_sqrt, "v.", "nct")
 BUILTIN(__builtin_elementwise_trunc, "v.", "nct")
 BUILTIN(__builtin_elementwise_canonicalize, "v.", "nct")
 BUILTIN(__builtin_elementwise_copysign, "v.", "nct")
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -2544,7 +2544,8 @@
     case Builtin::BI__builtin_sqrtf:
     case Builtin::BI__builtin_sqrtf16:
     case Builtin::BI__builtin_sqrtl:
-    case Builtin::BI__builtin_sqrtf128: {
+    case Builtin::BI__builtin_sqrtf128:
+    case Builtin::BI__builtin_elementwise_sqrt: {
       llvm::Value *Call = emitUnaryMaybeConstrainedFPBuiltin(
           *this, E, Intrinsic::sqrt, Intrinsic::experimental_constrained_sqrt);
       SetSqrtFPAccuracy(Call);
Index: clang/lib/Headers/opencl-c-base.h
===================================================================
--- clang/lib/Headers/opencl-c-base.h
+++ clang/lib/Headers/opencl-c-base.h
@@ -819,6 +819,64 @@
 
 #endif // cl_intel_device_side_avc_motion_estimation
 
+/**
+ * Compute square root.
+ *
+ * Provide inline implementations using the builtin so that we get appropriate
+ * !fpmath based on -cl-fp32-correctly-rounded-divide-sqrt, attached to
+ * llvm.sqrt. The implementation should still provide an external definition.
+ */
+#define __ovld __attribute__((overloadable))
+#define __cnfn __attribute__((const))
+
+inline float __ovld __cnfn sqrt(float __x) {
+  return __builtin_elementwise_sqrt(__x);
+}
+
+inline float2 __ovld __cnfn sqrt(float2 __x) {
+  return __builtin_elementwise_sqrt(__x);
+}
+
+inline float3 __ovld __cnfn sqrt(float3 __x) {
+  return __builtin_elementwise_sqrt(__x);
+}
+
+inline float4 __ovld __cnfn sqrt(float4 __x) {
+  return __builtin_elementwise_sqrt(__x);
+}
+
+inline float8 __ovld __cnfn sqrt(float8 __x) {
+  return __builtin_elementwise_sqrt(__x);
+}
+
+inline float16 __ovld __cnfn sqrt(float16 __x) {
+  return __builtin_elementwise_sqrt(__x);
+}
+
+// We only really want to define the float variants here. However bad things
+// seem to happen with -fdeclare-opencl-builtins and splitting the handling of
+// different overloads.
+
+#ifdef cl_khr_fp64
+double __ovld __cnfn sqrt(double);
+double2 __ovld __cnfn sqrt(double2);
+double3 __ovld __cnfn sqrt(double3);
+double4 __ovld __cnfn sqrt(double4);
+double8 __ovld __cnfn sqrt(double8);
+double16 __ovld __cnfn sqrt(double16);
+#endif //cl_khr_fp64
+#ifdef cl_khr_fp16
+half __ovld __cnfn sqrt(half);
+half2 __ovld __cnfn sqrt(half2);
+half3 __ovld __cnfn sqrt(half3);
+half4 __ovld __cnfn sqrt(half4);
+half8 __ovld __cnfn sqrt(half8);
+half16 __ovld __cnfn sqrt(half16);
+#endif //cl_khr_fp16
+
+#undef __cnfn
+#undef __ovld
+
 // Disable any extensions we may have enabled previously.
 #pragma OPENCL EXTENSION all : disable
 
Index: clang/lib/Headers/opencl-c.h
===================================================================
--- clang/lib/Headers/opencl-c.h
+++ clang/lib/Headers/opencl-c.h
@@ -8496,32 +8496,6 @@
 half16 __ovld __cnfn sinpi(half16);
 #endif //cl_khr_fp16
 
-/**
- * Compute square root.
- */
-float __ovld __cnfn sqrt(float);
-float2 __ovld __cnfn sqrt(float2);
-float3 __ovld __cnfn sqrt(float3);
-float4 __ovld __cnfn sqrt(float4);
-float8 __ovld __cnfn sqrt(float8);
-float16 __ovld __cnfn sqrt(float16);
-#ifdef cl_khr_fp64
-double __ovld __cnfn sqrt(double);
-double2 __ovld __cnfn sqrt(double2);
-double3 __ovld __cnfn sqrt(double3);
-double4 __ovld __cnfn sqrt(double4);
-double8 __ovld __cnfn sqrt(double8);
-double16 __ovld __cnfn sqrt(double16);
-#endif //cl_khr_fp64
-#ifdef cl_khr_fp16
-half __ovld __cnfn sqrt(half);
-half2 __ovld __cnfn sqrt(half2);
-half3 __ovld __cnfn sqrt(half3);
-half4 __ovld __cnfn sqrt(half4);
-half8 __ovld __cnfn sqrt(half8);
-half16 __ovld __cnfn sqrt(half16);
-#endif //cl_khr_fp16
-
 /**
  * Compute tangent.
  */
Index: clang/lib/Sema/SemaChecking.cpp
===================================================================
--- clang/lib/Sema/SemaChecking.cpp
+++ clang/lib/Sema/SemaChecking.cpp
@@ -2641,6 +2641,7 @@
   case Builtin::BI__builtin_elementwise_rint:
   case Builtin::BI__builtin_elementwise_nearbyint:
   case Builtin::BI__builtin_elementwise_sin:
+  case Builtin::BI__builtin_elementwise_sqrt:
   case Builtin::BI__builtin_elementwise_trunc:
   case Builtin::BI__builtin_elementwise_canonicalize: {
     if (PrepareBuiltinElementwiseMathOneArgCall(TheCall))
Index: clang/test/CodeGen/builtins-elementwise-math.c
===================================================================
--- clang/test/CodeGen/builtins-elementwise-math.c
+++ clang/test/CodeGen/builtins-elementwise-math.c
@@ -588,6 +588,22 @@
   vf2 = __builtin_elementwise_sin(vf1);
 }
 
+void test_builtin_elementwise_sqrt(float f1, float f2, double d1, double d2,
+                                  float4 vf1, float4 vf2) {
+  // CHECK-LABEL: define void @test_builtin_elementwise_sqrt(
+  // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
+  // CHECK-NEXT:  call float @llvm.sqrt.f32(float [[F1]])
+  f2 = __builtin_elementwise_sqrt(f1);
+
+  // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
+  // CHECK-NEXT: call double @llvm.sqrt.f64(double [[D1]])
+  d2 = __builtin_elementwise_sqrt(d1);
+
+  // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
+  // CHECK-NEXT: call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[VF1]])
+  vf2 = __builtin_elementwise_sqrt(vf1);
+}
+
 void test_builtin_elementwise_trunc(float f1, float f2, double d1, double d2,
                                     float4 vf1, float4 vf2) {
   // CHECK-LABEL: define void @test_builtin_elementwise_trunc(
Index: clang/test/CodeGen/strictfp-elementwise-bulitins.cpp
===================================================================
--- clang/test/CodeGen/strictfp-elementwise-bulitins.cpp
+++ clang/test/CodeGen/strictfp-elementwise-bulitins.cpp
@@ -177,6 +177,16 @@
   return __builtin_elementwise_sin(a);
 }
 
+// CHECK-LABEL: define dso_local noundef <4 x float> @_Z23strict_elementwise_sqrtDv4_f
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.sqrt.v4f32(<4 x float> [[A]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+float4 strict_elementwise_sqrt(float4 a) {
+  return __builtin_elementwise_sqrt(a);
+}
+
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z24strict_elementwise_truncDv4_f
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  entry:
Index: clang/test/CodeGenCUDA/correctly-rounded-div.cu
===================================================================
--- clang/test/CodeGenCUDA/correctly-rounded-div.cu
+++ clang/test/CodeGenCUDA/correctly-rounded-div.cu
@@ -46,4 +46,18 @@
   return __builtin_sqrt(a);
 }
 
+// COMMON-LABEL: @_Z28test_builtin_elementwise_f32f
+// NCRDIV: call contract float @llvm.sqrt.f32(float %{{.+}}), !fpmath ![[MD:[0-9]+]]
+// CRDIV: call contract float @llvm.sqrt.f32(float %{{.+}}){{$}}
+__device__ float test_builtin_elementwise_f32(float a) {
+  return __builtin_elementwise_sqrt(a);
+}
+
+// COMMON-LABEL: @_Z28test_builtin_elementwise_f64d
+// COMMON: call contract double @llvm.sqrt.f64(double %{{.+}}){{$}}
+// COMMON-NOT: !fpmath
+__device__ double test_builtin_elementwise_f64(double a) {
+  return __builtin_elementwise_sqrt(a);
+}
+
 // NCRSQRT: ![[MD]] = !{float 2.500000e+00}
Index: clang/test/CodeGenOpenCL/fpmath.cl
===================================================================
--- clang/test/CodeGenOpenCL/fpmath.cl
+++ clang/test/CodeGenOpenCL/fpmath.cl
@@ -28,6 +28,21 @@
   return __builtin_sqrtf(a);
 }
 
+float elementwise_sqrt_f32(float a) {
+  // CHECK-LABEL: @elementwise_sqrt_f32
+  // NODIVOPT: call float @llvm.sqrt.f32(float %{{.+}}), !fpmath ![[MD_SQRT:[0-9]+]]
+  // DIVOPT: call float @llvm.sqrt.f32(float %{{.+}}){{$}}
+  return __builtin_elementwise_sqrt(a);
+}
+
+float4 elementwise_sqrt_v4f32(float4 a) {
+  // CHECK-LABEL: @elementwise_sqrt_v4f32
+  // NODIVOPT: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{.+}}), !fpmath ![[MD_SQRT:[0-9]+]]
+  // DIVOPT: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{.+}}){{$}}
+  return __builtin_elementwise_sqrt(a);
+}
+
+
 #if __OPENCL_C_VERSION__ >=120
 void printf(constant char* fmt, ...);
 
@@ -61,6 +76,18 @@
   return __builtin_sqrt(a);
 }
 
+double elementwise_sqrt_f64(double a) {
+  // CHECK-LABEL: @elementwise_sqrt_f64
+  // CHECK: call double @llvm.sqrt.f64(double %{{.+}}){{$}}
+  return __builtin_elementwise_sqrt(a);
+}
+
+double4 elementwise_sqrt_v4f64(double4 a) {
+  // CHECK-LABEL: @elementwise_sqrt_v4f64
+  // CHECK: call <4 x double> @llvm.sqrt.v4f64(<4 x double> %{{.+}}){{$}}
+  return __builtin_elementwise_sqrt(a);
+}
+
 #endif
 
 // NODIVOPT: ![[MD_FDIV]] = !{float 2.500000e+00}
Index: clang/test/CodeGenOpenCL/sqrt-fpmath.cl
===================================================================
--- /dev/null
+++ clang/test/CodeGenOpenCL/sqrt-fpmath.cl
@@ -0,0 +1,201 @@
+// Test that float variants of sqrt are emitted as available_externally inline
+// definitions that call the sqrt intrinsic with appropriate !fpmath metadata
+// depending on -cl-fp32-correctly-rounded-divide-sqrt
+
+// Test with -fdeclare-opencl-builtins
+// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -fdeclare-opencl-builtins -finclude-default-header -S -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,DEFAULT %s
+// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -fdeclare-opencl-builtins -finclude-default-header -cl-fp32-correctly-rounded-divide-sqrt -S -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,CORRECTLYROUNDED %s
+
+// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -fdeclare-opencl-builtins -finclude-default-header -cl-unsafe-math-optimizations -S -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,DEFAULT-UNSAFE %s
+// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -fdeclare-opencl-builtins -finclude-default-header -cl-fp32-correctly-rounded-divide-sqrt -cl-unsafe-math-optimizations -S -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,CORRECTLYROUNDED-UNSAFE %s
+
+// Test without -fdeclare-opencl-builtins
+// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -finclude-default-header -S -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,DEFAULT %s
+// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -finclude-default-header -cl-fp32-correctly-rounded-divide-sqrt -S -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,CORRECTLYROUNDED %s
+
+// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -finclude-default-header -cl-unsafe-math-optimizations -S -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,DEFAULT-UNSAFE %s
+// RUN: %clang_cc1 -disable-llvm-passes -triple amdgcn-unknown-unknown -finclude-default-header -cl-fp32-correctly-rounded-divide-sqrt -cl-unsafe-math-optimizations -S -emit-llvm -o - %s | FileCheck -check-prefixes=CHECK,CORRECTLYROUNDED-UNSAFE %s
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+// CHECK-LABEL: define {{.*}} float @call_sqrt_f32(
+// CHECK: call {{.*}} float @_Z4sqrtf(float noundef %{{.+}}) #{{[0-9]+$}}
+float call_sqrt_f32(float x) {
+  return sqrt(x);
+}
+
+// CHECK-LABEL: define available_externally float @_Z4sqrtf(float noundef %__x)
+// DEFAULT: call float @llvm.sqrt.f32(float %{{.+}}), !fpmath [[$FPMATH:![0-9]+]]{{$}}
+// CORRECTLYROUNDED: call float @llvm.sqrt.f32(float %{{.+}}){{$}}
+
+// DEFAULT-UNSAFE: call reassoc nsz arcp contract afn float @llvm.sqrt.f32(float %{{.+}}), !fpmath [[$FPMATH:![0-9]+]]{{$}}
+// CORRECTLYROUNDED-UNSAFE: call reassoc nsz arcp contract afn float @llvm.sqrt.f32(float %{{.+}}){{$}}
+
+// CHECK-LABEL: define {{.*}} <2 x float> @call_sqrt_v2f32(
+// CHECK: call {{.*}} <2 x float> @_Z4sqrtDv2_f(<2 x float> noundef %{{.*}}) #{{[0-9]+$}}
+float2 call_sqrt_v2f32(float2 x) {
+  return sqrt(x);
+}
+
+// CHECK-LABEL: define available_externally <2 x float> @_Z4sqrtDv2_f(<2 x float> noundef %__x)
+// DEFAULT: call <2 x float> @llvm.sqrt.v2f32(<2 x float> %{{.+}}), !fpmath [[$FPMATH:![0-9]+]]{{$}}
+// CORRECTLYROUNDED: call <2 x float> @llvm.sqrt.v2f32(<2 x float> %{{.+}}){{$}}
+
+// DEFAULT-UNSAFE: call reassoc nsz arcp contract afn <2 x float> @llvm.sqrt.v2f32(<2 x float> %{{.+}}), !fpmath [[$FPMATH:![0-9]+]]{{$}}
+// CORRECTLYROUNDED-UNSAFE: call reassoc nsz arcp contract afn <2 x float> @llvm.sqrt.v2f32(<2 x float> %{{.+}}){{$}}
+
+// CHECK-LABEL: define {{.*}} <3 x float> @call_sqrt_v3f32(
+// CHECK: call {{.*}} <3 x float> @_Z4sqrtDv3_f(<3 x float> noundef %{{.*}}) #{{[0-9]+$}}
+float3 call_sqrt_v3f32(float3 x) {
+  return sqrt(x);
+}
+
+// CHECK-LABEL: define available_externally <3 x float> @_Z4sqrtDv3_f(<3 x float> noundef %__x)
+// DEFAULT: call <3 x float> @llvm.sqrt.v3f32(<3 x float> %{{.+}}), !fpmath [[$FPMATH:![0-9]+]]{{$}}
+// CORRECTLYROUNDED: call <3 x float> @llvm.sqrt.v3f32(<3 x float> %{{.+}}){{$}}
+
+// DEFAULT-UNSAFE: call reassoc nsz arcp contract afn <3 x float> @llvm.sqrt.v3f32(<3 x float> %{{.+}}), !fpmath [[$FPMATH:![0-9]+]]{{$}}
+// CORRECTLYROUNDED-UNSAFE: call reassoc nsz arcp contract afn <3 x float> @llvm.sqrt.v3f32(<3 x float> %{{.+}}){{$}}
+
+
+// CHECK-LABEL: define {{.*}} <4 x float> @call_sqrt_v4f32(
+// CHECK: call {{.*}} <4 x float> @_Z4sqrtDv4_f(<4 x float> noundef %{{.*}}) #{{[0-9]+$}}
+float4 call_sqrt_v4f32(float4 x) {
+  return sqrt(x);
+}
+
+// CHECK-LABEL: define available_externally <4 x float> @_Z4sqrtDv4_f(<4 x float> noundef %__x)
+// DEFAULT: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{.+}}), !fpmath [[$FPMATH:![0-9]+]]{{$}}
+// CORRECTLYROUNDED: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{.+}}){{$}}
+
+// DEFAULT-UNSAFE: call reassoc nsz arcp contract afn <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{.+}}), !fpmath [[$FPMATH:![0-9]+]]{{$}}
+// CORRECTLYROUNDED-UNSAFE: call reassoc nsz arcp contract afn <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{.+}}){{$}}
+
+// CHECK-LABEL: define {{.*}} <8 x float> @call_sqrt_v8f32(
+// CHECK: call {{.*}} <8 x float> @_Z4sqrtDv8_f(<8 x float> noundef %{{.*}}) #{{[0-9]+$}}
+float8 call_sqrt_v8f32(float8 x) {
+  return sqrt(x);
+}
+
+// CHECK-LABEL: define available_externally <8 x float> @_Z4sqrtDv8_f(<8 x float> noundef %__x)
+// DEFAULT: call <8 x float> @llvm.sqrt.v8f32(<8 x float> %{{.+}}), !fpmath [[$FPMATH:![0-9]+]]{{$}}
+// CORRECTLYROUNDED: call <8 x float> @llvm.sqrt.v8f32(<8 x float> %{{.+}}){{$}}
+
+// DEFAULT-UNSAFE: call reassoc nsz arcp contract afn <8 x float> @llvm.sqrt.v8f32(<8 x float> %{{.+}}), !fpmath [[$FPMATH:![0-9]+]]{{$}}
+// CORRECTLYROUNDED-UNSAFE: call reassoc nsz arcp contract afn <8 x float> @llvm.sqrt.v8f32(<8 x float> %{{.+}}){{$}}
+
+
+// CHECK-LABEL: define {{.*}} <16 x float> @call_sqrt_v16f32(
+// CHECK: call {{.*}} <16 x float> @_Z4sqrtDv16_f(<16 x float> noundef %{{.*}}) #{{[0-9]+$}}
+float16 call_sqrt_v16f32(float16 x) {
+  return sqrt(x);
+}
+
+// CHECK-LABEL: define available_externally <16 x float> @_Z4sqrtDv16_f(<16 x float> noundef %__x)
+// DEFAULT: call <16 x float> @llvm.sqrt.v16f32(<16 x float> %{{.+}}), !fpmath [[$FPMATH:![0-9]+]]{{$}}
+// CORRECTLYROUNDED: call <16 x float> @llvm.sqrt.v16f32(<16 x float> %{{.+}}){{$}}
+
+// DEFAULT-UNSAFE: call reassoc nsz arcp contract afn <16 x float> @llvm.sqrt.v16f32(<16 x float> %{{.+}}), !fpmath [[$FPMATH:![0-9]+]]{{$}}
+// CORRECTLYROUNDED-UNSAFE: call reassoc nsz arcp contract afn <16 x float> @llvm.sqrt.v16f32(<16 x float> %{{.+}}){{$}}
+
+
+// Not for f64
+// CHECK-LABEL: define {{.*}} double @call_sqrt_f64(
+// CHECK: call {{.*}} double @_Z4sqrtd(double noundef %{{.+}}) #{{[0-9]+$}}
+double call_sqrt_f64(double x) {
+  return sqrt(x);
+}
+
+// CHECK-NOT: define
+
+// Not for f64
+// CHECK-LABEL: define {{.*}} <2 x double> @call_sqrt_v2f64(
+// CHECK: call {{.*}} <2 x double> @_Z4sqrtDv2_d(<2 x double> noundef %{{.+}}) #{{[0-9]+$}}
+double2 call_sqrt_v2f64(double2 x) {
+  return sqrt(x);
+}
+
+// CHECK-NOT: define
+
+// CHECK-LABEL: define {{.*}} <3 x double> @call_sqrt_v3f64(
+// CHECK: call {{.*}} <3 x double> @_Z4sqrtDv3_d(<3 x double> noundef %{{.+}}) #{{[0-9]+$}}
+double3 call_sqrt_v3f64(double3 x) {
+  return sqrt(x);
+}
+
+// CHECK-NOT: define
+
+// CHECK-LABEL: define {{.*}} <4 x double> @call_sqrt_v4f64(
+// CHECK: call {{.*}} <4 x double> @_Z4sqrtDv4_d(<4 x double> noundef %{{.+}}) #{{[0-9]+$}}
+double4 call_sqrt_v4f64(double4 x) {
+  return sqrt(x);
+}
+
+// CHECK-NOT: define
+
+// CHECK-LABEL: define {{.*}} <8 x double> @call_sqrt_v8f64(
+// CHECK: call {{.*}} <8 x double> @_Z4sqrtDv8_d(<8 x double> noundef %{{.+}}) #{{[0-9]+$}}
+double8 call_sqrt_v8f64(double8 x) {
+  return sqrt(x);
+}
+
+// CHECK-NOT: define
+
+// CHECK-LABEL: define {{.*}} <16 x double> @call_sqrt_v16f64(
+// CHECK: call {{.*}} <16 x double> @_Z4sqrtDv16_d(<16 x double> noundef %{{.+}}) #{{[0-9]+$}}
+double16 call_sqrt_v16f64(double16 x) {
+  return sqrt(x);
+}
+
+// CHECK-NOT: define
+
+// Not for f16
+// CHECK-LABEL: define {{.*}} half @call_sqrt_f16(
+// CHECK: call {{.*}} half @_Z4sqrtDh(half noundef %{{.+}}) #{{[0-9]+$}}
+half call_sqrt_f16(half x) {
+  return sqrt(x);
+}
+
+// CHECK-NOT: define
+
+// CHECK-LABEL: define {{.*}} <2 x half> @call_sqrt_v2f16(
+// CHECK: call {{.*}} <2 x half> @_Z4sqrtDv2_Dh(<2 x half> noundef %{{.+}}) #{{[0-9]+$}}
+half2 call_sqrt_v2f16(half2 x) {
+  return sqrt(x);
+}
+
+// CHECK-NOT: define
+
+// CHECK-LABEL: define {{.*}} <3 x half> @call_sqrt_v3f16(
+// CHECK: call {{.*}} <3 x half> @_Z4sqrtDv3_Dh(<3 x half> noundef %{{.+}}) #{{[0-9]+$}}
+half3 call_sqrt_v3f16(half3 x) {
+  return sqrt(x);
+}
+
+// CHECK-NOT: define
+
+// CHECK-LABEL: define {{.*}} <4 x half> @call_sqrt_v4f16(
+// CHECK: call {{.*}} <4 x half> @_Z4sqrtDv4_Dh(<4 x half> noundef %{{.+}}) #{{[0-9]+$}}
+half4 call_sqrt_v4f16(half4 x) {
+  return sqrt(x);
+}
+
+// CHECK-NOT: define
+
+// CHECK-LABEL: define {{.*}} <8 x half> @call_sqrt_v8f16(
+// CHECK: call {{.*}} <8 x half> @_Z4sqrtDv8_Dh(<8 x half> noundef %{{.+}}) #{{[0-9]+$}}
+half8 call_sqrt_v8f16(half8 x) {
+  return sqrt(x);
+}
+
+// CHECK-NOT: define
+
+// CHECK-LABEL: define {{.*}} <16 x half> @call_sqrt_v16f16(
+// CHECK: call {{.*}} <16 x half> @_Z4sqrtDv16_Dh(<16 x half> noundef %{{.+}}) #{{[0-9]+$}}
+half16 call_sqrt_v16f16(half16 x) {
+  return sqrt(x);
+}
+
+// CHECK-NOT: define
+
+// DEFAULT: [[$FPMATH]] = !{float 3.000000e+00}
Index: clang/test/Sema/builtins-elementwise-math.c
===================================================================
--- clang/test/Sema/builtins-elementwise-math.c
+++ clang/test/Sema/builtins-elementwise-math.c
@@ -601,6 +601,27 @@
   // expected-error@-1 {{1st argument must be a floating point type (was 'unsigned4' (vector of 4 'unsigned int' values))}}
 }
 
+void test_builtin_elementwise_sqrt(int i, float f, double d, float4 v, int3 iv, unsigned u, unsigned4 uv) {
+
+  struct Foo s = __builtin_elementwise_sqrt(f);
+  // expected-error@-1 {{initializing 'struct Foo' with an expression of incompatible type 'float'}}
+
+  i = __builtin_elementwise_sqrt();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+
+  i = __builtin_elementwise_sqrt(i);
+  // expected-error@-1 {{1st argument must be a floating point type (was 'int')}}
+
+  i = __builtin_elementwise_sqrt(f, f);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+
+  u = __builtin_elementwise_sqrt(u);
+  // expected-error@-1 {{1st argument must be a floating point type (was 'unsigned int')}}
+
+  uv = __builtin_elementwise_sqrt(uv);
+  // expected-error@-1 {{1st argument must be a floating point type (was 'unsigned4' (vector of 4 'unsigned int' values))}}
+}
+
 void test_builtin_elementwise_trunc(int i, float f, double d, float4 v, int3 iv, unsigned u, unsigned4 uv) {
 
   struct Foo s = __builtin_elementwise_trunc(f);
Index: clang/test/SemaCXX/builtins-elementwise-math.cpp
===================================================================
--- clang/test/SemaCXX/builtins-elementwise-math.cpp
+++ clang/test/SemaCXX/builtins-elementwise-math.cpp
@@ -111,6 +111,13 @@
   static_assert(!is_const<decltype(__builtin_elementwise_sin(b))>::value);
 }
 
+void test_builtin_elementwise_sqrt() {
+  const float a = 42.0;
+  float b = 42.3;
+  static_assert(!is_const<decltype(__builtin_elementwise_sqrt(a))>::value);
+  static_assert(!is_const<decltype(__builtin_elementwise_sqrt(b))>::value);
+}
+
 void test_builtin_elementwise_log() {
   const float a = 42.0;
   float b = 42.3;