Index: clang/docs/LanguageExtensions.rst
===================================================================
--- clang/docs/LanguageExtensions.rst
+++ clang/docs/LanguageExtensions.rst
@@ -643,6 +643,8 @@
  T __builtin_elementwise_bitreverse(T x)     return the integer represented after reversing the bits of x     integer types
  T __builtin_elementwise_exp(T x)            returns the base-e exponential, e^x, of the specified value      floating point types
  T __builtin_elementwise_exp2(T x)           returns the base-2 exponential, 2^x, of the specified value      floating point types
+
+ T __builtin_elementwise_sqrt(T x)           return the square root of a floating-point number                floating point types
  T __builtin_elementwise_roundeven(T x)      round x to the nearest integer value in floating point format,   floating point types
                                              rounding halfway cases to even (that is, to the nearest value
                                              that is an even integer), regardless of the current rounding
Index: clang/docs/ReleaseNotes.rst
===================================================================
--- clang/docs/ReleaseNotes.rst
+++ clang/docs/ReleaseNotes.rst
@@ -239,6 +239,7 @@
 - Add ``__builtin_set_flt_rounds`` builtin for X86, x86_64, Arm and AArch64 only.
 - Add ``__builtin_elementwise_pow`` builtin for floating point types only.
 - Add ``__builtin_elementwise_bitreverse`` builtin for integer types only.
+- Add ``__builtin_elementwise_sqrt`` builtin for floating point types only.
 
 AST Matchers
 ------------
Index: clang/include/clang/Basic/Builtins.def
===================================================================
--- clang/include/clang/Basic/Builtins.def
+++ clang/include/clang/Basic/Builtins.def
@@ -694,6 +694,7 @@
 BUILTIN(__builtin_elementwise_rint, "v.", "nct")
 BUILTIN(__builtin_elementwise_nearbyint, "v.", "nct")
 BUILTIN(__builtin_elementwise_sin, "v.", "nct")
+BUILTIN(__builtin_elementwise_sqrt, "v.", "nct")
 BUILTIN(__builtin_elementwise_trunc, "v.", "nct")
 BUILTIN(__builtin_elementwise_canonicalize, "v.", "nct")
 BUILTIN(__builtin_elementwise_copysign, "v.", "nct")
Index: clang/lib/CodeGen/CGBuiltin.cpp
===================================================================
--- clang/lib/CodeGen/CGBuiltin.cpp
+++ clang/lib/CodeGen/CGBuiltin.cpp
@@ -2530,7 +2530,8 @@
     case Builtin::BI__builtin_sqrtf:
     case Builtin::BI__builtin_sqrtf16:
     case Builtin::BI__builtin_sqrtl:
-    case Builtin::BI__builtin_sqrtf128: {
+    case Builtin::BI__builtin_sqrtf128:
+    case Builtin::BI__builtin_elementwise_sqrt: {
       llvm::Value *Call = emitUnaryMaybeConstrainedFPBuiltin(
           *this, E, Intrinsic::sqrt, Intrinsic::experimental_constrained_sqrt);
       SetSqrtFPAccuracy(Call);
Index: clang/lib/Sema/SemaChecking.cpp
===================================================================
--- clang/lib/Sema/SemaChecking.cpp
+++ clang/lib/Sema/SemaChecking.cpp
@@ -2642,6 +2642,7 @@
   case Builtin::BI__builtin_elementwise_rint:
   case Builtin::BI__builtin_elementwise_nearbyint:
   case Builtin::BI__builtin_elementwise_sin:
+  case Builtin::BI__builtin_elementwise_sqrt:
   case Builtin::BI__builtin_elementwise_trunc:
   case Builtin::BI__builtin_elementwise_canonicalize: {
     if (PrepareBuiltinElementwiseMathOneArgCall(TheCall))
Index: clang/test/CodeGen/builtins-elementwise-math.c
===================================================================
--- clang/test/CodeGen/builtins-elementwise-math.c
+++ clang/test/CodeGen/builtins-elementwise-math.c
@@ -588,6 +588,22 @@
   vf2 = __builtin_elementwise_sin(vf1);
 }
 
+void test_builtin_elementwise_sqrt(float f1, float f2, double d1, double d2,
+                                  float4 vf1, float4 vf2) {
+  // CHECK-LABEL: define void @test_builtin_elementwise_sqrt(
+  // CHECK:      [[F1:%.+]] = load float, ptr %f1.addr, align 4
+  // CHECK-NEXT:  call float @llvm.sqrt.f32(float [[F1]])
+  f2 = __builtin_elementwise_sqrt(f1);
+
+  // CHECK:      [[D1:%.+]] = load double, ptr %d1.addr, align 8
+  // CHECK-NEXT: call double @llvm.sqrt.f64(double [[D1]])
+  d2 = __builtin_elementwise_sqrt(d1);
+
+  // CHECK:      [[VF1:%.+]] = load <4 x float>, ptr %vf1.addr, align 16
+  // CHECK-NEXT: call <4 x float> @llvm.sqrt.v4f32(<4 x float> [[VF1]])
+  vf2 = __builtin_elementwise_sqrt(vf1);
+}
+
 void test_builtin_elementwise_trunc(float f1, float f2, double d1, double d2,
                                     float4 vf1, float4 vf2) {
   // CHECK-LABEL: define void @test_builtin_elementwise_trunc(
Index: clang/test/CodeGen/strictfp-elementwise-bulitins.cpp
===================================================================
--- clang/test/CodeGen/strictfp-elementwise-bulitins.cpp
+++ clang/test/CodeGen/strictfp-elementwise-bulitins.cpp
@@ -177,6 +177,16 @@
   return __builtin_elementwise_sin(a);
 }
 
+// CHECK-LABEL: define dso_local noundef <4 x float> @_Z23strict_elementwise_sqrtDv4_f
+// CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[TMP0:%.*]] = tail call <4 x float> @llvm.experimental.constrained.sqrt.v4f32(<4 x float> [[A]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR4]]
+// CHECK-NEXT:    ret <4 x float> [[TMP0]]
+//
+float4 strict_elementwise_sqrt(float4 a) {
+  return __builtin_elementwise_sqrt(a);
+}
+
 // CHECK-LABEL: define dso_local noundef <4 x float> @_Z24strict_elementwise_truncDv4_f
 // CHECK-SAME: (<4 x float> noundef [[A:%.*]]) local_unnamed_addr #[[ATTR2]] {
 // CHECK-NEXT:  entry:
Index: clang/test/CodeGenCUDA/correctly-rounded-div.cu
===================================================================
--- clang/test/CodeGenCUDA/correctly-rounded-div.cu
+++ clang/test/CodeGenCUDA/correctly-rounded-div.cu
@@ -46,4 +46,18 @@
   return __builtin_sqrt(a);
 }
 
+// COMMON-LABEL: @_Z28test_builtin_elementwise_f32f
+// NCRDIV: call contract float @llvm.sqrt.f32(float %{{.+}}), !fpmath ![[MD:[0-9]+]]
+// CRDIV: call contract float @llvm.sqrt.f32(float %{{.+}}){{$}}
+__device__ float test_builtin_elementwise_f32(float a) {
+  return __builtin_elementwise_sqrt(a);
+}
+
+// COMMON-LABEL: @_Z28test_builtin_elementwise_f64d
+// COMMON: call contract double @llvm.sqrt.f64(double %{{.+}}){{$}}
+// COMMON-NOT: !fpmath
+__device__ double test_builtin_elementwise_f64(double a) {
+  return __builtin_elementwise_sqrt(a);
+}
+
 // NCRSQRT: ![[MD]] = !{float 2.500000e+00}
Index: clang/test/CodeGenOpenCL/fpmath.cl
===================================================================
--- clang/test/CodeGenOpenCL/fpmath.cl
+++ clang/test/CodeGenOpenCL/fpmath.cl
@@ -28,6 +28,21 @@
   return __builtin_sqrtf(a);
 }
 
+float elementwise_sqrt_f32(float a) {
+  // CHECK-LABEL: @elementwise_sqrt_f32
+  // NODIVOPT: call float @llvm.sqrt.f32(float %{{.+}}), !fpmath ![[MD_SQRT:[0-9]+]]
+  // DIVOPT: call float @llvm.sqrt.f32(float %{{.+}}){{$}}
+  return __builtin_elementwise_sqrt(a);
+}
+
+float4 elementwise_sqrt_v4f32(float4 a) {
+  // CHECK-LABEL: @elementwise_sqrt_v4f32
+  // NODIVOPT: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{.+}}), !fpmath ![[MD_SQRT:[0-9]+]]
+  // DIVOPT: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{.+}}){{$}}
+  return __builtin_elementwise_sqrt(a);
+}
+
+
 #if __OPENCL_C_VERSION__ >=120
 void printf(constant char* fmt, ...);
 
@@ -61,6 +76,18 @@
   return __builtin_sqrt(a);
 }
 
+double elementwise_sqrt_f64(double a) {
+  // CHECK-LABEL: @elementwise_sqrt_f64
+  // CHECK: call double @llvm.sqrt.f64(double %{{.+}}){{$}}
+  return __builtin_elementwise_sqrt(a);
+}
+
+double4 elementwise_sqrt_v4f64(double4 a) {
+  // CHECK-LABEL: @elementwise_sqrt_v4f64
+  // CHECK: call <4 x double> @llvm.sqrt.v4f64(<4 x double> %{{.+}}){{$}}
+  return __builtin_elementwise_sqrt(a);
+}
+
 #endif
 
 // NODIVOPT: ![[MD_FDIV]] = !{float 2.500000e+00}
Index: clang/test/Sema/builtins-elementwise-math.c
===================================================================
--- clang/test/Sema/builtins-elementwise-math.c
+++ clang/test/Sema/builtins-elementwise-math.c
@@ -601,6 +601,27 @@
   // expected-error@-1 {{1st argument must be a floating point type (was 'unsigned4' (vector of 4 'unsigned int' values))}}
 }
 
+void test_builtin_elementwise_sqrt(int i, float f, double d, float4 v, int3 iv, unsigned u, unsigned4 uv) {
+
+  struct Foo s = __builtin_elementwise_sqrt(f);
+  // expected-error@-1 {{initializing 'struct Foo' with an expression of incompatible type 'float'}}
+
+  i = __builtin_elementwise_sqrt();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+
+  i = __builtin_elementwise_sqrt(i);
+  // expected-error@-1 {{1st argument must be a floating point type (was 'int')}}
+
+  i = __builtin_elementwise_sqrt(f, f);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+
+  u = __builtin_elementwise_sqrt(u);
+  // expected-error@-1 {{1st argument must be a floating point type (was 'unsigned int')}}
+
+  uv = __builtin_elementwise_sqrt(uv);
+  // expected-error@-1 {{1st argument must be a floating point type (was 'unsigned4' (vector of 4 'unsigned int' values))}}
+}
+
 void test_builtin_elementwise_trunc(int i, float f, double d, float4 v, int3 iv, unsigned u, unsigned4 uv) {
 
   struct Foo s = __builtin_elementwise_trunc(f);
Index: clang/test/SemaCXX/builtins-elementwise-math.cpp
===================================================================
--- clang/test/SemaCXX/builtins-elementwise-math.cpp
+++ clang/test/SemaCXX/builtins-elementwise-math.cpp
@@ -111,6 +111,13 @@
   static_assert(!is_const<decltype(__builtin_elementwise_sin(b))>::value);
 }
 
+void test_builtin_elementwise_sqrt() {
+  const float a = 42.0;
+  float b = 42.3;
+  static_assert(!is_const<decltype(__builtin_elementwise_sqrt(a))>::value);
+  static_assert(!is_const<decltype(__builtin_elementwise_sqrt(b))>::value);
+}
+
 void test_builtin_elementwise_log() {
   const float a = 42.0;
   float b = 42.3;