Index: llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
===================================================================
--- llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -1062,6 +1062,10 @@
 
   B.SetInsertPointPastAllocas(F);
 
+  DILocation *MergedDebugLoc =
+      DILocation::getMergedLocation(Sin->getDebugLoc(), Cos->getDebugLoc());
+  B.SetCurrentDebugLocation(MergedDebugLoc);
+
   AllocaInst *Alloc = B.CreateAlloca(Sin->getType(), nullptr, "__sincos_");
 
   if (Instruction *ArgInst = dyn_cast<Instruction>(Arg)) {
@@ -1070,6 +1074,7 @@
     // if it's an argument or constant.
 
     B.SetInsertPoint(ArgInst->getParent(), ++ArgInst->getIterator());
+    B.SetCurrentDebugLocation(MergedDebugLoc);
   }
 
   Value *P = Alloc;
@@ -1087,6 +1092,7 @@
 
   CallInst *Call = CreateCallEx2(B, Fsincos, Arg, P);
   LoadInst *Reload = B.CreateLoad(Alloc->getAllocatedType(), Alloc);
+  Reload->setDebugLoc(Cos->getDebugLoc());
 
   LLVM_DEBUG(errs() << "AMDIC: fold_sincos (" << *Sin << ", " << *Cos
                     << ") with " << *Call << '\n');
@@ -1117,7 +1123,6 @@
   CallInst *CI = cast<CallInst>(FPOp);
   bool Changed = false;
 
-  Module *M = CI->getModule();
   FuncInfo PartnerInfo(isSin ? AMDGPULibFunc::EI_COS : AMDGPULibFunc::EI_SIN,
                        fInfo);
   const std::string PairName = PartnerInfo.mangle();
Index: llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll
===================================================================
--- llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll
+++ llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sincos.ll
@@ -1078,12 +1078,12 @@
 ; CHECK-NEXT:    [[__SINCOS_:%.*]] = alloca float, align 4, addrspace(5), !dbg [[DBG12:![0-9]+]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[__SINCOS_]] to ptr, !dbg [[DBG12]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[TMP0]]), !dbg [[DBG12]]
-; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4, !dbg [[DBG12]]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata float [[TMP1]], metadata [[META9:![0-9]+]], metadata !DIExpression()), !dbg [[DBG13:![0-9]+]]
-; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4, !dbg [[DBG14:![0-9]+]]
-; CHECK-NEXT:    call void @llvm.dbg.value(metadata float [[TMP2]], metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG12]]
-; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4, !dbg [[DBG15:![0-9]+]]
-; CHECK-NEXT:    ret void, !dbg [[DBG16:![0-9]+]]
+; CHECK-NEXT:    [[TMP2:%.*]] = load float, ptr addrspace(5) [[__SINCOS_]], align 4, !dbg [[DBG13:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata float [[TMP1]], metadata [[META9:![0-9]+]], metadata !DIExpression()), !dbg [[DBG14:![0-9]+]]
+; CHECK-NEXT:    store float [[TMP1]], ptr addrspace(1) [[SIN_OUT]], align 4, !dbg [[DBG15:![0-9]+]]
+; CHECK-NEXT:    call void @llvm.dbg.value(metadata float [[TMP2]], metadata [[META11:![0-9]+]], metadata !DIExpression()), !dbg [[DBG13]]
+; CHECK-NEXT:    store float [[TMP2]], ptr addrspace(1) [[COS_OUT]], align 4, !dbg [[DBG16:![0-9]+]]
+; CHECK-NEXT:    ret void, !dbg [[DBG17:![0-9]+]]
 ;
 entry:
   %call = tail call contract float @_Z3sinf(float %x), !dbg !19
@@ -1100,9 +1100,9 @@
 ; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COS_TMP:%.*]] = alloca float, align 4, addrspace(5)
-; CHECK-NEXT:    [[SIN0:%.*]] = tail call nnan ninf nsz contract float @_Z3sinf(float [[X]]), !fpmath !17
+; CHECK-NEXT:    [[SIN0:%.*]] = tail call nnan ninf nsz contract float @_Z3sinf(float [[X]]), !fpmath !18
 ; CHECK-NEXT:    store float [[SIN0]], ptr addrspace(1) [[SIN_OUT]], align 4
-; CHECK-NEXT:    [[SIN1:%.*]] = call nnan contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[COS_TMP]]), !fpmath !18
+; CHECK-NEXT:    [[SIN1:%.*]] = call nnan contract float @_Z6sincosfPU3AS5f(float [[X]], ptr addrspace(5) [[COS_TMP]]), !fpmath !19
 ; CHECK-NEXT:    [[COS1:%.*]] = load float, ptr addrspace(5) [[COS_TMP]], align 4
 ; CHECK-NEXT:    store float [[COS1]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret float [[SIN1]]
@@ -1122,10 +1122,10 @@
 ; CHECK-SAME: (float [[X:%.*]], ptr addrspace(1) nocapture writeonly [[SIN_OUT:%.*]], ptr addrspace(1) nocapture writeonly [[COS_OUT:%.*]]) local_unnamed_addr #[[ATTR3]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[COS_TMP:%.*]] = alloca float, align 4, addrspace(5)
-; CHECK-NEXT:    [[SIN0:%.*]] = tail call nsz contract float @_Z3sinf(float [[X]]), !fpmath !17
+; CHECK-NEXT:    [[SIN0:%.*]] = tail call nsz contract float @_Z3sinf(float [[X]]), !fpmath !18
 ; CHECK-NEXT:    store float [[SIN0]], ptr addrspace(1) [[SIN_OUT]], align 4
 ; CHECK-NEXT:    [[COS_TMP_CAST:%.*]] = addrspacecast ptr addrspace(5) [[COS_TMP]] to ptr
-; CHECK-NEXT:    [[SIN1:%.*]] = call ninf nsz contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[COS_TMP_CAST]]), !fpmath !18
+; CHECK-NEXT:    [[SIN1:%.*]] = call ninf nsz contract float @_Z6sincosfPU3AS0f(float [[X]], ptr [[COS_TMP_CAST]]), !fpmath !19
 ; CHECK-NEXT:    [[COS1:%.*]] = load float, ptr addrspace(5) [[COS_TMP]], align 4
 ; CHECK-NEXT:    store float [[COS1]], ptr addrspace(1) [[COS_OUT]], align 4
 ; CHECK-NEXT:    ret float [[SIN1]]
Index: llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sqrt.ll
===================================================================
--- /dev/null
+++ llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-sqrt.ll
@@ -0,0 +1,457 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -passes=amdgpu-simplifylib %s | FileCheck %s
+
+target datalayout = "e-p:64:64-p1:64:64-p2:32:32-p3:32:32-p4:64:64-p5:32:32-p6:32:32-p7:160:256:256:32-p8:128:128-i64:64-v16:16-v24:32-v32:32-v48:64-v96:128-v192:256-v256:256-v512:512-v1024:1024-v2048:2048-n32:64-S32-A5-G1-ni:7:8"
+
+declare float @_Z4sqrtf(float)
+declare <2 x float> @_Z4sqrtDv2_f(<2 x float>)
+declare <3 x float> @_Z4sqrtDv3_f(<3 x float>)
+declare <4 x float> @_Z4sqrtDv4_f(<4 x float>)
+declare <8 x float> @_Z4sqrtDv8_f(<8 x float>)
+declare <16 x float> @_Z4sqrtDv16_f(<16 x float>)
+
+declare double @_Z4sqrtd(double)
+declare <2 x double> @_Z4sqrtDv2_d(<2 x double>)
+declare <3 x double> @_Z4sqrtDv3_d(<3 x double>)
+declare <4 x double> @_Z4sqrtDv4_d(<4 x double>)
+declare <8 x double> @_Z4sqrtDv8_d(<8 x double>)
+declare <16 x double> @_Z4sqrtDv16_d(<16 x double>)
+
+declare half @_Z4sqrtDh(half)
+declare <2 x half> @_Z4sqrtDv2_Dh(<2 x half>)
+declare <3 x half> @_Z4sqrtDv3_Dh(<3 x half>)
+declare <4 x half> @_Z4sqrtDv4_Dh(<4 x half>)
+declare <8 x half> @_Z4sqrtDv8_Dh(<8 x half>)
+declare <16 x half> @_Z4sqrtDv16_Dh(<16 x half>)
+
+define float @test_sqrt_f32(float %arg) {
+; CHECK-LABEL: define float @test_sqrt_f32
+; CHECK-SAME: (float [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call float @_Z4sqrtf(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    ret float [[SQRT]]
+;
+  %sqrt = tail call float @_Z4sqrtf(float %arg), !fpmath !0
+  ret float %sqrt
+}
+
+define <2 x float> @test_sqrt_v2f32(<2 x float> %arg) {
+; CHECK-LABEL: define <2 x float> @test_sqrt_v2f32
+; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call <2 x float> @_Z4sqrtDv2_f(<2 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    ret <2 x float> [[SQRT]]
+;
+  %sqrt = tail call <2 x float> @_Z4sqrtDv2_f(<2 x float> %arg), !fpmath !0
+  ret <2 x float> %sqrt
+}
+
+define <3 x float> @test_sqrt_v3f32(<3 x float> %arg) {
+; CHECK-LABEL: define <3 x float> @test_sqrt_v3f32
+; CHECK-SAME: (<3 x float> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call <3 x float> @_Z4sqrtDv3_f(<3 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    ret <3 x float> [[SQRT]]
+;
+  %sqrt = tail call <3 x float> @_Z4sqrtDv3_f(<3 x float> %arg), !fpmath !0
+  ret <3 x float> %sqrt
+}
+
+define <4 x float> @test_sqrt_v4f32(<4 x float> %arg) {
+; CHECK-LABEL: define <4 x float> @test_sqrt_v4f32
+; CHECK-SAME: (<4 x float> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call <4 x float> @_Z4sqrtDv4_f(<4 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    ret <4 x float> [[SQRT]]
+;
+  %sqrt = tail call <4 x float> @_Z4sqrtDv4_f(<4 x float> %arg), !fpmath !0
+  ret <4 x float> %sqrt
+}
+
+define <8 x float> @test_sqrt_v8f32(<8 x float> %arg) {
+; CHECK-LABEL: define <8 x float> @test_sqrt_v8f32
+; CHECK-SAME: (<8 x float> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call <8 x float> @_Z4sqrtDv8_f(<8 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    ret <8 x float> [[SQRT]]
+;
+  %sqrt = tail call <8 x float> @_Z4sqrtDv8_f(<8 x float> %arg), !fpmath !0
+  ret <8 x float> %sqrt
+}
+
+define <16 x float> @test_sqrt_v16f32(<16 x float> %arg) {
+; CHECK-LABEL: define <16 x float> @test_sqrt_v16f32
+; CHECK-SAME: (<16 x float> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call <16 x float> @_Z4sqrtDv16_f(<16 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    ret <16 x float> [[SQRT]]
+;
+  %sqrt = tail call <16 x float> @_Z4sqrtDv16_f(<16 x float> %arg), !fpmath !0
+  ret <16 x float> %sqrt
+}
+
+define float @test_sqrt_cr_f32(float %arg) {
+; CHECK-LABEL: define float @test_sqrt_cr_f32
+; CHECK-SAME: (float [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call float @_Z4sqrtf(float [[ARG]])
+; CHECK-NEXT:    ret float [[SQRT]]
+;
+  %sqrt = tail call float @_Z4sqrtf(float %arg)
+  ret float %sqrt
+}
+
+define <2 x float> @test_sqrt_cr_v2f32(<2 x float> %arg) {
+; CHECK-LABEL: define <2 x float> @test_sqrt_cr_v2f32
+; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call <2 x float> @_Z4sqrtDv2_f(<2 x float> [[ARG]])
+; CHECK-NEXT:    ret <2 x float> [[SQRT]]
+;
+  %sqrt = tail call <2 x float> @_Z4sqrtDv2_f(<2 x float> %arg)
+  ret <2 x float> %sqrt
+}
+
+define <3 x float> @test_sqrt_cr_v3f32(<3 x float> %arg) {
+; CHECK-LABEL: define <3 x float> @test_sqrt_cr_v3f32
+; CHECK-SAME: (<3 x float> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call <3 x float> @_Z4sqrtDv3_f(<3 x float> [[ARG]])
+; CHECK-NEXT:    ret <3 x float> [[SQRT]]
+;
+  %sqrt = tail call <3 x float> @_Z4sqrtDv3_f(<3 x float> %arg)
+  ret <3 x float> %sqrt
+}
+
+define <4 x float> @test_sqrt_cr_v4f32(<4 x float> %arg) {
+; CHECK-LABEL: define <4 x float> @test_sqrt_cr_v4f32
+; CHECK-SAME: (<4 x float> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call <4 x float> @_Z4sqrtDv4_f(<4 x float> [[ARG]])
+; CHECK-NEXT:    ret <4 x float> [[SQRT]]
+;
+  %sqrt = tail call <4 x float> @_Z4sqrtDv4_f(<4 x float> %arg)
+  ret <4 x float> %sqrt
+}
+
+define <8 x float> @test_sqrt_cr_v8f32(<8 x float> %arg) {
+; CHECK-LABEL: define <8 x float> @test_sqrt_cr_v8f32
+; CHECK-SAME: (<8 x float> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call <8 x float> @_Z4sqrtDv8_f(<8 x float> [[ARG]])
+; CHECK-NEXT:    ret <8 x float> [[SQRT]]
+;
+  %sqrt = tail call <8 x float> @_Z4sqrtDv8_f(<8 x float> %arg)
+  ret <8 x float> %sqrt
+}
+
+define <16 x float> @test_sqrt_cr_v16f32(<16 x float> %arg) {
+; CHECK-LABEL: define <16 x float> @test_sqrt_cr_v16f32
+; CHECK-SAME: (<16 x float> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call <16 x float> @_Z4sqrtDv16_f(<16 x float> [[ARG]])
+; CHECK-NEXT:    ret <16 x float> [[SQRT]]
+;
+  %sqrt = tail call <16 x float> @_Z4sqrtDv16_f(<16 x float> %arg)
+  ret <16 x float> %sqrt
+}
+
+define double @test_sqrt_f64(double %arg) {
+; CHECK-LABEL: define double @test_sqrt_f64
+; CHECK-SAME: (double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call double @_Z4sqrtd(double [[ARG]])
+; CHECK-NEXT:    ret double [[SQRT]]
+;
+  %sqrt = tail call double @_Z4sqrtd(double %arg)
+  ret double %sqrt
+}
+
+define <2 x double> @test_sqrt_v2f64(<2 x double> %arg) {
+; CHECK-LABEL: define <2 x double> @test_sqrt_v2f64
+; CHECK-SAME: (<2 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call <2 x double> @_Z4sqrtDv2_d(<2 x double> [[ARG]])
+; CHECK-NEXT:    ret <2 x double> [[SQRT]]
+;
+  %sqrt = tail call <2 x double> @_Z4sqrtDv2_d(<2 x double> %arg)
+  ret <2 x double> %sqrt
+}
+
+define <3 x double> @test_sqrt_v3f64(<3 x double> %arg) {
+; CHECK-LABEL: define <3 x double> @test_sqrt_v3f64
+; CHECK-SAME: (<3 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call <3 x double> @_Z4sqrtDv3_d(<3 x double> [[ARG]])
+; CHECK-NEXT:    ret <3 x double> [[SQRT]]
+;
+  %sqrt = tail call <3 x double> @_Z4sqrtDv3_d(<3 x double> %arg)
+  ret <3 x double> %sqrt
+}
+
+define <4 x double> @test_sqrt_v4f64(<4 x double> %arg) {
+; CHECK-LABEL: define <4 x double> @test_sqrt_v4f64
+; CHECK-SAME: (<4 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call <4 x double> @_Z4sqrtDv4_d(<4 x double> [[ARG]])
+; CHECK-NEXT:    ret <4 x double> [[SQRT]]
+;
+  %sqrt = tail call <4 x double> @_Z4sqrtDv4_d(<4 x double> %arg)
+  ret <4 x double> %sqrt
+}
+
+define <8 x double> @test_sqrt_v8f64(<8 x double> %arg) {
+; CHECK-LABEL: define <8 x double> @test_sqrt_v8f64
+; CHECK-SAME: (<8 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call <8 x double> @_Z4sqrtDv8_d(<8 x double> [[ARG]])
+; CHECK-NEXT:    ret <8 x double> [[SQRT]]
+;
+  %sqrt = tail call <8 x double> @_Z4sqrtDv8_d(<8 x double> %arg)
+  ret <8 x double> %sqrt
+}
+
+define <16 x double> @test_sqrt_v16f64(<16 x double> %arg) {
+; CHECK-LABEL: define <16 x double> @test_sqrt_v16f64
+; CHECK-SAME: (<16 x double> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call <16 x double> @_Z4sqrtDv16_d(<16 x double> [[ARG]])
+; CHECK-NEXT:    ret <16 x double> [[SQRT]]
+;
+  %sqrt = tail call <16 x double> @_Z4sqrtDv16_d(<16 x double> %arg)
+  ret <16 x double> %sqrt
+}
+
+define half @test_sqrt_f16(half %arg) {
+; CHECK-LABEL: define half @test_sqrt_f16
+; CHECK-SAME: (half [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call half @_Z4sqrtDh(half [[ARG]])
+; CHECK-NEXT:    ret half [[SQRT]]
+;
+  %sqrt = tail call half @_Z4sqrtDh(half %arg)
+  ret half %sqrt
+}
+
+define <2 x half> @test_sqrt_v2f16(<2 x half> %arg) {
+; CHECK-LABEL: define <2 x half> @test_sqrt_v2f16
+; CHECK-SAME: (<2 x half> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call <2 x half> @_Z4sqrtDv2_Dh(<2 x half> [[ARG]])
+; CHECK-NEXT:    ret <2 x half> [[SQRT]]
+;
+  %sqrt = tail call <2 x half> @_Z4sqrtDv2_Dh(<2 x half> %arg)
+  ret <2 x half> %sqrt
+}
+
+define <3 x half> @test_sqrt_v3f16(<3 x half> %arg) {
+; CHECK-LABEL: define <3 x half> @test_sqrt_v3f16
+; CHECK-SAME: (<3 x half> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call <3 x half> @_Z4sqrtDv3_Dh(<3 x half> [[ARG]])
+; CHECK-NEXT:    ret <3 x half> [[SQRT]]
+;
+  %sqrt = tail call <3 x half> @_Z4sqrtDv3_Dh(<3 x half> %arg)
+  ret <3 x half> %sqrt
+}
+
+define <4 x half> @test_sqrt_v4f16(<4 x half> %arg) {
+; CHECK-LABEL: define <4 x half> @test_sqrt_v4f16
+; CHECK-SAME: (<4 x half> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call <4 x half> @_Z4sqrtDv4_Dh(<4 x half> [[ARG]])
+; CHECK-NEXT:    ret <4 x half> [[SQRT]]
+;
+  %sqrt = tail call <4 x half> @_Z4sqrtDv4_Dh(<4 x half> %arg)
+  ret <4 x half> %sqrt
+}
+
+define <8 x half> @test_sqrt_v8f16(<8 x half> %arg) {
+; CHECK-LABEL: define <8 x half> @test_sqrt_v8f16
+; CHECK-SAME: (<8 x half> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call <8 x half> @_Z4sqrtDv8_Dh(<8 x half> [[ARG]])
+; CHECK-NEXT:    ret <8 x half> [[SQRT]]
+;
+  %sqrt = tail call <8 x half> @_Z4sqrtDv8_Dh(<8 x half> %arg)
+  ret <8 x half> %sqrt
+}
+
+define <16 x half> @test_sqrt_v16f16(<16 x half> %arg) {
+; CHECK-LABEL: define <16 x half> @test_sqrt_v16f16
+; CHECK-SAME: (<16 x half> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call <16 x half> @_Z4sqrtDv16_Dh(<16 x half> [[ARG]])
+; CHECK-NEXT:    ret <16 x half> [[SQRT]]
+;
+  %sqrt = tail call <16 x half> @_Z4sqrtDv16_Dh(<16 x half> %arg)
+  ret <16 x half> %sqrt
+}
+
+define float @test_sqrt_f32_nobuiltin_callsite(float %arg) {
+; CHECK-LABEL: define float @test_sqrt_f32_nobuiltin_callsite
+; CHECK-SAME: (float [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call float @_Z4sqrtf(float [[ARG]]) #[[ATTR2:[0-9]+]], !fpmath !0
+; CHECK-NEXT:    ret float [[SQRT]]
+;
+  %sqrt = tail call float @_Z4sqrtf(float %arg) #0, !fpmath !0
+  ret float %sqrt
+}
+
+define <2 x float> @test_sqrt_v2f32_nobuiltin_callsite(<2 x float> %arg) {
+; CHECK-LABEL: define <2 x float> @test_sqrt_v2f32_nobuiltin_callsite
+; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call <2 x float> @_Z4sqrtDv2_f(<2 x float> [[ARG]]) #[[ATTR2]], !fpmath !0
+; CHECK-NEXT:    ret <2 x float> [[SQRT]]
+;
+  %sqrt = tail call <2 x float> @_Z4sqrtDv2_f(<2 x float> %arg) #0, !fpmath !0
+  ret <2 x float> %sqrt
+}
+
+define float @test_sqrt_cr_f32_nobuiltin_callsite(float %arg) {
+; CHECK-LABEL: define float @test_sqrt_cr_f32_nobuiltin_callsite
+; CHECK-SAME: (float [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call float @_Z4sqrtf(float [[ARG]]) #[[ATTR2]]
+; CHECK-NEXT:    ret float [[SQRT]]
+;
+  %sqrt = tail call float @_Z4sqrtf(float %arg) #0
+  ret float %sqrt
+}
+
+define <2 x float> @test_sqrt_cr_v2f32_nobuiltin_callsite(<2 x float> %arg) {
+; CHECK-LABEL: define <2 x float> @test_sqrt_cr_v2f32_nobuiltin_callsite
+; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call <2 x float> @_Z4sqrtDv2_f(<2 x float> [[ARG]]) #[[ATTR2]]
+; CHECK-NEXT:    ret <2 x float> [[SQRT]]
+;
+  %sqrt = tail call <2 x float> @_Z4sqrtDv2_f(<2 x float> %arg) #0
+  ret <2 x float> %sqrt
+}
+
+; "no-builtins" should be ignored
+define float @test_sqrt_f32_nobuiltins(float %arg) #1 {
+; CHECK-LABEL: define float @test_sqrt_f32_nobuiltins
+; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call float @_Z4sqrtf(float [[ARG]]) #[[ATTR2]], !fpmath !0
+; CHECK-NEXT:    ret float [[SQRT]]
+;
+  %sqrt = tail call float @_Z4sqrtf(float %arg) #0, !fpmath !0
+  ret float %sqrt
+}
+
+define <2 x float> @test_sqrt_v2f32_nobuiltins(<2 x float> %arg) #1 {
+; CHECK-LABEL: define <2 x float> @test_sqrt_v2f32_nobuiltins
+; CHECK-SAME: (<2 x float> [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call <2 x float> @_Z4sqrtDv2_f(<2 x float> [[ARG]]) #[[ATTR2]], !fpmath !0
+; CHECK-NEXT:    ret <2 x float> [[SQRT]]
+;
+  %sqrt = tail call <2 x float> @_Z4sqrtDv2_f(<2 x float> %arg) #0, !fpmath !0
+  ret <2 x float> %sqrt
+}
+
+define float @test_sqrt_cr_f32_nobuiltins(float %arg) #1 {
+; CHECK-LABEL: define float @test_sqrt_cr_f32_nobuiltins
+; CHECK-SAME: (float [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call float @_Z4sqrtf(float [[ARG]]) #[[ATTR2]]
+; CHECK-NEXT:    ret float [[SQRT]]
+;
+  %sqrt = tail call float @_Z4sqrtf(float %arg) #0
+  ret float %sqrt
+}
+
+define <2 x float> @test_sqrt_cr_v2f32_nobuiltins(<2 x float> %arg) #1 {
+; CHECK-LABEL: define <2 x float> @test_sqrt_cr_v2f32_nobuiltins
+; CHECK-SAME: (<2 x float> [[ARG:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call <2 x float> @_Z4sqrtDv2_f(<2 x float> [[ARG]]) #[[ATTR2]]
+; CHECK-NEXT:    ret <2 x float> [[SQRT]]
+;
+  %sqrt = tail call <2 x float> @_Z4sqrtDv2_f(<2 x float> %arg) #0
+  ret <2 x float> %sqrt
+}
+
+define float @test_sqrt_f32_preserve_flags(float %arg) {
+; CHECK-LABEL: define float @test_sqrt_f32_preserve_flags
+; CHECK-SAME: (float [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call nnan ninf float @_Z4sqrtf(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    ret float [[SQRT]]
+;
+  %sqrt = tail call nnan ninf float @_Z4sqrtf(float %arg), !fpmath !0
+  ret float %sqrt
+}
+
+define <2 x float> @test_sqrt_v2f32_preserve_flags(<2 x float> %arg) {
+; CHECK-LABEL: define <2 x float> @test_sqrt_v2f32_preserve_flags
+; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call nnan nsz contract <2 x float> @_Z4sqrtDv2_f(<2 x float> [[ARG]]), !fpmath !0
+; CHECK-NEXT:    ret <2 x float> [[SQRT]]
+;
+  %sqrt = tail call contract nsz nnan <2 x float> @_Z4sqrtDv2_f(<2 x float> %arg), !fpmath !0
+  ret <2 x float> %sqrt
+}
+
+define float @test_sqrt_f32_preserve_flags_md(float %arg) {
+; CHECK-LABEL: define float @test_sqrt_f32_preserve_flags_md
+; CHECK-SAME: (float [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call nnan ninf float @_Z4sqrtf(float [[ARG]]), !fpmath !0, !foo !1
+; CHECK-NEXT:    ret float [[SQRT]]
+;
+  %sqrt = tail call nnan ninf float @_Z4sqrtf(float %arg), !fpmath !0, !foo !1
+  ret float %sqrt
+}
+
+define <2 x float> @test_sqrt_v2f32_preserve_flags_md(<2 x float> %arg) {
+; CHECK-LABEL: define <2 x float> @test_sqrt_v2f32_preserve_flags_md
+; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call nnan nsz contract <2 x float> @_Z4sqrtDv2_f(<2 x float> [[ARG]]), !fpmath !0, !foo !1
+; CHECK-NEXT:    ret <2 x float> [[SQRT]]
+;
+  %sqrt = tail call contract nsz nnan <2 x float> @_Z4sqrtDv2_f(<2 x float> %arg), !fpmath !0, !foo !1
+  ret <2 x float> %sqrt
+}
+
+define float @test_sqrt_cr_f32_preserve_flags(float %arg) {
+; CHECK-LABEL: define float @test_sqrt_cr_f32_preserve_flags
+; CHECK-SAME: (float [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call ninf contract float @_Z4sqrtf(float [[ARG]])
+; CHECK-NEXT:    ret float [[SQRT]]
+;
+  %sqrt = tail call ninf contract float @_Z4sqrtf(float %arg)
+  ret float %sqrt
+}
+
+define <2 x float> @test_sqrt_cr_v2f32_preserve_flags(<2 x float> %arg) {
+; CHECK-LABEL: define <2 x float> @test_sqrt_cr_v2f32_preserve_flags
+; CHECK-SAME: (<2 x float> [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call nnan nsz <2 x float> @_Z4sqrtDv2_f(<2 x float> [[ARG]])
+; CHECK-NEXT:    ret <2 x float> [[SQRT]]
+;
+  %sqrt = tail call nnan nsz <2 x float> @_Z4sqrtDv2_f(<2 x float> %arg)
+  ret <2 x float> %sqrt
+}
+
+; Test the libm name, not a recognized opencl builtin.
+declare float @sqrtf(float) #2
+declare double @sqrt(double) #2
+
+define float @test_libm_sqrt_f32(float %arg) {
+; CHECK-LABEL: define float @test_libm_sqrt_f32
+; CHECK-SAME: (float [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call float @sqrtf(float [[ARG]])
+; CHECK-NEXT:    ret float [[SQRT]]
+;
+  %sqrt = tail call float @sqrtf(float %arg)
+  ret float %sqrt
+}
+
+define float @test_libm_sqrt_f32_fpmath(float %arg) {
+; CHECK-LABEL: define float @test_libm_sqrt_f32_fpmath
+; CHECK-SAME: (float [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call float @sqrtf(float [[ARG]]), !fpmath !0
+; CHECK-NEXT:    ret float [[SQRT]]
+;
+  %sqrt = tail call float @sqrtf(float %arg), !fpmath !0
+  ret float %sqrt
+}
+
+define double @test_libm_sqrt_f64(double %arg) {
+; CHECK-LABEL: define double @test_libm_sqrt_f64
+; CHECK-SAME: (double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call double @sqrt(double [[ARG]])
+; CHECK-NEXT:    ret double [[SQRT]]
+;
+  %sqrt = tail call double @sqrt(double %arg)
+  ret double %sqrt
+}
+
+define double @test_libm_sqrt_f64_fpmath(double %arg) {
+; CHECK-LABEL: define double @test_libm_sqrt_f64_fpmath
+; CHECK-SAME: (double [[ARG:%.*]]) {
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call double @sqrt(double [[ARG]]), !fpmath !0
+; CHECK-NEXT:    ret double [[SQRT]]
+;
+  %sqrt = tail call double @sqrt(double %arg), !fpmath !0
+  ret double %sqrt
+}
+
+attributes #0 = { nobuiltin }
+attributes #1 = { "no-builtins" }
+attributes #2 = { nounwind memory(none) }
+
+!0 = !{float 3.000000e+00}
+!1 = !{i32 1234}