diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -21188,6 +21188,12 @@ if (Enabled == TLI.ReciprocalEstimate::Disabled) return SDValue(); + const TargetOptions &Options = DAG.getTarget().Options; + + // Use rsqrt to estimate would produce wrong result for +Inf + if (!Reciprocal && !Options.NoInfsFPMath && !Flags.hasNoInfs()) + return SDValue(); + // Estimates may be explicitly enabled for this type with a custom number of // refinement steps. int Iterations = TLI.getSqrtRefinementSteps(VT, MF); diff --git a/llvm/test/CodeGen/NVPTX/fast-math.ll b/llvm/test/CodeGen/NVPTX/fast-math.ll --- a/llvm/test/CodeGen/NVPTX/fast-math.ll +++ b/llvm/test/CodeGen/NVPTX/fast-math.ll @@ -13,7 +13,7 @@ } ; CHECK-LABEL: sqrt_div_fast( -; CHECK: sqrt.approx.f32 +; CHECK: sqrt.rn.f32 ; CHECK: div.approx.f32 define float @sqrt_div_fast(float %a, float %b) #0 { %t1 = tail call float @llvm.sqrt.f32(float %a) @@ -21,6 +21,15 @@ ret float %t2 } +; CHECK-LABEL: sqrt_div_fast_ninf( +; CHECK: sqrt.approx.f32 +; CHECK: div.approx.f32 +define float @sqrt_div_fast_ninf(float %a, float %b) #0 { + %t1 = tail call ninf float @llvm.sqrt.f32(float %a) + %t2 = fdiv float %t1, %b + ret float %t2 +} + ; CHECK-LABEL: sqrt_div_ftz( ; CHECK: sqrt.rn.ftz.f32 ; CHECK: div.rn.ftz.f32 @@ -31,7 +40,7 @@ } ; CHECK-LABEL: sqrt_div_fast_ftz( -; CHECK: sqrt.approx.ftz.f32 +; CHECK: sqrt.rn.ftz.f32 ; CHECK: div.approx.ftz.f32 define float @sqrt_div_fast_ftz(float %a, float %b) #0 #1 { %t1 = tail call float @llvm.sqrt.f32(float %a) @@ -39,12 +48,20 @@ ret float %t2 } +; CHECK-LABEL: sqrt_div_fast_ftz_ninf( +; CHECK: sqrt.approx.ftz.f32 +; CHECK: div.approx.ftz.f32 +define float @sqrt_div_fast_ftz_ninf(float %a, float %b) #0 #1 { + %t1 = tail call ninf float @llvm.sqrt.f32(float %a) + %t2 = fdiv float %t1, %b + ret float %t2 +} + ; There are no fast-math or ftz versions of sqrt and div for f64. We use ; reciprocal(rsqrt(x)) for sqrt(x), and emit a vanilla divide. ; CHECK-LABEL: sqrt_div_fast_ftz_f64( -; CHECK: rsqrt.approx.f64 -; CHECK: rcp.approx.ftz.f64 +; CHECK: sqrt.rn.f64 ; CHECK: div.rn.f64 define double @sqrt_div_fast_ftz_f64(double %a, double %b) #0 #1 { %t1 = tail call double @llvm.sqrt.f64(double %a) @@ -52,6 +69,16 @@ ret double %t2 } +; CHECK-LABEL: sqrt_div_fast_ftz_f64_ninf( +; CHECK: rsqrt.approx.f64 +; CHECK: rcp.approx.ftz.f64 +; CHECK: div.rn.f64 +define double @sqrt_div_fast_ftz_f64_ninf(double %a, double %b) #0 #1 { + %t1 = tail call ninf double @llvm.sqrt.f64(double %a) + %t2 = fdiv double %t1, %b + ret double %t2 +} + ; CHECK-LABEL: rsqrt( ; CHECK-NOT: rsqrt.approx ; CHECK: sqrt.rn.f32 diff --git a/llvm/test/CodeGen/NVPTX/sqrt-approx.ll b/llvm/test/CodeGen/NVPTX/sqrt-approx.ll --- a/llvm/test/CodeGen/NVPTX/sqrt-approx.ll +++ b/llvm/test/CodeGen/NVPTX/sqrt-approx.ll @@ -45,35 +45,63 @@ ; CHECK-LABEL test_sqrt32 define float @test_sqrt32(float %a) #0 { -; CHECK: sqrt.approx.f32 +; CHECK: sqrt.rn.f32 %ret = tail call float @llvm.sqrt.f32(float %a) ret float %ret } +; CHECK-LABEL test_sqrt32_ninf +define float @test_sqrt32_ninf(float %a) #0 { +; CHECK: sqrt.approx.f32 + %ret = tail call ninf float @llvm.sqrt.f32(float %a) + ret float %ret +} + ; CHECK-LABEL test_sqrt_ftz define float @test_sqrt_ftz(float %a) #0 #1 { -; CHECK: sqrt.approx.ftz.f32 +; CHECK: sqrt.rn.ftz.f32 %ret = tail call float @llvm.sqrt.f32(float %a) ret float %ret } +; CHECK-LABEL test_sqrt_ftz_ninf +define float @test_sqrt_ftz_ninf(float %a) #0 #1 { +; CHECK: sqrt.approx.ftz.f32 + %ret = tail call ninf float @llvm.sqrt.f32(float %a) + ret float %ret +} + ; CHECK-LABEL test_sqrt64 define double @test_sqrt64(double %a) #0 { +; CHECK: sqrt.rn.f64 + %ret = tail call double @llvm.sqrt.f64(double %a) + ret double %ret +} + +; CHECK-LABEL test_sqrt64_ninf +define double @test_sqrt64_ninf(double %a) #0 { ; There's no sqrt.approx.f64 instruction; we emit ; reciprocal(rsqrt.approx.f64(x)). There's no non-ftz approximate reciprocal, ; so we just use the ftz version. ; CHECK: rsqrt.approx.f64 ; CHECK: rcp.approx.ftz.f64 - %ret = tail call double @llvm.sqrt.f64(double %a) + %ret = tail call ninf double @llvm.sqrt.f64(double %a) ret double %ret } ; CHECK-LABEL test_sqrt64_ftz define double @test_sqrt64_ftz(double %a) #0 #1 { +; CHECK: sqrt.rn.f64 + %ret = tail call double @llvm.sqrt.f64(double %a) + ret double %ret +} + +; CHECK-LABEL test_sqrt64_ftz_ninf +define double @test_sqrt64_ftz_ninf(double %a) #0 #1 { ; There's no sqrt.approx.ftz.f64 instruction; we just use the non-ftz version. ; CHECK: rsqrt.approx.f64 ; CHECK: rcp.approx.ftz.f64 - %ret = tail call double @llvm.sqrt.f64(double %a) + %ret = tail call ninf double @llvm.sqrt.f64(double %a) ret double %ret } @@ -92,11 +120,18 @@ ; CHECK-LABEL: test_sqrt32_refined define float @test_sqrt32_refined(float %a) #0 #2 { -; CHECK: rsqrt.approx.f32 +; CHECK: sqrt.rn.f32 %ret = tail call float @llvm.sqrt.f32(float %a) ret float %ret } +; CHECK-LABEL: test_sqrt32_refined_ninf +define float @test_sqrt32_refined_ninf(float %a) #0 #2 { +; CHECK: rsqrt.approx.f32 + %ret = tail call ninf float @llvm.sqrt.f32(float %a) + ret float %ret +} + ; CHECK-LABEL: test_rsqrt64_refined define double @test_rsqrt64_refined(double %a) #0 #2 { ; CHECK: rsqrt.approx.f64 @@ -107,11 +142,18 @@ ; CHECK-LABEL: test_sqrt64_refined define double @test_sqrt64_refined(double %a) #0 #2 { -; CHECK: rsqrt.approx.f64 +; CHECK: sqrt.rn.f64 %ret = tail call double @llvm.sqrt.f64(double %a) ret double %ret } +; CHECK-LABEL: test_sqrt64_refined_ninf +define double @test_sqrt64_refined_ninf(double %a) #0 #2 { +; CHECK: rsqrt.approx.f64 + %ret = tail call ninf double @llvm.sqrt.f64(double %a) + ret double %ret +} + ; -- refined sqrt and rsqrt with ftz enabled -- ; CHECK-LABEL: test_rsqrt32_refined_ftz @@ -124,11 +166,18 @@ ; CHECK-LABEL: test_sqrt32_refined_ftz define float @test_sqrt32_refined_ftz(float %a) #0 #1 #2 { -; CHECK: rsqrt.approx.ftz.f32 +; CHECK: sqrt.rn.ftz.f32 %ret = tail call float @llvm.sqrt.f32(float %a) ret float %ret } +; CHECK-LABEL: test_sqrt32_refined_ftz_ninf +define float @test_sqrt32_refined_ftz_ninf(float %a) #0 #1 #2 { +; CHECK: rsqrt.approx.ftz.f32 + %ret = tail call ninf float @llvm.sqrt.f32(float %a) + ret float %ret +} + ; CHECK-LABEL: test_rsqrt64_refined_ftz define double @test_rsqrt64_refined_ftz(double %a) #0 #1 #2 { ; There's no rsqrt.approx.ftz.f64, so we just use the non-ftz version. @@ -140,11 +189,18 @@ ; CHECK-LABEL: test_sqrt64_refined_ftz define double @test_sqrt64_refined_ftz(double %a) #0 #1 #2 { -; CHECK: rsqrt.approx.f64 +; CHECK: sqrt.rn.f64 %ret = tail call double @llvm.sqrt.f64(double %a) ret double %ret } +; CHECK-LABEL: test_sqrt64_refined_ftz_ninf +define double @test_sqrt64_refined_ftz_ninf(double %a) #0 #1 #2 { +; CHECK: rsqrt.approx.f64 + %ret = tail call ninf double @llvm.sqrt.f64(double %a) + ret double %ret +} + attributes #0 = { "unsafe-fp-math" = "true" } attributes #1 = { "denormal-fp-math-f32" = "preserve-sign,preserve-sign" } attributes #2 = { "reciprocal-estimates" = "rsqrtf:1,rsqrtd:1,sqrtf:1,sqrtd:1" } diff --git a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll --- a/llvm/test/CodeGen/PowerPC/fmf-propagation.ll +++ b/llvm/test/CodeGen/PowerPC/fmf-propagation.ll @@ -270,11 +270,11 @@ ; Reduced precision for sqrt is allowed - should use estimate and NR iterations. ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_afn_ieee:' -; FMFDEBUG: fmul afn {{t[0-9]+}} +; FMFDEBUG: fmul ninf afn {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'sqrt_afn_ieee:' ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_afn_ieee:' -; GLOBALDEBUG: fmul afn {{t[0-9]+}} +; GLOBALDEBUG: fmul ninf afn {{t[0-9]+}} ; GLOBALDEBUG: Type-legalized selection DAG: %bb.0 'sqrt_afn_ieee:' define float @sqrt_afn_ieee(float %x) #0 { @@ -321,17 +321,31 @@ ; GLOBAL-NEXT: xsmulsp 0, 0, 2 ; GLOBAL-NEXT: .LBB10_2: ; GLOBAL-NEXT: fmr 1, 0 +; GLOBAL-NEXT: blr + %rt = call afn ninf float @llvm.sqrt.f32(float %x) + ret float %rt +} + +define float @sqrt_afn_ieee_inf(float %x) #0 { +; FMF-LABEL: sqrt_afn_ieee_inf: +; FMF: # %bb.0: +; FMF-NEXT: xssqrtsp 1, 1 +; FMF-NEXT: blr +; +; GLOBAL-LABEL: sqrt_afn_ieee_inf: +; GLOBAL: # %bb.0: +; GLOBAL-NEXT: xssqrtsp 1, 1 ; GLOBAL-NEXT: blr %rt = call afn float @llvm.sqrt.f32(float %x) ret float %rt } ; FMFDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_afn_preserve_sign:' -; FMFDEBUG: fmul afn {{t[0-9]+}} +; FMFDEBUG: fmul ninf afn {{t[0-9]+}} ; FMFDEBUG: Type-legalized selection DAG: %bb.0 'sqrt_afn_preserve_sign:' ; GLOBALDEBUG-LABEL: Optimized lowered selection DAG: %bb.0 'sqrt_afn_preserve_sign:' -; GLOBALDEBUG: fmul afn {{t[0-9]+}} +; GLOBALDEBUG: fmul ninf afn {{t[0-9]+}} ; GLOBALDEBUG: Type-legalized selection DAG: %bb.0 'sqrt_afn_preserve_sign:' define float @sqrt_afn_preserve_sign(float %x) #1 { @@ -339,19 +353,19 @@ ; FMF: # %bb.0: ; FMF-NEXT: xxlxor 0, 0, 0 ; FMF-NEXT: fcmpu 0, 1, 0 -; FMF-NEXT: beq 0, .LBB11_2 +; FMF-NEXT: beq 0, .LBB12_2 ; FMF-NEXT: # %bb.1: ; FMF-NEXT: xsrsqrtesp 0, 1 -; FMF-NEXT: addis 3, 2, .LCPI11_0@toc@ha -; FMF-NEXT: addis 4, 2, .LCPI11_1@toc@ha -; FMF-NEXT: lfs 2, .LCPI11_0@toc@l(3) -; FMF-NEXT: lfs 3, .LCPI11_1@toc@l(4) +; FMF-NEXT: addis 3, 2, .LCPI12_0@toc@ha +; FMF-NEXT: addis 4, 2, .LCPI12_1@toc@ha +; FMF-NEXT: lfs 2, .LCPI12_0@toc@l(3) +; FMF-NEXT: lfs 3, .LCPI12_1@toc@l(4) ; FMF-NEXT: xsmulsp 1, 1, 0 ; FMF-NEXT: xsmulsp 0, 1, 0 ; FMF-NEXT: xsmulsp 1, 1, 2 ; FMF-NEXT: xsaddsp 0, 0, 3 ; FMF-NEXT: xsmulsp 0, 1, 0 -; FMF-NEXT: .LBB11_2: +; FMF-NEXT: .LBB12_2: ; FMF-NEXT: fmr 1, 0 ; FMF-NEXT: blr ; @@ -359,19 +373,33 @@ ; GLOBAL: # %bb.0: ; GLOBAL-NEXT: xxlxor 0, 0, 0 ; GLOBAL-NEXT: fcmpu 0, 1, 0 -; GLOBAL-NEXT: beq 0, .LBB11_2 +; GLOBAL-NEXT: beq 0, .LBB12_2 ; GLOBAL-NEXT: # %bb.1: ; GLOBAL-NEXT: xsrsqrtesp 0, 1 -; GLOBAL-NEXT: addis 3, 2, .LCPI11_0@toc@ha -; GLOBAL-NEXT: addis 4, 2, .LCPI11_1@toc@ha -; GLOBAL-NEXT: lfs 2, .LCPI11_0@toc@l(3) -; GLOBAL-NEXT: lfs 3, .LCPI11_1@toc@l(4) +; GLOBAL-NEXT: addis 3, 2, .LCPI12_0@toc@ha +; GLOBAL-NEXT: addis 4, 2, .LCPI12_1@toc@ha +; GLOBAL-NEXT: lfs 2, .LCPI12_0@toc@l(3) +; GLOBAL-NEXT: lfs 3, .LCPI12_1@toc@l(4) ; GLOBAL-NEXT: xsmulsp 1, 1, 0 ; GLOBAL-NEXT: xsmaddasp 2, 1, 0 ; GLOBAL-NEXT: xsmulsp 0, 1, 3 ; GLOBAL-NEXT: xsmulsp 0, 0, 2 -; GLOBAL-NEXT: .LBB11_2: +; GLOBAL-NEXT: .LBB12_2: ; GLOBAL-NEXT: fmr 1, 0 +; GLOBAL-NEXT: blr + %rt = call afn ninf float @llvm.sqrt.f32(float %x) + ret float %rt +} + +define float @sqrt_afn_preserve_sign_inf(float %x) #1 { +; FMF-LABEL: sqrt_afn_preserve_sign_inf: +; FMF: # %bb.0: +; FMF-NEXT: xssqrtsp 1, 1 +; FMF-NEXT: blr +; +; GLOBAL-LABEL: sqrt_afn_preserve_sign_inf: +; GLOBAL: # %bb.0: +; GLOBAL-NEXT: xssqrtsp 1, 1 ; GLOBAL-NEXT: blr %rt = call afn float @llvm.sqrt.f32(float %x) ret float %rt @@ -390,45 +418,45 @@ define float @sqrt_fast_ieee(float %x) #0 { ; FMF-LABEL: sqrt_fast_ieee: ; FMF: # %bb.0: -; FMF-NEXT: addis 3, 2, .LCPI12_2@toc@ha +; FMF-NEXT: addis 3, 2, .LCPI14_2@toc@ha ; FMF-NEXT: fabs 0, 1 -; FMF-NEXT: lfs 2, .LCPI12_2@toc@l(3) +; FMF-NEXT: lfs 2, .LCPI14_2@toc@l(3) ; FMF-NEXT: fcmpu 0, 0, 2 ; FMF-NEXT: xxlxor 0, 0, 0 -; FMF-NEXT: blt 0, .LBB12_2 +; FMF-NEXT: blt 0, .LBB14_2 ; FMF-NEXT: # %bb.1: ; FMF-NEXT: xsrsqrtesp 0, 1 -; FMF-NEXT: addis 3, 2, .LCPI12_0@toc@ha -; FMF-NEXT: addis 4, 2, .LCPI12_1@toc@ha -; FMF-NEXT: lfs 2, .LCPI12_0@toc@l(3) -; FMF-NEXT: lfs 3, .LCPI12_1@toc@l(4) +; FMF-NEXT: addis 3, 2, .LCPI14_0@toc@ha +; FMF-NEXT: addis 4, 2, .LCPI14_1@toc@ha +; FMF-NEXT: lfs 2, .LCPI14_0@toc@l(3) +; FMF-NEXT: lfs 3, .LCPI14_1@toc@l(4) ; FMF-NEXT: xsmulsp 1, 1, 0 ; FMF-NEXT: xsmaddasp 2, 1, 0 ; FMF-NEXT: xsmulsp 0, 1, 3 ; FMF-NEXT: xsmulsp 0, 0, 2 -; FMF-NEXT: .LBB12_2: +; FMF-NEXT: .LBB14_2: ; FMF-NEXT: fmr 1, 0 ; FMF-NEXT: blr ; ; GLOBAL-LABEL: sqrt_fast_ieee: ; GLOBAL: # %bb.0: -; GLOBAL-NEXT: addis 3, 2, .LCPI12_2@toc@ha +; GLOBAL-NEXT: addis 3, 2, .LCPI14_2@toc@ha ; GLOBAL-NEXT: fabs 0, 1 -; GLOBAL-NEXT: lfs 2, .LCPI12_2@toc@l(3) +; GLOBAL-NEXT: lfs 2, .LCPI14_2@toc@l(3) ; GLOBAL-NEXT: fcmpu 0, 0, 2 ; GLOBAL-NEXT: xxlxor 0, 0, 0 -; GLOBAL-NEXT: blt 0, .LBB12_2 +; GLOBAL-NEXT: blt 0, .LBB14_2 ; GLOBAL-NEXT: # %bb.1: ; GLOBAL-NEXT: xsrsqrtesp 0, 1 -; GLOBAL-NEXT: addis 3, 2, .LCPI12_0@toc@ha -; GLOBAL-NEXT: addis 4, 2, .LCPI12_1@toc@ha -; GLOBAL-NEXT: lfs 2, .LCPI12_0@toc@l(3) -; GLOBAL-NEXT: lfs 3, .LCPI12_1@toc@l(4) +; GLOBAL-NEXT: addis 3, 2, .LCPI14_0@toc@ha +; GLOBAL-NEXT: addis 4, 2, .LCPI14_1@toc@ha +; GLOBAL-NEXT: lfs 2, .LCPI14_0@toc@l(3) +; GLOBAL-NEXT: lfs 3, .LCPI14_1@toc@l(4) ; GLOBAL-NEXT: xsmulsp 1, 1, 0 ; GLOBAL-NEXT: xsmaddasp 2, 1, 0 ; GLOBAL-NEXT: xsmulsp 0, 1, 3 ; GLOBAL-NEXT: xsmulsp 0, 0, 2 -; GLOBAL-NEXT: .LBB12_2: +; GLOBAL-NEXT: .LBB14_2: ; GLOBAL-NEXT: fmr 1, 0 ; GLOBAL-NEXT: blr %rt = call fast float @llvm.sqrt.f32(float %x) @@ -450,18 +478,18 @@ ; FMF: # %bb.0: ; FMF-NEXT: xxlxor 0, 0, 0 ; FMF-NEXT: fcmpu 0, 1, 0 -; FMF-NEXT: beq 0, .LBB13_2 +; FMF-NEXT: beq 0, .LBB15_2 ; FMF-NEXT: # %bb.1: ; FMF-NEXT: xsrsqrtesp 0, 1 -; FMF-NEXT: addis 3, 2, .LCPI13_0@toc@ha -; FMF-NEXT: addis 4, 2, .LCPI13_1@toc@ha -; FMF-NEXT: lfs 2, .LCPI13_0@toc@l(3) -; FMF-NEXT: lfs 3, .LCPI13_1@toc@l(4) +; FMF-NEXT: addis 3, 2, .LCPI15_0@toc@ha +; FMF-NEXT: addis 4, 2, .LCPI15_1@toc@ha +; FMF-NEXT: lfs 2, .LCPI15_0@toc@l(3) +; FMF-NEXT: lfs 3, .LCPI15_1@toc@l(4) ; FMF-NEXT: xsmulsp 1, 1, 0 ; FMF-NEXT: xsmaddasp 2, 1, 0 ; FMF-NEXT: xsmulsp 0, 1, 3 ; FMF-NEXT: xsmulsp 0, 0, 2 -; FMF-NEXT: .LBB13_2: +; FMF-NEXT: .LBB15_2: ; FMF-NEXT: fmr 1, 0 ; FMF-NEXT: blr ; @@ -469,18 +497,18 @@ ; GLOBAL: # %bb.0: ; GLOBAL-NEXT: xxlxor 0, 0, 0 ; GLOBAL-NEXT: fcmpu 0, 1, 0 -; GLOBAL-NEXT: beq 0, .LBB13_2 +; GLOBAL-NEXT: beq 0, .LBB15_2 ; GLOBAL-NEXT: # %bb.1: ; GLOBAL-NEXT: xsrsqrtesp 0, 1 -; GLOBAL-NEXT: addis 3, 2, .LCPI13_0@toc@ha -; GLOBAL-NEXT: addis 4, 2, .LCPI13_1@toc@ha -; GLOBAL-NEXT: lfs 2, .LCPI13_0@toc@l(3) -; GLOBAL-NEXT: lfs 3, .LCPI13_1@toc@l(4) +; GLOBAL-NEXT: addis 3, 2, .LCPI15_0@toc@ha +; GLOBAL-NEXT: addis 4, 2, .LCPI15_1@toc@ha +; GLOBAL-NEXT: lfs 2, .LCPI15_0@toc@l(3) +; GLOBAL-NEXT: lfs 3, .LCPI15_1@toc@l(4) ; GLOBAL-NEXT: xsmulsp 1, 1, 0 ; GLOBAL-NEXT: xsmaddasp 2, 1, 0 ; GLOBAL-NEXT: xsmulsp 0, 1, 3 ; GLOBAL-NEXT: xsmulsp 0, 0, 2 -; GLOBAL-NEXT: .LBB13_2: +; GLOBAL-NEXT: .LBB15_2: ; GLOBAL-NEXT: fmr 1, 0 ; GLOBAL-NEXT: blr %rt = call fast float @llvm.sqrt.f32(float %x) @@ -502,10 +530,10 @@ ; FMF: # %bb.0: ; FMF-NEXT: xxlxor 0, 0, 0 ; FMF-NEXT: xscmpudp 0, 1, 0 -; FMF-NEXT: blt 0, .LBB14_2 +; FMF-NEXT: blt 0, .LBB16_2 ; FMF-NEXT: # %bb.1: ; FMF-NEXT: fmr 3, 2 -; FMF-NEXT: .LBB14_2: +; FMF-NEXT: .LBB16_2: ; FMF-NEXT: fmr 1, 3 ; FMF-NEXT: blr ; @@ -513,10 +541,10 @@ ; GLOBAL: # %bb.0: ; GLOBAL-NEXT: xxlxor 0, 0, 0 ; GLOBAL-NEXT: xscmpudp 0, 1, 0 -; GLOBAL-NEXT: blt 0, .LBB14_2 +; GLOBAL-NEXT: blt 0, .LBB16_2 ; GLOBAL-NEXT: # %bb.1: ; GLOBAL-NEXT: fmr 3, 2 -; GLOBAL-NEXT: .LBB14_2: +; GLOBAL-NEXT: .LBB16_2: ; GLOBAL-NEXT: fmr 1, 3 ; GLOBAL-NEXT: blr %cmp = fcmp nnan ult double %a, 0.0 diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll --- a/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath-mir.ll @@ -9,17 +9,30 @@ ; CHECK: liveins: $xmm0 ; CHECK: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 ; CHECK: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF + ; CHECK: %1:fr32 = nofpexcept VSQRTSSr killed [[DEF]], [[COPY]], implicit $mxcsr + ; CHECK: $xmm0 = COPY %1 + ; CHECK: RET 0, $xmm0 + %call = tail call float @llvm.sqrt.f32(float %f) + ret float %call +} + +define float @sqrt_ieee_ninf(float %f) #0 { + ; CHECK-LABEL: name: sqrt_ieee_ninf + ; CHECK: bb.0 (%ir-block.0): + ; CHECK: liveins: $xmm0 + ; CHECK: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 + ; CHECK: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF ; CHECK: [[VRSQRTSSr:%[0-9]+]]:fr32 = VRSQRTSSr killed [[DEF]], [[COPY]] - ; CHECK: %3:fr32 = nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr + ; CHECK: %3:fr32 = ninf nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr ; CHECK: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load 4 from constant-pool) - ; CHECK: %5:fr32 = nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed %3, [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK: %5:fr32 = ninf nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed %3, [[VMOVSSrm_alt]], implicit $mxcsr ; CHECK: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load 4 from constant-pool) - ; CHECK: %7:fr32 = nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK: %8:fr32 = nofpexcept VMULSSrr killed %7, killed %5, implicit $mxcsr - ; CHECK: %9:fr32 = nofpexcept VMULSSrr [[COPY]], %8, implicit $mxcsr - ; CHECK: %10:fr32 = nofpexcept VFMADD213SSr %8, %9, [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK: %11:fr32 = nofpexcept VMULSSrr %9, [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK: %12:fr32 = nofpexcept VMULSSrr killed %11, killed %10, implicit $mxcsr + ; CHECK: %7:fr32 = ninf nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK: %8:fr32 = ninf nofpexcept VMULSSrr killed %7, killed %5, implicit $mxcsr + ; CHECK: %9:fr32 = ninf nofpexcept VMULSSrr [[COPY]], %8, implicit $mxcsr + ; CHECK: %10:fr32 = ninf nofpexcept VFMADD213SSr %8, %9, [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK: %11:fr32 = ninf nofpexcept VMULSSrr %9, [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK: %12:fr32 = ninf nofpexcept VMULSSrr killed %11, killed %10, implicit $mxcsr ; CHECK: [[COPY1:%[0-9]+]]:vr128 = COPY %12 ; CHECK: [[COPY2:%[0-9]+]]:vr128 = COPY [[COPY]] ; CHECK: [[VPBROADCASTDrm:%[0-9]+]]:vr128 = VPBROADCASTDrm $rip, 1, $noreg, %const.2, $noreg :: (load 4 from constant-pool) @@ -31,7 +44,7 @@ ; CHECK: [[COPY5:%[0-9]+]]:fr32 = COPY [[VPANDNrr]] ; CHECK: $xmm0 = COPY [[COPY5]] ; CHECK: RET 0, $xmm0 - %call = tail call float @llvm.sqrt.f32(float %f) + %call = tail call ninf float @llvm.sqrt.f32(float %f) ret float %call } @@ -41,17 +54,30 @@ ; CHECK: liveins: $xmm0 ; CHECK: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 ; CHECK: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF + ; CHECK: %1:fr32 = nofpexcept VSQRTSSr killed [[DEF]], [[COPY]], implicit $mxcsr + ; CHECK: $xmm0 = COPY %1 + ; CHECK: RET 0, $xmm0 + %call = tail call float @llvm.sqrt.f32(float %f) + ret float %call +} + +define float @sqrt_daz_ninf(float %f) #1 { + ; CHECK-LABEL: name: sqrt_daz_ninf + ; CHECK: bb.0 (%ir-block.0): + ; CHECK: liveins: $xmm0 + ; CHECK: [[COPY:%[0-9]+]]:fr32 = COPY $xmm0 + ; CHECK: [[DEF:%[0-9]+]]:fr32 = IMPLICIT_DEF ; CHECK: [[VRSQRTSSr:%[0-9]+]]:fr32 = VRSQRTSSr killed [[DEF]], [[COPY]] - ; CHECK: %3:fr32 = nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr + ; CHECK: %3:fr32 = ninf nofpexcept VMULSSrr [[COPY]], [[VRSQRTSSr]], implicit $mxcsr ; CHECK: [[VMOVSSrm_alt:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.0, $noreg :: (load 4 from constant-pool) - ; CHECK: %5:fr32 = nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed %3, [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK: %5:fr32 = ninf nofpexcept VFMADD213SSr [[VRSQRTSSr]], killed %3, [[VMOVSSrm_alt]], implicit $mxcsr ; CHECK: [[VMOVSSrm_alt1:%[0-9]+]]:fr32 = VMOVSSrm_alt $rip, 1, $noreg, %const.1, $noreg :: (load 4 from constant-pool) - ; CHECK: %7:fr32 = nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK: %8:fr32 = nofpexcept VMULSSrr killed %7, killed %5, implicit $mxcsr - ; CHECK: %9:fr32 = nofpexcept VMULSSrr [[COPY]], %8, implicit $mxcsr - ; CHECK: %10:fr32 = nofpexcept VFMADD213SSr %8, %9, [[VMOVSSrm_alt]], implicit $mxcsr - ; CHECK: %11:fr32 = nofpexcept VMULSSrr %9, [[VMOVSSrm_alt1]], implicit $mxcsr - ; CHECK: %12:fr32 = nofpexcept VMULSSrr killed %11, killed %10, implicit $mxcsr + ; CHECK: %7:fr32 = ninf nofpexcept VMULSSrr [[VRSQRTSSr]], [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK: %8:fr32 = ninf nofpexcept VMULSSrr killed %7, killed %5, implicit $mxcsr + ; CHECK: %9:fr32 = ninf nofpexcept VMULSSrr [[COPY]], %8, implicit $mxcsr + ; CHECK: %10:fr32 = ninf nofpexcept VFMADD213SSr %8, %9, [[VMOVSSrm_alt]], implicit $mxcsr + ; CHECK: %11:fr32 = ninf nofpexcept VMULSSrr %9, [[VMOVSSrm_alt1]], implicit $mxcsr + ; CHECK: %12:fr32 = ninf nofpexcept VMULSSrr killed %11, killed %10, implicit $mxcsr ; CHECK: [[COPY1:%[0-9]+]]:vr128 = COPY %12 ; CHECK: [[FsFLD0SS:%[0-9]+]]:fr32 = FsFLD0SS ; CHECK: %15:fr32 = nofpexcept VCMPSSrr [[COPY]], killed [[FsFLD0SS]], 0, implicit $mxcsr @@ -60,7 +86,7 @@ ; CHECK: [[COPY3:%[0-9]+]]:fr32 = COPY [[VPANDNrr]] ; CHECK: $xmm0 = COPY [[COPY3]] ; CHECK: RET 0, $xmm0 - %call = tail call float @llvm.sqrt.f32(float %f) + %call = tail call ninf float @llvm.sqrt.f32(float %f) ret float %call } diff --git a/llvm/test/CodeGen/X86/sqrt-fastmath.ll b/llvm/test/CodeGen/X86/sqrt-fastmath.ll --- a/llvm/test/CodeGen/X86/sqrt-fastmath.ll +++ b/llvm/test/CodeGen/X86/sqrt-fastmath.ll @@ -59,6 +59,20 @@ define float @finite_f32_estimate_ieee(float %f) #1 { ; SSE-LABEL: finite_f32_estimate_ieee: ; SSE: # %bb.0: +; SSE-NEXT: sqrtss %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: finite_f32_estimate_ieee: +; AVX: # %bb.0: +; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq + %call = tail call float @__sqrtf_finite(float %f) #2 + ret float %call +} + +define float @finite_f32_estimate_ieee_ninf(float %f) #1 { +; SSE-LABEL: finite_f32_estimate_ieee_ninf: +; SSE: # %bb.0: ; SSE-NEXT: rsqrtss %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: mulss %xmm1, %xmm2 @@ -72,7 +86,7 @@ ; SSE-NEXT: andnps %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: finite_f32_estimate_ieee: +; AVX1-LABEL: finite_f32_estimate_ieee_ninf: ; AVX1: # %bb.0: ; AVX1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 ; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm2 @@ -85,7 +99,7 @@ ; AVX1-NEXT: vandnps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; -; AVX512-LABEL: finite_f32_estimate_ieee: +; AVX512-LABEL: finite_f32_estimate_ieee_ninf: ; AVX512: # %bb.0: ; AVX512-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm2 @@ -99,13 +113,27 @@ ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq - %call = tail call float @__sqrtf_finite(float %f) #2 + %call = tail call ninf float @__sqrtf_finite(float %f) #2 ret float %call } define float @finite_f32_estimate_daz(float %f) #4 { ; SSE-LABEL: finite_f32_estimate_daz: ; SSE: # %bb.0: +; SSE-NEXT: sqrtss %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: finite_f32_estimate_daz: +; AVX: # %bb.0: +; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq + %call = tail call float @__sqrtf_finite(float %f) #2 + ret float %call +} + +define float @finite_f32_estimate_daz_ninf(float %f) #4 { +; SSE-LABEL: finite_f32_estimate_daz_ninf: +; SSE: # %bb.0: ; SSE-NEXT: rsqrtss %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: mulss %xmm1, %xmm2 @@ -119,7 +147,7 @@ ; SSE-NEXT: andnps %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: finite_f32_estimate_daz: +; AVX1-LABEL: finite_f32_estimate_daz_ninf: ; AVX1: # %bb.0: ; AVX1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 ; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm2 @@ -132,7 +160,7 @@ ; AVX1-NEXT: vandnps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; -; AVX512-LABEL: finite_f32_estimate_daz: +; AVX512-LABEL: finite_f32_estimate_daz_ninf: ; AVX512: # %bb.0: ; AVX512-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm2 @@ -144,7 +172,7 @@ ; AVX512-NEXT: vmovss %xmm2, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq - %call = tail call float @__sqrtf_finite(float %f) #2 + %call = tail call ninf float @__sqrtf_finite(float %f) #2 ret float %call } @@ -175,6 +203,20 @@ define float @sqrtf_check_denorms(float %x) #3 { ; SSE-LABEL: sqrtf_check_denorms: ; SSE: # %bb.0: +; SSE-NEXT: sqrtss %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sqrtf_check_denorms: +; AVX: # %bb.0: +; AVX-NEXT: vsqrtss %xmm0, %xmm0, %xmm0 +; AVX-NEXT: retq + %call = tail call float @__sqrtf_finite(float %x) #2 + ret float %call +} + +define float @sqrtf_check_denorms_ninf(float %x) #3 { +; SSE-LABEL: sqrtf_check_denorms_ninf: +; SSE: # %bb.0: ; SSE-NEXT: rsqrtss %xmm0, %xmm1 ; SSE-NEXT: movaps %xmm0, %xmm2 ; SSE-NEXT: mulss %xmm1, %xmm2 @@ -188,7 +230,7 @@ ; SSE-NEXT: andnps %xmm2, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: sqrtf_check_denorms: +; AVX1-LABEL: sqrtf_check_denorms_ninf: ; AVX1: # %bb.0: ; AVX1-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 ; AVX1-NEXT: vmulss %xmm1, %xmm0, %xmm2 @@ -201,7 +243,7 @@ ; AVX1-NEXT: vandnps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; -; AVX512-LABEL: sqrtf_check_denorms: +; AVX512-LABEL: sqrtf_check_denorms_ninf: ; AVX512: # %bb.0: ; AVX512-NEXT: vrsqrtss %xmm0, %xmm0, %xmm1 ; AVX512-NEXT: vmulss %xmm1, %xmm0, %xmm2 @@ -215,13 +257,27 @@ ; AVX512-NEXT: vmovss %xmm0, %xmm1, %xmm1 {%k1} ; AVX512-NEXT: vmovaps %xmm1, %xmm0 ; AVX512-NEXT: retq - %call = tail call float @__sqrtf_finite(float %x) #2 + %call = tail call ninf float @__sqrtf_finite(float %x) #2 ret float %call } define <4 x float> @sqrt_v4f32_check_denorms(<4 x float> %x) #3 { ; SSE-LABEL: sqrt_v4f32_check_denorms: ; SSE: # %bb.0: +; SSE-NEXT: sqrtps %xmm0, %xmm0 +; SSE-NEXT: retq +; +; AVX-LABEL: sqrt_v4f32_check_denorms: +; AVX: # %bb.0: +; AVX-NEXT: vsqrtps %xmm0, %xmm0 +; AVX-NEXT: retq + %call = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2 + ret <4 x float> %call +} + +define <4 x float> @sqrt_v4f32_check_denorms_ninf(<4 x float> %x) #3 { +; SSE-LABEL: sqrt_v4f32_check_denorms_ninf: +; SSE: # %bb.0: ; SSE-NEXT: rsqrtps %xmm0, %xmm2 ; SSE-NEXT: movaps %xmm0, %xmm1 ; SSE-NEXT: mulps %xmm2, %xmm1 @@ -237,7 +293,7 @@ ; SSE-NEXT: movaps %xmm1, %xmm0 ; SSE-NEXT: retq ; -; AVX1-LABEL: sqrt_v4f32_check_denorms: +; AVX1-LABEL: sqrt_v4f32_check_denorms_ninf: ; AVX1: # %bb.0: ; AVX1-NEXT: vrsqrtps %xmm0, %xmm1 ; AVX1-NEXT: vmulps %xmm1, %xmm0, %xmm2 @@ -251,7 +307,7 @@ ; AVX1-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; -; AVX512-LABEL: sqrt_v4f32_check_denorms: +; AVX512-LABEL: sqrt_v4f32_check_denorms_ninf: ; AVX512: # %bb.0: ; AVX512-NEXT: vrsqrtps %xmm0, %xmm1 ; AVX512-NEXT: vmulps %xmm1, %xmm0, %xmm2 @@ -266,7 +322,7 @@ ; AVX512-NEXT: vcmpleps %xmm0, %xmm2, %xmm0 ; AVX512-NEXT: vandps %xmm1, %xmm0, %xmm0 ; AVX512-NEXT: retq - %call = tail call <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2 + %call = tail call ninf <4 x float> @llvm.sqrt.v4f32(<4 x float> %x) #2 ret <4 x float> %call }