Index: lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp =================================================================== --- lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -76,7 +76,7 @@ return false; // Reciprocal f32 is handled separately without denormals. - return UnsafeDiv && CNum->isExactlyValue(+1.0); + return UnsafeDiv || CNum->isExactlyValue(+1.0); } // Insert an intrinsic for fast fdiv for safe math situations where we can Index: test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll =================================================================== --- test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll +++ test/CodeGen/AMDGPU/amdgpu-codegenprepare.ll @@ -45,6 +45,7 @@ ; CHECK-LABEL: @rcp_fdiv_fpmath( ; CHECK: %no.md = fdiv float 1.000000e+00, %x{{$}} +; CHECK: %md.25ulp = fdiv float 1.000000e+00, %x, !fpmath !0 ; CHECK: %md.half.ulp = fdiv float 1.000000e+00, %x, !fpmath !1 ; CHECK: %arcp.no.md = fdiv arcp float 1.000000e+00, %x{{$}} ; CHECK: %arcp.25ulp = fdiv arcp float 1.000000e+00, %x, !fpmath !0 @@ -54,6 +55,9 @@ %no.md = fdiv float 1.0, %x store volatile float %no.md, float addrspace(1)* %out + %md.25ulp = fdiv float 1.0, %x, !fpmath !0 + store volatile float %md.25ulp, float addrspace(1)* %out + %md.half.ulp = fdiv float 1.0, %x, !fpmath !1 store volatile float %md.half.ulp, float addrspace(1)* %out @@ -146,13 +150,13 @@ ; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0 ; CHECK: fdiv arcp float 1.000000e+00, %[[X0]], !fpmath !0 ; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1 -; CHECK: call arcp float @llvm.amdgcn.fdiv.fast(float 2.000000e+00, float %[[X1]]), !fpmath !0 +; CHECK: fdiv arcp float 2.000000e+00, %[[X1]], !fpmath !0 ; CHECK: store volatile <2 x float> %arcp.25ulp ; CHECK: %[[X0:[0-9]+]] = extractelement <2 x float> %x, i64 0 ; CHECK: fdiv fast float 1.000000e+00, %[[X0]], !fpmath !0 ; CHECK: %[[X1:[0-9]+]] = extractelement <2 x float> %x, i64 1 -; CHECK: call fast float @llvm.amdgcn.fdiv.fast(float 2.000000e+00, float %[[X1]]), !fpmath !0 +; CHECK: fdiv fast float 2.000000e+00, %[[X1]], !fpmath !0 ; CHECK: store volatile <2 x float> %fast.25ulp define void @rcp_fdiv_fpmath_vector_nonsplat(<2 x float> addrspace(1)* %out, <2 x float> %x) #1 { %no.md = fdiv <2 x float> , %x Index: test/CodeGen/AMDGPU/rcp-pattern.ll =================================================================== --- test/CodeGen/AMDGPU/rcp-pattern.ll +++ test/CodeGen/AMDGPU/rcp-pattern.ll @@ -1,11 +1,96 @@ -; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG-SAFE -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s ; RUN: llc -march=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefix=EG -check-prefix=FUNC %s -; FIXME: Evergreen only ever does unsafe fp math. ; FUNC-LABEL: {{^}}rcp_pat_f32: +; GCN: s_load_dword [[SRC:s[0-9]+]] +; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]] +; GCN: buffer_store_dword [[RCP]] + ; EG: RECIP_IEEE -define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind { +define void @rcp_pat_f32(float addrspace(1)* %out, float %src) #0 { %rcp = fdiv float 1.0, %src store float %rcp, float addrspace(1)* %out, align 4 ret void } + +; FUNC-LABEL: {{^}}rcp_ulp25_pat_f32: +; GCN: s_load_dword [[SRC:s[0-9]+]] +; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]] +; GCN: buffer_store_dword [[RCP]] + +; EG: RECIP_IEEE +define void @rcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 { + %rcp = fdiv float 1.0, %src, !fpmath !0 + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rcp_fast_ulp25_pat_f32: +; GCN: s_load_dword [[SRC:s[0-9]+]] +; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]] +; GCN: buffer_store_dword [[RCP]] + +; EG: RECIP_IEEE +define void @rcp_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 { + %rcp = fdiv fast float 1.0, %src, !fpmath !0 + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rcp_arcp_ulp25_pat_f32: +; GCN: s_load_dword [[SRC:s[0-9]+]] +; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]] +; GCN: buffer_store_dword [[RCP]] + +; EG: RECIP_IEEE +define void @rcp_arcp_ulp25_pat_f32(float addrspace(1)* %out, float %src) #0 { + %rcp = fdiv arcp float 1.0, %src, !fpmath !0 + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rcp_global_fast_ulp25_pat_f32: +; GCN: s_load_dword [[SRC:s[0-9]+]] +; GCN: v_rcp_f32_e32 [[RCP:v[0-9]+]], [[SRC]] +; GCN: buffer_store_dword [[RCP]] + +; EG: RECIP_IEEE +define void @rcp_global_fast_ulp25_pat_f32(float addrspace(1)* %out, float %src) #2 { + %rcp = fdiv float 1.0, %src, !fpmath !0 + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: {{^}}rcp_fabs_pat_f32: +; GCN: s_load_dword [[SRC:s[0-9]+]] +; GCN: v_rcp_f32_e64 [[RCP:v[0-9]+]], |[[SRC]]| +; GCN: buffer_store_dword [[RCP]] + +; EG: RECIP_IEEE +define void @rcp_fabs_pat_f32(float addrspace(1)* %out, float %src) #0 { + %src.fabs = call float @llvm.fabs.f32(float %src) + %rcp = fdiv float 1.0, %src.fabs + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FIXME: fneg folded into constant 1 +; FUNC-LABEL: {{^}}rcp_fabs_fneg_pat_f32: +define void @rcp_fabs_fneg_pat_f32(float addrspace(1)* %out, float %src) #0 { + %src.fabs = call float @llvm.fabs.f32(float %src) + %src.fabs.fneg = fsub float -0.0, %src.fabs + %rcp = fdiv float 1.0, %src.fabs.fneg + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + + +declare float @llvm.fabs.f32(float) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } +attributes #2 = { nounwind "unsafe-fp-math"="true" } + +!0 = !{float 2.500000e+00} Index: test/CodeGen/AMDGPU/reciprocal.ll =================================================================== --- test/CodeGen/AMDGPU/reciprocal.ll +++ /dev/null @@ -1,13 +0,0 @@ -;RUN: llc < %s -march=r600 -mcpu=redwood | FileCheck %s - -;CHECK: RECIP_IEEE * T{{[0-9]+\.[XYZW], T[0-9]+\.[XYZW]}} - -define amdgpu_ps void @test(<4 x float> inreg %reg0) { - %r0 = extractelement <4 x float> %reg0, i32 0 - %r1 = fdiv float 1.0, %r0 - %vec = insertelement <4 x float> undef, float %r1, i32 0 - call void @llvm.r600.store.swizzle(<4 x float> %vec, i32 0, i32 0) - ret void -} - -declare void @llvm.r600.store.swizzle(<4 x float>, i32, i32)