diff --git a/llvm/lib/Target/AMDGPU/CaymanInstructions.td b/llvm/lib/Target/AMDGPU/CaymanInstructions.td --- a/llvm/lib/Target/AMDGPU/CaymanInstructions.td +++ b/llvm/lib/Target/AMDGPU/CaymanInstructions.td @@ -50,6 +50,8 @@ def : RsqPat; +def : SqrtPat; + def : POW_Common ; defm DIV_cm : DIV_Common; @@ -70,8 +72,6 @@ -def : R600Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>; - class RAT_STORE_DWORD mask> : CF_MEM_RAT_CACHELESS <0x14, 0, mask, (ins rc:$rw_gpr, R600_TReg32_X:$index_gpr), diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td --- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td +++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td @@ -118,11 +118,12 @@ def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>; def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>; def : RsqPat; +def : SqrtPat; + def SIN_eg : SIN_Common<0x8D>; def COS_eg : COS_Common<0x8E>; def : POW_Common ; -def : EGPat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>; } // End SubtargetPredicate = isEG //===----------------------------------------------------------------------===// diff --git a/llvm/lib/Target/AMDGPU/R600Instructions.td b/llvm/lib/Target/AMDGPU/R600Instructions.td --- a/llvm/lib/Target/AMDGPU/R600Instructions.td +++ b/llvm/lib/Target/AMDGPU/R600Instructions.td @@ -1233,6 +1233,11 @@ def : RcpPat; } +class SqrtPat : R600Pat < + (fsqrt f32:$src), + (RecipInst (RsqInst $src)) +>; + //===----------------------------------------------------------------------===// // R600 / R700 Instructions //===----------------------------------------------------------------------===// @@ -1272,8 +1277,8 @@ defm DIV_r600 : DIV_Common; def : POW_Common ; - def : R600Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>; def : RsqPat; + def : SqrtPat; def R600_ExportSwz : ExportSwzInst { let Word1{20-17} = 0; // BURST_COUNT diff --git a/llvm/test/CodeGen/AMDGPU/fsqrt.ll b/llvm/test/CodeGen/AMDGPU/fsqrt.ll --- a/llvm/test/CodeGen/AMDGPU/fsqrt.ll +++ b/llvm/test/CodeGen/AMDGPU/fsqrt.ll @@ -27,8 +27,8 @@ ; FUNC-LABEL: {{^}}s_sqrt_f32: ; GCN: v_sqrt_f32_e32 -; R600: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].Z -; R600: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].Z, PS +; R600: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[2].Z +; R600: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, PS define amdgpu_kernel void @s_sqrt_f32(float addrspace(1)* %out, float %in) #1 { entry: %fdiv = call float @llvm.sqrt.f32(float %in) @@ -40,10 +40,10 @@ ; GCN: v_sqrt_f32_e32 ; GCN: v_sqrt_f32_e32 -; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[2].W -; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[2].W, PS -; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].X -; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].X, PS +; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[2].W +; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, PS +; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].X +; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}} define amdgpu_kernel void @s_sqrt_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #1 { entry: %fdiv = call <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) @@ -57,14 +57,14 @@ ; GCN: v_sqrt_f32_e32 ; GCN: v_sqrt_f32_e32 -; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Y -; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Y, PS -; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].Z -; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].Z, PS -; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[3].W -; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[3].W, PS -; R600-DAG: RECIPSQRT_CLAMPED * T{{[0-9]\.[XYZW]}}, KC0[4].X -; R600-DAG: MUL NON-IEEE T{{[0-9]\.[XYZW]}}, KC0[4].X, PS +; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].Y +; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, PS +; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].Z +; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}} +; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[3].W +; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}} +; R600-DAG: RECIPSQRT_IEEE * T{{[0-9]\.[XYZW]}}, KC0[4].X +; R600-DAG: RECIP_IEEE * T{{[0-9]\.[XYZW]}}, T{{[0-9]\.[XYZW]}} define amdgpu_kernel void @s_sqrt_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #1 { entry: %fdiv = call <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) @@ -134,6 +134,16 @@ ret void } +; FUNC-LABEL: {{^}}recip_sqrt: +; R600: RECIPSQRT_IEEE +; R600-NOT: RECIP_IEEE +define amdgpu_kernel void @recip_sqrt(float addrspace(1)* %out, float %src) nounwind { + %sqrt = call float @llvm.sqrt.f32(float %src) + %recipsqrt = fdiv fast float 1.0, %sqrt + store float %recipsqrt, float addrspace(1)* %out, align 4 + ret void +} + declare float @llvm.sqrt.f32(float %in) #0 declare <2 x float> @llvm.sqrt.v2f32(<2 x float> %in) #0 declare <4 x float> @llvm.sqrt.v4f32(<4 x float> %in) #0