Index: lib/Target/R600/AMDGPUISelLowering.h =================================================================== --- lib/Target/R600/AMDGPUISelLowering.h +++ lib/Target/R600/AMDGPUISelLowering.h @@ -196,6 +196,7 @@ DIV_FMAS, DIV_FIXUP, TRIG_PREOP, + RCP, DOT4, BFE_U32, // Extract range of bits with zero extension to 32-bits. BFE_I32, // Extract range of bits with sign extension to 32-bits. Index: lib/Target/R600/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/R600/AMDGPUISelLowering.cpp +++ lib/Target/R600/AMDGPUISelLowering.cpp @@ -817,6 +817,8 @@ return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT, Op.getOperand(1), Op.getOperand(2)); + case AMDGPUIntrinsic::AMDGPU_rcp: + return DAG.getNode(AMDGPUISD::RCP, DL, VT, Op.getOperand(1)); } } @@ -1715,6 +1717,7 @@ NODE_NAME_CASE(DIV_FMAS) NODE_NAME_CASE(DIV_FIXUP) NODE_NAME_CASE(TRIG_PREOP) + NODE_NAME_CASE(RCP) NODE_NAME_CASE(DOT4) NODE_NAME_CASE(BFE_U32) NODE_NAME_CASE(BFE_I32) Index: lib/Target/R600/AMDGPUInstrInfo.td =================================================================== --- lib/Target/R600/AMDGPUInstrInfo.td +++ lib/Target/R600/AMDGPUInstrInfo.td @@ -37,6 +37,9 @@ // out = a - floor(a) def AMDGPUfract : SDNode<"AMDGPUISD::FRACT", SDTFPUnaryOp>; +// out = 1.0 / a +def AMDGPUrcp : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>; + // out = max(a, b) a and b are floats def AMDGPUfmax : SDNode<"AMDGPUISD::FMAX", SDTFPBinOp, [SDNPCommutative, SDNPAssociative] Index: lib/Target/R600/AMDGPUInstructions.td =================================================================== --- lib/Target/R600/AMDGPUInstructions.td +++ lib/Target/R600/AMDGPUInstructions.td @@ -472,6 +472,16 @@ >; } +class RcpPat : Pat < + (fdiv FP_ONE, vt:$src), + (RcpInst $src) +>; + +class RsqPat : Pat < + (AMDGPUrcp (fsqrt vt:$src)), + (RsqInst $src) +>; + include "R600Instructions.td" include "R700Instructions.td" include "EvergreenInstructions.td" Index: lib/Target/R600/AMDGPUIntrinsics.td =================================================================== --- lib/Target/R600/AMDGPUIntrinsics.td +++ lib/Target/R600/AMDGPUIntrinsics.td @@ -28,7 +28,7 @@ def int_AMDGPU_lrp : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_mul : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_pow : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; - def int_AMDGPU_rcp : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; + def int_AMDGPU_rcp : Intrinsic<[llvm_anyfloat_ty], [llvm_anyfloat_ty], [IntrNoMem]>; def int_AMDGPU_rsq : Intrinsic<[llvm_float_ty], [llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_seq : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; def int_AMDGPU_sgt : Intrinsic<[llvm_float_ty], [llvm_float_ty, llvm_float_ty], [IntrNoMem]>; Index: lib/Target/R600/AMDILIntrinsics.td =================================================================== --- lib/Target/R600/AMDILIntrinsics.td +++ lib/Target/R600/AMDILIntrinsics.td @@ -183,8 +183,6 @@ Intrinsic<[llvm_v2i64_ty], [llvm_double_ty], []>; def int_AMDIL_ldexp : GCCBuiltin<"__amdil_ldexp">, Intrinsic<[llvm_anyfloat_ty], [llvm_anyfloat_ty, llvm_anyint_ty], []>; - def int_AMDIL_drcp : GCCBuiltin<"__amdil_rcp">, - Intrinsic<[llvm_double_ty], [llvm_double_ty], []>; def int_AMDIL_convert_f16_f32 : GCCBuiltin<"__amdil_half_to_float">, ConvertIntITOF; def int_AMDIL_convert_f32_f16 : GCCBuiltin<"__amdil_float_to_half">, Index: lib/Target/R600/SIInstructions.td =================================================================== --- lib/Target/R600/SIInstructions.td +++ lib/Target/R600/SIInstructions.td @@ -1008,10 +1008,11 @@ defm V_LOG_F32 : VOP1_32 <0x00000027, "V_LOG_F32", [(set f32:$dst, (flog2 f32:$src0))] >; + defm V_RCP_CLAMP_F32 : VOP1_32 <0x00000028, "V_RCP_CLAMP_F32", []>; defm V_RCP_LEGACY_F32 : VOP1_32 <0x00000029, "V_RCP_LEGACY_F32", []>; defm V_RCP_F32 : VOP1_32 <0x0000002a, "V_RCP_F32", - [(set f32:$dst, (fdiv FP_ONE, f32:$src0))] + [(set f32:$dst, (AMDGPUrcp f32:$src0))] >; defm V_RCP_IFLAG_F32 : VOP1_32 <0x0000002b, "V_RCP_IFLAG_F32", []>; defm V_RSQ_CLAMP_F32 : VOP1_32 <0x0000002c, "V_RSQ_CLAMP_F32", []>; @@ -1023,7 +1024,7 @@ [(set f32:$dst, (fdiv FP_ONE, (fsqrt f32:$src0)))] >; defm V_RCP_F64 : VOP1_64 <0x0000002f, "V_RCP_F64", - [(set f64:$dst, (fdiv FP_ONE, f64:$src0))] + [(set f64:$dst, (AMDGPUrcp f64:$src0))] >; defm V_RCP_CLAMP_F64 : VOP1_64 <0x00000030, "V_RCP_CLAMP_F64", []>; defm V_RSQ_F64 : VOP1_64 <0x00000031, "V_RSQ_F64", @@ -2484,6 +2485,11 @@ (S_ADD_I32 $src0, $src1) >; +def : RcpPat; +def : RcpPat; +def : RsqPat; +def : RsqPat; + //============================================================================// // Miscellaneous Optimization Patterns //============================================================================// Index: test/CodeGen/R600/llvm.AMDGPU.rcp.ll =================================================================== --- /dev/null +++ test/CodeGen/R600/llvm.AMDGPU.rcp.ll @@ -0,0 +1,58 @@ +; RUN: llc -march=r600 -mcpu=SI -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +declare float @llvm.AMDGPU.rcp.f32(float) nounwind readnone +declare double @llvm.AMDGPU.rcp.f64(double) nounwind readnone + + +declare float @llvm.sqrt.f32(float) nounwind readnone +declare double @llvm.sqrt.f64(double) nounwind readnone + +; FUNC-LABEL: @rcp_f32 +; SI: V_RCP_F32_e32 +define void @rcp_f32(float addrspace(1)* %out, float %src) nounwind { + %rcp = call float @llvm.AMDGPU.rcp.f32(float %src) nounwind readnone + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @rcp_f64 +; SI: V_RCP_F64_e32 +define void @rcp_f64(double addrspace(1)* %out, double %src) nounwind { + %rcp = call double @llvm.AMDGPU.rcp.f64(double %src) nounwind readnone + store double %rcp, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @rcp_pat_f32 +; SI: V_RCP_F32_e32 +define void @rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind { + %rcp = fdiv float 1.0, %src + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @rcp_pat_f64 +; SI: V_RCP_F64_e32 +define void @rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind { + %rcp = fdiv double 1.0, %src + store double %rcp, double addrspace(1)* %out, align 8 + ret void +} + +; FUNC-LABEL: @rsq_rcp_pat_f32 +; SI: V_RSQ_F32_e32 +define void @rsq_rcp_pat_f32(float addrspace(1)* %out, float %src) nounwind { + %sqrt = call float @llvm.sqrt.f32(float %src) nounwind readnone + %rcp = call float @llvm.AMDGPU.rcp.f32(float %sqrt) nounwind readnone + store float %rcp, float addrspace(1)* %out, align 4 + ret void +} + +; FUNC-LABEL: @rsq_rcp_pat_f64 +; SI: V_RSQ_F64_e32 +define void @rsq_rcp_pat_f64(double addrspace(1)* %out, double %src) nounwind { + %sqrt = call double @llvm.sqrt.f64(double %src) nounwind readnone + %rcp = call double @llvm.AMDGPU.rcp.f64(double %sqrt) nounwind readnone + store double %rcp, double addrspace(1)* %out, align 8 + ret void +}