Index: lib/Target/R600/AMDGPUISelLowering.h =================================================================== --- lib/Target/R600/AMDGPUISelLowering.h +++ lib/Target/R600/AMDGPUISelLowering.h @@ -49,6 +49,10 @@ SDValue LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const; + + SDValue LowerFROUND32(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerINT_TO_FP64(SDValue Op, SelectionDAG &DAG, bool Signed) const; Index: lib/Target/R600/AMDGPUISelLowering.cpp =================================================================== --- lib/Target/R600/AMDGPUISelLowering.cpp +++ lib/Target/R600/AMDGPUISelLowering.cpp @@ -127,9 +127,11 @@ setOperationAction(ISD::FABS, MVT::f32, Legal); setOperationAction(ISD::FFLOOR, MVT::f32, Legal); setOperationAction(ISD::FRINT, MVT::f32, Legal); - setOperationAction(ISD::FROUND, MVT::f32, Legal); setOperationAction(ISD::FTRUNC, MVT::f32, Legal); + setOperationAction(ISD::FROUND, MVT::f32, Custom); + setOperationAction(ISD::FROUND, MVT::f64, Custom); + setOperationAction(ISD::FREM, MVT::f32, Custom); setOperationAction(ISD::FREM, MVT::f64, Custom); @@ -610,6 +612,7 @@ case ISD::FTRUNC: return LowerFTRUNC(Op, DAG); case ISD::FRINT: return LowerFRINT(Op, DAG); case ISD::FNEARBYINT: return LowerFNEARBYINT(Op, DAG); + case ISD::FROUND: return LowerFROUND(Op, DAG); case ISD::FFLOOR: return LowerFFLOOR(Op, DAG); case ISD::SINT_TO_FP: return LowerSINT_TO_FP(Op, DAG); case ISD::UINT_TO_FP: return LowerUINT_TO_FP(Op, DAG); @@ -1917,6 +1920,20 @@ return DAG.getNode(ISD::FADD, SL, MVT::f64, Trunc, Add); } +static SDValue extractF64Exponent(SDValue Hi, SDLoc SL, SelectionDAG &DAG) { + const unsigned FractBits = 52; + const unsigned ExpBits = 11; + + SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, + Hi, + DAG.getConstant(FractBits - 32, MVT::i32), + DAG.getConstant(ExpBits, MVT::i32)); + SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart, + DAG.getConstant(1023, MVT::i32)); + + return Exp; +} + SDValue AMDGPUTargetLowering::LowerFTRUNC(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Src = Op.getOperand(0); @@ -1932,16 +1949,9 @@ // exponent. SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, VecSrc, One); - const unsigned FractBits = 52; - const unsigned ExpBits = 11; + SDValue Exp = extractF64Exponent(Hi, SL, DAG); - // Extract the exponent. - SDValue ExpPart = DAG.getNode(AMDGPUISD::BFE_U32, SL, MVT::i32, - Hi, - DAG.getConstant(FractBits - 32, MVT::i32), - DAG.getConstant(ExpBits, MVT::i32)); - SDValue Exp = DAG.getNode(ISD::SUB, SL, MVT::i32, ExpPart, - DAG.getConstant(1023, MVT::i32)); + const unsigned FractBits = 52; // Extract the sign bit. const SDValue SignBitMask = DAG.getConstant(UINT32_C(1) << 31, MVT::i32); @@ -2004,6 +2014,99 @@ return DAG.getNode(ISD::FRINT, SDLoc(Op), Op.getValueType(), Op.getOperand(0)); } +// XXX - May require not supporting f32 denormals? +SDValue AMDGPUTargetLowering::LowerFROUND32(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue X = Op.getOperand(0); + + SDValue T = DAG.getNode(ISD::FTRUNC, SL, MVT::f32, X); + + SDValue Diff = DAG.getNode(ISD::FSUB, SL, MVT::f32, X, T); + + SDValue AbsDiff = DAG.getNode(ISD::FABS, SL, MVT::f32, Diff); + + const SDValue Zero = DAG.getConstantFP(0.0, MVT::f32); + const SDValue One = DAG.getConstantFP(1.0, MVT::f32); + const SDValue Half = DAG.getConstantFP(0.5, MVT::f32); + + SDValue SignOne = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f32, One, X); + + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::f32); + + SDValue Cmp = DAG.getSetCC(SL, SetCCVT, AbsDiff, Half, ISD::SETOGE); + + SDValue Sel = DAG.getNode(ISD::SELECT, SL, MVT::f32, Cmp, SignOne, Zero); + + return DAG.getNode(ISD::FADD, SL, MVT::f32, T, Sel); +} + +SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const { + SDLoc SL(Op); + SDValue X = Op.getOperand(0); + + SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X); + + const SDValue Zero = DAG.getConstant(0, MVT::i32); + const SDValue One = DAG.getConstant(1, MVT::i32); + const SDValue NegOne = DAG.getConstant(-1, MVT::i32); + const SDValue FiftyOne = DAG.getConstant(51, MVT::i32); + EVT SetCCVT = getSetCCResultType(*DAG.getContext(), MVT::i32); + + + SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X); + + SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One); + + SDValue Exp = extractF64Exponent(Hi, SL, DAG); + + const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), MVT::i64); + + SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp); + SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64, + DAG.getConstant(INT64_C(0x0008000000000000), MVT::i64), + Exp); + + SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M); + SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT, + DAG.getConstant(0, MVT::i64), Tmp0, + ISD::SETNE); + + SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1, + D, DAG.getConstant(0, MVT::i64)); + SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2); + + K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64)); + K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K); + + SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT); + SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT); + SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ); + + SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64, + ExpEqNegOne, + DAG.getConstantFP(1.0, MVT::f64), + DAG.getConstantFP(0.0, MVT::f64)); + + SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X); + + K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K); + K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K); + + return K; +} + +SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + + if (VT == MVT::f32) + return LowerFROUND32(Op, DAG); + + if (VT == MVT::f64) + return LowerFROUND64(Op, DAG); + + llvm_unreachable("unhandled type"); +} + SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const { SDLoc SL(Op); SDValue Src = Op.getOperand(0); Index: lib/Target/R600/EvergreenInstructions.td =================================================================== --- lib/Target/R600/EvergreenInstructions.td +++ lib/Target/R600/EvergreenInstructions.td @@ -590,8 +590,6 @@ // SHA-256 Patterns def : SHA256MaPattern ; -def : FROUNDPat ; - def EG_ExportSwz : ExportSwzInst { let Word1{19-16} = 0; // BURST_COUNT let Word1{20} = 0; // VALID_PIXEL_MODE Index: lib/Target/R600/R600Instructions.td =================================================================== --- lib/Target/R600/R600Instructions.td +++ lib/Target/R600/R600Instructions.td @@ -1142,16 +1142,6 @@ (exp_ieee (mul_lit (log_clamped (MAX $src_y, (f32 ZERO))), $src_w, $src_x)) >; -// FROUND pattern -class FROUNDPat : Pat < - (AMDGPUround f32:$x), - (CNDGE $x, - (CNDGE (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x)), - (CNDGT (ADD (FNEG_R600 (f32 HALF)), (FRACT $x)), (CEIL $x), (FLOOR $x)) - ) ->; - - //===----------------------------------------------------------------------===// // R600 / R700 Instructions //===----------------------------------------------------------------------===// @@ -1195,8 +1185,6 @@ def : Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>; defm : RsqPat; - def : FROUNDPat ; - def R600_ExportSwz : ExportSwzInst { let Word1{20-17} = 0; // BURST_COUNT let Word1{21} = eop; Index: test/CodeGen/R600/llvm.round.f64.ll =================================================================== --- /dev/null +++ test/CodeGen/R600/llvm.round.f64.ll @@ -0,0 +1,74 @@ +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}round_f64: +; SI: s_endpgm +define void @round_f64(double addrspace(1)* %out, double %x) #0 { + %result = call double @llvm.round.f64(double %x) #1 + store double %result, double addrspace(1)* %out + ret void +} + +; This is a pretty large function, so just test a few of the +; instructions that are necessary. + +; FUNC-LABEL: {{^}}v_round_f64: +; SI: buffer_load_dwordx2 +; SI: v_bfe_u32 [[EXP:v[0-9]+]], v{{[0-9]+}}, 20, 11 + +; SI: v_not_b32_e32 +; SI: v_not_b32_e32 + +; SI: v_cmp_eq_i32 + +; SI: s_mov_b32 [[BFIMASK:s[0-9]+]], 0x7fffffff +; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[BFIMASK]] + +; SI: v_cmp_lt_i32_e64 +; SI: v_cmp_gt_i32_e64 + + +; SI: buffer_store_dwordx2 +; SI: s_endpgm +define void @v_round_f64(double addrspace(1)* %out, double addrspace(1)* %in) #0 { + %tid = call i32 @llvm.r600.read.tidig.x() #1 + %gep = getelementptr double addrspace(1)* %in, i32 %tid + %out.gep = getelementptr double addrspace(1)* %out, i32 %tid + %x = load double addrspace(1)* %gep + %result = call double @llvm.round.f64(double %x) #1 + store double %result, double addrspace(1)* %out.gep + ret void +} + +; FUNC-LABEL: {{^}}round_v2f64: +; SI: s_endpgm +define void @round_v2f64(<2 x double> addrspace(1)* %out, <2 x double> %in) #0 { + %result = call <2 x double> @llvm.round.v2f64(<2 x double> %in) #1 + store <2 x double> %result, <2 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}round_v4f64: +; SI: s_endpgm +define void @round_v4f64(<4 x double> addrspace(1)* %out, <4 x double> %in) #0 { + %result = call <4 x double> @llvm.round.v4f64(<4 x double> %in) #1 + store <4 x double> %result, <4 x double> addrspace(1)* %out + ret void +} + +; FUNC-LABEL: {{^}}round_v8f64: +; SI: s_endpgm +define void @round_v8f64(<8 x double> addrspace(1)* %out, <8 x double> %in) #0 { + %result = call <8 x double> @llvm.round.v8f64(<8 x double> %in) #1 + store <8 x double> %result, <8 x double> addrspace(1)* %out + ret void +} + +declare i32 @llvm.r600.read.tidig.x() #1 + +declare double @llvm.round.f64(double) #1 +declare <2 x double> @llvm.round.v2f64(<2 x double>) #1 +declare <4 x double> @llvm.round.v4f64(<4 x double>) #1 +declare <8 x double> @llvm.round.v8f64(<8 x double>) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone } Index: test/CodeGen/R600/llvm.round.ll =================================================================== --- test/CodeGen/R600/llvm.round.ll +++ test/CodeGen/R600/llvm.round.ll @@ -1,17 +1,27 @@ -; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck %s --check-prefix=R600 --check-prefix=FUNC - -; FUNC-LABEL: {{^}}f32: -; R600: FRACT {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]] -; R600-DAG: ADD {{.*}}, -0.5 -; R600-DAG: CEIL {{.*}} [[ARG]] -; R600-DAG: FLOOR {{.*}} [[ARG]] -; R600-DAG: CNDGE -; R600-DAG: CNDGT -; R600: CNDGE {{[^,]+}}, [[ARG]] -define void @f32(float addrspace(1)* %out, float %in) { -entry: - %0 = call float @llvm.round.f32(float %in) - store float %0, float addrspace(1)* %out +; RUN: llc -march=amdgcn -mcpu=SI < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s +; RUN: llc -march=r600 -mcpu=redwood < %s | FileCheck -check-prefix=R600 -check-prefix=FUNC %s + +; FUNC-LABEL: {{^}}round_f32: +; SI-DAG: s_load_dword [[SX:s[0-9]+]] +; SI-DAG: v_mov_b32_e32 [[VX:v[0-9]+]], [[SX]] +; SI-DAG: s_mov_b32 [[K:s[0-9]+]], 0x7fffffff +; SI: v_bfi_b32 [[COPYSIGN:v[0-9]+]], [[K]], 1.0, [[VX]] +; SI: v_trunc_f32_e32 [[TRUNC:v[0-9]+]], [[SX]] +; SI: v_sub_f32_e32 [[SUB:v[0-9]+]], [[SX]], [[TRUNC]] +; SI: v_cmp_ge_f32_e64 [[CMP:s\[[0-9]+:[0-9]+\]]], |[[SUB]]|, 0.5 +; SI: v_cndmask_b32_e64 [[SEL:v[0-9]+]], 0, [[VX]], [[CMP]] +; SI: v_add_f32_e32 [[RESULT:v[0-9]+]], [[SEL]], [[TRUNC]] +; SI: buffer_store_dword [[RESULT]] + +; R600: TRUNC {{.*}}, [[ARG:KC[0-9]\[[0-9]+\]\.[XYZW]]] +; R600-DAG: ADD {{.*}}, +; R600-DAG: BFI_INT +; R600-DAG: SETGE +; R600-DAG: CNDE +; R600-DAG: ADD +define void @round_f32(float addrspace(1)* %out, float %x) #0 { + %result = call float @llvm.round.f32(float %x) #1 + store float %result, float addrspace(1)* %out ret void } @@ -20,24 +30,37 @@ ; a test for the scalar case, so the vector tests just check that the ; compiler doesn't crash. -; FUNC-LABEL: v2f32 +; FUNC-LABEL: {{^}}round_v2f32: +; SI: s_endpgm ; R600: CF_END -define void @v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) { -entry: - %0 = call <2 x float> @llvm.round.v2f32(<2 x float> %in) - store <2 x float> %0, <2 x float> addrspace(1)* %out +define void @round_v2f32(<2 x float> addrspace(1)* %out, <2 x float> %in) #0 { + %result = call <2 x float> @llvm.round.v2f32(<2 x float> %in) #1 + store <2 x float> %result, <2 x float> addrspace(1)* %out ret void } -; FUNC-LABEL: v4f32 +; FUNC-LABEL: {{^}}round_v4f32: +; SI: s_endpgm ; R600: CF_END -define void @v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) { -entry: - %0 = call <4 x float> @llvm.round.v4f32(<4 x float> %in) - store <4 x float> %0, <4 x float> addrspace(1)* %out +define void @round_v4f32(<4 x float> addrspace(1)* %out, <4 x float> %in) #0 { + %result = call <4 x float> @llvm.round.v4f32(<4 x float> %in) #1 + store <4 x float> %result, <4 x float> addrspace(1)* %out ret void } -declare float @llvm.round.f32(float) -declare <2 x float> @llvm.round.v2f32(<2 x float>) -declare <4 x float> @llvm.round.v4f32(<4 x float>) +; FUNC-LABEL: {{^}}round_v8f32: +; SI: s_endpgm +; R600: CF_END +define void @round_v8f32(<8 x float> addrspace(1)* %out, <8 x float> %in) #0 { + %result = call <8 x float> @llvm.round.v8f32(<8 x float> %in) #1 + store <8 x float> %result, <8 x float> addrspace(1)* %out + ret void +} + +declare float @llvm.round.f32(float) #1 +declare <2 x float> @llvm.round.v2f32(<2 x float>) #1 +declare <4 x float> @llvm.round.v4f32(<4 x float>) #1 +declare <8 x float> @llvm.round.v8f32(<8 x float>) #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone }