Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -167,6 +167,8 @@ SDValue performUCharToFloatCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFCopySignCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSHLPtrCombine(SDNode *N, unsigned AS, EVT MemVT, Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -769,7 +769,8 @@ ISD::ZERO_EXTEND, ISD::SIGN_EXTEND_INREG, ISD::EXTRACT_VECTOR_ELT, - ISD::INSERT_VECTOR_ELT}); + ISD::INSERT_VECTOR_ELT, + ISD::FCOPYSIGN}); // All memory operations. Some folding on the pointer operand is done to help // matching the constant offsets in the addressing modes. @@ -9428,6 +9429,29 @@ return SDValue(); } +SDValue SITargetLowering::performFCopySignCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SDValue SignOp = N->getOperand(1); + if (SignOp.getValueType() != MVT::f64) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + // Reduce width of sign operand, we only need the highest bit. + // + // fcopysign f64:x, f64:y -> + // fcopysign f64:x, (extract_vector_elt (bitcast f64:y to v2f32), 1) + // TODO: In some cases it might make sense to go all the way to f16. + SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2f32, SignOp); + SDValue SignAsF32 = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, SignAsVector, + DAG.getConstant(1, DL, MVT::i32)); + + return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0), + SignAsF32); +} + // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) // This is a variant of @@ -11666,6 +11690,8 @@ case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return performUCharToFloatCombine(N, DCI); + case ISD::FCOPYSIGN: + return performFCopySignCombine(N, DCI); case AMDGPUISD::CVT_F32_UBYTE0: case AMDGPUISD::CVT_F32_UBYTE1: case AMDGPUISD::CVT_F32_UBYTE2: Index: llvm/test/CodeGen/AMDGPU/fnearbyint.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fnearbyint.ll +++ llvm/test/CodeGen/AMDGPU/fnearbyint.ll @@ -159,9 +159,8 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b32 s4, s0 ; SI-NEXT: s_mov_b32 s5, s1 -; SI-NEXT: v_mov_b32_e32 v4, s3 -; SI-NEXT: v_bfi_b32 v1, s8, v1, v4 ; SI-NEXT: v_mov_b32_e32 v6, s3 +; SI-NEXT: v_bfi_b32 v1, s8, v1, v6 ; SI-NEXT: v_mov_b32_e32 v7, s2 ; SI-NEXT: v_add_f64 v[4:5], s[2:3], v[0:1] ; SI-NEXT: v_add_f64 v[0:1], v[4:5], -v[0:1] @@ -210,13 +209,11 @@ ; SI-NEXT: v_mov_b32_e32 v4, s8 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v1, s3 -; SI-NEXT: v_bfi_b32 v1, s10, v6, v1 ; SI-NEXT: v_mov_b32_e32 v7, s3 +; SI-NEXT: v_bfi_b32 v1, s10, v6, v7 ; SI-NEXT: v_mov_b32_e32 v8, s2 ; SI-NEXT: v_mov_b32_e32 v9, s1 -; SI-NEXT: v_mov_b32_e32 v10, s1 -; SI-NEXT: v_mov_b32_e32 v11, s0 +; SI-NEXT: v_mov_b32_e32 v10, s0 ; SI-NEXT: v_add_f64 v[2:3], s[2:3], v[0:1] ; SI-NEXT: v_add_f64 v[2:3], v[2:3], -v[0:1] ; SI-NEXT: v_bfi_b32 v1, s10, v6, v9 @@ -226,8 +223,8 @@ ; SI-NEXT: v_add_f64 v[6:7], s[0:1], v[0:1] ; SI-NEXT: v_add_f64 v[0:1], v[6:7], -v[0:1] ; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[0:1]|, v[4:5] -; SI-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc -; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc +; SI-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc +; SI-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[4:7], 0 ; SI-NEXT: s_endpgm ; @@ -275,22 +272,18 @@ ; SI-NEXT: v_mov_b32_e32 v8, s12 ; SI-NEXT: v_mov_b32_e32 v9, s13 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s3 -; SI-NEXT: v_bfi_b32 v5, s14, v10, v0 ; SI-NEXT: v_mov_b32_e32 v2, s3 +; SI-NEXT: v_bfi_b32 v5, s14, v10, v2 ; SI-NEXT: v_mov_b32_e32 v6, s2 -; SI-NEXT: v_mov_b32_e32 v3, s1 ; SI-NEXT: v_mov_b32_e32 v7, s1 ; SI-NEXT: v_mov_b32_e32 v11, s0 ; SI-NEXT: v_mov_b32_e32 v12, s7 -; SI-NEXT: v_mov_b32_e32 v13, s7 -; SI-NEXT: v_mov_b32_e32 v14, s6 -; SI-NEXT: v_mov_b32_e32 v15, s5 -; SI-NEXT: v_mov_b32_e32 v16, s5 -; SI-NEXT: v_mov_b32_e32 v17, s4 +; SI-NEXT: v_mov_b32_e32 v13, s6 +; SI-NEXT: v_mov_b32_e32 v14, s5 +; SI-NEXT: v_mov_b32_e32 v15, s4 ; SI-NEXT: v_add_f64 v[0:1], s[2:3], v[4:5] ; SI-NEXT: v_add_f64 v[0:1], v[0:1], -v[4:5] -; SI-NEXT: v_bfi_b32 v5, s14, v10, v3 +; SI-NEXT: v_bfi_b32 v5, s14, v10, v7 ; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[2:3]|, v[8:9] ; SI-NEXT: v_cndmask_b32_e32 v3, v1, v2, vcc ; SI-NEXT: v_cndmask_b32_e32 v2, v0, v6, vcc @@ -302,15 +295,15 @@ ; SI-NEXT: v_cndmask_b32_e32 v0, v0, v11, vcc ; SI-NEXT: v_add_f64 v[6:7], s[6:7], v[4:5] ; SI-NEXT: v_add_f64 v[6:7], v[6:7], -v[4:5] -; SI-NEXT: v_bfi_b32 v5, s14, v10, v15 +; SI-NEXT: v_bfi_b32 v5, s14, v10, v14 ; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[6:7]|, v[8:9] -; SI-NEXT: v_cndmask_b32_e32 v7, v7, v13, vcc -; SI-NEXT: v_cndmask_b32_e32 v6, v6, v14, vcc +; SI-NEXT: v_cndmask_b32_e32 v7, v7, v12, vcc +; SI-NEXT: v_cndmask_b32_e32 v6, v6, v13, vcc ; SI-NEXT: v_add_f64 v[10:11], s[4:5], v[4:5] ; SI-NEXT: v_add_f64 v[4:5], v[10:11], -v[4:5] ; SI-NEXT: v_cmp_gt_f64_e64 vcc, |s[4:5]|, v[8:9] -; SI-NEXT: v_cndmask_b32_e32 v5, v5, v16, vcc -; SI-NEXT: v_cndmask_b32_e32 v4, v4, v17, vcc +; SI-NEXT: v_cndmask_b32_e32 v5, v5, v14, vcc +; SI-NEXT: v_cndmask_b32_e32 v4, v4, v15, vcc ; SI-NEXT: buffer_store_dwordx4 v[4:7], off, s[8:11], 0 offset:16 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; SI-NEXT: s_endpgm