Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -167,6 +167,8 @@ SDValue performUCharToFloatCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performFCopySignCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue performSHLPtrCombine(SDNode *N, unsigned AS, EVT MemVT, Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -769,7 +769,8 @@ ISD::ZERO_EXTEND, ISD::SIGN_EXTEND_INREG, ISD::EXTRACT_VECTOR_ELT, - ISD::INSERT_VECTOR_ELT}); + ISD::INSERT_VECTOR_ELT, + ISD::FCOPYSIGN}); // All memory operations. Some folding on the pointer operand is done to help // matching the constant offsets in the addressing modes. @@ -9428,6 +9429,31 @@ return SDValue(); } +SDValue SITargetLowering::performFCopySignCombine(SDNode *N, + DAGCombinerInfo &DCI) const { + SDValue SignOp = N->getOperand(1); + if (SignOp.getValueType() != MVT::f64) + return SDValue(); + + SelectionDAG &DAG = DCI.DAG; + SDLoc DL(N); + + // Reduce width of sign operand, we only need the highest bit. + // + // fcopysign f64:x, f64:y -> + // fcopysign f64:x, + // (bitcast (extract_vector_elt (bitcast f64:y to v2i32), 1)) + // TODO: In some cases it might make sense to go all the way to f16. + SDValue SignAsVector = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, SignOp); + SDValue HighBits = + DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, SignAsVector, + DAG.getConstant(1, DL, MVT::i32)); + SDValue SignAsF32 = DAG.getNode(ISD::BITCAST, DL, MVT::f32, HighBits); + + return DAG.getNode(ISD::FCOPYSIGN, DL, N->getValueType(0), N->getOperand(0), + SignAsF32); +} + // (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2) // This is a variant of @@ -11666,6 +11692,8 @@ case ISD::SINT_TO_FP: case ISD::UINT_TO_FP: return performUCharToFloatCombine(N, DCI); + case ISD::FCOPYSIGN: + return performFCopySignCombine(N, DCI); case AMDGPUISD::CVT_F32_UBYTE0: case AMDGPUISD::CVT_F32_UBYTE1: case AMDGPUISD::CVT_F32_UBYTE2: Index: llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll +++ llvm/test/CodeGen/AMDGPU/fcopysign.f16.ll @@ -63,7 +63,7 @@ ; GCN-LABEL: {{^}}test_copysign_out_f64_mag_f16_sign_f64: ; GCN-DAG: {{buffer|flat|global}}_load_ushort v[[MAG:[0-9]+]] -; GCN-DAG: {{buffer|flat|global}}_load_dwordx2 v[[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]] +; GCN-DAG: {{buffer|flat|global}}_load_dword v[[SIGN_HI:[0-9]+]] ; GCN-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 ; GCN-DAG: v_cvt_f32_f16_e32 v[[MAG_EXT:[0-9]+]], v[[MAG]] ; GCN-DAG: v_cvt_f64_f32_e32 v[[[MAG_EXT_LO:[0-9]+]]:[[MAG_EXT_HI:[0-9]+]]], v[[MAG_EXT]] @@ -168,7 +168,7 @@ ; GCN-LABEL: {{^}}test_copysign_out_f16_mag_f16_sign_f64: ; GCN-DAG: {{buffer|flat|global}}_load_ushort v[[MAG:[0-9]+]] -; GCN-DAG: {{buffer|flat|global}}_load_dwordx2 v[[[SIGN_LO:[0-9]+]]:[[SIGN_HI:[0-9]+]]] +; GCN-DAG: {{buffer|flat|global}}_load_dword v[[SIGN_HI:[0-9]+]] ; SI-DAG: s_brev_b32 s[[CONST:[0-9]+]], -2 ; SI-DAG: v_cvt_f32_f16_e32 v[[MAG_F32:[0-9]+]], v[[MAG]] ; SI: v_bfi_b32 v[[OUT_F32:[0-9]+]], s[[CONST]], v[[MAG_F32]], v[[SIGN_HI]] Index: llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll +++ llvm/test/CodeGen/AMDGPU/fcopysign.f64.ll @@ -7,9 +7,9 @@ ; FUNC-LABEL: {{^}}test_copysign_f64: ; SI-DAG: s_load_dwordx2 s[[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x13 -; SI-DAG: s_load_dwordx2 s[[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x1d +; SI-DAG: s_load_dword s[[SSIGN_HI:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x1e ; VI-DAG: s_load_dwordx2 s[[[SMAG_LO:[0-9]+]]:[[SMAG_HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x4c -; VI-DAG: s_load_dwordx2 s[[[SSIGN_LO:[0-9]+]]:[[SSIGN_HI:[0-9]+]]], s{{\[[0-9]+:[0-9]+\]}}, 0x74 +; VI-DAG: s_load_dword s[[SSIGN_HI:[0-9]+]], s{{\[[0-9]+:[0-9]+\]}}, 0x78 ; GCN-DAG: v_mov_b32_e32 v[[VSIGN_HI:[0-9]+]], s[[SSIGN_HI]] ; GCN-DAG: v_mov_b32_e32 v[[VMAG_HI:[0-9]+]], s[[SMAG_HI]] ; GCN-DAG: s_brev_b32 [[SCONST:s[0-9]+]], -2