Index: llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -14148,6 +14148,16 @@ return SDValue(); } + // This needs to be the inverse of logic in foldSignChangeInBitcast. + // FIXME: I don't think looking for bitcast intrinsically makes sense, but + // removing this would require more changes. + auto IsBitCastOrFree = [&TLI, FPOpcode](SDValue Op, EVT VT) { + if (Op.getOpcode() == ISD::BITCAST && Op.getOperand(0).getValueType() == VT) + return true; + + return FPOpcode == ISD::FABS ? TLI.isFAbsFree(VT) : TLI.isFNegFree(VT); + }; + // Fold (bitcast int (and (bitcast fp X to int), 0x7fff...) to fp) -> fabs X // Fold (bitcast int (xor (bitcast fp X to int), 0x8000...) to fp) -> fneg X // Fold (bitcast int (or (bitcast fp X to int), 0x8000...) to fp) -> @@ -14155,9 +14165,9 @@ SDValue LogicOp0 = N0.getOperand(0); ConstantSDNode *LogicOp1 = isConstOrConstSplat(N0.getOperand(1), true); if (LogicOp1 && LogicOp1->getAPIntValue() == SignMask && - LogicOp0.getOpcode() == ISD::BITCAST && - LogicOp0.getOperand(0).getValueType() == VT) { - SDValue FPOp = DAG.getNode(FPOpcode, SDLoc(N), VT, LogicOp0.getOperand(0)); + IsBitCastOrFree(LogicOp0, VT)) { + SDValue CastOp0 = DAG.getNode(ISD::BITCAST, SDLoc(N), VT, LogicOp0); + SDValue FPOp = DAG.getNode(FPOpcode, SDLoc(N), VT, CastOp0); NumFPLogicOpsConv++; if (N0.getOpcode() == ISD::OR) return DAG.getNode(ISD::FNEG, SDLoc(N), VT, FPOp); Index: llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll +++ llvm/test/CodeGen/AMDGPU/fneg-modifier-casting.ll @@ -485,10 +485,9 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 -; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v1, vcc -; GCN-NEXT: v_cndmask_b32_e32 v1, v4, v2, vcc -; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 2.0 +; GCN-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc +; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc +; GCN-NEXT: v_add_f64 v[0:1], -v[1:2], 2.0 ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: fneg_xor_select_i64_user_with_srcmods: @@ -498,10 +497,8 @@ ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; GFX11-NEXT: v_dual_cndmask_b32 v0, v3, v1 :: v_dual_cndmask_b32 v1, v4, v2 -; GFX11-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-NEXT: v_add_f64 v[0:1], v[0:1], 2.0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v1 :: v_dual_cndmask_b32 v2, v4, v2 +; GFX11-NEXT: v_add_f64 v[0:1], -v[1:2], 2.0 ; GFX11-NEXT: s_setpc_b64 s[30:31] %select = select i1 %cond, i64 %arg0, i64 %arg1 %fneg = xor i64 %select, 9223372036854775808 Index: llvm/test/CodeGen/AMDGPU/fneg.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fneg.ll +++ llvm/test/CodeGen/AMDGPU/fneg.ll @@ -120,8 +120,7 @@ ; FUNC-LABEL: {{^}}s_fneg_i32_fp_use: ; GCN: s_load_dword [[IN:s[0-9]+]] -; GCN: s_xor_b32 [[FNEG:s[0-9]+]], [[IN]], 0x80000000 -; GCN: v_add_f32_e64 v{{[0-9]+}}, [[FNEG]], 2.0 +; GCN: v_sub_f32_e64 v{{[0-9]+}}, 2.0, [[IN]] define amdgpu_kernel void @s_fneg_i32_fp_use(ptr addrspace(1) %out, i32 %in) { %fneg = xor i32 %in, -2147483648 %bitcast = bitcast i32 %fneg to float @@ -132,8 +131,7 @@ ; FUNC-LABEL: {{^}}v_fneg_i32_fp_use: ; GCN: s_waitcnt -; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0 -; GCN-NEXT: v_add_f32_e32 v0, 2.0, v0 +; GCN-NEXT: v_sub_f32_e32 v0, 2.0, v0 ; GCN-NEXT: s_setpc_b64 define float @v_fneg_i32_fp_use(i32 %in) { %fneg = xor i32 %in, -2147483648 @@ -160,8 +158,7 @@ } ; FUNC-LABEL: {{^}}s_fneg_i64_fp_use: -; GCN: s_xor_b32 s[[NEG_HI:[0-9]+]], s{{[0-9]+}}, 0x80000000 -; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 2.0 +; GCN: v_add_f64 v{{\[[0-9]+:[0-9]+\]}}, -s{{\[[0-9]+:[0-9]+\]}}, 2.0 define amdgpu_kernel void @s_fneg_i64_fp_use(ptr addrspace(1) %out, i64 %in) { %fneg = xor i64 %in, -9223372036854775808 %bitcast = bitcast i64 %fneg to double @@ -172,8 +169,7 @@ ; FUNC-LABEL: {{^}}v_fneg_i64_fp_use: ; GCN: s_waitcnt -; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v1 -; GCN-NEXT: v_add_f64 v[0:1], v[0:1], 2.0 +; GCN-NEXT: v_add_f64 v[0:1], -v[0:1], 2.0 ; GCN-NEXT: s_setpc_b64 define double @v_fneg_i64_fp_use(i64 %in) { %fneg = xor i64 %in, -9223372036854775808 @@ -197,9 +193,7 @@ ; SI: v_cvt_f16_f32_e32 [[CVT1:v[0-9]+]], [[ADD]] ; VI: s_load_dword [[IN:s[0-9]+]] -; VI: v_mov_b32_e32 [[K:v[0-9]+]], 0xffff8000 -; VI: v_xor_b32_e32 [[NEG:v[0-9]+]], [[IN]], [[K]] -; VI: v_add_f16_e32 v{{[0-9]+}}, 2.0, [[NEG]] +; VI: v_sub_f16_e64 v{{[0-9]+}}, 2.0, [[IN]] define amdgpu_kernel void @s_fneg_i16_fp_use(ptr addrspace(1) %out, i16 %in) { %fneg = xor i16 %in, -32768 %bitcast = bitcast i16 %fneg to half @@ -215,8 +209,7 @@ ; SI-NEXT: s_setpc_b64 ; VI: s_waitcnt -; VI-NEXT: v_xor_b32_e32 v0, 0xffff8000, v0 -; VI-NEXT: v_add_f16_e32 v0, 2.0, v0 +; VI-NEXT: v_sub_f16_e32 v0, 2.0, v0 ; VI-NEXT: s_setpc_b64 define half @v_fneg_i16_fp_use(i16 %in) { %fneg = xor i16 %in, -32768 @@ -291,10 +284,9 @@ ; SI: v_add_f32_e32 v1, 2.0, v1 ; VI: s_waitcnt -; VI: v_xor_b32_e32 v0, 0x80008000, v0 ; VI: v_mov_b32_e32 v1, 0x4000 -; VI: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD -; VI: v_add_f16_e32 v0, 2.0, v0 +; VI: v_sub_f16_sdwa v1, v1, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; VI: v_sub_f16_e32 v0, 2.0, v0 ; VI: v_or_b32_e32 v0, v0, v1 ; VI: s_setpc_b64 define <2 x half> @v_fneg_v2i16_fp_use(i32 %arg) {