Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h @@ -692,7 +692,7 @@ void visitAdd(const User &I) { visitBinary(I, ISD::ADD); } void visitFAdd(const User &I) { visitBinary(I, ISD::FADD); } void visitSub(const User &I) { visitBinary(I, ISD::SUB); } - void visitFSub(const User &I); + void visitFSub(const User &I) { visitBinary(I, ISD::FSUB); } void visitMul(const User &I) { visitBinary(I, ISD::MUL); } void visitFMul(const User &I) { visitBinary(I, ISD::FMUL); } void visitURem(const User &I) { visitBinary(I, ISD::UREM); } Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -3025,20 +3025,6 @@ DAG.setRoot(DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, DAG.getRoot())); } -void SelectionDAGBuilder::visitFSub(const User &I) { - // -0.0 - X --> fneg - Type *Ty = I.getType(); - if (isa(I.getOperand(0)) && - I.getOperand(0) == ConstantFP::getZeroValueForNegation(Ty)) { - SDValue Op2 = getValue(I.getOperand(1)); - setValue(&I, DAG.getNode(ISD::FNEG, getCurSDLoc(), - Op2.getValueType(), Op2)); - return; - } - - visitBinary(I, ISD::FSUB); -} - void SelectionDAGBuilder::visitUnary(const User &I, unsigned Opcode) { SDNodeFlags Flags; Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -5707,6 +5707,18 @@ return Op.getOperand(0); } + // Treat fsub(-0.0,x) the same as fneg. + if (Op.getOpcode() == ISD::FSUB) { + ConstantFPSDNode *N0C = isConstOrConstSplatFP(Op.getOperand(0), true); + if (N0C && N0C->isZero()) { + // TODO: Handle NSZ cases. + if (N0C->isNegative()) { + Cost = NegatibleCost::Cheaper; + return Op.getOperand(1); + } + } + } + // Don't recurse exponentially. if (Depth > SelectionDAG::MaxRecursionDepth) return SDValue(); Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -9241,7 +9241,6 @@ switch (Opcode) { // These will flush denorms if required. case ISD::FADD: - case ISD::FSUB: case ISD::FMUL: case ISD::FCEIL: case ISD::FFLOOR: @@ -9271,6 +9270,16 @@ case AMDGPUISD::CVT_F32_UBYTE3: return true; + case ISD::FSUB: { + // FSUB(-0.0,X) can be lowered or combined as a bit operation. + // Need to check its input recursively to handle. + ConstantFPSDNode *N0C = isConstOrConstSplatFP(Op.getOperand(0), true); + // TODO: Handle NSZ. + if (N0C && N0C->isZero() && N0C->isNegative()) + return isCanonicalized(DAG, Op.getOperand(1), MaxDepth - 1); + + return true; + } // It can/will be lowered or combined as a bit operation. // Need to check their input recursively to handle. case ISD::FNEG: Index: llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll +++ llvm/test/CodeGen/AMDGPU/fdiv32-to-rcp-folding.ll @@ -273,11 +273,14 @@ ret void } +; FIXME: These changes look supicious, but they seem ok. It looks like non-determinism in +; DAGCombine is picking a different lowering, but it's the same instructions and counts. +; Although, there is a small scheduling difference with a move towards the end. ; GCN-LABEL: {{^}}div_v4_c_by_minus_x_25ulp: -; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} -; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} -; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} -; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, -2.0{{$}} +; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}} +; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}} +; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}} +; GCN-DENORM-DAG: v_div_scale_f32 {{.*}}, 2.0{{$}} ; GCN-DENORM-DAG: v_rcp_f32_e32 ; GCN-DENORM-DAG: v_rcp_f32_e32 @@ -298,8 +301,8 @@ ; GCN-DENORM-DAG: v_div_fmas_f32 ; GCN-DENORM-DAG: v_div_fmas_f32 -; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}} -; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, -2.0{{$}} +; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, 2.0{{$}} +; GCN-DENORM-DAG: v_div_fixup_f32 {{.*}}, 2.0{{$}} ; GCN-FLUSH-DAG: v_rcp_f32_e32 ; GCN-FLUSH-DAG: v_rcp_f32_e64 Index: llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll +++ llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fmed3.ll @@ -30,9 +30,10 @@ ret void } +; NOTE: Seems equivalent, but I'm not sure. The instruction changed. ; GCN-LABEL: {{^}}test_fneg_fmed3_multi_use: ; GCN: v_med3_f32 [[MED3:v[0-9]+]], -s{{[0-9]+}}, -v{{[0-9]+}}, -v{{[0-9]+}} -; GCN: v_mul_f32_e32 v{{[0-9]+}}, -4.0, [[MED3]] +; GCN: v_mul_f32_e64 v{{[0-9]+}}, -[[MED3]], 4.0 define amdgpu_kernel void @test_fneg_fmed3_multi_use(float addrspace(1)* %out, float %src0, float %src1, float %src2) #1 { %med3 = call float @llvm.amdgcn.fmed3.f32(float %src0, float %src1, float %src2) %neg.med3 = fsub float -0.0, %med3 Index: llvm/test/CodeGen/AMDGPU/selectcc-opt.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/selectcc-opt.ll +++ llvm/test/CodeGen/AMDGPU/selectcc-opt.ll @@ -7,11 +7,14 @@ ; EG-NOT: CND ; EG: SET{{[NEQGTL]+}}_DX10 +; NOTE: Don't think we can make FSUB(-0,X) work here. The FSUB(-0,X) is +; being combined with the SELECT. Can't prevent that xform without +; affecting other targets. define amdgpu_kernel void @test_a(i32 addrspace(1)* %out, float %in) { entry: %0 = fcmp olt float %in, 0.000000e+00 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 + %2 = fneg float %1 %3 = fptosi float %2 to i32 %4 = bitcast i32 %3 to float %5 = bitcast float %4 to i32 @@ -39,7 +42,7 @@ entry: %0 = fcmp olt float %in, 0.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 + %2 = fneg float %1 %3 = fptosi float %2 to i32 %4 = bitcast i32 %3 to float %5 = bitcast float %4 to i32 Index: llvm/test/CodeGen/AMDGPU/set-dx10.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/set-dx10.ll +++ llvm/test/CodeGen/AMDGPU/set-dx10.ll @@ -12,7 +12,7 @@ entry: %0 = fcmp une float %in, 5.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 + %2 = fneg float %1 %3 = fptosi float %2 to i32 store i32 %3, i32 addrspace(1)* %out ret void @@ -38,7 +38,7 @@ entry: %0 = fcmp oeq float %in, 5.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 + %2 = fneg float %1 %3 = fptosi float %2 to i32 store i32 %3, i32 addrspace(1)* %out ret void @@ -64,7 +64,7 @@ entry: %0 = fcmp ogt float %in, 5.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 + %2 = fneg float %1 %3 = fptosi float %2 to i32 store i32 %3, i32 addrspace(1)* %out ret void @@ -90,7 +90,7 @@ entry: %0 = fcmp oge float %in, 5.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 + %2 = fneg float %1 %3 = fptosi float %2 to i32 store i32 %3, i32 addrspace(1)* %out ret void @@ -116,7 +116,7 @@ entry: %0 = fcmp ole float %in, 5.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 + %2 = fneg float %1 %3 = fptosi float %2 to i32 store i32 %3, i32 addrspace(1)* %out ret void @@ -142,7 +142,7 @@ entry: %0 = fcmp olt float %in, 5.0 %1 = select i1 %0, float 1.000000e+00, float 0.000000e+00 - %2 = fsub float -0.000000e+00, %1 + %2 = fneg float %1 %3 = fptosi float %2 to i32 store i32 %3, i32 addrspace(1)* %out ret void