Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -9751,6 +9751,52 @@ } } + // fold (fmul X, (select (fcmp X > 0.0), -1.0, 1.0)) -> (fneg (fabs X)) + // fold (fmul X, (select (fcmp X > 0.0), 1.0, -1.0)) -> (fabs X) + if (Flags.hasNoNaNs() && Flags.hasNoSignedZeros() && + (N0.getOpcode() == ISD::SELECT || N1.getOpcode() == ISD::SELECT) && + TLI.isOperationLegal(ISD::FABS, VT)) { + SDValue Select = N0, X = N1; + if (Select.getOpcode() != ISD::SELECT) + std::swap(Select, X); + + SDValue Cond = Select.getOperand(0); + auto TrueOpnd = dyn_cast(Select.getOperand(1)); + auto FalseOpnd = dyn_cast(Select.getOperand(2)); + + if (TrueOpnd && FalseOpnd && + Cond.getOpcode() == ISD::SETCC && Cond.getOperand(0) == X && + isa(Cond.getOperand(1)) && + cast(Cond.getOperand(1))->isExactlyValue(0.0)) { + ISD::CondCode CC = cast(Cond.getOperand(2))->get(); + switch (CC) { + default: break; + case ISD::SETOLT: + case ISD::SETULT: + case ISD::SETOLE: + case ISD::SETULE: + case ISD::SETLT: + case ISD::SETLE: + std::swap(TrueOpnd, FalseOpnd); + // Fall through + case ISD::SETOGT: + case ISD::SETUGT: + case ISD::SETOGE: + case ISD::SETUGE: + case ISD::SETGT: + case ISD::SETGE: + if (TrueOpnd->isExactlyValue(-1.0) && FalseOpnd->isExactlyValue(1.0) && + TLI.isOperationLegal(ISD::FNEG, VT)) + return DAG.getNode(ISD::FNEG, DL, VT, + DAG.getNode(ISD::FABS, DL, VT, X)); + if (TrueOpnd->isExactlyValue(1.0) && FalseOpnd->isExactlyValue(-1.0)) + return DAG.getNode(ISD::FABS, DL, VT, X); + + break; + } + } + } + // FMUL -> FMA combines: if (SDValue Fused = visitFMULForFMADistributiveCombine(N)) { AddToWorklist(Fused.getNode()); Index: test/CodeGen/AMDGPU/fold-fmul-to-neg-abs.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fold-fmul-to-neg-abs.ll @@ -0,0 +1,37 @@ +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN %s + +; GCN-LABEL: {{^}}fold_mul_neg: +; GCN: load_dword [[V:v[0-9]+]] +; GCN: v_or_b32_e32 [[NEG:v[0-9]]], 0x80000000, [[V]] +; GCN: store_dword [[NEG]] + +define amdgpu_kernel void @fold_mul_neg(float addrspace(1)* %arg) { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tid + %v = load float, float addrspace(1)* %gep, align 4 + %cmp = fcmp fast ogt float %v, 0.000000e+00 + %sel = select i1 %cmp, float -1.000000e+00, float 1.000000e+00 + %mul = fmul fast float %v, %sel + store float %mul, float addrspace(1)* %gep, align 4 + ret void +} + +; GCN-LABEL: {{^}}fold_mul_abs: +; GCN: load_dword [[V:v[0-9]+]] +; GCN: v_and_b32_e32 [[ABS:v[0-9]]], 0x7fffffff, [[V]] +; GCN: store_dword [[ABS]] + +define amdgpu_kernel void @fold_mul_abs(float addrspace(1)* %arg) { + %tid = tail call i32 @llvm.amdgcn.workitem.id.x() + %gep = getelementptr inbounds float, float addrspace(1)* %arg, i32 %tid + %v = load float, float addrspace(1)* %gep, align 4 + %cmp = fcmp fast olt float %v, 0.000000e+00 + %sel = select i1 %cmp, float -1.000000e+00, float 1.000000e+00 + %mul = fmul fast float %v, %sel + store float %mul, float addrspace(1)* %gep, align 4 + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #0 + +attributes #0 = { nounwind readnone speculatable }