Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -9678,29 +9678,67 @@ &DAG.getTarget().Options)) return GetNegatedExpression(N0, DAG, LegalOperations); - // Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading - // constant pool values. - if (!TLI.isFNegFree(VT) && - N0.getOpcode() == ISD::BITCAST && - N0.getNode()->hasOneUse()) { - SDValue Int = N0.getOperand(0); - EVT IntVT = Int.getValueType(); - if (IntVT.isInteger() && !IntVT.isVector()) { - APInt SignMask; - if (N0.getValueType().isVector()) { - // For a vector, get a mask such as 0x80... per scalar element - // and splat it. - SignMask = APInt::getSignBit(N0.getScalarValueSizeInBits()); - SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask); - } else { - // For a scalar, just generate 0x80... - SignMask = APInt::getSignBit(IntVT.getSizeInBits()); + if (TLI.isFNegFree(VT)) { + SDLoc SL(N); + unsigned Opc = N0.getOpcode(); + + // If the input has multiple uses and we can either fold the negate down, + // or the other uses cannot, give up. This both prevents unprofitable + // transformations and infinite loops: we won't repeatedly try to + // fold around a negate that has no 'good' form. + // TODO: Check if users are foldable. + if ((Opc == ISD::FADD || Opc == ISD::FMUL || Opc == ISD::FMA) && + !N0.hasOneUse()) + return SDValue(); + + switch (Opc) { + case ISD::FADD: { + // (fneg (fadd x, y)) -> (fadd (fneg x), (fneg y)) + SDValue LHS = N0.getOperand(0); + SDValue RHS = N0.getOperand(1); + + if (LHS.getOpcode() != ISD::FNEG) + LHS = DAG.getNode(ISD::FNEG, SL, VT, LHS); + else + LHS = LHS.getOperand(0); + + if (RHS.getOpcode() != ISD::FNEG) + RHS = DAG.getNode(ISD::FNEG, SL, VT, RHS); + else + RHS = RHS.getOperand(0); + + SDValue Res = DAG.getNode(ISD::FADD, SL, VT, LHS, RHS); + if (!N0.hasOneUse()) + DAG.ReplaceAllUsesWith(N0, DAG.getNode(ISD::FNEG, SL, VT, Res)); + return Res; + } + default: + break; + } + } else { + // Transform fneg(bitconvert(x)) -> bitconvert(x ^ sign) to avoid loading + // constant pool values. + if (N0.getOpcode() == ISD::BITCAST && + N0.getNode()->hasOneUse()) { + SDValue Int = N0.getOperand(0); + EVT IntVT = Int.getValueType(); + if (IntVT.isInteger() && !IntVT.isVector()) { + APInt SignMask; + if (N0.getValueType().isVector()) { + // For a vector, get a mask such as 0x80... per scalar element + // and splat it. + SignMask = APInt::getSignBit(N0.getScalarValueSizeInBits()); + SignMask = APInt::getSplat(IntVT.getSizeInBits(), SignMask); + } else { + // For a scalar, just generate 0x80... + SignMask = APInt::getSignBit(IntVT.getSizeInBits()); + } + SDLoc DL0(N0); + Int = DAG.getNode(ISD::XOR, DL0, IntVT, Int, + DAG.getConstant(SignMask, DL0, IntVT)); + AddToWorklist(Int.getNode()); + return DAG.getBitcast(VT, Int); } - SDLoc DL0(N0); - Int = DAG.getNode(ISD::XOR, DL0, IntVT, Int, - DAG.getConstant(SignMask, DL0, IntVT)); - AddToWorklist(Int.getNode()); - return DAG.getBitcast(VT, Int); } } Index: test/CodeGen/AMDGPU/fneg-combines.ll =================================================================== --- /dev/null +++ test/CodeGen/AMDGPU/fneg-combines.ll @@ -0,0 +1,179 @@ +; RUN: llc -march=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI -check-prefix=FUNC %s + +; GCN-LABEL: {{^}}v_fneg_add_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_sub_f32_e64 [[RESULT:v[0-9]+]], -[[A]], [[B]] +; GCN-NEXT: buffer_store_dword [[RESULT]] +define void @v_fneg_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %add = fadd float %a, %b + %fneg = fsub float -0.000000e+00, %add + store float %fneg, float addrspace(1)* %out.gep + ret void +} + +; GCN-LABEL: {{^}}v_fneg_add_store_use_add_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]] +; GCN-NEXT: buffer_store_dword [[NEG_ADD]] +; GCN-NEXT: buffer_store_dword [[ADD]] +define void @v_fneg_add_store_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %add = fadd float %a, %b + %fneg = fsub float -0.000000e+00, %add + store volatile float %fneg, float addrspace(1)* %out + store volatile float %add, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_fneg_add_multi_use_add_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN-DAG: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN-DAG: v_xor_b32_e32 [[NEG_ADD:v[0-9]+]], 0x80000000, [[ADD]] +; GCN: v_mul_f32_e32 [[MUL:v[0-9]+]], 4.0, [[ADD]] +; GCN-NEXT: buffer_store_dword [[NEG_ADD]] +; GCN-NEXT: buffer_store_dword [[MUL]] +define void @v_fneg_add_multi_use_add_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %add = fadd float %a, %b + %fneg = fsub float -0.000000e+00, %add + %use1 = fmul float %add, 4.0 + store volatile float %fneg, float addrspace(1)* %out + store volatile float %use1, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_fneg_add_fneg_x_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN-NEXT: buffer_store_dword [[ADD]] +define void @v_fneg_add_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %fneg.a = fsub float -0.000000e+00, %a + %add = fadd float %fneg.a, %b + %fneg = fsub float -0.000000e+00, %add + store volatile float %fneg, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_fneg_add_x_fneg_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_subrev_f32_e32 [[ADD:v[0-9]+]], [[A]], [[B]] +; GCN-NEXT: buffer_store_dword [[ADD]] +define void @v_fneg_add_x_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %fneg.b = fsub float -0.000000e+00, %b + %add = fadd float %a, %fneg.b + %fneg = fsub float -0.000000e+00, %add + store volatile float %fneg, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_fneg_add_fneg_fneg_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN: v_add_f32_e32 [[ADD:v[0-9]+]], [[B]], [[A]] +; GCN-NEXT: buffer_store_dword [[ADD]] +define void @v_fneg_add_fneg_fneg_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %fneg.a = fsub float -0.000000e+00, %a + %fneg.b = fsub float -0.000000e+00, %b + %add = fadd float %fneg.a, %fneg.b + %fneg = fsub float -0.000000e+00, %add + store volatile float %fneg, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_fneg_add_store_use_fneg_x_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN-DAG: v_xor_b32_e32 [[NEG_A:v[0-9]+]], 0x80000000, [[A]] +; GCN-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]] +; GCN-NEXT: buffer_store_dword [[NEG_ADD]] +; GCN-NEXT: buffer_store_dword [[NEG_A]] +define void @v_fneg_add_store_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %fneg.a = fsub float -0.000000e+00, %a + %add = fadd float %fneg.a, %b + %fneg = fsub float -0.000000e+00, %add + store volatile float %fneg, float addrspace(1)* %out + store volatile float %fneg.a, float addrspace(1)* %out + ret void +} + +; GCN-LABEL: {{^}}v_fneg_add_multi_use_fneg_x_f32: +; GCN: {{buffer|flat}}_load_dword [[A:v[0-9]+]] +; GCN: {{buffer|flat}}_load_dword [[B:v[0-9]+]] +; GCN-DAG: v_subrev_f32_e32 [[NEG_ADD:v[0-9]+]], [[B]], [[A]] +; GCN-DAG: v_mul_f32_e64 [[MUL:v[0-9]+]], -[[A]], s{{[0-9]+}} +; GCN-NEXT: buffer_store_dword [[NEG_ADD]] +; GCN-NEXT: buffer_store_dword [[MUL]] +define void @v_fneg_add_multi_use_fneg_x_f32(float addrspace(1)* %out, float addrspace(1)* %a.ptr, float addrspace(1)* %b.ptr, float %c) #0 { + %tid = call i32 @llvm.amdgcn.workitem.id.x() + %tid.ext = sext i32 %tid to i64 + %a.gep = getelementptr inbounds float, float addrspace(1)* %a.ptr, i64 %tid.ext + %b.gep = getelementptr inbounds float, float addrspace(1)* %b.ptr, i64 %tid.ext + %out.gep = getelementptr inbounds float, float addrspace(1)* %out, i64 %tid.ext + %a = load volatile float, float addrspace(1)* %a.gep + %b = load volatile float, float addrspace(1)* %b.gep + %fneg.a = fsub float -0.000000e+00, %a + %add = fadd float %fneg.a, %b + %fneg = fsub float -0.000000e+00, %add + %use1 = fmul float %fneg.a, %c + store volatile float %fneg, float addrspace(1)* %out + store volatile float %use1, float addrspace(1)* %out + ret void +} + +declare i32 @llvm.amdgcn.workitem.id.x() #1 + +attributes #0 = { nounwind } +attributes #1 = { nounwind readnone }