Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -527,6 +527,31 @@ return false; } + /// This pair is an opcode and value type. + typedef std::pair AssocType; + + /// This map type may be used by a target to count reassociations. + typedef std::map AssocMap; + + /// Return true if a reassociation optimization that exposes more + /// instruction-level-parallelism should be attempted for the specified + /// opcode and type. Targets may override this function based on the + /// amount of software-visible parallelism that is possible balanced against + /// the number of registers that are needed to support that number of + /// independent instructions. + /// \param Type The opcode and MVT of the instruction to reassociate. + /// \param Map A map for keeping track of reassociated operations. + virtual bool shouldReassociate(AssocType Type, AssocMap &Map) const { + return false; + } + + /// This hook is called if a reassociation of the specified type was + /// completed. Targets may override this function to keep track of + /// the number of times a reassociation of this type has occurred. + /// \param Type The opcode and MVT of the instruction that was reassociated. + /// \param Map A map for keeping track of reassociated operations. + virtual void didReassociate(AssocType Type, AssocMap &Map) const {} + /// Return how this operation should be treated: either it is legal, needs to /// be promoted to a larger size, needs to be expanded to some other code /// sequence, or the target has a custom expander for it. Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -406,6 +406,11 @@ /// requirements are checked by the function (e.g. that trunc is /// single-use) and if missed an empty SDValue is returned. SDValue distributeTruncateThroughAnd(SDNode *N); + + /// Given a dependent sequence of associative math/logic operations, + /// reassociate the operands to increase the instruction-level-parallelism. + SDValue reassociateForILP(SDNode *); + TargetLoweringBase::AssocMap ReassociateMap; public: DAGCombiner(SelectionDAG &D, AliasAnalysis &A, CodeGenOpt::Level OL) @@ -7786,6 +7791,58 @@ return SDValue(); } +SDValue DAGCombiner::reassociateForILP(SDNode *N) { + assert(N->getNumOperands() == 2 && "Invalid node for binop reassociation"); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + EVT VT = N->getValueType(0); + SDLoc DL(N); + unsigned Op = N->getOpcode(); + + // Only try this after type legalization. Reassociating earlier could have + // unknown consequences on register usage. Also, a new combiner is created + // post-type-legalization, so the map that is tracking this transform does + // not live long enough to support earlier passes. + if (!LegalTypes) + return SDValue(); + + TargetLoweringBase::AssocType Type = std::make_pair(Op, VT.getSimpleVT()); + + // Tell the target how many times we have already completed a reassociation + // of this type and ask the target if it's ok to try another. + if (!TLI.shouldReassociate(Type, ReassociateMap)) + return SDValue(); + + // Swap chains of this operation to LHS to avoid duplicating predicate logic + // for each commutation of the pattern. + if (SelectionDAG::isCommutativeBinOp(Op) && N1.getOpcode() == Op && + N1.hasOneUse() && N0.getOpcode() != Op) + std::swap(N1, N0); + + // Convert a chain of 3 dependent operations into 2 independent operations + // and 1 dependent operation. + if (N0.getOpcode() == Op && N0.hasOneUse() && N1.getOpcode() != Op) { + SDValue N00 = N0.getOperand(0); + SDValue N01 = N0.getOperand(1); + + // Swap second set of operands if needed. + if (SelectionDAG::isCommutativeBinOp(Op) && N01.getOpcode() == Op && + N01.hasOneUse() && N00.getOpcode() != Op) + std::swap(N01, N00); + + // (op N0: (op N00: (op z, w), N01: y), N1: x) -> + // (op N00: (op z, w), (op N1: x, N01: y)) + // TODO: The innermost op - (op z, w) - does not need to match the others. + if (N00.getOpcode() == Op) { + // Tell the target that a reassociation of this type was completed. + TLI.didReassociate(Type, ReassociateMap); + SDValue NewOp = DAG.getNode(Op, DL, VT, N1, N01); + return DAG.getNode(Op, DL, VT, N00, NewOp); + } + } + return SDValue(); +} + SDValue DAGCombiner::visitFADD(SDNode *N) { SDValue N0 = N->getOperand(0); SDValue N1 = N->getOperand(1); @@ -7920,6 +7977,10 @@ N0.getOperand(0), DAG.getConstantFP(4.0, DL, VT)); } } + + if (SDValue Reassociated = reassociateForILP(N)) + return Reassociated; + } // enable-unsafe-fp-math // FADD -> FMA combines: Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -1084,6 +1084,10 @@ /// Reassociate floating point divisions into multiply by reciprocal. bool combineRepeatedFPDivisors(unsigned NumUsers) const override; + + /// Increase instruction-level-parallelism for associative operations. + bool shouldReassociate(AssocType Type, AssocMap &Map) const override; + void didReassociate(AssocType Type, AssocMap &Map) const override; }; namespace X86 { Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -12929,6 +12929,42 @@ return NumUsers > 1; } +bool X86TargetLowering::shouldReassociate(AssocType Type, AssocMap &Map) const { + // These are the opcodes that we allow to be reassociated and the number + // of times each opcode may be reassociated. The limit is a conservative + // heuristic based on the number of architected general-purpose or FP/vector + // registers. We must avoid spilling for the optimization to have a benefit. + static const std::map ReassociationLimits = { + { ISD::FADD, 1 } + }; + + // TODO: Add the rest of the associative opcodes. + // TODO: Programmatically base the limit on number of registers. + // TODO: Limits should be based on MVT or register class. For example, + // a *vector* integer add does not use the same registers as a *scalar* + // integer add. + + // If the input opcode type is not a reassociation candidate, we're done. + const auto &LimitIter = ReassociationLimits.find(Type.first); + if (LimitIter == ReassociationLimits.end()) + return false; + + // If we hit the reassociation limit for this type of instruction, we're done. + unsigned ReassociationMax = LimitIter->second; + if (Map[Type] >= ReassociationMax) + return false; + + return true; +} + +void X86TargetLowering::didReassociate(AssocType Type, AssocMap &Map) const { + // TODO: The map should be updated based on register type rather than + // instruction type. For example, any floating point reassociation + // should affect all other floating point operations because they all + // require registers from the same register class. + Map[Type]++; +} + static bool isAllOnes(SDValue V) { ConstantSDNode *C = dyn_cast(V); return C && C->isAllOnesValue(); Index: test/CodeGen/X86/fp-fast.ll =================================================================== --- test/CodeGen/X86/fp-fast.ll +++ test/CodeGen/X86/fp-fast.ll @@ -114,3 +114,70 @@ ret float %t2 } +; Verify that the first two adds are independent; the destination registers +; are used as source registers for the third add. + +define float @reassociate_adds1(float %x0, float %x1, float %x2, float %x3) { +; CHECK-LABEL: reassociate_adds1: +; CHECK: # BB#0: +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm2, %xmm3, %xmm1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd float %x0, %x1 + %t1 = fadd float %t0, %x2 + %t2 = fadd float %t1, %x3 + ret float %t2 +} + +define float @reassociate_adds2(float %x0, float %x1, float %x2, float %x3) { +; CHECK-LABEL: reassociate_adds2: +; CHECK: # BB#0: +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm2, %xmm3, %xmm1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd float %x0, %x1 + %t1 = fadd float %x2, %t0 + %t2 = fadd float %t1, %x3 + ret float %t2 +} + +define float @reassociate_adds3(float %x0, float %x1, float %x2, float %x3) { +; CHECK-LABEL: reassociate_adds3: +; CHECK: # BB#0: +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm2, %xmm3, %xmm1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd float %x0, %x1 + %t1 = fadd float %t0, %x2 + %t2 = fadd float %x3, %t1 + ret float %t2 +} + +; FIXME: We should be able to do more reassociations for this test case, but the limit is set +; conservatively low to avoid spilling disasters. + +define float @reassociate_adds4(float %x0, float %x1, float %x2, float %x3, float %x4, float %x5, float %x6, float %x7) { +; CHECK-LABEL: reassociate_adds4: +; CHECK: # BB#0: +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm2, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm3, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm4, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm5, %xmm0, %xmm0 +; CHECK-NEXT: vaddss %xmm6, %xmm7, %xmm1 +; CHECK-NEXT: vaddss %xmm1, %xmm0, %xmm0 +; CHECK-NEXT: retq + %t0 = fadd float %x0, %x1 + %t1 = fadd float %t0, %x2 + %t2 = fadd float %t1, %x3 + %t3 = fadd float %t2, %x4 + %t4 = fadd float %t3, %x5 + %t5 = fadd float %t4, %x6 + %t6 = fadd float %t5, %x7 + ret float %t6 +} + +