Index: include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- include/llvm/Analysis/TargetTransformInfo.h +++ include/llvm/Analysis/TargetTransformInfo.h @@ -326,6 +326,11 @@ /// target. bool shouldBuildLookupTables() const; + /// \brief Return true if target always beneficiates from combining into FMA + /// for a given value type. This must typically return false on targets where + /// FMA takes more cycles to execute than FADD. + bool enableAggressiveFMAFusion(Type *Ty) const; + /// \brief Return hardware support for population count. PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) const; @@ -347,6 +352,7 @@ Type *Ty) const; unsigned getIntImmCost(Intrinsic::ID IID, unsigned Idx, const APInt &Imm, Type *Ty) const; + /// @} /// \name Vector Target Information @@ -525,6 +531,7 @@ virtual unsigned getJumpBufAlignment() = 0; virtual unsigned getJumpBufSize() = 0; virtual bool shouldBuildLookupTables() = 0; + virtual bool enableAggressiveFMAFusion(Type *Ty) = 0; virtual PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) = 0; virtual bool haveFastSqrt(Type *Ty) = 0; virtual unsigned getFPOpCost(Type *Ty) = 0; @@ -639,6 +646,9 @@ bool shouldBuildLookupTables() override { return Impl.shouldBuildLookupTables(); } + bool enableAggressiveFMAFusion(Type *Ty) override { + return Impl.enableAggressiveFMAFusion(Ty); + } PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) override { return Impl.getPopcntSupport(IntTyWidthInBit); } Index: include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- include/llvm/Analysis/TargetTransformInfoImpl.h +++ include/llvm/Analysis/TargetTransformInfoImpl.h @@ -233,6 +233,8 @@ bool shouldBuildLookupTables() { return true; } + bool enableAggressiveFMAFusion(Type *Ty) { return false; } + TTI::PopcntSupportKind getPopcntSupport(unsigned IntTyWidthInBit) { return TTI::PSK_Software; } Index: include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- include/llvm/CodeGen/BasicTTIImpl.h +++ include/llvm/CodeGen/BasicTTIImpl.h @@ -182,6 +182,10 @@ TLI->isOperationLegalOrCustom(ISD::BRIND, MVT::Other); } + bool enableAggressiveFMAFusion(Type *Ty) { + return getTLI()->enableAggressiveFMAFusion(Ty); + } + bool haveFastSqrt(Type *Ty) { const TargetLoweringBase *TLI = getTLI(); EVT VT = TLI->getValueType(Ty); Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -298,7 +298,7 @@ /// Return true if target always beneficiates from combining into FMA for a /// given value type. This must typically return false on targets where FMA /// takes more cycles to execute than FADD. - virtual bool enableAggressiveFMAFusion(EVT VT) const { + virtual bool enableAggressiveFMAFusion(Type *Ty) const { return false; } Index: lib/Analysis/TargetTransformInfo.cpp =================================================================== --- lib/Analysis/TargetTransformInfo.cpp +++ lib/Analysis/TargetTransformInfo.cpp @@ -139,6 +139,10 @@ return TTIImpl->shouldBuildLookupTables(); } +bool TargetTransformInfo::enableAggressiveFMAFusion(Type *Ty) const { + return TTIImpl->enableAggressiveFMAFusion(Ty); +} + TargetTransformInfo::PopcntSupportKind TargetTransformInfo::getPopcntSupport(unsigned IntTyWidthInBit) const { return TTIImpl->getPopcntSupport(IntTyWidthInBit); Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -6944,6 +6944,7 @@ ConstantFPSDNode *N0CFP = dyn_cast(N0); ConstantFPSDNode *N1CFP = dyn_cast(N1); EVT VT = N->getValueType(0); + Type *Ty = VT.getTypeForEVT(*DAG.getContext()); const TargetOptions &Options = DAG.getTarget().Options; // fold vector ops @@ -7084,14 +7085,14 @@ // fold (fadd (fmul x, y), z) -> (fma x, y, z) if (N0.getOpcode() == ISD::FMUL && - (N0->hasOneUse() || TLI.enableAggressiveFMAFusion(VT))) + (N0->hasOneUse() || TLI.enableAggressiveFMAFusion(Ty))) return DAG.getNode(ISD::FMA, SDLoc(N), VT, N0.getOperand(0), N0.getOperand(1), N1); // fold (fadd x, (fmul y, z)) -> (fma y, z, x) // Note: Commutes FADD operands. if (N1.getOpcode() == ISD::FMUL && - (N1->hasOneUse() || TLI.enableAggressiveFMAFusion(VT))) + (N1->hasOneUse() || TLI.enableAggressiveFMAFusion(Ty))) return DAG.getNode(ISD::FMA, SDLoc(N), VT, N1.getOperand(0), N1.getOperand(1), N0); @@ -7124,7 +7125,7 @@ } // More folding opportunities when target permits. - if (TLI.enableAggressiveFMAFusion(VT)) { + if (TLI.enableAggressiveFMAFusion(Ty)) { // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z)) if (N0.getOpcode() == ISD::FMA && @@ -7157,6 +7158,7 @@ ConstantFPSDNode *N0CFP = isConstOrConstSplatFP(N0); ConstantFPSDNode *N1CFP = isConstOrConstSplatFP(N1); EVT VT = N->getValueType(0); + Type *Ty = VT.getTypeForEVT(*DAG.getContext()); SDLoc dl(N); const TargetOptions &Options = DAG.getTarget().Options; @@ -7214,7 +7216,7 @@ // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z)) if (N0.getOpcode() == ISD::FMUL && - (N0->hasOneUse() || TLI.enableAggressiveFMAFusion(VT))) + (N0->hasOneUse() || TLI.enableAggressiveFMAFusion(Ty))) return DAG.getNode(ISD::FMA, dl, VT, N0.getOperand(0), N0.getOperand(1), DAG.getNode(ISD::FNEG, dl, VT, N1)); @@ -7222,7 +7224,7 @@ // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x) // Note: Commutes FSUB operands. if (N1.getOpcode() == ISD::FMUL && - (N1->hasOneUse() || TLI.enableAggressiveFMAFusion(VT))) + (N1->hasOneUse() || TLI.enableAggressiveFMAFusion(Ty))) return DAG.getNode(ISD::FMA, dl, VT, DAG.getNode(ISD::FNEG, dl, VT, N1.getOperand(0)), @@ -7232,7 +7234,7 @@ if (N0.getOpcode() == ISD::FNEG && N0.getOperand(0).getOpcode() == ISD::FMUL && ((N0->hasOneUse() && N0.getOperand(0).hasOneUse()) || - TLI.enableAggressiveFMAFusion(VT))) { + TLI.enableAggressiveFMAFusion(Ty))) { SDValue N00 = N0.getOperand(0).getOperand(0); SDValue N01 = N0.getOperand(0).getOperand(1); return DAG.getNode(ISD::FMA, dl, VT, @@ -7310,7 +7312,7 @@ } // More folding opportunities when target permits. - if (TLI.enableAggressiveFMAFusion(VT)) { + if (TLI.enableAggressiveFMAFusion(Ty)) { // fold (fsub (fma x, y, (fmul u, v)), z) // -> (fma x, y (fma u, v, (fneg z))) Index: lib/Target/NVPTX/NVPTXISelLowering.h =================================================================== --- lib/Target/NVPTX/NVPTXISelLowering.h +++ lib/Target/NVPTX/NVPTXISelLowering.h @@ -508,7 +508,7 @@ bool isFMAFasterThanFMulAndFAdd(EVT) const override { return true; } - bool enableAggressiveFMAFusion(EVT VT) const override { return true; } + bool enableAggressiveFMAFusion(Type *Ty) const override { return true; } private: const NVPTXSubtarget &STI; // cache the subtarget here Index: lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- lib/Target/PowerPC/PPCISelLowering.h +++ lib/Target/PowerPC/PPCISelLowering.h @@ -410,7 +410,7 @@ /// Return true if target always beneficiates from combining into FMA for a /// given value type. This must typically return false on targets where FMA /// takes more cycles to execute than FADD. - bool enableAggressiveFMAFusion(EVT VT) const override; + bool enableAggressiveFMAFusion(Type *Ty) const override; /// getPreIndexedAddressParts - returns true by value, base pointer and /// offset pointer and addressing mode by reference if the node's address Index: lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- lib/Target/PowerPC/PPCISelLowering.cpp +++ lib/Target/PowerPC/PPCISelLowering.cpp @@ -860,8 +860,10 @@ return VT.changeVectorElementTypeToInteger(); } -bool PPCTargetLowering::enableAggressiveFMAFusion(EVT VT) const { - assert(VT.isFloatingPoint() && "Non-floating-point FMA?"); +bool PPCTargetLowering::enableAggressiveFMAFusion(Type *Ty) const { + assert((Ty->isFloatingPointTy() || + (Ty->isVectorTy() && Ty->getVectorElementType()->isFloatingPointTy())) + && "Non-floating-point FMA?"); return true; } Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4537,6 +4537,86 @@ return SmallUF; } + // Unroll if this is a large loop (small loops are already dealt with by this + // point) and there is a scalar reduction that could benefit from unrolling. + if (!UnrollingRequiresRuntimePointerCheck && + Legal->getReductionVars()->size() && + TheLoop->getLoopLatch()) { + // Typically, an inner reduction loop might have been fully unrolled at + // this point, which makes the outer loop large. We want to interleave the + // reductions in the outer loop to expose ILP opportunities. However, we + // need to be careful since the loop is large and we want to avoid spills. + + // The heuristic used here is to divide the unroll factor by the average + // distance between reductions. Indeed, greater distance means it is likely + // that some ILP opportunities are already exposed in the loop. + + if (UF > 1) { + // Find critical path for integer and floating-point instructions. + unsigned ICriticalPathLength = 0, FPCriticalPathLength = 0; + for (auto Redx : *Legal->getReductionVars()) { + unsigned PathLength = 0; + User *U = Redx.getFirst(); + while (U != Redx.getSecond().LoopExitInstr) { + PathLength++; + auto I = U->users().begin(); + U = *I++; + assert((I.atEnd()) && + "Expected exactly one use of reduction variable."); + } + Type *Ty = U->getType(); + if (Ty->isIntegerTy() && ICriticalPathLength < PathLength) + ICriticalPathLength = PathLength; + else if (Ty->isFloatingPointTy() && FPCriticalPathLength < PathLength) + FPCriticalPathLength = PathLength; + } + + // Retrieve number of integer and floating-point instructions. + unsigned ILoopLength = 0, FPLoopLength = 0; + for (BasicBlock::iterator I : *TheLoop->getLoopLatch()) { + if (I->isBinaryOp()) { + if (I->getType()->isIntegerTy()) + ILoopLength++; + else if (I->getType()->isFloatingPointTy()) + FPLoopLength++; + } + } + + // Measure average distance between reductions in the loop. Distance for + // integer reductions is multiplied by 2 as latency is generally lower than + // for floating-point reductions. + unsigned IDistance = 0, FPDistance = 0; + if (ICriticalPathLength) + IDistance = (ILoopLength / ICriticalPathLength) * 2; + if (FPCriticalPathLength) + FPDistance = FPLoopLength / FPCriticalPathLength; + + // If the target supports aggressive FMA fusion, it is likely that the + // distance will be lower than it is now as some nodes will be combined. + LLVMContext &Context = TheLoop->getHeader()->getContext(); + Type *FloatTy = Type::getFloatTy(Context); + if (TTI.enableAggressiveFMAFusion(FloatTy) && FPDistance > 1) + FPDistance--; + + // We are interested in knowing the minimum distance between reductions, + // and hide the latency accordingly. + unsigned MinDistance = 0; + if (IDistance && FPDistance) + MinDistance = std::min(IDistance, FPDistance); + else if (IDistance) + MinDistance = IDistance; + else + MinDistance = FPDistance; + + // Reduce unroll factor to a reasonable number. + UF = PowerOf2Floor(UF / MinDistance); + if (!UF) ++UF; + } + + DEBUG(dbgs() << "LV: Unrolling because of reductions.\n"); + return UF; + } + DEBUG(dbgs() << "LV: Not Unrolling.\n"); return 1; } Index: test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll =================================================================== --- test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll +++ test/Transforms/LoopVectorize/PowerPC/large-loop-rdx.ll @@ -0,0 +1,68 @@ +; RUN: opt < %s -loop-vectorize -S | FileCheck %s + +; CHECK: fadd +; CHECK-NEXT: fadd +; CHECK-NEXT: fadd +; CHECK-NEXT: fadd +; CHECK-NEXT-NOT: fadd + +target datalayout = "e-m:e-i64:64-n32:64" +target triple = "powerpc64le-ibm-linux-gnu" + +define void @QLA_F3_r_veq_norm2_V(float* noalias nocapture %r, [3 x { float, float }]* noalias nocapture readonly %a, i32 signext %n) #0 { +entry: + %cmp24 = icmp sgt i32 %n, 0 + br i1 %cmp24, label %for.cond1.preheader.lr.ph, label %for.end13 + +for.cond1.preheader.lr.ph: ; preds = %entry + %0 = add i32 %n, -1 + br label %for.cond1.preheader + +for.cond1.preheader: ; preds = %for.cond1.preheader.lr.ph, %for.body3 + %indvars.iv = phi i64 [ 0, %for.cond1.preheader.lr.ph ], [ %indvars.iv.next, %for.body3 ] + %sum.026 = phi double [ 0.000000e+00, %for.cond1.preheader.lr.ph ], [ %add10.2, %for.body3 ] + br label %for.body3 + +for.body3: ; preds = %for.cond1.preheader + %arrayidx5.realp = getelementptr inbounds [3 x { float, float }]* %a, i64 %indvars.iv, i64 0, i32 0 + %arrayidx5.real = load float* %arrayidx5.realp, align 8 + %arrayidx5.imagp = getelementptr inbounds [3 x { float, float }]* %a, i64 %indvars.iv, i64 0, i32 1 + %arrayidx5.imag = load float* %arrayidx5.imagp, align 8 + %mul = fmul fast float %arrayidx5.real, %arrayidx5.real + %mul9 = fmul fast float %arrayidx5.imag, %arrayidx5.imag + %add = fadd fast float %mul9, %mul + %conv = fpext float %add to double + %add10 = fadd fast double %conv, %sum.026 + %arrayidx5.realp.1 = getelementptr inbounds [3 x { float, float }]* %a, i64 %indvars.iv, i64 1, i32 0 + %arrayidx5.real.1 = load float* %arrayidx5.realp.1, align 8 + %arrayidx5.imagp.1 = getelementptr inbounds [3 x { float, float }]* %a, i64 %indvars.iv, i64 1, i32 1 + %arrayidx5.imag.1 = load float* %arrayidx5.imagp.1, align 8 + %mul.1 = fmul fast float %arrayidx5.real.1, %arrayidx5.real.1 + %mul9.1 = fmul fast float %arrayidx5.imag.1, %arrayidx5.imag.1 + %add.1 = fadd fast float %mul9.1, %mul.1 + %conv.1 = fpext float %add.1 to double + %add10.1 = fadd fast double %conv.1, %add10 + %arrayidx5.realp.2 = getelementptr inbounds [3 x { float, float }]* %a, i64 %indvars.iv, i64 2, i32 0 + %arrayidx5.real.2 = load float* %arrayidx5.realp.2, align 8 + %arrayidx5.imagp.2 = getelementptr inbounds [3 x { float, float }]* %a, i64 %indvars.iv, i64 2, i32 1 + %arrayidx5.imag.2 = load float* %arrayidx5.imagp.2, align 8 + %mul.2 = fmul fast float %arrayidx5.real.2, %arrayidx5.real.2 + %mul9.2 = fmul fast float %arrayidx5.imag.2, %arrayidx5.imag.2 + %add.2 = fadd fast float %mul9.2, %mul.2 + %conv.2 = fpext float %add.2 to double + %add10.2 = fadd fast double %conv.2, %add10.1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %lftr.wideiv = trunc i64 %indvars.iv to i32 + %exitcond = icmp ne i32 %lftr.wideiv, %0 + br i1 %exitcond, label %for.cond1.preheader, label %for.cond.for.end13_crit_edge + +for.cond.for.end13_crit_edge: ; preds = %for.body3 + %add10.lcssa.lcssa = phi double [ %add10.2, %for.body3 ] + %phitmp = fptrunc double %add10.lcssa.lcssa to float + br label %for.end13 + +for.end13: ; preds = %for.cond.for.end13_crit_edge, %entry + %sum.0.lcssa = phi float [ %phitmp, %for.cond.for.end13_crit_edge ], [ 0.000000e+00, %entry ] + store float %sum.0.lcssa, float* %r, align 4 + ret void +}