Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -1194,6 +1194,18 @@ unsigned Opcode, VectorType *Ty, bool IsPairwiseForm, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; + /// Calculate the cost of performing a strict (in-order) vector reduction. + /// + /// This is the cost of reducing the vector value of type \p Ty to a scalar + /// value using the operation denoted by \p Opcode. The reduction is + /// performed in lane order using an initial scalar value and a vector with + /// the same element types as the scalar, i.e. + /// + /// result = (init + v0 + v1 + v2 + v3) + InstructionCost getArithmeticStrictReductionCost( + unsigned Opcode, VectorType *Ty, + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; + InstructionCost getMinMaxReductionCost( VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput) const; @@ -1658,6 +1670,9 @@ bool IsPairwiseForm, TTI::TargetCostKind CostKind) = 0; virtual InstructionCost + getArithmeticStrictReductionCost(unsigned Opcode, VectorType *Ty, + TTI::TargetCostKind CostKind) = 0; + virtual InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned, TTI::TargetCostKind CostKind) = 0; @@ -2163,6 +2178,11 @@ CostKind); } InstructionCost + getArithmeticStrictReductionCost(unsigned Opcode, VectorType *Ty, + TTI::TargetCostKind CostKind) override { + return Impl.getArithmeticStrictReductionCost(Opcode, Ty, CostKind); + } + InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned, TTI::TargetCostKind CostKind) override { Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -626,6 +626,11 @@ return 1; } + InstructionCost getArithmeticStrictReductionCost(unsigned, VectorType *, + TTI::TargetCostKind) const { + return 1; + } + InstructionCost getMinMaxReductionCost(VectorType *, VectorType *, bool, bool, TTI::TargetCostKind) const { return 1; Index: llvm/include/llvm/CodeGen/BasicTTIImpl.h =================================================================== --- llvm/include/llvm/CodeGen/BasicTTIImpl.h +++ llvm/include/llvm/CodeGen/BasicTTIImpl.h @@ -2065,6 +2065,31 @@ thisT()->getVectorInstrCost(Instruction::ExtractElement, Ty, 0); } + /// Try to calculate the cost of performing strict (in-order) reductions, + /// which involves doing a sequence of floating point additions in lane + /// order, starting with an initial value. For example, consider a scalar + /// initial value 'InitVal' of type float and a vector of type <4 x float>: + /// + /// Vector = + /// + /// %add1 = %InitVal + %v0 + /// %add2 = %add1 + %v1 + /// %add3 = %add2 + %v2 + /// %add4 = %add3 + %v3 + /// + /// As a simple estimate we can say the cost of such a reduction is 4 times + /// the cost of a scalar FP addition. We can only estimate the costs for + /// fixed-width vectors here because for scalable vectors we do not know the + /// runtime number of operations. + InstructionCost + getArithmeticStrictReductionCost(unsigned Opcode, VectorType *Ty, + TTI::TargetCostKind CostKind) { + auto *VTy = cast(Ty); + InstructionCost Cost = + getArithmeticInstrCost(Opcode, VTy->getElementType(), CostKind); + return Cost * VTy->getNumElements(); + } + /// Try to calculate op costs for min/max reduction operations. /// \param CondTy Conditional type for the Select instruction. InstructionCost getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy, Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -902,6 +902,14 @@ return Cost; } +InstructionCost TargetTransformInfo::getArithmeticStrictReductionCost( + unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind) const { + InstructionCost Cost = + TTIImpl->getArithmeticStrictReductionCost(Opcode, Ty, CostKind); + assert(Cost >= 0 && "TTI should not produce negative costs!"); + return Cost; +} + InstructionCost TargetTransformInfo::getMinMaxReductionCost( VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned, TTI::TargetCostKind CostKind) const { Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -131,6 +131,18 @@ return BaseT::getMaxVScale(); } + /// Try to return an estimate cost factor that can be used as a multiplier + /// when scalarizing an operation for a vector with ElementCount \p VF. + /// For scalable vectors this currently takes the most pessimistic view based + /// upon the maximum possible value for vscale. + unsigned getScalarizationCostFactor(ElementCount VF) const { + if (!VF.isScalable()) + return VF.getKnownMinValue(); + Optional MaxNumVScale = getMaxVScale(); + assert(MaxNumVScale && "Expected valid max vscale value"); + return *MaxNumVScale * VF.getKnownMinValue(); + } + unsigned getMaxInterleaveFactor(unsigned VF); InstructionCost getMaskedMemoryOpCost(unsigned Opcode, Type *Src, @@ -305,6 +317,10 @@ unsigned Opcode, VectorType *Ty, bool IsPairwiseForm, TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); + InstructionCost getArithmeticStrictReductionCost( + unsigned Opcode, VectorType *Ty, + TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput); + InstructionCost getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, ArrayRef Mask, int Index, VectorType *SubTp); Index: llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1404,14 +1404,9 @@ return InstructionCost::getInvalid(); ElementCount LegalVF = LT.second.getVectorElementCount(); - Optional MaxNumVScale = getMaxVScale(); - assert(MaxNumVScale && "Expected valid max vscale value"); - InstructionCost MemOpCost = getMemoryOpCost(Opcode, VT->getElementType(), Alignment, 0, CostKind, I); - unsigned MaxNumElementsPerGather = - MaxNumVScale.getValue() * LegalVF.getKnownMinValue(); - return LT.first * MaxNumElementsPerGather * MemOpCost; + return LT.first * MemOpCost * getScalarizationCostFactor(LegalVF); } bool AArch64TTIImpl::useNeonVector(const Type *Ty) const { @@ -1819,6 +1814,18 @@ } } +InstructionCost AArch64TTIImpl::getArithmeticStrictReductionCost( + unsigned Opcode, VectorType *Ty, TTI::TargetCostKind CostKind) { + if (!isa(Ty)) + return BaseT::getArithmeticStrictReductionCost(Opcode, Ty, CostKind); + + auto *VTy = cast(Ty); + InstructionCost Cost = + getArithmeticInstrCost(Opcode, VTy->getScalarType(), CostKind); + Cost *= getScalarizationCostFactor(VTy->getElementCount()); + return Cost; +} + InstructionCost AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy, bool IsPairwiseForm, Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -7168,6 +7168,11 @@ const RecurrenceDescriptor &RdxDesc = Legal->getReductionVars()[cast(ReductionPhi)]; + + if (useOrderedReductions(RdxDesc)) + return TTI.getArithmeticStrictReductionCost(RdxDesc.getOpcode(), VectorTy, + CostKind); + InstructionCost BaseCost = TTI.getArithmeticReductionCost( RdxDesc.getOpcode(), VectorTy, false, CostKind); Index: llvm/test/Analysis/CostModel/AArch64/strict-fadd.ll =================================================================== --- /dev/null +++ llvm/test/Analysis/CostModel/AArch64/strict-fadd.ll @@ -0,0 +1,49 @@ +; RUN: opt < %s -loop-vectorize -debug -disable-output -enable-strict-reductions=true -hints-allow-reordering=false \ +; RUN: -force-vector-width=4 -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF4 +; RUN: opt < %s -loop-vectorize -debug -disable-output -enable-strict-reductions=true -hints-allow-reordering=false \ +; RUN: -force-vector-width=8 -force-vector-interleave=1 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF8 + +target triple="aarch64-unknown-linux-gnu" + +; CHECK-VF4: Found an estimated cost of 8 for VF 4 For instruction: %add = fadd float %0, %sum.07 +; CHECK-VF8: Found an estimated cost of 16 for VF 8 For instruction: %add = fadd float %0, %sum.07 + +define float @fadd_strict32(float* noalias nocapture readonly %a, i64 %n) { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %add = fadd float %0, %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret float %add +} + + +; CHECK-VF4: Found an estimated cost of 8 for VF 4 For instruction: %add = fadd double %0, %sum.07 +; CHECK-VF8: Found an estimated cost of 16 for VF 8 For instruction: %add = fadd double %0, %sum.07 + +define double @fadd_strict64(double* noalias nocapture readonly %a, i64 %n) { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds double, double* %a, i64 %iv + %0 = load double, double* %arrayidx, align 4 + %add = fadd double %0, %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body + +for.end: + ret double %add +} Index: llvm/test/Analysis/CostModel/AArch64/sve-strict-fadd.ll =================================================================== --- /dev/null +++ llvm/test/Analysis/CostModel/AArch64/sve-strict-fadd.ll @@ -0,0 +1,55 @@ +; RUN: opt < %s -loop-vectorize -debug -disable-output -enable-strict-reductions=true -hints-allow-reordering=false \ +; RUN: -scalable-vectorization=on -force-vector-width=4 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF4 +; RUN: opt < %s -loop-vectorize -debug -disable-output -enable-strict-reductions=true -hints-allow-reordering=false \ +; RUN: -scalable-vectorization=on -force-vector-width=8 -S 2>&1 | FileCheck %s --check-prefix=CHECK-VF8 + +target triple="aarch64-unknown-linux-gnu" + +; CHECK-VF4: Found an estimated cost of 128 for VF vscale x 4 For instruction: %add = fadd float %0, %sum.07 +; CHECK-VF8: Found an estimated cost of 256 for VF vscale x 8 For instruction: %add = fadd float %0, %sum.07 + +define float @fadd_strict32(float* noalias nocapture readonly %a, i64 %n) #0 { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %add = fadd float %0, %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret float %add +} + + +; CHECK-VF4: Found an estimated cost of 128 for VF vscale x 4 For instruction: %add = fadd double %0, %sum.07 +; CHECK-VF8: Found an estimated cost of 256 for VF vscale x 8 For instruction: %add = fadd double %0, %sum.07 + +define double @fadd_strict64(double* noalias nocapture readonly %a, i64 %n) #0 { +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi double [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds double, double* %a, i64 %iv + %0 = load double, double* %arrayidx, align 4 + %add = fadd double %0, %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret double %add +} + +attributes #0 = { "target-features"="+sve" } + +!0 = distinct !{!0, !1, !2} +!1 = !{!"llvm.loop.interleave.count", i32 1} +!2 = !{!"llvm.loop.vectorize.scalable.enable", i1 true}