Index: llvm/include/llvm/Analysis/IVDescriptors.h =================================================================== --- llvm/include/llvm/Analysis/IVDescriptors.h +++ llvm/include/llvm/Analysis/IVDescriptors.h @@ -69,9 +69,11 @@ RecurrenceDescriptor(Value *Start, Instruction *Exit, RecurKind K, FastMathFlags FMF, Instruction *ExactFP, Type *RT, - bool Signed, SmallPtrSetImpl &CI) + bool Signed, bool Ordered, + SmallPtrSetImpl &CI) : StartValue(Start), LoopExitInstr(Exit), Kind(K), FMF(FMF), - ExactFPMathInst(ExactFP), RecurrenceType(RT), IsSigned(Signed) { + ExactFPMathInst(ExactFP), RecurrenceType(RT), IsSigned(Signed), + IsOrdered(Ordered) { CastInsts.insert(CI.begin(), CI.end()); } @@ -227,6 +229,9 @@ /// Returns true if all source operands of the recurrence are SExtInsts. bool isSigned() const { return IsSigned; } + /// Expose an ordered FP reduction to the instance users. + bool isOrdered() const { return IsOrdered; } + /// Attempts to find a chain of operations from Phi to LoopExitInst that can /// be treated as a set of reductions instructions for in-loop reductions. SmallVector getReductionOpChain(PHINode *Phi, @@ -249,6 +254,8 @@ Type *RecurrenceType = nullptr; // True if all source operands of the recurrence are SExtInsts. bool IsSigned = false; + // If this is an ordered reduction + bool IsOrdered = false; // Instructions used for type-promoting the recurrence. SmallPtrSet CastInsts; }; Index: llvm/include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -389,6 +389,11 @@ Value *createTargetReduction(IRBuilderBase &B, const TargetTransformInfo *TTI, RecurrenceDescriptor &Desc, Value *Src); +/// Create an ordered reduction intrinsic using the given recurrence +/// descriptor \p Desc. +Value *createOrderedReduction(IRBuilderBase &B, RecurrenceDescriptor &Desc, + Value *Src, Value *Start, Value *Predicate); + /// Get the intersection (logical and) of all of the potential IR flags /// of each scalar operation (VL) that will be converted into a vector (I). /// If OpValue is non-null, we only consider operations similar to OpValue Index: llvm/lib/Analysis/IVDescriptors.cpp =================================================================== --- llvm/lib/Analysis/IVDescriptors.cpp +++ llvm/lib/Analysis/IVDescriptors.cpp @@ -189,6 +189,48 @@ } } +// Check if a given Phi node can be recognized as an ordered reduction for +// vectorizing floating point operations without unsafe math. +static bool checkOrderedReduction(RecurKind Kind, Instruction *Exit, + PHINode *Phi) { + // Currently only FAdd is supported + if (Kind != RecurKind::FAdd) + return false; + + bool IsOrdered = Exit->getOpcode() == Instruction::FAdd && + !cast(Exit)->hasAllowReassoc(); + + // If this comes from a Phi node, look through it + if (auto *EIP = dyn_cast(Exit)) { + if (EIP->getNumIncomingValues() != 2) + return false; + + auto *ChainVal = EIP->getIncomingValue(0) == Phi ? EIP->getIncomingValue(1) + : EIP->getIncomingValue(0); + + if (!isa(ChainVal)) + return false; + + Exit = cast(ChainVal); + IsOrdered = Exit->getOpcode() == Instruction::FAdd && + !cast(Exit)->hasAllowReassoc(); + } + + // The only pattern accepted is the one in which the reduction PHI + // is used as one of the operands of the exit instruction + auto *LHS = Exit->getOperand(0); + auto *RHS = Exit->getOperand(1); + IsOrdered = IsOrdered && ((LHS == Phi) || (RHS == Phi)); + + if (!IsOrdered) + return false; + + LLVM_DEBUG(dbgs() << "LV: Found an ordered reduction: Phi: " << *Phi + << ", ExitInst: " << *Exit << "\n"); + + return true; +} + bool RecurrenceDescriptor::AddReductionVar(PHINode *Phi, RecurKind Kind, Loop *TheLoop, FastMathFlags FuncFMF, RecurrenceDescriptor &RedDes, @@ -416,6 +458,8 @@ if (!FoundStartPHI || !FoundReduxOp || !ExitInstruction) return false; + const bool IsOrdered = checkOrderedReduction(Kind, ExitInstruction, Phi); + if (Start != Phi) { // If the starting value is not the same as the phi node, we speculatively // looked through an 'and' instruction when evaluating a potential @@ -470,7 +514,7 @@ // Save the description of this reduction variable. RecurrenceDescriptor RD(RdxStart, ExitInstruction, Kind, FMF, ReduxDesc.getExactFPMathInst(), RecurrenceType, - IsSigned, CastInsts); + IsSigned, IsOrdered, CastInsts); RedDes = RD; return true; Index: llvm/lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- llvm/lib/Transforms/Utils/LoopUtils.cpp +++ llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -1070,6 +1070,22 @@ return createSimpleTargetReduction(B, TTI, Src, Desc.getRecurrenceKind()); } +Value *llvm::createOrderedReduction(IRBuilderBase &B, + RecurrenceDescriptor &Desc, Value *Src, + Value *Start, Value *Predicate) { + auto Kind = Desc.getRecurrenceKind(); + assert(Kind == RecurKind::FAdd && "Unexpected reduction kind"); + assert(Src->getType()->isVectorTy() && "Expected a vector type"); + assert(!Start->getType()->isVectorTy() && "Expected a scalar type"); + + std::function CreateRdx; + // Predication is done by masking out the inactive elements of the vector + // source with a safe identity value. + auto *MaskedVec = B.CreateSelect( + Predicate, Src, Desc.getRecurrenceIdentity(Kind, Src->getType())); + return B.CreateFAddReduce(Start, MaskedVec); +} + void llvm::propagateIRFlags(Value *I, ArrayRef VL, Value *OpValue) { auto *VecOp = dyn_cast(I); if (!VecOp) Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -325,6 +325,11 @@ cl::desc("Prefer in-loop vector reductions, " "overriding the targets preference.")); +cl::opt EnableStrictReductions( + "enable-strict-reducs", cl::init(false), cl::Hidden, + cl::desc("Enable the vectorisation of loops with in-order (strict) " + "FP reductions")); + static cl::opt PreferPredicatedReductionSelect( "prefer-predicated-reduction-select", cl::init(false), cl::Hidden, cl::desc( @@ -4228,6 +4233,10 @@ LCSSAPhi.addIncoming(ExtractForPhiUsedOutsideLoop, LoopMiddleBlock); } +static bool useOrderedReductions(RecurrenceDescriptor &RdxDesc) { + return EnableStrictReductions && RdxDesc.isOrdered(); +} + void InnerLoopVectorizer::fixReduction(PHINode *Phi, VPTransformState &State) { // Get it's reduction variable descriptor. assert(Legal->isReductionVariable(Phi) && @@ -4349,16 +4358,20 @@ // accidentally cause an extra step back into the loop while debugging. setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); { - // Floating-point operations should have some FMF to enable the reduction. - IRBuilderBase::FastMathFlagGuard FMFG(Builder); - Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); - for (unsigned Part = 1; Part < UF; ++Part) { - Value *RdxPart = State.get(LoopExitInstDef, Part); - if (Op != Instruction::ICmp && Op != Instruction::FCmp) { - ReducedPartRdx = Builder.CreateBinOp( - (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); - } else { - ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); + if (Cost->isInLoopReduction(Phi) && useOrderedReductions(RdxDesc)) + ReducedPartRdx = State.get(LoopExitInstDef, UF - 1); + else { + // Floating-point operations should have some FMF to enable the reduction. + IRBuilderBase::FastMathFlagGuard FMFG(Builder); + Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); + for (unsigned Part = 1; Part < UF; ++Part) { + Value *RdxPart = State.get(LoopExitInstDef, Part); + if (Op != Instruction::ICmp && Op != Instruction::FCmp) { + ReducedPartRdx = Builder.CreateBinOp( + (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); + } else { + ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); + } } } } @@ -6045,7 +6058,7 @@ if (!Legal->isReductionVariable(PN)) continue; RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN]; - if (PreferInLoopReductions || + if (PreferInLoopReductions || useOrderedReductions(RdxDesc) || TTI.preferInLoopReduction(RdxDesc.getOpcode(), RdxDesc.getRecurrenceType(), TargetTransformInfo::ReductionFlags())) @@ -7614,7 +7627,7 @@ // If the target would prefer this reduction to happen "in-loop", then we // want to record it as such. unsigned Opcode = RdxDesc.getOpcode(); - if (!PreferInLoopReductions && + if (!PreferInLoopReductions && !useOrderedReductions(RdxDesc) && !TTI.preferInLoopReduction(Opcode, Phi->getType(), TargetTransformInfo::ReductionFlags())) continue; @@ -9101,8 +9114,10 @@ void VPReductionRecipe::execute(VPTransformState &State) { assert(!State.Instance && "Reduction being replicated."); + Value *PrevInChain = State.get(getChainOp(), 0); for (unsigned Part = 0; Part < State.UF; ++Part) { RecurKind Kind = RdxDesc->getRecurrenceKind(); + bool IsOrdered = useOrderedReductions(*RdxDesc); Value *NewVecOp = State.get(getVecOp(), Part); if (VPValue *Cond = getCondOp()) { Value *NewCond = State.get(Cond, Part); @@ -9114,18 +9129,31 @@ Value *Select = State.Builder.CreateSelect(NewCond, NewVecOp, IdenVec); NewVecOp = Select; } - Value *NewRed = - createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); - Value *PrevInChain = State.get(getChainOp(), Part); + Value *NewRed; Value *NextInChain; + if (IsOrdered) { + Value *C = ConstantInt::get( + IntegerType::getInt1Ty(State.Builder.getContext()), 1); + Value *MaskOfOnes = + State.Builder.CreateVectorSplat(State.VF, C, "broadcast"); + NewRed = createOrderedReduction(State.Builder, *RdxDesc, NewVecOp, + PrevInChain, MaskOfOnes); + PrevInChain = NewRed; + } else { + PrevInChain = State.get(getChainOp(), Part); + NewRed = createTargetReduction(State.Builder, TTI, *RdxDesc, NewVecOp); + } if (RecurrenceDescriptor::isMinMaxRecurrenceKind(Kind)) { NextInChain = createMinMaxOp(State.Builder, RdxDesc->getRecurrenceKind(), NewRed, PrevInChain); } else { - NextInChain = State.Builder.CreateBinOp( - (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, - PrevInChain); + if (IsOrdered) + NextInChain = NewRed; + else + NextInChain = State.Builder.CreateBinOp( + (Instruction::BinaryOps)getUnderlyingInstr()->getOpcode(), NewRed, + PrevInChain); } State.set(this, NextInChain, Part); } Index: llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/LoopVectorize/AArch64/strict-fadd.ll @@ -0,0 +1,163 @@ +; RUN: opt < %s -loop-vectorize -mtriple aarch64-unknown-linux-gnu -mattr=+sve -enable-strict-reducs -S | FileCheck %s -check-prefix=CHECK + +define float @fadd_strict(float* noalias nocapture readonly %a, i64 %n) { +; CHECK-LABEL: @fadd_strict +; CHECK: vector.body: +; CHECK: %[[VEC_PHI:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ] +; CHECK: %[[LOAD:.*]] = load <8 x float>, <8 x float>* +; CHECK: %[[MASKED_VEC:.*]] = select <8 x i1> , <8 x float> %[[LOAD]], <8 x float> zeroinitializer +; CHECK: %[[RDX]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI]], <8 x float> %[[MASKED_VEC]]) +; CHECK: for.end +; CHECK: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ] +; CHECK: ret float %[[PHI]] +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %add = fadd float %0, %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + +for.end: + ret float %add +} + +define float @fadd_strict_unroll(float* noalias nocapture readonly %a, i64 %n) { +; CHECK-LABEL: @fadd_strict_unroll +; CHECK: vector.body: +; CHECK: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX1:.*]], %vector.body ] +; CHECK: %[[VEC_PHI2:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX2:.*]], %vector.body ] +; CHECK: %[[LOAD1:.*]] = load <8 x float>, <8 x float>* +; CHECK: %[[LOAD2:.*]] = load <8 x float>, <8 x float>* +; CHECK: %[[MASKED_VEC1:.*]] = select <8 x i1> , <8 x float> %[[LOAD1]], <8 x float> zeroinitializer +; CHECK: %[[RDX1:.*]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[VEC_PHI1]], <8 x float> %[[MASKED_VEC1]]) +; CHECK: %[[MASKED_VEC2:.*]] = select <8 x i1> , <8 x float> %[[LOAD2]], <8 x float> zeroinitializer +; CHECK: %[[RDX2]] = call float @llvm.vector.reduce.fadd.v8f32(float %[[RDX1]], <8 x float> %[[MASKED_VEC2]]) +; CHECK: for.end +; CHECK: %[[PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX2]], %middle.block ] +; CHECK: ret float %[[PHI]] +entry: + br label %for.body + +for.body: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %sum.07 = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + %arrayidx = getelementptr inbounds float, float* %a, i64 %iv + %0 = load float, float* %arrayidx, align 4 + %add = fadd float %0, %sum.07 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !1 + +for.end: + ret float %add +} + +define void @fadd_strict_interleave(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) { +; CHECK-LABEL: @fadd_strict_interleave +; CHECK: entry +; CHECK: %[[ARRAYIDX:.*]] = getelementptr inbounds float, float* %a, i64 1 +; CHECK: %[[LOAD1:.*]] = load float, float* %a +; CHECK: %[[LOAD2:.*]] = load float, float* %[[ARRAYIDX]] +; CHECK: vector.body +; CHECK: %[[VEC_PHI1:.*]] = phi float [ %[[LOAD2]], %vector.ph ], [ %[[RDX2:.*]], %vector.body ] +; CHECK: %[[VEC_PHI2:.*]] = phi float [ %[[LOAD1]], %vector.ph ], [ %[[RDX1:.*]], %vector.body ] +; CHECK: %[[WIDE_LOAD:.*]] = load <8 x float>, <8 x float>* +; CHECK: %[[STRIDED1:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> +; CHECK: %[[STRIDED2:.*]] = shufflevector <8 x float> %[[WIDE_LOAD]], <8 x float> poison, <4 x i32> +; CHECK: %[[MASKED_VEC1:.*]] = select <4 x i1> , <4 x float> %[[STRIDED1]], <4 x float> zeroinitializer +; CHECK: %[[RDX1]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI2]], <4 x float> %[[MASKED_VEC1]]) +; CHECK: %[[MASKED_VEC2:.*]] = select <4 x i1> , <4 x float> %[[STRIDED2]], <4 x float> zeroinitializer +; CHECK: %[[RDX2]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI1]], <4 x float> %[[MASKED_VEC2]]) +; CHECK: for.end +; CHECK ret void +entry: + %arrayidxa = getelementptr inbounds float, float* %a, i64 1 + %a1 = load float, float* %a, align 4 + %a2 = load float, float* %arrayidxa, align 4 + br label %for.body + +for.body: + %add.phi1 = phi float [ %a2, %entry ], [ %add2, %for.body ] + %add.phi2 = phi float [ %a1, %entry ], [ %add1, %for.body ] + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %arrayidxb1 = getelementptr inbounds float, float* %b, i64 %iv + %0 = load float, float* %arrayidxb1, align 4 + %add1 = fadd float %0, %add.phi2 + %or = or i64 %iv, 1 + %arrayidxb2 = getelementptr inbounds float, float* %b, i64 %or + %1 = load float, float* %arrayidxb2, align 4 + %add2 = fadd float %1, %add.phi1 + %iv.next = add nuw nsw i64 %iv, 2 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !2 + +for.end: + store float %add1, float* %a, align 4 + store float %add2, float* %arrayidxa, align 4 + ret void +} + +; Check we can vectorise conditional ordered reductions. Note the following +; loop contains a mixture of normal and reducing adds, which check only the +; reducing add is matched as an ordered instruction. + +; float func(float* restrict a, float* restrict b, int count) { +; float res = 0.0f; +; for (int i = 0; i < count; ++i) +; if (a[1] > 0.5f) +; res += a[i] + b[i]; +; return res; +; } + +define float @fadd_conditional_rdx(float* noalias nocapture readonly %a, float* noalias nocapture readonly %b, i64 %n) { +; CHECK-LABEL: @fadd_conditional_rdx +; CHECK: vector.body +; CHECK: %[[VEC_PHI1:.*]] = phi float [ 0.000000e+00, %vector.ph ], [ %[[RDX:.*]], %vector.body ] +; CHECK: %[[LOAD1:.*]] = load <4 x float>, <4 x float>* +; CHECK: %[[LOAD2:.*]] = load <4 x float>, <4 x float>* +; CHECK: %[[ADD:.*]] = fadd <4 x float> %[[LOAD1]], %[[LOAD2]] +; CHECK: %[[MASKED_VEC:.*]] = select <4 x i1> , <4 x float> %[[ADD]], <4 x float> zeroinitializer +; CHECK: %[[RDX]] = call float @llvm.vector.reduce.fadd.v4f32(float %[[VEC_PHI1]], <4 x float> %[[MASKED_VEC]]) +; CHECK: for.end.loopexit +; CHECK: %[[EXIT_PHI:.*]] = phi float [ %[[SCALAR:.*]], %for.body ], [ %[[RDX]], %middle.block ] +; CHECK: for.end +; CHECK: %[[PHI:.*]] = phi float [ 0.000000e+00, %entry ], [ %[[EXIT_PHI]], %for.end.loopexit ] +; CHECK: ret float %[[PHI]] +entry: + %arrayidx = getelementptr inbounds float, float* %a, i64 1 + %0 = load float, float* %arrayidx, align 4 + %cmp1 = fcmp ogt float %0, 5.000000e-01 + br i1 %cmp1, label %for.body, label %for.end + +for.body: ; preds = %for.body + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %res.014 = phi float [ 0.000000e+00, %entry ], [ %rdx, %for.body ] + %arrayidx2 = getelementptr inbounds float, float* %a, i64 %iv + %1 = load float, float* %arrayidx2, align 4 + %arrayidx4 = getelementptr inbounds float, float* %b, i64 %iv + %2 = load float, float* %arrayidx4, align 4 + %add = fadd float %1, %2 + %rdx = fadd float %res.014, %add + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, %n + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !2 + +for.end: ; preds = %for.body, %entry + %res = phi float [ 0.000000e+00, %entry ], [ %rdx, %for.body ] + ret float %res +} + +!0 = distinct !{!0, !3, !5, !7} +!1 = distinct !{!1, !3, !6, !7} +!2 = distinct !{!2, !4, !5, !7} +!3 = !{!"llvm.loop.vectorize.width", i32 8} +!4 = !{!"llvm.loop.vectorize.width", i32 4} +!5 = !{!"llvm.loop.interleave.count", i32 1} +!6 = !{!"llvm.loop.interleave.count", i32 2} +!7 = !{!"llvm.loop.vectorize.enable", i1 true}