Index: include/llvm/Analysis/IVDescriptors.h =================================================================== --- include/llvm/Analysis/IVDescriptors.h +++ include/llvm/Analysis/IVDescriptors.h @@ -89,10 +89,12 @@ RecurrenceDescriptor() = default; RecurrenceDescriptor(Value *Start, Instruction *Exit, RecurrenceKind K, - MinMaxRecurrenceKind MK, Instruction *UAI, Type *RT, - bool Signed, SmallPtrSetImpl &CI) - : StartValue(Start), LoopExitInstr(Exit), Kind(K), MinMaxKind(MK), - UnsafeAlgebraInst(UAI), RecurrenceType(RT), IsSigned(Signed) { + FastMathFlags FMF, MinMaxRecurrenceKind MK, + Instruction *UAI, Type *RT, bool Signed, + SmallPtrSetImpl &CI) + : StartValue(Start), LoopExitInstr(Exit), Kind(K), FMF(FMF), + MinMaxKind(MK), UnsafeAlgebraInst(UAI), RecurrenceType(RT), + IsSigned(Signed) { CastInsts.insert(CI.begin(), CI.end()); } @@ -198,6 +200,8 @@ MinMaxRecurrenceKind getMinMaxRecurrenceKind() { return MinMaxKind; } + FastMathFlags getFastMathFlags() { return FMF; } + TrackingVH getRecurrenceStartValue() { return StartValue; } Instruction *getLoopExitInstr() { return LoopExitInstr; } @@ -237,6 +241,9 @@ Instruction *LoopExitInstr = nullptr; // The kind of the recurrence. RecurrenceKind Kind = RK_NoRecurrence; + // The fast-math flags on the recurrenct instructions. We propagate these + // fast-math flags into the vectorized FP instructions we generate. + FastMathFlags FMF; // If this a min/max recurrence the kind of recurrence. MinMaxRecurrenceKind MinMaxKind = MRK_Invalid; // First occurrence of unasfe algebra in the PHI's use-chain. Index: include/llvm/IR/Operator.h =================================================================== --- include/llvm/IR/Operator.h +++ include/llvm/IR/Operator.h @@ -187,6 +187,12 @@ FastMathFlags() = default; + static FastMathFlags getFast() { + FastMathFlags FMF; + FMF.setFast(); + return FMF; + } + bool any() const { return Flags != 0; } bool none() const { return Flags == 0; } bool all() const { return Flags == ~0U; } Index: include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- include/llvm/Transforms/Utils/LoopUtils.h +++ include/llvm/Transforms/Utils/LoopUtils.h @@ -296,6 +296,7 @@ Value *getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op, RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind = RecurrenceDescriptor::MRK_Invalid, + FastMathFlags FMF = FastMathFlags(), ArrayRef RedOps = None); /// Create a target reduction of the given vector. The reduction operation @@ -308,6 +309,7 @@ unsigned Opcode, Value *Src, TargetTransformInfo::ReductionFlags Flags = TargetTransformInfo::ReductionFlags(), + FastMathFlags FMF = FastMathFlags(), ArrayRef RedOps = None); /// Create a generic target reduction using a recurrence descriptor \p Desc Index: lib/Analysis/IVDescriptors.cpp =================================================================== --- lib/Analysis/IVDescriptors.cpp +++ lib/Analysis/IVDescriptors.cpp @@ -251,6 +251,9 @@ Worklist.push_back(Start); VisitedInsts.insert(Start); + FastMathFlags FMF; + FMF.setFast(); + // A value in the reduction can be used: // - By the reduction: // - Reduction operation: @@ -296,6 +299,9 @@ ReduxDesc = isRecurrenceInstr(Cur, Kind, ReduxDesc, HasFunNoNaNAttr); if (!ReduxDesc.isRecurrence()) return false; + if (isa(ReduxDesc.getPatternInst())) { + FMF &= ReduxDesc.getPatternInst()->getFastMathFlags(); + } } bool IsASelect = isa(Cur); @@ -441,7 +447,7 @@ // Save the description of this reduction variable. RecurrenceDescriptor RD( - RdxStart, ExitInstruction, Kind, ReduxDesc.getMinMaxKind(), + RdxStart, ExitInstruction, Kind, FMF, ReduxDesc.getMinMaxKind(), ReduxDesc.getUnsafeAlgebraInst(), RecurrenceType, IsSigned, CastInsts); RedDes = RD; @@ -546,12 +552,16 @@ return InstDesc(false, I); } +static bool CanVectorizeReduction(Instruction *I) { + return !isa(I) || I->hasAllowReassoc(); +} + RecurrenceDescriptor::InstDesc RecurrenceDescriptor::isRecurrenceInstr(Instruction *I, RecurrenceKind Kind, InstDesc &Prev, bool HasFunNoNaNAttr) { bool FP = I->getType()->isFloatingPointTy(); Instruction *UAI = Prev.getUnsafeAlgebraInst(); - if (!UAI && FP && !I->isFast()) + if (!UAI && FP && !CanVectorizeReduction(I)) UAI = I; // Found an unsafe (unvectorizable) algebra instruction. switch (I->getOpcode()) { Index: lib/CodeGen/ExpandReductions.cpp =================================================================== --- lib/CodeGen/ExpandReductions.cpp +++ lib/CodeGen/ExpandReductions.cpp @@ -120,7 +120,8 @@ continue; Value *Rdx = IsOrdered ? getOrderedReduction(Builder, Acc, Vec, getOpcode(ID), MRK) - : getShuffleReduction(Builder, Vec, getOpcode(ID), MRK); + : getShuffleReduction(Builder, Vec, getOpcode(ID), MRK, + FastMathFlags::getFast()); II->replaceAllUsesWith(Rdx); II->eraseFromParent(); Changed = true; Index: lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- lib/Transforms/Utils/LoopUtils.cpp +++ lib/Transforms/Utils/LoopUtils.cpp @@ -671,12 +671,9 @@ return true; } -/// Adds a 'fast' flag to floating point operations. -static Value *addFastMathFlag(Value *V) { +static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { if (isa(V)) { - FastMathFlags Flags; - Flags.setFast(); - cast(V)->setFastMathFlags(Flags); + cast(V)->setFastMathFlags(FMF); } return V; } @@ -761,7 +758,7 @@ Value * llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op, RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind, - ArrayRef RedOps) { + FastMathFlags FMF, ArrayRef RedOps) { unsigned VF = Src->getType()->getVectorNumElements(); // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles // and vector ops, reducing the set of values being computed by half each @@ -786,7 +783,8 @@ if (Op != Instruction::ICmp && Op != Instruction::FCmp) { // Floating point operations had to be 'fast' to enable the reduction. TmpVec = addFastMathFlag(Builder.CreateBinOp((Instruction::BinaryOps)Op, - TmpVec, Shuf, "bin.rdx")); + TmpVec, Shuf, "bin.rdx"), + FMF); } else { assert(MinMaxKind != RecurrenceDescriptor::MRK_Invalid && "Invalid min/max"); @@ -803,7 +801,7 @@ /// flags (if generating min/max reductions). Value *llvm::createSimpleTargetReduction( IRBuilder<> &Builder, const TargetTransformInfo *TTI, unsigned Opcode, - Value *Src, TargetTransformInfo::ReductionFlags Flags, + Value *Src, TargetTransformInfo::ReductionFlags Flags, FastMathFlags FMF, ArrayRef RedOps) { assert(isa(Src->getType()) && "Type must be a vector"); @@ -873,7 +871,7 @@ } if (TTI->useReductionIntrinsic(Opcode, Src->getType(), Flags)) return BuildFunc(); - return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps); + return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, FMF, RedOps); } /// Create a vector reduction using a given recurrence descriptor. @@ -888,28 +886,37 @@ Flags.NoNaN = NoNaN; switch (RecKind) { case RD::RK_FloatAdd: - return createSimpleTargetReduction(B, TTI, Instruction::FAdd, Src, Flags); + return createSimpleTargetReduction(B, TTI, Instruction::FAdd, Src, Flags, + Desc.getFastMathFlags()); case RD::RK_FloatMult: - return createSimpleTargetReduction(B, TTI, Instruction::FMul, Src, Flags); + return createSimpleTargetReduction(B, TTI, Instruction::FMul, Src, Flags, + Desc.getFastMathFlags()); case RD::RK_IntegerAdd: - return createSimpleTargetReduction(B, TTI, Instruction::Add, Src, Flags); + return createSimpleTargetReduction(B, TTI, Instruction::Add, Src, Flags, + Desc.getFastMathFlags()); case RD::RK_IntegerMult: - return createSimpleTargetReduction(B, TTI, Instruction::Mul, Src, Flags); + return createSimpleTargetReduction(B, TTI, Instruction::Mul, Src, Flags, + Desc.getFastMathFlags()); case RD::RK_IntegerAnd: - return createSimpleTargetReduction(B, TTI, Instruction::And, Src, Flags); + return createSimpleTargetReduction(B, TTI, Instruction::And, Src, Flags, + Desc.getFastMathFlags()); case RD::RK_IntegerOr: - return createSimpleTargetReduction(B, TTI, Instruction::Or, Src, Flags); + return createSimpleTargetReduction(B, TTI, Instruction::Or, Src, Flags, + Desc.getFastMathFlags()); case RD::RK_IntegerXor: - return createSimpleTargetReduction(B, TTI, Instruction::Xor, Src, Flags); + return createSimpleTargetReduction(B, TTI, Instruction::Xor, Src, Flags, + Desc.getFastMathFlags()); case RD::RK_IntegerMinMax: { RD::MinMaxRecurrenceKind MMKind = Desc.getMinMaxRecurrenceKind(); Flags.IsMaxOp = (MMKind == RD::MRK_SIntMax || MMKind == RD::MRK_UIntMax); Flags.IsSigned = (MMKind == RD::MRK_SIntMax || MMKind == RD::MRK_SIntMin); - return createSimpleTargetReduction(B, TTI, Instruction::ICmp, Src, Flags); + return createSimpleTargetReduction(B, TTI, Instruction::ICmp, Src, Flags, + Desc.getFastMathFlags()); } case RD::RK_FloatMinMax: { Flags.IsMaxOp = Desc.getMinMaxRecurrenceKind() == RD::MRK_FloatMax; - return createSimpleTargetReduction(B, TTI, Instruction::FCmp, Src, Flags); + return createSimpleTargetReduction(B, TTI, Instruction::FCmp, Src, Flags, + Desc.getFastMathFlags()); } default: llvm_unreachable("Unhandled RecKind"); Index: lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- lib/Transforms/Vectorize/LoopVectorize.cpp +++ lib/Transforms/Vectorize/LoopVectorize.cpp @@ -320,9 +320,14 @@ /// A helper function that adds a 'fast' flag to floating-point operations. static Value *addFastMathFlag(Value *V) { if (isa(V)) { - FastMathFlags Flags; - Flags.setFast(); - cast(V)->setFastMathFlags(Flags); + cast(V)->setFastMathFlags(FastMathFlags::getFast()); + } + return V; +} + +static Value *addFastMathFlag(Value *V, FastMathFlags FMF) { + if (isa(V)) { + cast(V)->setFastMathFlags(FMF); } return V; } @@ -3612,7 +3617,8 @@ // Floating point operations had to be 'fast' to enable the reduction. ReducedPartRdx = addFastMathFlag( Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, - ReducedPartRdx, "bin.rdx")); + ReducedPartRdx, "bin.rdx"), + RdxDesc.getFastMathFlags()); else ReducedPartRdx = createMinMaxOp(Builder, MinMaxKind, ReducedPartRdx, RdxPart); Index: lib/Transforms/Vectorize/SLPVectorizer.cpp =================================================================== --- lib/Transforms/Vectorize/SLPVectorizer.cpp +++ lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -5929,7 +5929,8 @@ if (!IsPairwiseReduction) return createSimpleTargetReduction( Builder, TTI, ReductionData.getOpcode(), VectorizedValue, - ReductionData.getFlags(), ReductionOps.back()); + ReductionData.getFlags(), FastMathFlags::getFast(), + ReductionOps.back()); Value *TmpVec = VectorizedValue; for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) { Index: test/Transforms/LoopVectorize/reduction-fastmath.ll =================================================================== --- /dev/null +++ test/Transforms/LoopVectorize/reduction-fastmath.ll @@ -0,0 +1,108 @@ +; RUN: opt -S -loop-vectorize < %s | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +define float @reduction_sum_float_ieee(i32 %n, float* %array) { +; CHECK-LABEL: define float @reduction_sum_float_ieee( +entry: + %entry.cond = icmp ne i32 0, 4096 + br i1 %entry.cond, label %loop, label %loop.exit + +loop: + %idx = phi i32 [ 0, %entry ], [ %idx.inc, %loop ] + %sum = phi float [ 0.000000e+00, %entry ], [ %sum.inc, %loop ] + %address = getelementptr float, float* %array, i32 %idx + %value = load float, float* %address + %sum.inc = fadd float %sum, %value + %idx.inc = add i32 %idx, 1 + %be.cond = icmp ne i32 %idx.inc, 4096 + br i1 %be.cond, label %loop, label %loop.exit + +loop.exit: + %sum.lcssa = phi float [ %sum.inc, %loop ], [ 0.000000e+00, %entry ] +; CHECK-NOT: %wide.load = load <4 x float>, <4 x float>* +; CHECK: ret float %sum.lcssa + ret float %sum.lcssa +} + +define float @reduction_sum_float_fastmath(i32 %n, float* %array) { +; CHECK-LABEL: define float @reduction_sum_float_fastmath( +entry: + %entry.cond = icmp ne i32 0, 4096 + br i1 %entry.cond, label %loop, label %loop.exit + +loop: + %idx = phi i32 [ 0, %entry ], [ %idx.inc, %loop ] + %sum = phi float [ 0.000000e+00, %entry ], [ %sum.inc, %loop ] + %address = getelementptr float, float* %array, i32 %idx + %value = load float, float* %address + %sum.inc = fadd fast float %sum, %value + %idx.inc = add i32 %idx, 1 + %be.cond = icmp ne i32 %idx.inc, 4096 + br i1 %be.cond, label %loop, label %loop.exit + +loop.exit: + %sum.lcssa = phi float [ %sum.inc, %loop ], [ 0.000000e+00, %entry ] +; CHECK: %wide.load = load <4 x float>, <4 x float>* +; CHECK: ret float %sum.lcssa + ret float %sum.lcssa +} + +define float @reduction_sum_float_only_reassoc(i32 %n, float* %array) { +; CHECK-LABEL: define float @reduction_sum_float_only_reassoc( +; CHECK-NOT: fadd fast +; CHECK: fadd reassoc <4 x float> +; CHECK: fadd reassoc <4 x float> +; CHECK: fadd reassoc <4 x float> +; CHECK: fadd reassoc <4 x float> +; CHECK: fadd reassoc <4 x float> + +entry: + %entry.cond = icmp ne i32 0, 4096 + br i1 %entry.cond, label %loop, label %loop.exit + +loop: + %idx = phi i32 [ 0, %entry ], [ %idx.inc, %loop ] + %sum = phi float [ 0.000000e+00, %entry ], [ %sum.inc, %loop ] + %address = getelementptr float, float* %array, i32 %idx + %value = load float, float* %address + %sum.inc = fadd reassoc float %sum, %value + %idx.inc = add i32 %idx, 1 + %be.cond = icmp ne i32 %idx.inc, 4096 + br i1 %be.cond, label %loop, label %loop.exit + +loop.exit: + %sum.lcssa = phi float [ %sum.inc, %loop ], [ 0.000000e+00, %entry ] +; CHECK: ret float %sum.lcssa + ret float %sum.lcssa +} + +define float @reduction_sum_float_only_reassoc_and_contract(i32 %n, float* %array) { +; CHECK-LABEL: define float @reduction_sum_float_only_reassoc_and_contract( +; CHECK-NOT: fadd fast +; CHECK: fadd reassoc contract <4 x float> +; CHECK: fadd reassoc contract <4 x float> +; CHECK: fadd reassoc contract <4 x float> +; CHECK: fadd reassoc contract <4 x float> +; CHECK: fadd reassoc contract <4 x float> + +entry: + %entry.cond = icmp ne i32 0, 4096 + br i1 %entry.cond, label %loop, label %loop.exit + +loop: + %idx = phi i32 [ 0, %entry ], [ %idx.inc, %loop ] + %sum = phi float [ 0.000000e+00, %entry ], [ %sum.inc, %loop ] + %address = getelementptr float, float* %array, i32 %idx + %value = load float, float* %address + %sum.inc = fadd reassoc contract float %sum, %value + %idx.inc = add i32 %idx, 1 + %be.cond = icmp ne i32 %idx.inc, 4096 + br i1 %be.cond, label %loop, label %loop.exit + +loop.exit: + %sum.lcssa = phi float [ %sum.inc, %loop ], [ 0.000000e+00, %entry ] +; CHECK: ret float %sum.lcssa + ret float %sum.lcssa +}