Index: llvm/include/llvm/IR/Operator.h =================================================================== --- llvm/include/llvm/IR/Operator.h +++ llvm/include/llvm/IR/Operator.h @@ -239,6 +239,9 @@ void operator&=(const FastMathFlags &OtherFlags) { Flags &= OtherFlags.Flags; } + void operator|=(const FastMathFlags &OtherFlags) { + Flags |= OtherFlags.Flags; + } }; /// Utility class for floating point operations which can have Index: llvm/include/llvm/Transforms/Utils/LoopUtils.h =================================================================== --- llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -356,6 +356,7 @@ OptimizationRemarkEmitter *ORE = nullptr); /// Returns a Min/Max operation corresponding to MinMaxRecurrenceKind. +/// The Builder's fast-math-flags must be set to propagate the expected values. Value *createMinMaxOp(IRBuilderBase &Builder, RecurKind RK, Value *Left, Value *Right); Index: llvm/lib/Analysis/IVDescriptors.cpp =================================================================== --- llvm/lib/Analysis/IVDescriptors.cpp +++ llvm/lib/Analysis/IVDescriptors.cpp @@ -302,8 +302,18 @@ if (!ReduxDesc.isRecurrence()) return false; // FIXME: FMF is allowed on phi, but propagation is not handled correctly. - if (isa(ReduxDesc.getPatternInst()) && !IsAPhi) - FMF &= ReduxDesc.getPatternInst()->getFastMathFlags(); + if (isa(ReduxDesc.getPatternInst()) && !IsAPhi) { + FastMathFlags CurFMF = ReduxDesc.getPatternInst()->getFastMathFlags(); + if (auto *Sel = dyn_cast(ReduxDesc.getPatternInst())) { + // Accept FMF on either fcmp or select of a min/max idiom. + // TODO: This is a hack to work-around the fact that FMF may not be + // assigned/propagated correctly. If that problem is fixed or we + // standardize on fmin/fmax via intrinsics, this can be removed. + assert(isa(Sel->getCondition()) && "Expected fcmp min/max"); + CurFMF |= cast(Sel->getCondition())->getFastMathFlags(); + } + FMF &= CurFMF; + } // Update this reduction kind if we matched a new instruction. // TODO: Can we eliminate the need for a 2nd InstDesc by keeping 'Kind' // state accurate while processing the worklist? Index: llvm/lib/Transforms/Utils/LoopUtils.cpp =================================================================== --- llvm/lib/Transforms/Utils/LoopUtils.cpp +++ llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -944,12 +944,6 @@ break; } - // We only match FP sequences that are 'fast', so we can unconditionally - // set it on any generated instructions. - IRBuilderBase::FastMathFlagGuard FMFG(Builder); - FastMathFlags FMF; - FMF.setFast(); - Builder.setFastMathFlags(FMF); Value *Cmp = Builder.CreateCmp(Pred, Left, Right, "rdx.minmax.cmp"); Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select"); return Select; Index: llvm/lib/Transforms/Vectorize/LoopVectorize.cpp =================================================================== --- llvm/lib/Transforms/Vectorize/LoopVectorize.cpp +++ llvm/lib/Transforms/Vectorize/LoopVectorize.cpp @@ -4301,16 +4301,19 @@ // terminate on this line. This is the easiest way to ensure we don't // accidentally cause an extra step back into the loop while debugging. setDebugLocFromInst(Builder, LoopMiddleBlock->getTerminator()); - for (unsigned Part = 1; Part < UF; ++Part) { - Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); - if (Op != Instruction::ICmp && Op != Instruction::FCmp) - // Floating point operations had to be 'fast' to enable the reduction. - ReducedPartRdx = addFastMathFlag( - Builder.CreateBinOp((Instruction::BinaryOps)Op, RdxPart, - ReducedPartRdx, "bin.rdx"), - RdxDesc.getFastMathFlags()); - else - ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); + { + // Floating-point operations should have some FMF to enable the reduction. + IRBuilderBase::FastMathFlagGuard FMFG(Builder); + Builder.setFastMathFlags(RdxDesc.getFastMathFlags()); + for (unsigned Part = 1; Part < UF; ++Part) { + Value *RdxPart = VectorLoopValueMap.getVectorValue(LoopExitInst, Part); + if (Op != Instruction::ICmp && Op != Instruction::FCmp) { + ReducedPartRdx = Builder.CreateBinOp( + (Instruction::BinaryOps)Op, RdxPart, ReducedPartRdx, "bin.rdx"); + } else { + ReducedPartRdx = createMinMaxOp(Builder, RK, ReducedPartRdx, RdxPart); + } + } } // Create the reduction after the loop. Note that inloop reductions create the Index: llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll +++ llvm/test/Transforms/LoopVectorize/X86/reduction-fastmath.ll @@ -262,7 +262,8 @@ ret float %sum.lcssa } -; FIXME: Some fcmp are 'nnan ninf', some are 'fast', but the reduction is sequential? +; New instructions should have the same FMF as the original code. +; Note that the select inherits FMF from its fcmp condition. define float @PR35538(float* nocapture readonly %a, i32 %N) #0 { ; CHECK-LABEL: @PR35538( @@ -299,9 +300,9 @@ ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP8:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x float> [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select fast <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP10]], <4 x float> [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = call float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]]) +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp nnan ninf ogt <4 x float> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select nnan ninf <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP10]], <4 x float> [[TMP11]] +; CHECK-NEXT: [[TMP13:%.*]] = call nnan ninf float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] ; CHECK: scalar.ph: @@ -349,6 +350,8 @@ br i1 %exitcond, label %for.cond.cleanup, label %for.body } +; Same as above, but this time the select already has matching FMF with its condition. + define float @PR35538_more_FMF(float* nocapture readonly %a, i32 %N) #0 { ; CHECK-LABEL: @PR35538_more_FMF( ; CHECK-NEXT: entry: @@ -384,8 +387,8 @@ ; CHECK-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[TMP12]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP10:!llvm.loop !.*]] ; CHECK: middle.block: -; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast ogt <4 x float> [[TMP10]], [[TMP11]] -; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select fast <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP10]], <4 x float> [[TMP11]] +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp nnan ninf ogt <4 x float> [[TMP10]], [[TMP11]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select nnan ninf <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP10]], <4 x float> [[TMP11]] ; CHECK-NEXT: [[TMP13:%.*]] = call nnan ninf float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[RDX_MINMAX_SELECT]]) ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[WIDE_TRIP_COUNT]], [[N_VEC]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[SCALAR_PH]] Index: llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll =================================================================== --- llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll +++ llvm/test/Transforms/LoopVectorize/float-minmax-instruction-flag.ll @@ -69,11 +69,11 @@ ; CHECK-NEXT: br i1 [[TMP6]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], [[LOOP0:!llvm.loop !.*]] ; CHECK: middle.block: ; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x float> [[TMP5]], [[RDX_SHUF]] -; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select fast <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP5]], <4 x float> [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp olt <4 x float> [[TMP5]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP5]], <4 x float> [[RDX_SHUF]] ; CHECK-NEXT: [[RDX_SHUF1:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> poison, <4 x i32> -; CHECK-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp fast olt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] -; CHECK-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select fast <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]] +; CHECK-NEXT: [[RDX_MINMAX_CMP2:%.*]] = fcmp olt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF1]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT3:%.*]] = select <4 x i1> [[RDX_MINMAX_CMP2]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF1]] ; CHECK-NEXT: [[TMP7:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT3]], i32 0 ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 65536, 65536 ; CHECK-NEXT: br i1 [[CMP_N]], label [[OUT:%.*]], label [[SCALAR_PH]]