diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp --- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp +++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp @@ -6455,6 +6455,10 @@ case RecurKind::FMul: return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS, Name); + case RecurKind::FMax: + return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS); + case RecurKind::FMin: + return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS); case RecurKind::SMax: { Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name); @@ -6568,6 +6572,15 @@ if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind)) return true; + if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) { + // FP min/max are associative except for NaN and -0.0. We do not + // have to rule out -0.0 here because the intrinsic semantics do not + // specify a fixed result for it. + // TODO: This is artificially restricted to fast because the code that + // creates reductions assumes/produces fast ops. + return I->getFastMathFlags().isFast(); + } + return I->isAssociative(); } @@ -6677,6 +6690,11 @@ if (match(I, m_FMul(m_Value(), m_Value()))) return OperationData(RecurKind::FMul); + if (match(I, m_Intrinsic(m_Value(), m_Value()))) + return OperationData(RecurKind::FMax); + if (match(I, m_Intrinsic(m_Value(), m_Value()))) + return OperationData(RecurKind::FMin); + if (match(I, m_SMax(m_Value(), m_Value()))) return OperationData(RecurKind::SMax); if (match(I, m_SMin(m_Value(), m_Value()))) @@ -7076,6 +7094,18 @@ ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy); break; } + case RecurKind::FMax: + case RecurKind::FMin: { + auto *VecCondTy = cast(CmpInst::makeCmpResultType(VectorTy)); + VectorCost = + TTI->getMinMaxReductionCost(VectorTy, VecCondTy, + /*pairwise=*/false, /*unsigned=*/false); + ScalarCost = + TTI->getCmpSelInstrCost(Instruction::FCmp, ScalarTy) + + TTI->getCmpSelInstrCost(Instruction::Select, ScalarTy, + CmpInst::makeCmpResultType(ScalarTy)); + break; + } case RecurKind::SMax: case RecurKind::SMin: case RecurKind::UMax: @@ -7307,6 +7337,16 @@ return nullptr; } +static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) { + if (match(I, m_BinOp(m_Value(V0), m_Value(V1)))) + return true; + if (match(I, m_Intrinsic(m_Value(V0), m_Value(V1)))) + return true; + if (match(I, m_Intrinsic(m_Value(V0), m_Value(V1)))) + return true; + return false; +} + /// Attempt to reduce a horizontal reduction. /// If it is legal to match a horizontal reduction feeding the phi node \a P /// with reduction operators \a Root (or one of its operands) in a basic block @@ -7347,7 +7387,7 @@ unsigned Level; std::tie(Inst, Level) = Stack.pop_back_val(); Value *B0, *B1; - bool IsBinop = match(Inst, m_BinOp(m_Value(B0), m_Value(B1))); + bool IsBinop = matchRdxBop(Inst, B0, B1); bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value())); if (IsBinop || IsSelect) { HorizontalReduction HorRdx; diff --git a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll --- a/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll +++ b/llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll @@ -330,17 +330,16 @@ define float @fmin_v4i32(float* %p) #0 { ; CHECK-LABEL: @fmin_v4i32( ; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = load float, float* [[P:%.*]], align 4, [[TBAA7]] -; CHECK-NEXT: [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[P]], i64 1 -; CHECK-NEXT: [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4, [[TBAA7]] -; CHECK-NEXT: [[TMP2:%.*]] = tail call fast float @llvm.minnum.f32(float [[TMP1]], float [[TMP0]]) -; CHECK-NEXT: [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[P]], i64 2 -; CHECK-NEXT: [[TMP3:%.*]] = load float, float* [[ARRAYIDX_2]], align 4, [[TBAA7]] -; CHECK-NEXT: [[TMP4:%.*]] = tail call fast float @llvm.minnum.f32(float [[TMP3]], float [[TMP2]]) -; CHECK-NEXT: [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[P]], i64 3 -; CHECK-NEXT: [[TMP5:%.*]] = load float, float* [[ARRAYIDX_3]], align 4, [[TBAA7]] -; CHECK-NEXT: [[TMP6:%.*]] = tail call fast float @llvm.minnum.f32(float [[TMP5]], float [[TMP4]]) -; CHECK-NEXT: ret float [[TMP6]] +; CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[P:%.*]] to <4 x float>* +; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, [[TBAA7]] +; CHECK-NEXT: [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x float> [[TMP1]], [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT:%.*]] = select fast <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP1]], <4 x float> [[RDX_SHUF]] +; CHECK-NEXT: [[RDX_SHUF3:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> poison, <4 x i32> +; CHECK-NEXT: [[RDX_MINMAX_CMP4:%.*]] = fcmp fast olt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF3]] +; CHECK-NEXT: [[RDX_MINMAX_SELECT5:%.*]] = select fast <4 x i1> [[RDX_MINMAX_CMP4]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF3]] +; CHECK-NEXT: [[TMP2:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT5]], i32 0 +; CHECK-NEXT: ret float [[TMP2]] ; entry: br label %for.cond diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll b/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll @@ -343,14 +343,10 @@ ; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1 ; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds float, float* [[P]], i64 2 ; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds float, float* [[P]], i64 3 -; CHECK-NEXT: [[T0:%.*]] = load float, float* [[P]], align 4 -; CHECK-NEXT: [[T1:%.*]] = load float, float* [[G1]], align 4 -; CHECK-NEXT: [[T2:%.*]] = load float, float* [[G2]], align 4 -; CHECK-NEXT: [[T3:%.*]] = load float, float* [[G3]], align 4 -; CHECK-NEXT: [[M1:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T1]], float [[T0]]) -; CHECK-NEXT: [[M2:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T2]], float [[M1]]) -; CHECK-NEXT: [[M3:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T3]], float [[M2]]) -; CHECK-NEXT: ret float [[M3]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P]] to <4 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP2]]) +; CHECK-NEXT: ret float [[TMP3]] ; %g1 = getelementptr inbounds float, float* %p, i64 1 %g2 = getelementptr inbounds float, float* %p, i64 2 @@ -365,6 +361,8 @@ ret float %m3 } +; TODO: This should become a reduce intrinsic. + define float @reduction_v4f32_nnan(float* %p) { ; CHECK-LABEL: @reduction_v4f32_nnan( ; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1 @@ -392,6 +390,8 @@ ret float %m3 } +; Negative test - must have nnan. + define float @reduction_v4f32_not_fast(float* %p) { ; CHECK-LABEL: @reduction_v4f32_not_fast( ; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1 @@ -428,22 +428,10 @@ ; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds float, float* [[P]], i64 5 ; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds float, float* [[P]], i64 6 ; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds float, float* [[P]], i64 7 -; CHECK-NEXT: [[T0:%.*]] = load float, float* [[P]], align 4 -; CHECK-NEXT: [[T1:%.*]] = load float, float* [[G1]], align 4 -; CHECK-NEXT: [[T2:%.*]] = load float, float* [[G2]], align 4 -; CHECK-NEXT: [[T3:%.*]] = load float, float* [[G3]], align 4 -; CHECK-NEXT: [[T4:%.*]] = load float, float* [[G4]], align 4 -; CHECK-NEXT: [[T5:%.*]] = load float, float* [[G5]], align 4 -; CHECK-NEXT: [[T6:%.*]] = load float, float* [[G6]], align 4 -; CHECK-NEXT: [[T7:%.*]] = load float, float* [[G7]], align 4 -; CHECK-NEXT: [[M1:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T1]], float [[T0]]) -; CHECK-NEXT: [[M2:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T2]], float [[M1]]) -; CHECK-NEXT: [[M3:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T3]], float [[M2]]) -; CHECK-NEXT: [[M4:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T4]], float [[M3]]) -; CHECK-NEXT: [[M5:%.*]] = tail call fast float @llvm.maxnum.f32(float [[M4]], float [[T6]]) -; CHECK-NEXT: [[M6:%.*]] = tail call fast float @llvm.maxnum.f32(float [[M5]], float [[T5]]) -; CHECK-NEXT: [[M7:%.*]] = tail call fast float @llvm.maxnum.f32(float [[M6]], float [[T7]]) -; CHECK-NEXT: ret float [[M7]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P]] to <8 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> [[TMP2]]) +; CHECK-NEXT: ret float [[TMP3]] ; %g1 = getelementptr inbounds float, float* %p, i64 1 %g2 = getelementptr inbounds float, float* %p, i64 2 @@ -490,14 +478,10 @@ ; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 1 ; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds double, double* [[P]], i64 2 ; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds double, double* [[P]], i64 3 -; CHECK-NEXT: [[T0:%.*]] = load double, double* [[P]], align 4 -; CHECK-NEXT: [[T1:%.*]] = load double, double* [[G1]], align 4 -; CHECK-NEXT: [[T2:%.*]] = load double, double* [[G2]], align 4 -; CHECK-NEXT: [[T3:%.*]] = load double, double* [[G3]], align 4 -; CHECK-NEXT: [[M1:%.*]] = tail call fast double @llvm.maxnum.f64(double [[T1]], double [[T0]]) -; CHECK-NEXT: [[M2:%.*]] = tail call fast double @llvm.maxnum.f64(double [[T2]], double [[M1]]) -; CHECK-NEXT: [[M3:%.*]] = tail call fast double @llvm.maxnum.f64(double [[T3]], double [[M2]]) -; CHECK-NEXT: ret double [[M3]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[P]] to <4 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> [[TMP2]]) +; CHECK-NEXT: ret double [[TMP3]] ; %g1 = getelementptr inbounds double, double* %p, i64 1 %g2 = getelementptr inbounds double, double* %p, i64 2 @@ -512,6 +496,8 @@ ret double %m3 } +; Negative test - must have nnan. + define double @reduction_v4f64_wrong_fmf(double* %p) { ; CHECK-LABEL: @reduction_v4f64_wrong_fmf( ; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 1 diff --git a/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll b/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll --- a/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll +++ b/llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll @@ -343,14 +343,10 @@ ; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1 ; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds float, float* [[P]], i64 2 ; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds float, float* [[P]], i64 3 -; CHECK-NEXT: [[T0:%.*]] = load float, float* [[P]], align 4 -; CHECK-NEXT: [[T1:%.*]] = load float, float* [[G1]], align 4 -; CHECK-NEXT: [[T2:%.*]] = load float, float* [[G2]], align 4 -; CHECK-NEXT: [[T3:%.*]] = load float, float* [[G3]], align 4 -; CHECK-NEXT: [[M1:%.*]] = tail call fast float @llvm.minnum.f32(float [[T1]], float [[T0]]) -; CHECK-NEXT: [[M2:%.*]] = tail call fast float @llvm.minnum.f32(float [[T2]], float [[M1]]) -; CHECK-NEXT: [[M3:%.*]] = tail call fast float @llvm.minnum.f32(float [[T3]], float [[M2]]) -; CHECK-NEXT: ret float [[M3]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P]] to <4 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP2]]) +; CHECK-NEXT: ret float [[TMP3]] ; %g1 = getelementptr inbounds float, float* %p, i64 1 %g2 = getelementptr inbounds float, float* %p, i64 2 @@ -365,6 +361,8 @@ ret float %m3 } +; TODO: This should become a reduce intrinsic. + define float @reduction_v4f32_nnan(float* %p) { ; CHECK-LABEL: @reduction_v4f32_nnan( ; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1 @@ -392,6 +390,8 @@ ret float %m3 } +; Negative test - must have nnan. + define float @reduction_v4f32_wrong_fmf(float* %p) { ; CHECK-LABEL: @reduction_v4f32_wrong_fmf( ; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1 @@ -428,22 +428,10 @@ ; CHECK-NEXT: [[G5:%.*]] = getelementptr inbounds float, float* [[P]], i64 5 ; CHECK-NEXT: [[G6:%.*]] = getelementptr inbounds float, float* [[P]], i64 6 ; CHECK-NEXT: [[G7:%.*]] = getelementptr inbounds float, float* [[P]], i64 7 -; CHECK-NEXT: [[T0:%.*]] = load float, float* [[P]], align 4 -; CHECK-NEXT: [[T1:%.*]] = load float, float* [[G1]], align 4 -; CHECK-NEXT: [[T2:%.*]] = load float, float* [[G2]], align 4 -; CHECK-NEXT: [[T3:%.*]] = load float, float* [[G3]], align 4 -; CHECK-NEXT: [[T4:%.*]] = load float, float* [[G4]], align 4 -; CHECK-NEXT: [[T5:%.*]] = load float, float* [[G5]], align 4 -; CHECK-NEXT: [[T6:%.*]] = load float, float* [[G6]], align 4 -; CHECK-NEXT: [[T7:%.*]] = load float, float* [[G7]], align 4 -; CHECK-NEXT: [[M1:%.*]] = tail call fast float @llvm.minnum.f32(float [[T1]], float [[T0]]) -; CHECK-NEXT: [[M2:%.*]] = tail call fast float @llvm.minnum.f32(float [[T2]], float [[M1]]) -; CHECK-NEXT: [[M3:%.*]] = tail call fast float @llvm.minnum.f32(float [[T3]], float [[M2]]) -; CHECK-NEXT: [[M4:%.*]] = tail call fast float @llvm.minnum.f32(float [[T4]], float [[M3]]) -; CHECK-NEXT: [[M5:%.*]] = tail call fast float @llvm.minnum.f32(float [[M4]], float [[T6]]) -; CHECK-NEXT: [[M6:%.*]] = tail call fast float @llvm.minnum.f32(float [[M5]], float [[T5]]) -; CHECK-NEXT: [[M7:%.*]] = tail call fast float @llvm.minnum.f32(float [[M6]], float [[T7]]) -; CHECK-NEXT: ret float [[M7]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast float* [[P]] to <8 x float>* +; CHECK-NEXT: [[TMP2:%.*]] = load <8 x float>, <8 x float>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> [[TMP2]]) +; CHECK-NEXT: ret float [[TMP3]] ; %g1 = getelementptr inbounds float, float* %p, i64 1 %g2 = getelementptr inbounds float, float* %p, i64 2 @@ -490,14 +478,10 @@ ; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 1 ; CHECK-NEXT: [[G2:%.*]] = getelementptr inbounds double, double* [[P]], i64 2 ; CHECK-NEXT: [[G3:%.*]] = getelementptr inbounds double, double* [[P]], i64 3 -; CHECK-NEXT: [[T0:%.*]] = load double, double* [[P]], align 4 -; CHECK-NEXT: [[T1:%.*]] = load double, double* [[G1]], align 4 -; CHECK-NEXT: [[T2:%.*]] = load double, double* [[G2]], align 4 -; CHECK-NEXT: [[T3:%.*]] = load double, double* [[G3]], align 4 -; CHECK-NEXT: [[M1:%.*]] = tail call fast double @llvm.minnum.f64(double [[T1]], double [[T0]]) -; CHECK-NEXT: [[M2:%.*]] = tail call fast double @llvm.minnum.f64(double [[T2]], double [[M1]]) -; CHECK-NEXT: [[M3:%.*]] = tail call fast double @llvm.minnum.f64(double [[T3]], double [[M2]]) -; CHECK-NEXT: ret double [[M3]] +; CHECK-NEXT: [[TMP1:%.*]] = bitcast double* [[P]] to <4 x double>* +; CHECK-NEXT: [[TMP2:%.*]] = load <4 x double>, <4 x double>* [[TMP1]], align 4 +; CHECK-NEXT: [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> [[TMP2]]) +; CHECK-NEXT: ret double [[TMP3]] ; %g1 = getelementptr inbounds double, double* %p, i64 1 %g2 = getelementptr inbounds double, double* %p, i64 2 @@ -512,6 +496,8 @@ ret double %m3 } +; Negative test - must have nnan. + define double @reduction_v4f64_not_fast(double* %p) { ; CHECK-LABEL: @reduction_v4f64_not_fast( ; CHECK-NEXT: [[G1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 1