Index: llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
===================================================================
--- llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -6455,6 +6455,10 @@
       case RecurKind::FMul:
         return Builder.CreateBinOp((Instruction::BinaryOps)RdxOpcode, LHS, RHS,
                                    Name);
+      case RecurKind::FMax:
+        return Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, LHS, RHS);
+      case RecurKind::FMin:
+        return Builder.CreateBinaryIntrinsic(Intrinsic::minnum, LHS, RHS);
 
       case RecurKind::SMax: {
         Value *Cmp = Builder.CreateICmpSGT(LHS, RHS, Name);
@@ -6568,6 +6572,15 @@
       if (RecurrenceDescriptor::isIntMinMaxRecurrenceKind(Kind))
         return true;
 
+      if (Kind == RecurKind::FMax || Kind == RecurKind::FMin) {
+        // FP min/max are associative except for NaN and -0.0. We do not
+        // have to rule out -0.0 here because the intrinsic semantics do not
+        // specify a fixed result for it.
+        // TODO: This is artificially restricted to fast because the code that
+        //       creates reductions assumes/produces fast ops.
+        return I->getFastMathFlags().isFast();
+      }
+
       return I->isAssociative();
     }
 
@@ -6677,6 +6690,11 @@
     if (match(I, m_FMul(m_Value(), m_Value())))
       return OperationData(RecurKind::FMul);
 
+    if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_Value())))
+      return OperationData(RecurKind::FMax);
+    if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
+      return OperationData(RecurKind::FMin);
+
     if (match(I, m_SMax(m_Value(), m_Value())))
       return OperationData(RecurKind::SMax);
     if (match(I, m_SMin(m_Value(), m_Value())))
@@ -7076,6 +7094,18 @@
       ScalarCost = TTI->getArithmeticInstrCost(RdxOpcode, ScalarTy);
       break;
     }
+    case RecurKind::FMax:
+    case RecurKind::FMin: {
+      // FIXME: FMF should be captured/intersected from all reduction insts.
+      FastMathFlags FMF = cast<Instruction>(ReductionRoot)->getFastMathFlags();
+      Intrinsic::ID IID =
+          Kind == RecurKind::FMax ? Intrinsic::maxnum : Intrinsic::minnum;
+      IntrinsicCostAttributes VecAttr(IID, VecTy, {VecTy, VecTy}, FMF);
+      IntrinsicCostAttributes ScaAttr(IID, ScalarTy, {ScalarTy, ScalarTy}, FMF);
+      VectorCost = TTI->getIntrinsicInstrCost(VecAttr, TTI::TCK_RecipThroughput);
+      ScalarCost = TTI->getIntrinsicInstrCost(ScaAttr, TTI::TCK_RecipThroughput);
+      break;
+    }
     case RecurKind::SMax:
     case RecurKind::SMin:
     case RecurKind::UMax:
@@ -7307,6 +7337,16 @@
   return nullptr;
 }
 
+static bool matchRdxBop(Instruction *I, Value *&V0, Value *&V1) {
+  if (match(I, m_BinOp(m_Value(V0), m_Value(V1))))
+    return true;
+  if (match(I, m_Intrinsic<Intrinsic::maxnum>(m_Value(V0), m_Value(V1))))
+    return true;
+  if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(V0), m_Value(V1))))
+    return true;
+  return false;
+}
+
 /// Attempt to reduce a horizontal reduction.
 /// If it is legal to match a horizontal reduction feeding the phi node \a P
 /// with reduction operators \a Root (or one of its operands) in a basic block
@@ -7347,7 +7387,7 @@
     unsigned Level;
     std::tie(Inst, Level) = Stack.pop_back_val();
     Value *B0, *B1;
-    bool IsBinop = match(Inst, m_BinOp(m_Value(B0), m_Value(B1)));
+    bool IsBinop = matchRdxBop(Inst, B0, B1);
     bool IsSelect = match(Inst, m_Select(m_Value(), m_Value(), m_Value()));
     if (IsBinop || IsSelect) {
       HorizontalReduction HorRdx;
Index: llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
===================================================================
--- llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
+++ llvm/test/Transforms/PhaseOrdering/X86/vector-reductions-expanded.ll
@@ -330,17 +330,16 @@
 define float @fmin_v4i32(float* %p) #0 {
 ; CHECK-LABEL: @fmin_v4i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load float, float* [[P:%.*]], align 4, [[TBAA7]]
-; CHECK-NEXT:    [[ARRAYIDX_1:%.*]] = getelementptr inbounds float, float* [[P]], i64 1
-; CHECK-NEXT:    [[TMP1:%.*]] = load float, float* [[ARRAYIDX_1]], align 4, [[TBAA7]]
-; CHECK-NEXT:    [[TMP2:%.*]] = tail call fast float @llvm.minnum.f32(float [[TMP1]], float [[TMP0]])
-; CHECK-NEXT:    [[ARRAYIDX_2:%.*]] = getelementptr inbounds float, float* [[P]], i64 2
-; CHECK-NEXT:    [[TMP3:%.*]] = load float, float* [[ARRAYIDX_2]], align 4, [[TBAA7]]
-; CHECK-NEXT:    [[TMP4:%.*]] = tail call fast float @llvm.minnum.f32(float [[TMP3]], float [[TMP2]])
-; CHECK-NEXT:    [[ARRAYIDX_3:%.*]] = getelementptr inbounds float, float* [[P]], i64 3
-; CHECK-NEXT:    [[TMP5:%.*]] = load float, float* [[ARRAYIDX_3]], align 4, [[TBAA7]]
-; CHECK-NEXT:    [[TMP6:%.*]] = tail call fast float @llvm.minnum.f32(float [[TMP5]], float [[TMP4]])
-; CHECK-NEXT:    ret float [[TMP6]]
+; CHECK-NEXT:    [[TMP0:%.*]] = bitcast float* [[P:%.*]] to <4 x float>*
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4, [[TBAA7]]
+; CHECK-NEXT:    [[RDX_SHUF:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP:%.*]] = fcmp fast olt <4 x float> [[TMP1]], [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT:%.*]] = select fast <4 x i1> [[RDX_MINMAX_CMP]], <4 x float> [[TMP1]], <4 x float> [[RDX_SHUF]]
+; CHECK-NEXT:    [[RDX_SHUF3:%.*]] = shufflevector <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> poison, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+; CHECK-NEXT:    [[RDX_MINMAX_CMP4:%.*]] = fcmp fast olt <4 x float> [[RDX_MINMAX_SELECT]], [[RDX_SHUF3]]
+; CHECK-NEXT:    [[RDX_MINMAX_SELECT5:%.*]] = select fast <4 x i1> [[RDX_MINMAX_CMP4]], <4 x float> [[RDX_MINMAX_SELECT]], <4 x float> [[RDX_SHUF3]]
+; CHECK-NEXT:    [[TMP2:%.*]] = extractelement <4 x float> [[RDX_MINMAX_SELECT5]], i32 0
+; CHECK-NEXT:    ret float [[TMP2]]
 ;
 entry:
   br label %for.cond
Index: llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/fmaxnum.ll
@@ -343,14 +343,10 @@
 ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1
 ; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds float, float* [[P]], i64 2
 ; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds float, float* [[P]], i64 3
-; CHECK-NEXT:    [[T0:%.*]] = load float, float* [[P]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load float, float* [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load float, float* [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load float, float* [[G3]], align 4
-; CHECK-NEXT:    [[M1:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T1]], float [[T0]])
-; CHECK-NEXT:    [[M2:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T2]], float [[M1]])
-; CHECK-NEXT:    [[M3:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T3]], float [[M2]])
-; CHECK-NEXT:    ret float [[M3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fmax.v4f32(<4 x float> [[TMP2]])
+; CHECK-NEXT:    ret float [[TMP3]]
 ;
   %g1 = getelementptr inbounds float, float* %p, i64 1
   %g2 = getelementptr inbounds float, float* %p, i64 2
@@ -401,22 +397,10 @@
 ; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds float, float* [[P]], i64 5
 ; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds float, float* [[P]], i64 6
 ; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds float, float* [[P]], i64 7
-; CHECK-NEXT:    [[T0:%.*]] = load float, float* [[P]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load float, float* [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load float, float* [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load float, float* [[G3]], align 4
-; CHECK-NEXT:    [[T4:%.*]] = load float, float* [[G4]], align 4
-; CHECK-NEXT:    [[T5:%.*]] = load float, float* [[G5]], align 4
-; CHECK-NEXT:    [[T6:%.*]] = load float, float* [[G6]], align 4
-; CHECK-NEXT:    [[T7:%.*]] = load float, float* [[G7]], align 4
-; CHECK-NEXT:    [[M1:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T1]], float [[T0]])
-; CHECK-NEXT:    [[M2:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T2]], float [[M1]])
-; CHECK-NEXT:    [[M3:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T3]], float [[M2]])
-; CHECK-NEXT:    [[M4:%.*]] = tail call fast float @llvm.maxnum.f32(float [[T4]], float [[M3]])
-; CHECK-NEXT:    [[M5:%.*]] = tail call fast float @llvm.maxnum.f32(float [[M4]], float [[T6]])
-; CHECK-NEXT:    [[M6:%.*]] = tail call fast float @llvm.maxnum.f32(float [[M5]], float [[T5]])
-; CHECK-NEXT:    [[M7:%.*]] = tail call fast float @llvm.maxnum.f32(float [[M6]], float [[T7]])
-; CHECK-NEXT:    ret float [[M7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P]] to <8 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fmax.v8f32(<8 x float> [[TMP2]])
+; CHECK-NEXT:    ret float [[TMP3]]
 ;
   %g1 = getelementptr inbounds float, float* %p, i64 1
   %g2 = getelementptr inbounds float, float* %p, i64 2
@@ -463,14 +447,10 @@
 ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 1
 ; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds double, double* [[P]], i64 2
 ; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds double, double* [[P]], i64 3
-; CHECK-NEXT:    [[T0:%.*]] = load double, double* [[P]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load double, double* [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load double, double* [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load double, double* [[G3]], align 4
-; CHECK-NEXT:    [[M1:%.*]] = tail call fast double @llvm.maxnum.f64(double [[T1]], double [[T0]])
-; CHECK-NEXT:    [[M2:%.*]] = tail call fast double @llvm.maxnum.f64(double [[T2]], double [[M1]])
-; CHECK-NEXT:    [[M3:%.*]] = tail call fast double @llvm.maxnum.f64(double [[T3]], double [[M2]])
-; CHECK-NEXT:    ret double [[M3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[P]] to <4 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fmax.v4f64(<4 x double> [[TMP2]])
+; CHECK-NEXT:    ret double [[TMP3]]
 ;
   %g1 = getelementptr inbounds double, double* %p, i64 1
   %g2 = getelementptr inbounds double, double* %p, i64 2
Index: llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll
===================================================================
--- llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll
+++ llvm/test/Transforms/SLPVectorizer/X86/fminnum.ll
@@ -343,14 +343,10 @@
 ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds float, float* [[P:%.*]], i64 1
 ; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds float, float* [[P]], i64 2
 ; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds float, float* [[P]], i64 3
-; CHECK-NEXT:    [[T0:%.*]] = load float, float* [[P]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load float, float* [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load float, float* [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load float, float* [[G3]], align 4
-; CHECK-NEXT:    [[M1:%.*]] = tail call fast float @llvm.minnum.f32(float [[T1]], float [[T0]])
-; CHECK-NEXT:    [[M2:%.*]] = tail call fast float @llvm.minnum.f32(float [[T2]], float [[M1]])
-; CHECK-NEXT:    [[M3:%.*]] = tail call fast float @llvm.minnum.f32(float [[T3]], float [[M2]])
-; CHECK-NEXT:    ret float [[M3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P]] to <4 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, <4 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fmin.v4f32(<4 x float> [[TMP2]])
+; CHECK-NEXT:    ret float [[TMP3]]
 ;
   %g1 = getelementptr inbounds float, float* %p, i64 1
   %g2 = getelementptr inbounds float, float* %p, i64 2
@@ -401,22 +397,10 @@
 ; CHECK-NEXT:    [[G5:%.*]] = getelementptr inbounds float, float* [[P]], i64 5
 ; CHECK-NEXT:    [[G6:%.*]] = getelementptr inbounds float, float* [[P]], i64 6
 ; CHECK-NEXT:    [[G7:%.*]] = getelementptr inbounds float, float* [[P]], i64 7
-; CHECK-NEXT:    [[T0:%.*]] = load float, float* [[P]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load float, float* [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load float, float* [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load float, float* [[G3]], align 4
-; CHECK-NEXT:    [[T4:%.*]] = load float, float* [[G4]], align 4
-; CHECK-NEXT:    [[T5:%.*]] = load float, float* [[G5]], align 4
-; CHECK-NEXT:    [[T6:%.*]] = load float, float* [[G6]], align 4
-; CHECK-NEXT:    [[T7:%.*]] = load float, float* [[G7]], align 4
-; CHECK-NEXT:    [[M1:%.*]] = tail call fast float @llvm.minnum.f32(float [[T1]], float [[T0]])
-; CHECK-NEXT:    [[M2:%.*]] = tail call fast float @llvm.minnum.f32(float [[T2]], float [[M1]])
-; CHECK-NEXT:    [[M3:%.*]] = tail call fast float @llvm.minnum.f32(float [[T3]], float [[M2]])
-; CHECK-NEXT:    [[M4:%.*]] = tail call fast float @llvm.minnum.f32(float [[T4]], float [[M3]])
-; CHECK-NEXT:    [[M5:%.*]] = tail call fast float @llvm.minnum.f32(float [[M4]], float [[T6]])
-; CHECK-NEXT:    [[M6:%.*]] = tail call fast float @llvm.minnum.f32(float [[M5]], float [[T5]])
-; CHECK-NEXT:    [[M7:%.*]] = tail call fast float @llvm.minnum.f32(float [[M6]], float [[T7]])
-; CHECK-NEXT:    ret float [[M7]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast float* [[P]] to <8 x float>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <8 x float>, <8 x float>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fmin.v8f32(<8 x float> [[TMP2]])
+; CHECK-NEXT:    ret float [[TMP3]]
 ;
   %g1 = getelementptr inbounds float, float* %p, i64 1
   %g2 = getelementptr inbounds float, float* %p, i64 2
@@ -463,14 +447,10 @@
 ; CHECK-NEXT:    [[G1:%.*]] = getelementptr inbounds double, double* [[P:%.*]], i64 1
 ; CHECK-NEXT:    [[G2:%.*]] = getelementptr inbounds double, double* [[P]], i64 2
 ; CHECK-NEXT:    [[G3:%.*]] = getelementptr inbounds double, double* [[P]], i64 3
-; CHECK-NEXT:    [[T0:%.*]] = load double, double* [[P]], align 4
-; CHECK-NEXT:    [[T1:%.*]] = load double, double* [[G1]], align 4
-; CHECK-NEXT:    [[T2:%.*]] = load double, double* [[G2]], align 4
-; CHECK-NEXT:    [[T3:%.*]] = load double, double* [[G3]], align 4
-; CHECK-NEXT:    [[M1:%.*]] = tail call fast double @llvm.minnum.f64(double [[T1]], double [[T0]])
-; CHECK-NEXT:    [[M2:%.*]] = tail call fast double @llvm.minnum.f64(double [[T2]], double [[M1]])
-; CHECK-NEXT:    [[M3:%.*]] = tail call fast double @llvm.minnum.f64(double [[T3]], double [[M2]])
-; CHECK-NEXT:    ret double [[M3]]
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast double* [[P]] to <4 x double>*
+; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x double>, <4 x double>* [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast double @llvm.vector.reduce.fmin.v4f64(<4 x double> [[TMP2]])
+; CHECK-NEXT:    ret double [[TMP3]]
 ;
   %g1 = getelementptr inbounds double, double* %p, i64 1
   %g2 = getelementptr inbounds double, double* %p, i64 2