Index: include/llvm/CodeGen/ExpandReductions.h
===================================================================
--- /dev/null
+++ include/llvm/CodeGen/ExpandReductions.h
@@ -0,0 +1,24 @@
+//===----- ExpandReductions.h - Expand experimental reduction intrinsics --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_EXPANDREDUCTIONS_H
+#define LLVM_CODEGEN_EXPANDREDUCTIONS_H
+
+#include "llvm/IR/PassManager.h"
+
+namespace llvm {
+
+class ExpandReductionsPass
+    : public PassInfoMixin<ExpandReductionsPass> {
+public:
+  PreservedAnalyses run(Function &F, FunctionAnalysisManager &AM);
+};
+} // end namespace llvm
+
+#endif // LLVM_CODEGEN_EXPANDREDUCTIONS_H
Index: include/llvm/CodeGen/Passes.h
===================================================================
--- include/llvm/CodeGen/Passes.h
+++ include/llvm/CodeGen/Passes.h
@@ -405,6 +405,10 @@
   /// printing assembly.
   ModulePass *createMachineOutlinerPass();
 
+  /// This pass expands the experimental reduction intrinsics into sequences of
+  /// shuffles.
+  FunctionPass *createExpandReductionsPass();
+
 } // End llvm namespace
 
 /// Target machine pass initializer for passes with dependencies. Use with
Index: include/llvm/InitializePasses.h
===================================================================
--- include/llvm/InitializePasses.h
+++ include/llvm/InitializePasses.h
@@ -130,6 +130,7 @@
 void initializeEliminateAvailableExternallyLegacyPassPass(PassRegistry&);
 void initializeExpandISelPseudosPass(PassRegistry&);
 void initializeExpandPostRAPass(PassRegistry&);
+void initializeExpandReductionsPass(PassRegistry&);
 void initializeExternalAAWrapperPassPass(PassRegistry&);
 void initializeFEntryInserterPass(PassRegistry&);
 void initializeFinalizeMachineBundlesPass(PassRegistry&);
Index: include/llvm/Transforms/Utils/LoopUtils.h
===================================================================
--- include/llvm/Transforms/Utils/LoopUtils.h
+++ include/llvm/Transforms/Utils/LoopUtils.h
@@ -490,6 +490,12 @@
                         LoopSafetyInfo *SafetyInfo,
                         OptimizationRemarkEmitter *ORE = nullptr);
 
+/// Generates a vector reduction using shufflevectors to reduce the value.
+Value *getShuffleReduction(
+    IRBuilder<> &Builder, Value *Src, unsigned Op,
+    RecurrenceDescriptor::MinMaxRecurrenceKind *MinMaxKind = nullptr,
+    ArrayRef<Value *> RedOps = ArrayRef<Value *>());
+
 /// Create a target reduction of the given vector. The reduction must be simple,
 /// that is, it must not be complex like a minmax reduction.
 Value *createTargetReduction(IRBuilder<> &B, const TargetTransformInfo *TTI,
Index: lib/CodeGen/CMakeLists.txt
===================================================================
--- lib/CodeGen/CMakeLists.txt
+++ lib/CodeGen/CMakeLists.txt
@@ -23,6 +23,7 @@
   ExecutionDepsFix.cpp
   ExpandISelPseudos.cpp
   ExpandPostRAPseudos.cpp
+  ExpandReductions.cpp
   FaultMaps.cpp
   FEntryInserter.cpp
   FuncletLayout.cpp
Index: lib/CodeGen/ExpandReductions.cpp
===================================================================
--- /dev/null
+++ lib/CodeGen/ExpandReductions.cpp
@@ -0,0 +1,160 @@
+//===--- ExpandReductions.cpp - Expand experimental reduction intrinsics --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements IR expansion for reduction intrinsics, allowing targets
+// to enable the experimental intrinsics until just before codegen.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/ExpandReductions.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+namespace {
+
+unsigned getOpcode(Intrinsic::ID ID) {
+  switch (ID) {
+  case Intrinsic::experimental_vector_reduce_fadd:
+    return Instruction::FAdd;
+  case Intrinsic::experimental_vector_reduce_fmul:
+    return Instruction::FMul;
+  case Intrinsic::experimental_vector_reduce_add:
+    return Instruction::Add;
+  case Intrinsic::experimental_vector_reduce_mul:
+    return Instruction::Mul;
+  case Intrinsic::experimental_vector_reduce_and:
+    return Instruction::And;
+  case Intrinsic::experimental_vector_reduce_or:
+    return Instruction::Or;
+  case Intrinsic::experimental_vector_reduce_xor:
+    return Instruction::Xor;
+  case Intrinsic::experimental_vector_reduce_smax:
+  case Intrinsic::experimental_vector_reduce_smin:
+  case Intrinsic::experimental_vector_reduce_umax:
+  case Intrinsic::experimental_vector_reduce_umin:
+    return Instruction::ICmp;
+  case Intrinsic::experimental_vector_reduce_fmax:
+  case Intrinsic::experimental_vector_reduce_fmin:
+    return Instruction::FCmp;
+  default:
+    llvm_unreachable("Unexpected ID");
+  }
+}
+
+RecurrenceDescriptor::MinMaxRecurrenceKind getMRK(Intrinsic::ID ID) {
+  switch (ID) {
+  case Intrinsic::experimental_vector_reduce_smax:
+    return RecurrenceDescriptor::MRK_SIntMax;
+  case Intrinsic::experimental_vector_reduce_smin:
+    return RecurrenceDescriptor::MRK_SIntMin;
+  case Intrinsic::experimental_vector_reduce_umax:
+    return RecurrenceDescriptor::MRK_UIntMax;
+  case Intrinsic::experimental_vector_reduce_umin:
+    return RecurrenceDescriptor::MRK_UIntMin;
+  case Intrinsic::experimental_vector_reduce_fmax:
+    return RecurrenceDescriptor::MRK_FloatMax;
+  case Intrinsic::experimental_vector_reduce_fmin:
+    return RecurrenceDescriptor::MRK_FloatMin;
+  default:
+    llvm_unreachable("Unexpected ID");
+  }
+}
+
+bool expandReductions(Function &F) {
+  bool Changed = false;
+  inst_iterator NextIt;
+  for (inst_iterator I = inst_begin(F), E = inst_end(F); I != E; I = NextIt) {
+    NextIt = std::next(I);
+    auto II = dyn_cast<IntrinsicInst>(&*I);
+    if (!II)
+      continue;
+    IRBuilder<> Builder(II);
+    Value *Vec = nullptr;
+    auto ID = II->getIntrinsicID();
+    switch (ID) {
+    case Intrinsic::experimental_vector_reduce_fadd:
+    case Intrinsic::experimental_vector_reduce_fmul:
+      // FMFs must be attached to the call, otherwise it's an ordered reduction
+      // and it can't be handled by generating this shuffle sequence.
+      if (!II->getFastMathFlags().unsafeAlgebra())
+        continue;
+      Vec = II->getArgOperand(1);
+    case Intrinsic::experimental_vector_reduce_add:
+    case Intrinsic::experimental_vector_reduce_mul:
+    case Intrinsic::experimental_vector_reduce_and:
+    case Intrinsic::experimental_vector_reduce_or:
+    case Intrinsic::experimental_vector_reduce_xor: {
+      if (!Vec)
+        Vec = II->getArgOperand(0);
+      auto Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID));
+      cast<Instruction>(Rdx)->setDebugLoc(II->getDebugLoc());
+      II->replaceAllUsesWith(Rdx);
+      II->eraseFromParent();
+      Changed = true;
+      continue;
+    }
+    case Intrinsic::experimental_vector_reduce_smax:
+    case Intrinsic::experimental_vector_reduce_smin:
+    case Intrinsic::experimental_vector_reduce_umax:
+    case Intrinsic::experimental_vector_reduce_umin:
+    case Intrinsic::experimental_vector_reduce_fmax:
+    case Intrinsic::experimental_vector_reduce_fmin: {
+      Vec = II->getArgOperand(0);
+      auto MRK = getMRK(ID);
+      auto Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), &MRK);
+      cast<Instruction>(Rdx)->setDebugLoc(II->getDebugLoc());
+      II->replaceAllUsesWith(Rdx);
+      II->eraseFromParent();
+      Changed = true;
+      continue;
+    }
+    default:
+      continue;
+    }
+  }
+  return Changed;
+}
+
+class ExpandReductions : public FunctionPass {
+public:
+  static char ID;
+  ExpandReductions() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) {
+    return expandReductions(F);
+  }
+};
+
+char ExpandReductions::ID;
+}
+
+INITIALIZE_PASS(ExpandReductions, "expand-reductions",
+                "Expand reduction intrinsics", false, false)
+
+namespace llvm {
+FunctionPass *createExpandReductionsPass() {
+  return new ExpandReductions();
+}
+
+PreservedAnalyses ExpandReductionsPass::run(Function &F,
+                                            FunctionAnalysisManager &AM) {
+  if (!expandReductions(F))
+    return PreservedAnalyses::all();
+  return PreservedAnalyses::none();
+}
+} // End llvm namespace
Index: lib/Transforms/Utils/LoopUtils.cpp
===================================================================
--- lib/Transforms/Utils/LoopUtils.cpp
+++ lib/Transforms/Utils/LoopUtils.cpp
@@ -1125,10 +1125,10 @@
 }
 
 // Helper to generate a log2 shuffle reduction.
-static Value *getShuffleReduction(
+Value *llvm::getShuffleReduction(
     IRBuilder<> &Builder, Value *Src, unsigned Op,
-    RecurrenceDescriptor::MinMaxRecurrenceKind *MinMaxKind = nullptr,
-    ArrayRef<Value *> RedOps = ArrayRef<Value *>()) {
+    RecurrenceDescriptor::MinMaxRecurrenceKind *MinMaxKind,
+    ArrayRef<Value *> RedOps) {
   unsigned VF = Src->getType()->getVectorNumElements();
   // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
   // and vector ops, reducing the set of values being computed by half each
Index: test/CodeGen/Generic/expand-experimental-reductions.ll
===================================================================
--- /dev/null
+++ test/CodeGen/Generic/expand-experimental-reductions.ll
@@ -0,0 +1,167 @@
+; RUN: opt < %s -mtriple=aarch64 -expand-reductions -S | FileCheck %s
+declare i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64>)
+
+declare float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float, <4 x float>)
+declare float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float, <4 x float>)
+
+declare i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64>)
+declare i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64>)
+
+declare double @llvm.experimental.vector.reduce.fmax.f64.v2f64(<2 x double>)
+declare double @llvm.experimental.vector.reduce.fmin.f64.v2f64(<2 x double>)
+
+
+define i64 @add_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @add_i64
+; CHECK: [[SHUF:%[a-zA-Z0-9.]+]] = shufflevector <2 x i64> %vec
+; CHECK-NEXT: [[RDX:%[a-zA-Z0-9.]+]] = add <2 x i64> %vec, [[SHUF]]
+; CHECK-NEXT: extractelement <2 x i64> [[RDX]], i32 0
+entry:
+  %r = call i64 @llvm.experimental.vector.reduce.add.i64.v2i64(<2 x i64> %vec)
+  ret i64 %r
+}
+
+define i64 @mul_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @mul_i64
+; CHECK: [[SHUF:%[a-zA-Z0-9.]+]] = shufflevector <2 x i64> %vec
+; CHECK-NEXT: [[RDX:%[a-zA-Z0-9.]+]] = mul <2 x i64> %vec, [[SHUF]]
+; CHECK-NEXT: extractelement <2 x i64> [[RDX]], i32 0
+entry:
+  %r = call i64 @llvm.experimental.vector.reduce.mul.i64.v2i64(<2 x i64> %vec)
+  ret i64 %r
+}
+
+define i64 @and_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @and_i64
+; CHECK: [[SHUF:%[a-zA-Z0-9.]+]] = shufflevector <2 x i64> %vec
+; CHECK-NEXT: [[RDX:%[a-zA-Z0-9.]+]] = and <2 x i64> %vec, [[SHUF]]
+; CHECK-NEXT: extractelement <2 x i64> [[RDX]], i32 0
+entry:
+  %r = call i64 @llvm.experimental.vector.reduce.and.i64.v2i64(<2 x i64> %vec)
+  ret i64 %r
+}
+
+define i64 @or_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @or_i64
+; CHECK: [[SHUF:%[a-zA-Z0-9.]+]] = shufflevector <2 x i64> %vec
+; CHECK-NEXT: [[RDX:%[a-zA-Z0-9.]+]] = or <2 x i64> %vec, [[SHUF]]
+; CHECK-NEXT: extractelement <2 x i64> [[RDX]], i32 0
+entry:
+  %r = call i64 @llvm.experimental.vector.reduce.or.i64.v2i64(<2 x i64> %vec)
+  ret i64 %r
+}
+
+define i64 @xor_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @xor_i64
+; CHECK: [[SHUF:%[a-zA-Z0-9.]+]] = shufflevector <2 x i64> %vec
+; CHECK-NEXT: [[RDX:%[a-zA-Z0-9.]+]] = xor <2 x i64> %vec, [[SHUF]]
+; CHECK-NEXT: extractelement <2 x i64> [[RDX]], i32 0
+entry:
+  %r = call i64 @llvm.experimental.vector.reduce.xor.i64.v2i64(<2 x i64> %vec)
+  ret i64 %r
+}
+
+define float @fadd_f32(<4 x float> %vec) {
+; CHECK-LABEL: @fadd_f32
+; CHECK: [[SHUF:%[a-zA-Z0-9.]+]] = shufflevector <4 x float> %vec
+; CHECK-NEXT: [[RDX:%[a-zA-Z0-9.]+]] = fadd fast <4 x float> %vec, [[SHUF]]
+; CHECK-NEXT: [[SHUF2:%[a-zA-Z0-9.]+]] = shufflevector <4 x float> [[RDX]]
+; CHECK-NEXT: [[RDX2:%[a-zA-Z0-9.]+]] = fadd fast <4 x float> [[RDX]], [[SHUF2]]
+; CHECK-NEXT: extractelement <4 x float> [[RDX2]], i32 0
+entry:
+  %r = call fast float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec)
+  ret float %r
+}
+
+define float @fadd_f32_strict(<4 x float> %vec) {
+entry:
+; CHECK-LABEL: @fadd_f32_strict
+; CHECK-NOT: shufflevector
+; CHECK: call float @llvm.experimental.vector.reduce.fadd
+  %r = call float @llvm.experimental.vector.reduce.fadd.f32.v4f32(float undef, <4 x float> %vec)
+  ret float %r
+}
+
+define float @fmul_f32(<4 x float> %vec) {
+; CHECK-LABEL: @fmul_f32
+; CHECK: [[SHUF:%[a-zA-Z0-9.]+]] = shufflevector <4 x float> %vec
+; CHECK-NEXT: [[RDX:%[a-zA-Z0-9.]+]] = fmul fast <4 x float> %vec, [[SHUF]]
+; CHECK-NEXT: [[SHUF2:%[a-zA-Z0-9.]+]] = shufflevector <4 x float> [[RDX]]
+; CHECK-NEXT: [[RDX2:%[a-zA-Z0-9.]+]] = fmul fast <4 x float> [[RDX]], [[SHUF2]]
+; CHECK-NEXT: extractelement <4 x float> [[RDX2]], i32 0
+entry:
+  %r = call fast float @llvm.experimental.vector.reduce.fmul.f32.v4f32(float undef, <4 x float> %vec)
+  ret float %r
+}
+
+define i64 @smax_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @smax_i64
+; CHECK: [[SHUF:%[a-zA-Z0-9.]+]] = shufflevector <2 x i64> %vec
+; CHECK-NEXT: [[CMP:%[a-zA-Z0-9.]+]] = icmp sgt <2 x i64> %vec, [[SHUF]]
+; CHECK-NEXT: [[SEL:%[a-zA-Z0-9.]+]] = select <2 x i1> [[CMP]], <2 x i64> %vec, <2 x i64> [[SHUF]]
+; CHECK-NEXT: extractelement <2 x i64> [[SEL]], i32 0
+entry:
+  %r = call i64 @llvm.experimental.vector.reduce.smax.i64.v2i64(<2 x i64> %vec)
+  ret i64 %r
+}
+
+define i64 @smin_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @smin_i64
+; CHECK: [[SHUF:%[a-zA-Z0-9.]+]] = shufflevector <2 x i64> %vec
+; CHECK-NEXT: [[CMP:%[a-zA-Z0-9.]+]] = icmp slt <2 x i64> %vec, [[SHUF]]
+; CHECK-NEXT: [[SEL:%[a-zA-Z0-9.]+]] = select <2 x i1> [[CMP]], <2 x i64> %vec, <2 x i64> [[SHUF]]
+; CHECK-NEXT: extractelement <2 x i64> [[SEL]], i32 0
+entry:
+  %r = call i64 @llvm.experimental.vector.reduce.smin.i64.v2i64(<2 x i64> %vec)
+  ret i64 %r
+}
+
+define i64 @umax_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @umax_i64
+; CHECK: [[SHUF:%[a-zA-Z0-9.]+]] = shufflevector <2 x i64> %vec
+; CHECK-NEXT: [[CMP:%[a-zA-Z0-9.]+]] = icmp ugt <2 x i64> %vec, [[SHUF]]
+; CHECK-NEXT: [[SEL:%[a-zA-Z0-9.]+]] = select <2 x i1> [[CMP]], <2 x i64> %vec, <2 x i64> [[SHUF]]
+; CHECK-NEXT: extractelement <2 x i64> [[SEL]], i32 0
+entry:
+  %r = call i64 @llvm.experimental.vector.reduce.umax.i64.v2i64(<2 x i64> %vec)
+  ret i64 %r
+}
+
+define i64 @umin_i64(<2 x i64> %vec) {
+; CHECK-LABEL: @umin_i64
+; CHECK: [[SHUF:%[a-zA-Z0-9.]+]] = shufflevector <2 x i64> %vec
+; CHECK-NEXT: [[CMP:%[a-zA-Z0-9.]+]] = icmp ult <2 x i64> %vec, [[SHUF]]
+; CHECK-NEXT: [[SEL:%[a-zA-Z0-9.]+]] = select <2 x i1> [[CMP]], <2 x i64> %vec, <2 x i64> [[SHUF]]
+; CHECK-NEXT: extractelement <2 x i64> [[SEL]], i32 0
+entry:
+  %r = call i64 @llvm.experimental.vector.reduce.umin.i64.v2i64(<2 x i64> %vec)
+  ret i64 %r
+}
+
+define double @fmax_f64(<2 x double> %vec) {
+; CHECK-LABEL: @fmax_f64
+; CHECK: [[SHUF:%[a-zA-Z0-9.]+]] = shufflevector <2 x double> %vec
+; CHECK-NEXT: [[CMP:%[a-zA-Z0-9.]+]] = fcmp fast ogt <2 x double> %vec, [[SHUF]]
+; CHECK-NEXT: [[SEL:%[a-zA-Z0-9.]+]] = select <2 x i1> [[CMP]], <2 x double> %vec, <2 x double> [[SHUF]]
+; CHECK-NEXT: extractelement <2 x double> [[SEL]], i32 0
+entry:
+  %r = call double @llvm.experimental.vector.reduce.fmax.f64.v2f64(<2 x double> %vec)
+  ret double %r
+}
+
+define double @fmin_f64(<2 x double> %vec) {
+; CHECK-LABEL: @fmin_f64
+; CHECK: [[SHUF:%[a-zA-Z0-9.]+]] = shufflevector <2 x double> %vec
+; CHECK-NEXT: [[CMP:%[a-zA-Z0-9.]+]] = fcmp fast olt <2 x double> %vec, [[SHUF]]
+; CHECK-NEXT: [[SEL:%[a-zA-Z0-9.]+]] = select <2 x i1> [[CMP]], <2 x double> %vec, <2 x double> [[SHUF]]
+; CHECK-NEXT: extractelement <2 x double> [[SEL]], i32 0
+entry:
+  %r = call double @llvm.experimental.vector.reduce.fmin.f64.v2f64(<2 x double> %vec)
+  ret double %r
+}
Index: tools/llc/llc.cpp
===================================================================
--- tools/llc/llc.cpp
+++ tools/llc/llc.cpp
@@ -296,6 +296,7 @@
   initializeConstantHoistingLegacyPassPass(*Registry);
   initializeScalarOpts(*Registry);
   initializeVectorization(*Registry);
+  initializeExpandReductionsPass(*Registry);
 
   // Register the target printer for --version.
   cl::AddExtraVersionPrinter(TargetRegistry::printRegisteredTargetsForVersion);
Index: tools/opt/opt.cpp
===================================================================
--- tools/opt/opt.cpp
+++ tools/opt/opt.cpp
@@ -397,6 +397,7 @@
   initializeInterleavedAccessPass(Registry);
   initializeCountingFunctionInserterPass(Registry);
   initializeUnreachableBlockElimLegacyPassPass(Registry);
+  initializeExpandReductionsPass(Registry);
 
 #ifdef LINK_POLLY_INTO_TOOLS
   polly::initializePollyPasses(Registry);