Index: mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h
===================================================================
--- mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h
+++ mlir/include/mlir/Dialect/Vector/Transforms/VectorDistribution.h
@@ -68,6 +68,10 @@
 void populatePropagateWarpVectorDistributionPatterns(
     RewritePatternSet &pattern);
 
+/// Collect patterns to distribute vector reduction ops using GPU warp shuffle
+/// ops.
+void populateReductionToGPUWarpShufflePatterns(RewritePatternSet &pattern);
+
 } // namespace vector
 } // namespace mlir
 #endif // MLIR_DIALECT_VECTOR_TRANSFORMS_VECTORDISTRIBUTION_H_
Index: mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
===================================================================
--- mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
+++ mlir/lib/Dialect/Vector/Transforms/CMakeLists.txt
@@ -25,6 +25,7 @@
   MLIRBufferization
   MLIRBufferizationTransforms
   MLIRDialectUtils
+  MLIRGPUOps
   MLIRIR
   MLIRLinalg
   MLIRMemRef
Index: mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
===================================================================
--- mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
+++ mlir/lib/Dialect/Vector/Transforms/VectorDistribute.cpp
@@ -8,9 +8,11 @@
 
 #include "mlir/Dialect/Affine/IR/AffineOps.h"
 #include "mlir/Dialect/Arithmetic/IR/Arithmetic.h"
+#include "mlir/Dialect/GPU/GPUDialect.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/SCF/SCF.h"
 #include "mlir/Dialect/Vector/Transforms/VectorDistribution.h"
+#include "mlir/Dialect/Vector/Utils/VectorUtils.h"
 #include "mlir/IR/BlockAndValueMapping.h"
 #include "mlir/Transforms/SideEffectUtils.h"
 
@@ -708,6 +710,91 @@
   }
 };
 
+/// A pattern that extracts vector.reduction ops from a WarpExecuteOnLane0Op.
+/// The vector is reduced in parallel. Currently limited to vector<32x...>
+/// values. Every lane reduces two scalars, 5 times in a row.
+/// E.g.:
+/// ```
+/// %r = vector_ext.warp_execute_on_lane_0(%laneid) -> (f32) {
+///   %0 = "some_def"() : () -> (vector<32xf32>)
+///   %1 = vector.reduction "add", %0 : vector<32xf32> into f32
+///   vector_ext.yield %1 : f32
+/// }
+/// ```
+/// is lowered to:
+/// ```
+/// %0 = vector_ext.warp_execute_on_lane_0(%laneid) -> (vector<1xf32>) {
+///   %1 = "some_def"() : () -> (vector<32xf32>)
+///   vector_ext.yield %1 : vector<32xf32>
+/// }
+/// %a = vector.extract %0[0] : vector<1xf32>
+/// %r0, %s0 = gpu.shuffle xor %e, %c1, %c32 : f32
+/// %a0 = arith.addf %a, %r0 : f32
+/// %r1, %s1 = gpu.shuffle xor %a0, %c2, %c32 : f32
+/// %a1 = arith.addf %a0, %r1 : f32
+/// ...
+/// %r4, %s4 = gpu.shuffle xor %a3, %c16, %c32 : f32
+/// %r = arith.addf %a3, %r4 : f32
+/// ```
+struct ReductionToGPUWarpShuffle
+    : public OpRewritePattern<WarpExecuteOnLane0Op> {
+  using OpRewritePattern<WarpExecuteOnLane0Op>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(WarpExecuteOnLane0Op warpOp,
+                                PatternRewriter &rewriter) const override {
+    OpOperand *yieldOperand = getWarpResult(
+        warpOp, [](Operation *op) { return isa<vector::ReductionOp>(op); });
+    if (!yieldOperand)
+      return failure();
+
+    auto reductionOp =
+        cast<vector::ReductionOp>(yieldOperand->get().getDefiningOp());
+    auto vectorType = reductionOp.getVector().getType().cast<VectorType>();
+    // Only rank 1 vectors supported.
+    if (vectorType.getRank() != 1)
+      return rewriter.notifyMatchFailure(
+          warpOp, "Only rank 1 reductions can be distributed.");
+    // Only warp_size-sized vectors supported.
+    if (static_cast<uint64_t>(vectorType.getShape()[0]) != warpOp.getWarpSize())
+      return rewriter.notifyMatchFailure(
+          warpOp, "Reduction vector dimension must match was size.");
+    // Only f32 and i32 element types are supported.
+    if (!reductionOp.getType().isF32() &&
+        !reductionOp.getType().isSignlessInteger(32))
+      return rewriter.notifyMatchFailure(
+          warpOp,
+          "Reduction distribution currently only supports 32bits types.");
+
+    Location yieldLoc = yieldOperand->getOwner()->getLoc();
+
+    // Return vector that will be reduced from the WarpExecuteOnLane0Op.
+    unsigned operandIndex = yieldOperand->getOperandNumber();
+    SmallVector<Value> yieldValues = {reductionOp.getVector()};
+    SmallVector<Type> retTypes = {VectorType::get({1}, reductionOp.getType())};
+    WarpExecuteOnLane0Op newWarpOp = moveRegionToNewWarpOpAndAppendReturns(
+        rewriter, warpOp, yieldValues, retTypes);
+
+    // Every lane has one scalar value. These should be reduced.
+    Value laneValVec = newWarpOp.getResult(warpOp.getNumResults());
+    Value laneVal = rewriter.create<vector::ExtractOp>(yieldLoc, laneValVec, 0);
+
+    // Parallel reduction using butterfly shuffles.
+    for (uint64_t i = 1; i < newWarpOp.getWarpSize(); i <<= 1) {
+      Value shuffled =
+          rewriter
+              .create<gpu::ShuffleOp>(reductionOp.getLoc(), laneVal, i,
+                                      /*width=*/newWarpOp.getWarpSize(),
+                                      /*mode=*/gpu::ShuffleMode::XOR)
+              .result();
+      laneVal = makeArithReduction(rewriter, reductionOp.getLoc(),
+                                   reductionOp.getKind(), laneVal, shuffled);
+    }
+
+    newWarpOp.getResult(operandIndex).replaceAllUsesWith(laneVal);
+    return success();
+  }
+};
+
 } // namespace
 
 void mlir::vector::populateWarpExecuteOnLane0OpToScfForPattern(
@@ -728,6 +815,11 @@
       patterns.getContext());
 }
 
+void mlir::vector::populateReductionToGPUWarpShufflePatterns(
+    RewritePatternSet &patterns) {
+  patterns.add<ReductionToGPUWarpShuffle>(patterns.getContext());
+}
+
 void mlir::vector::moveScalarUniformCode(WarpExecuteOnLane0Op warpOp) {
   Block *body = warpOp.getBody();
 
Index: mlir/test/Dialect/Vector/vector-warp-distribute.mlir
===================================================================
--- mlir/test/Dialect/Vector/vector-warp-distribute.mlir
+++ mlir/test/Dialect/Vector/vector-warp-distribute.mlir
@@ -434,3 +434,37 @@
   "some_use"(%0#0) : (vector<1xf32>) -> ()
   return
 }
+
+// -----
+
+// CHECK-PROP-LABEL: func @vector_reduction(
+//  CHECK-PROP-SAME:     %[[laneid:.*]]: index)
+//   CHECK-PROP-DAG:   %[[c1:.*]] = arith.constant 1 : i32
+//   CHECK-PROP-DAG:   %[[c2:.*]] = arith.constant 2 : i32
+//   CHECK-PROP-DAG:   %[[c4:.*]] = arith.constant 4 : i32
+//   CHECK-PROP-DAG:   %[[c8:.*]] = arith.constant 8 : i32
+//   CHECK-PROP-DAG:   %[[c16:.*]] = arith.constant 16 : i32
+//   CHECK-PROP-DAG:   %[[c32:.*]] = arith.constant 32 : i32
+//       CHECK-PROP:   %[[warp_op:.*]] = vector.warp_execute_on_lane_0(%[[laneid]])[32] -> (vector<1xf32>) {
+//       CHECK-PROP:     vector.yield %{{.*}} : vector<32xf32>
+//       CHECK-PROP:   }
+//       CHECK-PROP:   %[[a:.*]] = vector.extract %[[warp_op]][0] : vector<1xf32>
+//       CHECK-PROP:   %[[r0:.*]], %{{.*}} = gpu.shuffle  xor %[[a]], %[[c1]], %[[c32]]
+//       CHECK-PROP:   %[[a0:.*]] = arith.addf %[[a]], %[[r0]]
+//       CHECK-PROP:   %[[r1:.*]], %{{.*}} = gpu.shuffle  xor %[[a0]], %[[c2]], %[[c32]]
+//       CHECK-PROP:   %[[a1:.*]] = arith.addf %[[a0]], %[[r1]]
+//       CHECK-PROP:   %[[r2:.*]], %{{.*}} = gpu.shuffle  xor %[[a1]], %[[c4]], %[[c32]]
+//       CHECK-PROP:   %[[a2:.*]] = arith.addf %[[a1]], %[[r2]]
+//       CHECK-PROP:   %[[r3:.*]], %{{.*}} = gpu.shuffle  xor %[[a2]], %[[c8]], %[[c32]]
+//       CHECK-PROP:   %[[a3:.*]] = arith.addf %[[a2]], %[[r3]]
+//       CHECK-PROP:   %[[r4:.*]], %{{.*}} = gpu.shuffle  xor %[[a3]], %[[c16]], %[[c32]]
+//       CHECK-PROP:   %[[a4:.*]] = arith.addf %[[a3]], %[[r4]]
+//       CHECK-PROP:   return %[[a4]] : f32
+func.func @vector_reduction(%laneid: index) -> (f32) {
+  %r = vector.warp_execute_on_lane_0(%laneid)[32] -> (f32) {
+    %0 = "some_def"() : () -> (vector<32xf32>)
+    %1 = vector.reduction <add>, %0 : vector<32xf32> into f32
+    vector.yield %1 : f32
+  }
+  return %r : f32
+}
Index: mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir
===================================================================
--- /dev/null
+++ mlir/test/Integration/Dialect/Vector/GPU/CUDA/test-reduction-distribute.mlir
@@ -0,0 +1,69 @@
+// RUN: mlir-opt %s -test-vector-warp-distribute="hoist-uniform distribute-transfer-write propagate-distribution" -canonicalize |\
+// RUN: mlir-opt -test-vector-warp-distribute=rewrite-warp-ops-to-scf-if |\
+// RUN: mlir-opt  -lower-affine -convert-scf-to-cf -convert-vector-to-llvm \
+// RUN:  -convert-arith-to-llvm -gpu-kernel-outlining \
+// RUN:  -pass-pipeline='gpu.module(strip-debuginfo,convert-gpu-to-nvvm,reconcile-unrealized-casts,gpu-to-cubin)' \
+// RUN:  -gpu-to-llvm -reconcile-unrealized-casts |\
+// RUN: mlir-cpu-runner -e main -entry-point-result=void \
+// RUN:   -shared-libs=%mlir_runner_utils_dir/libmlir_cuda_runtime%shlibext \
+// RUN:   -shared-libs=%mlir_runner_utils_dir/libmlir_c_runner_utils%shlibext \
+// RUN:   -shared-libs=%mlir_runner_utils_dir/libmlir_runner_utils%shlibext | \
+// RUN: FileCheck %s
+
+// Run a tiled reduction fused with an elementwise op.
+
+func.func @gpu_func(%in: memref<1024xf32>, %out: memref<1xf32>) {
+  %c1 = arith.constant 1 : index
+  %cst = arith.constant dense<100.0000> : vector<1xf32>
+  %cst_0 = arith.constant 0.000000e+00 : f32
+  %c0 = arith.constant 0 : index
+  %c1024 = arith.constant 1024 : index
+  %c32 = arith.constant 32 : index
+  gpu.launch blocks(%arg3, %arg4, %arg5)
+  in (%arg9 = %c1, %arg10 = %c1, %arg11 = %c1)
+  threads(%arg6, %arg7, %arg8) in (%arg12 = %c32, %arg13 = %c1, %arg14 = %c1) {
+    vector.warp_execute_on_lane_0(%arg6)[32] {
+      %init = vector.transfer_read %out[%c0], %cst_0 {in_bounds = [true]} : memref<1xf32>, vector<1xf32>
+      %13 = scf.for %arg0 = %c0 to %c1024 step %c32 iter_args(%arg1 = %init) -> (vector<1xf32>) {
+        %20 = vector.transfer_read %in[%arg0], %cst_0 {in_bounds = [true]} : memref<1024xf32>, vector<32xf32>
+        %21 = vector.reduction <add>, %20 : vector<32xf32> into f32
+        %22 = vector.broadcast %21 : f32 to vector<1xf32>
+        %23 = arith.addf %22, %arg1 : vector<1xf32>
+        scf.yield %23 : vector<1xf32>
+      }
+      %14 = arith.divf %13, %cst : vector<1xf32>
+      vector.transfer_write %14, %out[%c0] {in_bounds = [true]} : vector<1xf32>, memref<1xf32>
+    }
+    gpu.terminator
+  }
+  return
+}
+func.func @main() {
+  %cst = arith.constant 0.000000e+00 : f32
+  %c0 = arith.constant 0 : index
+  %c32 = arith.constant 32 : index
+  %c1024 = arith.constant 1024 : index
+  %0 = memref.alloc() : memref<1024xf32>
+  %1 = memref.alloc() : memref<1xf32>
+  %cst_1 = arith.constant dense<[
+    0.0,  1.0,  2.0,  3.0,  4.0,  5.0,  6.0,  7.0,
+    8.0,  9.0,  10.0, 11.0, 12.0, 13.0, 14.0, 15.0,
+    16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0,
+    24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0]> : vector<32xf32>
+  %cst_2 = arith.constant dense<2.000000e+00> : vector<1xf32>
+  // init the buffers.
+  scf.for %i = %c0 to %c1024 step %c32 {
+    vector.transfer_write %cst_1, %0[%i] {in_bounds = [true]} : vector<32xf32>, memref<1024xf32>
+  }
+  vector.transfer_write %cst_2, %1[%c0] {in_bounds = [true]} : vector<1xf32>, memref<1xf32>
+  %3 = memref.cast %0 : memref<1024xf32> to memref<*xf32>
+  gpu.host_register %3 : memref<*xf32>
+  %5 = memref.cast %1 : memref<1xf32> to memref<*xf32>
+  gpu.host_register %5 : memref<*xf32>
+  call @gpu_func(%0, %1) : (memref<1024xf32>, memref<1xf32>) -> ()
+  %6 = vector.transfer_read %1[%c0], %cst : memref<1xf32>, vector<1xf32>
+  vector.print %6 : vector<1xf32>
+  return
+}
+
+// CHECK: ( 158.74 )
Index: mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
===================================================================
--- mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
+++ mlir/test/lib/Dialect/Vector/TestVectorTransforms.cpp
@@ -861,6 +861,7 @@
     if (propagateDistribution) {
       RewritePatternSet patterns(ctx);
       vector::populatePropagateWarpVectorDistributionPatterns(patterns);
+      vector::populateReductionToGPUWarpShufflePatterns(patterns);
       (void)applyPatternsAndFoldGreedily(getOperation(), std::move(patterns));
     }
     WarpExecuteOnLane0LoweringOptions options;
Index: utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
===================================================================
--- utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
+++ utils/bazel/llvm-project-overlay/mlir/BUILD.bazel
@@ -3121,6 +3121,7 @@
         ":BufferizationTransforms",
         ":DialectUtils",
         ":FuncDialect",
+        ":GPUDialect",
         ":IR",
         ":LinalgOps",
         ":MemRefDialect",