diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -108,6 +108,35 @@
 def ROCDL_MbcntLoOp : ROCDL_MbcntOp<"lo">;
 def ROCDL_MbcntHiOp : ROCDL_MbcntOp<"hi">;
 
+def ROCDL_DsSwizzleOp : 
+ROCDL_Op<"ds_swizzle">,
+Results<(outs I32:$res)>,
+Arguments<(ins I32:$src,
+               I32:$offset)>
+{
+  string llvmBuilder = [{
+    $res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_ds_swizzle, {$src, $offset});
+  }];
+  let assemblyFormat = [{
+    $src `,` $offset  attr-dict `:` `(` type($src) `,` type($offset) `)` `->` type($res)
+   }];
+}
+
+def ROCDL_DsBpermuteOp : 
+ROCDL_Op<"ds_bpermute">,
+Results<(outs I32:$res)>,
+Arguments<(ins I32:$index,
+               I32:$src)>
+{
+  string llvmBuilder = [{
+    $res = createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_ds_bpermute, {$index, $src});
+  }];
+  let assemblyFormat = [{
+    $index `,` $src  attr-dict `:` `(` type($index) `,` type($src) `)` `->` type($res)
+   }];
+}
+
+
 //===----------------------------------------------------------------------===//
 // Thread index and Block index
 
diff --git a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
--- a/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
+++ b/mlir/lib/Conversion/GPUToROCDL/LowerGpuOpsToROCDLOps.cpp
@@ -64,6 +64,18 @@
   return canBeBare;
 }
 
+Value getLaneId(ConversionPatternRewriter &rewriter, Location loc,
+                const unsigned indexBitwidth) {
+  auto int32Type = IntegerType::get(rewriter.getContext(), 32);
+  Value zero = rewriter.createOrFold<arith::ConstantIntOp>(loc, 0, 32);
+  Value minus1 = rewriter.createOrFold<arith::ConstantIntOp>(loc, -1, 32);
+  Value mbcntLo = rewriter.create<ROCDL::MbcntLoOp>(loc, int32Type,
+                                                    ValueRange{minus1, zero});
+  Value laneId = rewriter.create<ROCDL::MbcntHiOp>(loc, int32Type,
+                                                   ValueRange{minus1, mbcntLo});
+  return laneId;
+}
+
 namespace {
 struct GPULaneIdOpToROCDL : ConvertOpToLLVMPattern<gpu::LaneIdOp> {
   using ConvertOpToLLVMPattern<gpu::LaneIdOp>::ConvertOpToLLVMPattern;
@@ -98,6 +110,86 @@
   }
 };
 
+struct GPUShuffleOpLowering : public ConvertOpToLLVMPattern<gpu::ShuffleOp> {
+  using ConvertOpToLLVMPattern<gpu::ShuffleOp>::ConvertOpToLLVMPattern;
+
+  /// Lowers a shuffle to the corresponding NVVM op.
+  ///
+  /// Convert the `width` argument into an activeMask (a bitmask which specifies
+  /// which threads participate in the shuffle) and a maskAndClamp (specifying
+  /// the highest lane which participates in the shuffle).
+  ///
+  ///
+  ///  DS Bpermute:
+  ///   let shflMode = [xor, up, down, idx]
+  ///   let warpSize = 32, step = [1, 2, 4, 8, 16, ... , WarpSize].
+  ///   1. curLaneId = using mbcnt.lo + mbcnt.hi
+  ///   2. warpSizeOrZeroIfOutside = (curLaneId + warpSize) & -warpSize
+  ///   3. dstLane = shflMode(curLaneId, step)
+  ///   4. isActiveSrcLane = dstLane < isActiveSrcLane
+  ///   5. dstLane = isActiveSrcLane ? dstLane : curLaneId
+  ///   6. dwordAlignedDstLane = dstLane * 4 or dstLane << 2.
+  ///   7. bpermute(dwordAlignedDstLane, shfl_value).
+  ///
+  LogicalResult
+  matchAndRewrite(gpu::ShuffleOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Location loc = op->getLoc();
+    // TODO: Add support for non 32-bit shuffle values.
+    if (adaptor.getValue().getType().getIntOrFloatBitWidth() != 32)
+      return failure();
+    const unsigned indexBitwidth = getTypeConverter()->getIndexTypeBitwidth();
+    Value srcLaneId = getLaneId(rewriter, loc, indexBitwidth);
+
+    // TODO: Change warp from const to width.
+    auto int32Type = IntegerType::get(rewriter.getContext(), 32);
+    const int kWarpSize = 32;
+    Value warpSize = adaptor.getWidth();
+    Value zero = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 0);
+    Value negWarpSize =
+        rewriter.create<LLVM::SubOp>(loc, int32Type, zero, warpSize);
+    Value add =
+        rewriter.create<LLVM::AddOp>(loc, int32Type, srcLaneId, warpSize);
+    Value warpSizeOrZeroIfOutside =
+        rewriter.create<LLVM::AndOp>(loc, int32Type, add, negWarpSize);
+    Value dstLane;
+    // TODO: Add support for gpu::ShuffleMode::UP and gpu::ShuffleMode::DOWN.
+    // TODO: Use ds_swizzle for XOR when step/offsets are constants for better
+    // perf.
+    switch (op.getMode()) {
+    case gpu::ShuffleMode::XOR:
+      dstLane = rewriter.create<LLVM::XOrOp>(loc, int32Type, srcLaneId,
+                                             adaptor.getOffset());
+      break;
+    case gpu::ShuffleMode::IDX:
+      dstLane = adaptor.getOffset();
+      break;
+    default:
+      return failure();
+    }
+    Value isActiveSrcLane = rewriter.create<LLVM::ICmpOp>(
+        loc, LLVM::ICmpPredicate::slt, dstLane, warpSizeOrZeroIfOutside);
+    Value selectDstLane = rewriter.create<LLVM::SelectOp>(loc, isActiveSrcLane,
+                                                          dstLane, srcLaneId);
+    Value two = rewriter.create<LLVM::ConstantOp>(loc, int32Type, 2);
+    Value dwordAlignedDstLane =
+        rewriter.create<LLVM::ShlOp>(loc, int32Type, selectDstLane, two);
+    Value initShflValue = adaptor.getValue();
+    if (adaptor.getValue().getType().isF32()) {
+      initShflValue =
+          rewriter.create<LLVM::BitcastOp>(loc, int32Type, initShflValue);
+    }
+    Value shflValue = rewriter.create<ROCDL::DsBpermuteOp>(
+        loc, int32Type, dwordAlignedDstLane, initShflValue);
+    if (adaptor.getValue().getType().isF32()) {
+      shflValue = rewriter.create<LLVM::BitcastOp>(
+          loc, adaptor.getValue().getType(), shflValue);
+    }
+    rewriter.replaceOp(op, {shflValue, isActiveSrcLane});
+    return success();
+  }
+};
+
 /// Import the GPU Ops to ROCDL Patterns.
 #include "GPUToROCDL.cpp.inc"
 
@@ -278,7 +370,7 @@
     patterns.add<GPUPrintfOpToLLVMCallLowering>(converter, /*addressSpace=*/4);
   }
 
-  patterns.add<GPULaneIdOpToROCDL>(converter);
+  patterns.add<GPUShuffleOpLowering, GPULaneIdOpToROCDL>(converter);
 
   populateOpPatterns<math::AbsFOp>(converter, patterns, "__ocml_fabs_f32",
                                    "__ocml_fabs_f64");
diff --git a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
--- a/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
+++ b/mlir/test/Conversion/GPUToROCDL/gpu-to-rocdl.mlir
@@ -524,3 +524,92 @@
   }
 }
 
+// -----
+
+gpu.module @test_module {
+  // CHECK-LABEL: func @gpu_all_reduce_op()
+  gpu.func @gpu_all_reduce_op() {
+    %arg0 = arith.constant 1.0 : f32
+    // TODO: Check full IR expansion once lowering has settled.
+    // CHECK: llvm.add
+    // CHECK: llvm.and
+    // CHECK: llvm.xor
+    // CHECK: llvm.icmp "slt"
+    // CHECK: llvm.select
+    // CHECK: llvm.shl
+    // CHECK: rocdl.ds_bpermute {{.*}}
+    // CHECK: rocdl.barrier
+    // CHECK: llvm.bitcast
+    // CHECK: llvm.fadd
+    %result = gpu.all_reduce add %arg0 uniform {} : (f32) -> (f32)
+
+    gpu.return
+  }
+}
+
+
+// -----
+
+gpu.module @test_module {
+  // CHECK-LABEL: func @gpu_all_reduce_region()
+  gpu.func @gpu_all_reduce_region() {
+    %arg0 = arith.constant 1 : i32
+    // TODO: Check full IR expansion once lowering has settled.
+    // CHECK: llvm.add
+    // CHECK: llvm.and
+    // CHECK: llvm.xor
+    // CHECK: llvm.icmp "slt"
+    // CHECK: llvm.select
+    // CHECK: llvm.shl
+    // CHECK: rocdl.ds_bpermute {{.*}}
+    // CHECK: rocdl.barrier
+    %result = gpu.all_reduce %arg0 uniform {
+    ^bb(%lhs : i32, %rhs : i32):
+      %xor = arith.xori %lhs, %rhs : i32
+      "gpu.yield"(%xor) : (i32) -> ()
+    } : (i32) -> (i32)
+    gpu.return
+  }
+}
+
+// -----
+
+gpu.module @test_module {
+  // CHECK-LABEL: func @gpu_shuffle()
+  func.func @gpu_shuffle() -> (f32, f32) {
+    // CHECK: %[[#VALUE:]] = llvm.mlir.constant(1.000000e+00 : f32) : f32
+    %arg0 = arith.constant 1.0 : f32
+    // CHECK: %[[#OFFSET:]] = llvm.mlir.constant(4 : i32) : i32
+    %arg1 = arith.constant 4 : i32
+    // CHECK: %[[#WIDTH:]] = llvm.mlir.constant(23 : i32) : i32
+    %arg2 = arith.constant 23 : i32
+    // CHECK: %[[#LANE_ID:]] = rocdl.mbcnt.hi
+    // CHECK: %[[#ZERO:]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK: %[[#NEG_WIDTH:]] = llvm.sub %[[#ZERO]], %[[#WIDTH]] : i32
+    // CHECK: %[[#ADD:]] = llvm.add %[[#LANE_ID]], %[[#WIDTH]] : i32
+    // CHECK: %[[#WARP_OR_ZERO:]] = llvm.and %[[#ADD]], %[[#NEG_WIDTH]] : i32
+    // CHECK: %[[#XOR:]] = llvm.xor %[[#LANE_ID]], %{{.*}} : i32
+    // CHECK: %[[#CMP:]] = llvm.icmp "slt" %[[#XOR]], %[[#WARP_OR_ZERO]] : i32
+    // CHECK: %[[#DST_LANE:]] = llvm.select %[[#CMP]], %[[#XOR]], %{{.*}} : i1, i32
+    // CHECK: %[[#TWO:]] = llvm.mlir.constant(2 : i32) : i32
+    // CHECK: %[[#ALIGNED_DST_LANE:]] = llvm.shl %[[#DST_LANE]], %[[#TWO]] : i32
+    // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
+    // CHECK: %[[#PERMUTE:]] = rocdl.ds_bpermute %[[#ALIGNED_DST_LANE]], %[[#CAST_VALUE]] : (i32, i32) -> i32
+    // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32
+    %shfl, %pred = gpu.shuffle xor %arg0, %arg1, %arg2 : f32
+    // CHECK: %[[#LANE_ID:]] = rocdl.mbcnt.hi
+    // CHECK: %[[#ZERO:]] = llvm.mlir.constant(0 : i32) : i32
+    // CHECK: %[[#NEG_WIDTH:]] = llvm.sub %[[#ZERO]], %[[#WIDTH]] : i32
+    // CHECK: %[[#ADD:]] = llvm.add %[[#LANE_ID]], %[[#WIDTH]] : i32
+    // CHECK: %[[#WARP_OR_ZERO:]] = llvm.and %[[#ADD]], %[[#NEG_WIDTH]] : i32
+    // CHECK: %[[#CMP:]] = llvm.icmp "slt" %[[#OFFSET]], %[[#WARP_OR_ZERO]] : i32
+    // CHECK: %[[#DST_LANE:]] = llvm.select %[[#CMP]], %[[#OFFSET]], %{{.*}} : i1, i32
+    // CHECK: %[[#TWO:]] = llvm.mlir.constant(2 : i32) : i32
+    // CHECK: %[[#ALIGNED_DST_LANE:]] = llvm.shl %[[#DST_LANE]], %[[#TWO]] : i32
+    // CHECK: %[[#CAST_VALUE:]] = llvm.bitcast %[[#VALUE]] : f32 to i32
+    // CHECK: %[[#PERMUTE:]] = rocdl.ds_bpermute %[[#ALIGNED_DST_LANE]], %[[#CAST_VALUE]] : (i32, i32) -> i32
+    // CHECK: %[[#CAST_SHFL_VALUE:]] = llvm.bitcast %[[#PERMUTE]] : i32 to f32
+    %shfli, %predi = gpu.shuffle idx %arg0, %arg1, %arg2 : f32
+    func.return %shfl, %shfli : f32, f32
+  }
+}
\ No newline at end of file
diff --git a/mlir/test/Conversion/GPUToROCDL/invalid.mlir b/mlir/test/Conversion/GPUToROCDL/invalid.mlir
deleted file mode 100644
--- a/mlir/test/Conversion/GPUToROCDL/invalid.mlir
+++ /dev/null
@@ -1,17 +0,0 @@
-// RUN: mlir-opt %s -convert-gpu-to-rocdl -split-input-file --verify-diagnostics
-
-
-// Demonstrate the need to register the cf and memref dialect as dependent.
-// CHECK-LABEL: @dependentDialect
-gpu.module @module {
-  gpu.func @dependentDialect() {
-    %arg0 = arith.constant 1 : i32
-    // expected-error@+1 {{failed to legalize operation 'gpu.shuffle' that was explicitly marked illega}}
-    %result = gpu.all_reduce %arg0 uniform {
-    ^bb(%lhs : i32, %rhs : i32):
-      %xor = arith.xori %lhs, %rhs : i32
-      "gpu.yield"(%xor) : (i32) -> ()
-    } : (i32) -> (i32)
-    gpu.return
-  }
-}
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -66,6 +66,22 @@
   llvm.return %3 : i32
 }
 
+llvm.func @rocdl.swizzle(%src : i32) -> i32 {
+  // CHECK-LABEL: rocdl.swizzle
+  // CHECK: call i32 @llvm.amdgcn.ds.swizzle
+  %offset = llvm.mlir.constant(100 : i32) : i32
+  %0 = rocdl.ds_swizzle %src, %offset : (i32, i32) -> i32
+  llvm.return %0 : i32
+}
+
+llvm.func @rocdl.bpermute(%src : i32) -> i32 {
+  // CHECK-LABEL: rocdl.bpermute
+  // CHECK: call i32 @llvm.amdgcn.ds.bpermute
+  %index = llvm.mlir.constant(10 : i32) : i32
+  %0 = rocdl.ds_bpermute %index, %src : (i32, i32) -> i32
+  llvm.return %0 : i32
+}
+
 llvm.func @rocdl.barrier() {
   // CHECK:      fence syncscope("workgroup") release
   // CHECK-NEXT: call void @llvm.amdgcn.s.barrier()