diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h
--- a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h
@@ -38,6 +38,9 @@
 /// Add patterns to expand Arith ceil/floor division ops.
 void populateCeilFloorDivExpandOpsPatterns(RewritePatternSet &patterns);
 
+/// Add patterns to expand Arith bf16 patterns to lower level bitcasts/shifts.
+void populateExpandBFloat16Patterns(RewritePatternSet &patterns);
+
 /// Add patterns to expand Arith ops.
 void populateArithExpandOpsPatterns(RewritePatternSet &patterns);
 
diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td
--- a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.td
@@ -32,6 +32,10 @@
   let summary = "Legalize Arith ops to be convertible to LLVM.";
   let constructor = "mlir::arith::createArithExpandOpsPass()";
   let dependentDialects = ["vector::VectorDialect"];
+  let options = [
+    Option<"includeBf16", "include-bf16", "bool", /*default=*/"false",
+           "Enable the BF16 expansion patterns">,
+  ];
 }
 
 def ArithUnsignedWhenEquivalent : Pass<"arith-unsigned-when-equivalent"> {
diff --git a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
--- a/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/ExpandOps.cpp
@@ -10,6 +10,7 @@
 
 #include "mlir/Dialect/Arith/IR/Arith.h"
 #include "mlir/Dialect/Vector/IR/VectorOps.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/IR/TypeUtilities.h"
 #include "mlir/Transforms/DialectConversion.h"
 
@@ -25,15 +26,13 @@
 /// Create an integer or index constant.
 static Value createConst(Location loc, Type type, int value,
                          PatternRewriter &rewriter) {
-
-  auto elTy = getElementTypeOrSelf(type);
-  auto constantAttr = rewriter.getIntegerAttr(elTy, value);
-
-  if (auto vecTy = llvm::dyn_cast<ShapedType>(type))
+  auto attr = rewriter.getIntegerAttr(getElementTypeOrSelf(type), value);
+  if (auto shapedTy = dyn_cast<ShapedType>(type)) {
     return rewriter.create<arith::ConstantOp>(
-        loc, vecTy, DenseElementsAttr::get(vecTy, constantAttr));
+        loc, DenseElementsAttr::get(shapedTy, attr));
+  }
 
-  return rewriter.create<arith::ConstantOp>(loc, constantAttr);
+  return rewriter.create<arith::ConstantOp>(loc, attr);
 }
 
 namespace {
@@ -187,6 +186,122 @@
   }
 };
 
+struct BFloat16ExtFOpConverter : public OpRewritePattern<arith::ExtFOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(arith::ExtFOp op,
+                                PatternRewriter &rewriter) const final {
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    auto operand = op.getOperand();
+    Type operandTy = operand.getType();
+    Type resultTy = op.getType();
+    Type operandETy = getElementTypeOrSelf(operandTy);
+    Type resultETy = getElementTypeOrSelf(resultTy);
+
+    if (!operandETy.isBF16() || !resultETy.isF32()) {
+      return rewriter.notifyMatchFailure(op, "not a ext of bf16 to f32.");
+    }
+
+    Type i16Ty = b.getI16Type();
+    Type i32Ty = b.getI32Type();
+    if (auto shapedTy = dyn_cast<ShapedType>(operandTy)) {
+      i16Ty = shapedTy.clone(i16Ty);
+      i32Ty = shapedTy.clone(i32Ty);
+    }
+
+    Value bitcast = b.create<arith::BitcastOp>(i16Ty, operand);
+    Value exti = b.create<arith::ExtUIOp>(i32Ty, bitcast);
+
+    Value c16 = createConst(op.getLoc(), i32Ty, 16, rewriter);
+    Value shl = b.create<arith::ShLIOp>(exti, c16);
+    Value result = b.create<arith::BitcastOp>(resultTy, shl);
+
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+};
+
+struct BFloat16TruncFOpConverter : public OpRewritePattern<arith::TruncFOp> {
+  using OpRewritePattern::OpRewritePattern;
+  LogicalResult matchAndRewrite(arith::TruncFOp op,
+                                PatternRewriter &rewriter) const final {
+    ImplicitLocOpBuilder b(op.getLoc(), rewriter);
+    auto operand = op.getOperand();
+    Type operandTy = operand.getType();
+    Type resultTy = op.getType();
+    Type operandETy = getElementTypeOrSelf(operandTy);
+    Type resultETy = getElementTypeOrSelf(resultTy);
+
+    if (!operandETy.isF32() || !resultETy.isBF16()) {
+      return rewriter.notifyMatchFailure(op, "not a trunc of f32 to bf16.");
+    }
+
+    Type i1Ty = b.getI1Type();
+    Type i16Ty = b.getI16Type();
+    Type i32Ty = b.getI32Type();
+    Type f32Ty = b.getF32Type();
+    if (auto shapedTy = dyn_cast<ShapedType>(operandTy)) {
+      i1Ty = shapedTy.clone(i1Ty);
+      i16Ty = shapedTy.clone(i16Ty);
+      i32Ty = shapedTy.clone(i32Ty);
+      f32Ty = shapedTy.clone(f32Ty);
+    }
+
+    Value bitcast = b.create<arith::BitcastOp>(i32Ty, operand);
+
+    Value c23 = createConst(op.getLoc(), i32Ty, 23, rewriter);
+    Value c31 = createConst(op.getLoc(), i32Ty, 31, rewriter);
+    Value c23Mask = createConst(op.getLoc(), i32Ty, (1 << 23) - 1, rewriter);
+    Value expMask =
+        createConst(op.getLoc(), i32Ty, ((1 << 8) - 1) << 23, rewriter);
+    Value expMax =
+        createConst(op.getLoc(), i32Ty, ((1 << 8) - 2) << 23, rewriter);
+
+    // Grab the sign bit.
+    Value sign = b.create<arith::ShRUIOp>(bitcast, c31);
+
+    // Our mantissa rounding value depends on the sign bit and the last
+    // truncated bit.
+    Value cManRound = createConst(op.getLoc(), i32Ty, (1 << 15), rewriter);
+    cManRound = b.create<arith::SubIOp>(cManRound, sign);
+
+    // Grab out the mantissa and directly apply rounding.
+    Value man = b.create<arith::AndIOp>(bitcast, c23Mask);
+    Value manRound = b.create<arith::AddIOp>(man, cManRound);
+
+    // Grab the overflow bit and shift right if we overflow.
+    Value roundBit = b.create<arith::ShRUIOp>(manRound, c23);
+    Value manNew = b.create<arith::ShRUIOp>(manRound, roundBit);
+
+    // Grab the exponent and round using the mantissa's carry bit.
+    Value exp = b.create<arith::AndIOp>(bitcast, expMask);
+    Value expCarry = b.create<arith::AddIOp>(exp, manRound);
+    expCarry = b.create<arith::AndIOp>(expCarry, expMask);
+
+    // If the exponent is saturated, we keep the max value.
+    Value expCmp =
+        b.create<arith::CmpIOp>(arith::CmpIPredicate::uge, exp, expMax);
+    exp = b.create<arith::SelectOp>(expCmp, exp, expCarry);
+
+    // If the exponent is max and we rolled over, keep the old mantissa.
+    Value roundBitBool = b.create<arith::TruncIOp>(i1Ty, roundBit);
+    Value keepOldMan = b.create<arith::AndIOp>(expCmp, roundBitBool);
+    man = b.create<arith::SelectOp>(keepOldMan, man, manNew);
+
+    // Assemble the now rounded f32 value (as an i32).
+    Value rounded = b.create<arith::ShLIOp>(sign, c31);
+    rounded = b.create<arith::OrIOp>(rounded, exp);
+    rounded = b.create<arith::OrIOp>(rounded, man);
+
+    Value c16 = createConst(op.getLoc(), i32Ty, 16, rewriter);
+    Value shr = b.create<arith::ShRUIOp>(rounded, c16);
+    Value trunc = b.create<arith::TruncIOp>(i16Ty, shr);
+    Value result = b.create<arith::BitcastOp>(resultTy, trunc);
+
+    rewriter.replaceOp(op, result);
+    return success();
+  }
+};
+
 struct ArithExpandOpsPass
     : public arith::impl::ArithExpandOpsBase<ArithExpandOpsPass> {
   void runOnOperation() override {
@@ -204,6 +319,24 @@
       arith::MaxFOp,
       arith::MinFOp
     >();
+
+    if (includeBf16) {
+      arith::populateExpandBFloat16Patterns(patterns);
+      target.addDynamicallyLegalOp<arith::ExtFOp>(
+        [](arith::ExtFOp op) {
+          Type inETy = getElementTypeOrSelf(op.getOperand().getType());
+          Type outETy = getElementTypeOrSelf(op.getType());
+          return !(inETy.isBF16() && outETy.isF32());
+        });
+
+      target.addDynamicallyLegalOp<arith::TruncFOp>(
+        [](arith::TruncFOp op)  {
+          Type inETy = getElementTypeOrSelf(op.getOperand().getType());
+          Type outETy = getElementTypeOrSelf(op.getType());
+          return !(inETy.isF32() && outETy.isBF16());
+        });
+    }
+
     // clang-format on
     if (failed(applyPartialConversion(getOperation(), target,
                                       std::move(patterns))))
@@ -220,6 +353,11 @@
           patterns.getContext());
 }
 
+void mlir::arith::populateExpandBFloat16Patterns(RewritePatternSet &patterns) {
+  patterns.add<BFloat16ExtFOpConverter, BFloat16TruncFOpConverter>(
+      patterns.getContext());
+}
+
 void mlir::arith::populateArithExpandOpsPatterns(RewritePatternSet &patterns) {
   populateCeilFloorDivExpandOpsPatterns(patterns);
   // clang-format off
diff --git a/mlir/test/Dialect/Arith/expand-ops.mlir b/mlir/test/Dialect/Arith/expand-ops.mlir
--- a/mlir/test/Dialect/Arith/expand-ops.mlir
+++ b/mlir/test/Dialect/Arith/expand-ops.mlir
@@ -1,4 +1,4 @@
-// RUN: mlir-opt %s -arith-expand -split-input-file | FileCheck %s
+// RUN: mlir-opt %s -arith-expand="include-bf16=true" -split-input-file | FileCheck %s
 
 // Test ceil divide with signed integer
 // CHECK-LABEL:       func @ceildivi
@@ -215,3 +215,50 @@
 // CHECK-NEXT: %[[IS_NAN:.*]] = arith.cmpf uno, %[[RHS]], %[[RHS]] : f32
 // CHECK-NEXT: %[[RESULT:.*]] = arith.select %[[IS_NAN]], %[[RHS]], %[[SELECT]] : f32
 // CHECK-NEXT: return %[[RESULT]] : f32
+
+// -----
+
+func.func @extf_bf16(%arg0 : bf16) -> f32 {
+    %0 = arith.extf %arg0 : bf16 to f32
+    return %0 : f32
+}
+
+// CHECK-LABEL: @extf_bf16
+// CHECK-SAME: %[[ARG0:.+]]: bf16
+// CHECK-DAG: %[[BITCAST:.+]] = arith.bitcast %[[ARG0]] : bf16 to i16
+// CHECK-DAG: %[[EXT:.+]] = arith.extui %[[BITCAST]] : i16 to i32
+// CHECK-DAG: %[[C16:.+]] = arith.constant 16
+// CHECK-DAG: %[[SHLI:.+]] = arith.shli %[[EXT]], %[[C16]]
+// CHECK-DAG: %[[BITCAST:.+]] = arith.bitcast %[[SHLI]] : i32 to f32
+// CHECK: return %[[BITCAST]]
+
+// -----
+
+func.func @extf_vector_bf16(%arg0 : vector<4xbf16>) -> vector<4xf32> {
+    %0 = arith.extf %arg0 : vector<4xbf16> to vector<4xf32>
+    return %0 : vector<4xf32>
+}
+
+// CHECK-LABEL: @extf_vector_bf16
+// CHECK-SAME: %[[ARG0:.+]]: vector<4xbf16>
+// CHECK-DAG: %[[BITCAST:.+]] = arith.bitcast %[[ARG0]] : vector<4xbf16> to vector<4xi16>
+// CHECK-DAG: %[[EXT:.+]] = arith.extui %[[BITCAST]] : vector<4xi16> to vector<4xi32>
+// CHECK-DAG: %[[C16:.+]] = arith.constant dense<16>
+// CHECK-DAG: %[[SHLI:.+]] = arith.shli %[[EXT]], %[[C16]]
+// CHECK-DAG: %[[BITCAST:.+]] = arith.bitcast %[[SHLI]] : vector<4xi32> to vector<4xf32>
+// CHECK: return %[[BITCAST]]
+
+// -----
+
+func.func @truncf_f32(%arg0 : f32) -> bf16 {
+    %0 = arith.truncf %arg0 : f32 to bf16
+    return %0 : bf16
+}
+
+
+// -----
+
+func.func @truncf_vector_f32(%arg0 : vector<4xf32>) -> vector<4xbf16> {
+    %0 = arith.truncf %arg0 : vector<4xf32> to vector<4xbf16>
+    return %0 : vector<4xbf16>
+}
diff --git a/mlir/test/mlir-cpu-runner/expand-arith-ops.mlir b/mlir/test/mlir-cpu-runner/expand-arith-ops.mlir
new file mode 100644
--- /dev/null
+++ b/mlir/test/mlir-cpu-runner/expand-arith-ops.mlir
@@ -0,0 +1,62 @@
+// RUN:   mlir-opt %s -pass-pipeline="builtin.module(func.func(arith-expand{include-bf16=true},convert-arith-to-llvm),convert-vector-to-llvm,convert-func-to-llvm,reconcile-unrealized-casts)" \
+// RUN: | mlir-cpu-runner                                                      \
+// RUN:     -e main -entry-point-result=void -O0                               \
+// RUN:     -shared-libs=%mlir_c_runner_utils  \
+// RUN:     -shared-libs=%mlir_runner_utils    \
+// RUN: | FileCheck %s
+
+func.func @trunc_bf16(%a : f32) {
+  %b = arith.truncf %a : f32 to bf16
+  %c = arith.extf %b : bf16 to f32
+  vector.print %c : f32
+  return
+}
+
+func.func @main() {
+  // CHECK: 1.00781
+  %roundOneI = arith.constant 0x3f808000 : i32
+  %roundOneF = arith.bitcast %roundOneI : i32 to f32
+  call @trunc_bf16(%roundOneF): (f32) -> ()
+
+  // CHECK-NEXT: -1
+  %noRoundNegOneI = arith.constant 0xbf808000 : i32
+  %noRoundNegOneF = arith.bitcast %noRoundNegOneI : i32 to f32
+  call @trunc_bf16(%noRoundNegOneF): (f32) -> ()
+
+  // CHECK-NEXT: -1.00781
+  %roundNegOneI = arith.constant 0xbf808001 : i32
+  %roundNegOneF = arith.bitcast %roundNegOneI : i32 to f32
+  call @trunc_bf16(%roundNegOneF): (f32) -> ()
+
+  // CHECK-NEXT: inf
+  %infi = arith.constant 0x7f800000 : i32
+  %inff = arith.bitcast %infi : i32 to f32
+  call @trunc_bf16(%inff): (f32) -> ()
+
+  // CHECK-NEXT: -inf
+  %neginfi = arith.constant 0xff800000 : i32
+  %neginff = arith.bitcast %neginfi : i32 to f32
+  call @trunc_bf16(%neginff): (f32) -> ()
+
+  // CHECK-NEXT: 3.38953e+38
+  %bigi = arith.constant 0x7f7fffff : i32
+  %bigf = arith.bitcast %bigi : i32 to f32
+  call @trunc_bf16(%bigf): (f32) -> ()
+
+  // CHECK-NEXT: -3.38953e+38
+  %negbigi = arith.constant 0xff7fffff : i32
+  %negbigf = arith.bitcast %negbigi : i32 to f32
+  call @trunc_bf16(%negbigf): (f32) -> ()
+
+  // CHECK-NEXT: 1.625
+  %exprolli = arith.constant 0x3fcfffff : i32
+  %exprollf = arith.bitcast %exprolli : i32 to f32
+  call @trunc_bf16(%exprollf): (f32) -> ()
+
+  // CHECK-NEXT: -1.625
+  %exprollnegi = arith.constant 0xbfcfffff : i32
+  %exprollnegf = arith.bitcast %exprollnegi : i32 to f32
+  call @trunc_bf16(%exprollnegf): (f32) -> ()
+
+  return
+}