diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def
--- a/clang/include/clang/Basic/Builtins.def
+++ b/clang/include/clang/Basic/Builtins.def
@@ -574,6 +574,7 @@
 BUILTIN(__builtin_call_with_static_chain, "v.", "nt")
 
 BUILTIN(__builtin_matrix_insert,  "v.", "nt")
+BUILTIN(__builtin_matrix_extract, "v.", "nt")
 
 // "Overloaded" Atomic operator builtins.  These are overloaded to support data
 // types of i8, i16, i32, i64, and i128.  The front-end sees calls to the
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -11614,6 +11614,8 @@
   // Matrix Builtin intrinsic handling.
   ExprResult SemaBuiltinMatrixInsertOverload(CallExpr *TheCall,
                                              ExprResult CallResult);
+  ExprResult SemaBuiltinExtractMatrixOverload(CallExpr *TheCall,
+                                              ExprResult CallResult);
 
 public:
   enum FormatStringType {
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -2356,6 +2356,15 @@
     return RValue::get(MB.CreateMatrixInsert(MatValue, ValValue, RowValue,
                                              ColValue, MatrixTy->getNumRows()));
   }
+  case Builtin::BI__builtin_matrix_extract: {
+    MatrixBuilder<CGBuilderTy> MB(Builder);
+    Value *MatrixValue = EmitScalarExpr(E->getArg(0));
+    Value *RowValue = EmitScalarExpr(E->getArg(1));
+    Value *ColValue = EmitScalarExpr(E->getArg(2));
+    const MatrixType *MatrixTy = getMatrixTy(E->getArg(0)->getType());
+    Value *Result = MB.CreateExtractMatrix(MatrixValue, RowValue, ColValue, MatrixTy->getNumRows());
+    return RValue::get(Result);
+  }
 
   case Builtin::BIfinite:
   case Builtin::BI__finite:
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -1614,6 +1614,7 @@
     break;
 
   case Builtin::BI__builtin_matrix_insert:
+  case Builtin::BI__builtin_matrix_extract:
     if (!getLangOpts().EnableMatrix) {
       Diag(TheCall->getBeginLoc(), diag::err_builtin_matrix_disabled);
       return ExprError();
@@ -1622,6 +1623,8 @@
     switch (BuiltinID) {
     case Builtin::BI__builtin_matrix_insert:
       return SemaBuiltinMatrixInsertOverload(TheCall, TheCallResult);
+    case Builtin::BI__builtin_matrix_extract:
+      return SemaBuiltinExtractMatrixOverload(TheCall, TheCallResult);
     default:
       llvm_unreachable("All matrix builtins should be handled here!");
     }
@@ -15176,3 +15179,102 @@
 
   return CallResult;
 }
+
+/// \brief Use the call to create an overloaded matrix extraction declaration
+///
+/// SemaBuiltinExtractMatrixOverload - Handle __builtin_matrix_extract.
+/// This is declared to take (...), so everything must be checked.
+/// A correct invocation with have 7 arguments passed in.
+/// The return type is dependent on the value of two arguments.
+ExprResult Sema::SemaBuiltinExtractMatrixOverload(CallExpr *TheCall,
+                                                  ExprResult CallResult) {
+
+  // This function takes three parameters
+  // 1: matrix m - the matrix being extracted from
+  // 2: int row  - row being extracted
+  // 3: int col  - column being extracted
+  //
+  // returns the element at [row, column], which is of the same type as the
+  // matrix element
+
+  // First part of this method focuses on creating the overloaded function type
+  // Second part of this method focuses on creating the declaration reference
+  // for the call
+
+  // Check that the number of arguments is correct
+  if (checkArgCount(*this, TheCall, 3))
+    return ExprError();
+
+  Expr *Callee = TheCall->getCallee();
+  DeclRefExpr *DRE = cast<DeclRefExpr>(Callee->IgnoreParenCasts());
+  FunctionDecl *FDecl = cast<FunctionDecl>(DRE->getDecl());
+
+
+  // Some typechecking to ensure that the parameters are correct
+  Expr *MatArg = TheCall->getArg(0);
+  Expr *RowArg = TheCall->getArg(1);
+  Expr *ColArg = TheCall->getArg(2);
+  {
+    QualType MTy = MatArg->getType();
+    QualType RTy = RowArg->getType();
+    QualType CTy = RowArg->getType();
+
+    bool ArgError = false;
+    if (!MTy->isMatrixType()) {
+      Diag(MatArg->getBeginLoc(), diag::err_builtin_matrix_arg) << 0;
+      ArgError = true;
+    }
+    if (!RTy->isIntegerType()) {
+      Diag(RowArg->getBeginLoc(), diag::err_builtin_matrix_scalar_int_arg)
+          << 0 << 0;
+      ArgError = true;
+    }
+    if (!CTy->isIntegerType()) {
+      Diag(ColArg->getBeginLoc(), diag::err_builtin_matrix_scalar_int_arg)
+          << 1 << 0;
+      ArgError = true;
+    }
+    if (ArgError)
+      return ExprError();
+  }
+
+  // Create new function prototype
+
+  // Convert an l-valued matrix input to an r-value
+  if (!MatArg->isRValue()) {
+    ExprResult Res =
+        ImplicitCastExpr::Create(Context, MatArg->getType(), CK_LValueToRValue,
+                                 MatArg, nullptr, VK_RValue);
+    assert(!Res.isInvalid() && Res.get()->isRValue() &&
+           "Failed to cast Matrix arg to an R-value");
+    TheCall->setArg(0, Res.get());
+  }
+
+  MatrixType const *MTy =
+      cast<MatrixType const>(MatArg->getType().getCanonicalType());
+  QualType ReturnType = MTy->getElementType();
+
+  llvm::SmallVector<QualType, 3> ParameterTypes = {
+      MatArg->getType().getCanonicalType(), TheCall->getArg(1)->getType(),
+      TheCall->getArg(2)->getType()};
+
+  // Create a new DeclRefExpr to refer to the new decl.
+  DeclRefExpr *NewDRE = DeclRefExpr::Create(
+      Context, DRE->getQualifierLoc(), SourceLocation(), FDecl,
+      /*enclosing*/ false, DRE->getLocation(), Context.BuiltinFnTy,
+      DRE->getValueKind(), nullptr, nullptr, DRE->isNonOdrUse());
+
+  // Set the callee in the CallExpr.
+  // FIXME: This loses syntactic information.
+  QualType CalleePtrTy = Context.getPointerType(FDecl->getType());
+  ExprResult PromotedCall = ImpCastExprToType(NewDRE, CalleePtrTy,
+                                              CK_BuiltinFnToFnPtr);
+  TheCall->setCallee(PromotedCall.get());
+
+  // Change the result type of the call to match the original value type. This
+  // is arbitrary, but the codegen for these builtins ins design to handle it
+  // gracefully.
+  TheCall->setType(ReturnType);
+
+  return CallResult;
+}
diff --git a/clang/test/CodeGen/builtin-matrix.c b/clang/test/CodeGen/builtin-matrix.c
--- a/clang/test/CodeGen/builtin-matrix.c
+++ b/clang/test/CodeGen/builtin-matrix.c
@@ -124,3 +124,34 @@
   a = __builtin_matrix_insert(a, 2u, 1u, i);
   b = __builtin_matrix_insert(b, 1u, 2u, e);
 }
+
+void extract1(dx5x5_t a, fx3x3_t b, ix9x3_t c) {
+  double v1 = __builtin_matrix_extract(a, 2, 3);
+  float v2 = __builtin_matrix_extract(b, 2, 1);
+  int v3 = __builtin_matrix_extract(c, 1, 1);
+
+  // CHECK-LABEL: @extract1(
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %a.addr = alloca [25 x double], align 8
+  // CHECK-NEXT:    %b.addr = alloca [9 x float], align 4
+  // CHECK-NEXT:    %c.addr = alloca [27 x i32], align 4
+  // CHECK-NEXT:    %v1 = alloca double, align 8
+  // CHECK-NEXT:    %v2 = alloca float, align 4
+  // CHECK-NEXT:    %v3 = alloca i32, align 4
+  // CHECK-NEXT:    %0 = bitcast [25 x double]* %a.addr to <25 x double>*
+  // CHECK-NEXT:    store <25 x double> %a, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %1 = bitcast [9 x float]* %b.addr to <9 x float>*
+  // CHECK-NEXT:    store <9 x float> %b, <9 x float>* %1, align 4
+  // CHECK-NEXT:    %2 = bitcast [27 x i32]* %c.addr to <27 x i32>*
+  // CHECK-NEXT:    store <27 x i32> %c, <27 x i32>* %2, align 4
+  // CHECK-NEXT:    %3 = load <25 x double>, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %4 = extractelement <25 x double> %3, i32 17
+  // CHECK-NEXT:    store double %4, double* %v1, align 8
+  // CHECK-NEXT:    %5 = load <9 x float>, <9 x float>* %1, align 4
+  // CHECK-NEXT:    %6 = extractelement <9 x float> %5, i32 5
+  // CHECK-NEXT:    store float %6, float* %v2, align 4
+  // CHECK-NEXT:    %7 = load <27 x i32>, <27 x i32>* %2, align 4
+  // CHECK-NEXT:    %8 = extractelement <27 x i32> %7, i32 10
+  // CHECK-NEXT:    store i32 %8, i32* %v3, align 4
+  // CHECK-NEXT:    ret void
+}
diff --git a/clang/test/CodeGenCXX/builtin-matrix.cpp b/clang/test/CodeGenCXX/builtin-matrix.cpp
--- a/clang/test/CodeGenCXX/builtin-matrix.cpp
+++ b/clang/test/CodeGenCXX/builtin-matrix.cpp
@@ -70,9 +70,9 @@
   Mat.value = __builtin_matrix_insert(Mat.value, 1u, 0u, e);
 }
 
-void test_template(unsigned *Ptr1, unsigned E1, float *Ptr2, float E2) {
+void test_insert_template(unsigned *Ptr1, unsigned E1, float *Ptr2, float E2) {
 
-  // CHECK-LABEL: define void @_Z13test_templatePjjPff(i32* %Ptr1, i32 %E1, float* %Ptr2, float %E2)
+  // CHECK-LABEL: define void @_Z20test_insert_templatePjjPff(i32* %Ptr1, i32 %E1, float* %Ptr2, float %E2)
   // CHECK-NEXT:  entry:
   // CHECK-NEXT:    %Ptr1.addr = alloca i32*, align 8
   // CHECK-NEXT:    %E1.addr = alloca i32, align 4
@@ -148,3 +148,78 @@
   Mat2.value = *((decltype(Mat2)::matrix_t *)Ptr2);
   insert(Mat2, E2);
 }
+
+
+typedef float fx3x3_t __attribute__((matrix_type(3, 3)));
+void extract1(dx5x5_t a, fx3x3_t b, ix9x3_t c) {
+  // CHECK-LABEL: @_Z8extract1Dm5_5_dDm3_3_fDm9_3_i(
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %a.addr = alloca [25 x double], align 8
+  // CHECK-NEXT:    %b.addr = alloca [9 x float], align 4
+  // CHECK-NEXT:    %c.addr = alloca [27 x i32], align 4
+  // CHECK-NEXT:    %v1 = alloca double, align 8
+  // CHECK-NEXT:    %v2 = alloca float, align 4
+  // CHECK-NEXT:    %v3 = alloca i32, align 4
+  // CHECK-NEXT:    %0 = bitcast [25 x double]* %a.addr to <25 x double>*
+  // CHECK-NEXT:    store <25 x double> %a, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %1 = bitcast [9 x float]* %b.addr to <9 x float>*
+  // CHECK-NEXT:    store <9 x float> %b, <9 x float>* %1, align 4
+  // CHECK-NEXT:    %2 = bitcast [27 x i32]* %c.addr to <27 x i32>*
+  // CHECK-NEXT:    store <27 x i32> %c, <27 x i32>* %2, align 4
+  // CHECK-NEXT:    %3 = load <25 x double>, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %4 = extractelement <25 x double> %3, i32 17
+  // CHECK-NEXT:    store double %4, double* %v1, align 8
+  // CHECK-NEXT:    %5 = load <9 x float>, <9 x float>* %1, align 4
+  // CHECK-NEXT:    %6 = extractelement <9 x float> %5, i32 5
+  // CHECK-NEXT:    store float %6, float* %v2, align 4
+  // CHECK-NEXT:    %7 = load <27 x i32>, <27 x i32>* %2, align 4
+  // CHECK-NEXT:    %8 = extractelement <27 x i32> %7, i32 10
+  // CHECK-NEXT:    store i32 %8, i32* %v3, align 4
+  // CHECK-NEXT:    ret void
+
+  double v1 = __builtin_matrix_extract(a, 2, 3);
+  float v2 = __builtin_matrix_extract(b, 2, 1);
+  int v3 = __builtin_matrix_extract(c, 1, 1);
+}
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+EltTy extract(MyMatrix<EltTy, Rows, Columns> &Mat) {
+  return __builtin_matrix_extract(Mat.value, 1u, 0u);
+}
+
+void test_extract_template(unsigned *Ptr1, float *Ptr2) {
+
+  // CHECK-LABEL: define void @_Z21test_extract_templatePjPf(i32* %Ptr1, float* %Ptr2)
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %Ptr1.addr = alloca i32*, align 8
+  // CHECK-NEXT:    %Ptr2.addr = alloca float*, align 8
+  // CHECK-NEXT:    %Mat1 = alloca %struct.MyMatrix, align 4
+  // CHECK-NEXT:    %v1 = alloca i32, align 4
+  // CHECK-NEXT:    store i32* %Ptr1, i32** %Ptr1.addr, align 8
+  // CHECK-NEXT:    store float* %Ptr2, float** %Ptr2.addr, align 8
+  // CHECK-NEXT:    %0 = load i32*, i32** %Ptr1.addr, align 8
+  // CHECK-NEXT:    %1 = bitcast i32* %0 to [4 x i32]*
+  // CHECK-NEXT:    %2 = bitcast [4 x i32]* %1 to <4 x i32>*
+  // CHECK-NEXT:    %3 = load <4 x i32>, <4 x i32>* %2, align 4
+  // CHECK-NEXT:    %value = getelementptr inbounds %struct.MyMatrix, %struct.MyMatrix* %Mat1, i32 0, i32 0
+  // CHECK-NEXT:    %4 = bitcast [4 x i32]* %value to <4 x i32>*
+  // CHECK-NEXT:    store <4 x i32> %3, <4 x i32>* %4, align 4
+  // CHECK-NEXT:    %call = call i32 @_Z7extractIjLj2ELj2EET_R8MyMatrixIS0_XT0_EXT1_EE(%struct.MyMatrix* dereferenceable(16) %Mat1)
+  // CHECK-NEXT:    store i32 %call, i32* %v1, align 4
+  // CHECK-NEXT:    ret void
+
+  // CHECK-LABEL: define linkonce_odr i32 @_Z7extractIjLj2ELj2EET_R8MyMatrixIS0_XT0_EXT1_EE(%struct.MyMatrix* dereferenceable(16) %Mat)
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %Mat.addr = alloca %struct.MyMatrix*, align 8
+  // CHECK-NEXT:    store %struct.MyMatrix* %Mat, %struct.MyMatrix** %Mat.addr, align 8
+  // CHECK-NEXT:    %0 = load %struct.MyMatrix*, %struct.MyMatrix** %Mat.addr, align 8
+  // CHECK-NEXT:    %value = getelementptr inbounds %struct.MyMatrix, %struct.MyMatrix* %0, i32 0, i32 0
+  // CHECK-NEXT:    %1 = bitcast [4 x i32]* %value to <4 x i32>*
+  // CHECK-NEXT:    %2 = load <4 x i32>, <4 x i32>* %1, align 4
+  // CHECK-NEXT:    %3 = extractelement <4 x i32> %2, i32 1
+  // CHECK-NEXT:    ret i32 %3
+
+  MyMatrix<unsigned, 2, 2> Mat1;
+  Mat1.value = *((decltype(Mat1)::matrix_t*) Ptr1);
+  unsigned v1 = extract(Mat1);
+}
diff --git a/clang/test/Sema/builtin-matrix.c b/clang/test/Sema/builtin-matrix.c
new file mode 100644
--- /dev/null
+++ b/clang/test/Sema/builtin-matrix.c
@@ -0,0 +1,42 @@
+// RUN: %clang_cc1 %s -fenable-matrix -pedantic -verify -triple=x86_64-apple-darwin9
+
+typedef float sx10x10_t __attribute__((matrix_type(10, 10)));
+sx10x10_t a;
+
+struct Foo {
+  char *s;
+};
+
+void insert(sx10x10_t *a, float f) {
+  *a = __builtin_matrix_insert(
+      10, // expected-error {{First argument must be a matrix}}
+      a,  // expected-error {{Row argument must be an unsigned integer}}
+      a,  // expected-error {{Column argument must be an unsigned integer}}
+      10);
+
+  int x = __builtin_matrix_insert(*a, 3u, 5u, 10.0); // expected-error {{initializing 'int' with an expression of incompatible type 'sx10x10_t' (aka 'float __attribute__((matrix_type(10, 10))) ')}}
+
+  // TODO: Should error here (index out of range).
+  *a = __builtin_matrix_insert(*a, -1u, 5u, 10.0);
+
+  // FIXME: Column argument is fine!
+  *a = __builtin_matrix_insert(*a, f, // expected-error {{Row argument must be an unsigned integer}}
+                               5u, 10.0); // expected-error {{Column argument must be an unsigned integer}}
+}
+
+
+void extract(sx10x10_t *a) {
+  struct Foo v1  = __builtin_matrix_extract( // expected-error {{initializing 'struct Foo' with an expression of incompatible type 'float'}}
+      *a, 1, 1);
+
+   float v2 = __builtin_matrix_extract(
+      10,  // expected-error {{First argument must be a matrix}}
+      a,   // expected-error {{Row argument must be an unsigned integer}}
+      a);  // expected-error {{Column argument must be an unsigned integer}}
+
+   float v3 = __builtin_matrix_extract(
+      *a, 1); // expected-error {{too few arguments to function call, expected 3, have 2}}
+
+   float v4 = __builtin_matrix_extract(
+      *a, 1, 1, 1); // expected-error {{too many arguments to function call, expected 3, have 4}}
+}
diff --git a/clang/test/SemaCXX/builtin-matrix.cpp b/clang/test/SemaCXX/builtin-matrix.cpp
--- a/clang/test/SemaCXX/builtin-matrix.cpp
+++ b/clang/test/SemaCXX/builtin-matrix.cpp
@@ -7,6 +7,13 @@
   char *s;
 };
 
+template <typename EltTy, unsigned Rows, unsigned Columns>
+struct MyMatrix {
+  using matrix_t = EltTy __attribute__((matrix_type(Rows, Columns)));
+
+  matrix_t value;
+};
+
 void insert(sx10x10_t *a, float f) {
   *a = __builtin_matrix_insert(
       10, // expected-error {{First argument must be a matrix}}
@@ -23,3 +30,15 @@
   *a = __builtin_matrix_insert(*a, f,     // expected-error {{Row argument must be an unsigned integer}}
                                5u, 10.0); // expected-error {{Column argument must be an unsigned integer}}
 }
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+EltTy extract(MyMatrix<EltTy, Rows, Columns> &Mat) {
+  char *v1 = __builtin_matrix_extract(Mat.value, 1u, 0u); // expected-error {{cannot initialize a variable of type 'char *' with an rvalue of type 'unsigned int'}}
+  return __builtin_matrix_extract(Mat.value, 1u, 0u);
+}
+
+void test_extract_template(unsigned *Ptr1, float *Ptr2) {
+  MyMatrix<unsigned, 2, 2> Mat1;
+  Mat1.value = *((decltype(Mat1)::matrix_t*) Ptr1);
+  unsigned v1 = extract(Mat1); // expected-note {{in instantiation of function template specialization 'extract<unsigned int, 2, 2>' requested here}}
+}