diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def
--- a/clang/include/clang/Basic/Builtins.def
+++ b/clang/include/clang/Basic/Builtins.def
@@ -573,6 +573,8 @@
 BUILTIN(__builtin_alloca_with_align, "v*zIz", "Fn")
 BUILTIN(__builtin_call_with_static_chain, "v.", "nt")
 
+BUILTIN(__builtin_matrix_insert,  "v.", "nt")
+
 // "Overloaded" Atomic operator builtins.  These are overloaded to support data
 // types of i8, i16, i32, i64, and i128.  The front-end sees calls to the
 // non-suffixed version of these (which has a bogus type) and transforms them to
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10243,6 +10243,15 @@
 def err_builtin_matrix_disabled: Error<
   "Builtin matrix support is disabled. Pass -fenable-matrix to enable it.">;
 
+def err_builtin_matrix_arg: Error<
+  "%select{First|Second}0 argument must be a matrix">;
+
+def err_builtin_matrix_scalar_int_arg: Error<
+  "%select{Row|Column|Offset|Stride}0 argument must be %select{an unsigned integer|a constant unsigned integer expression}1">;
+
+def err_builtin_matrix_implicit_cast_error: Error<
+  "Implicit cast to from %0 to %1 failed">;
+
 def err_preserve_field_info_not_field : Error<
   "__builtin_preserve_field_info argument %0 not a field access">;
 def err_preserve_field_info_not_const: Error<
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -11562,6 +11562,11 @@
                                 int ArgNum, unsigned ExpectedFieldNum,
                                 bool AllowName);
   bool SemaBuiltinARMMemoryTaggingCall(unsigned BuiltinID, CallExpr *TheCall);
+
+  // Matrix Builtin intrinsic handling.
+  ExprResult SemaBuiltinMatrixInsertOverload(CallExpr *TheCall,
+                                             ExprResult CallResult);
+
 public:
   enum FormatStringType {
     FST_Scanf,
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -43,6 +43,7 @@
 #include "llvm/IR/IntrinsicsWebAssembly.h"
 #include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/MatrixBuilder.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/TargetParser.h"
@@ -1600,6 +1601,10 @@
   return RValue::get(Builder.CreateCall(F, { Src, Src, ShiftAmt }));
 }
 
+const clang::MatrixType *getMatrixTy(QualType Ty) {
+  return cast<clang::MatrixType>(Ty.getCanonicalType());
+};
+
 RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
                                         const CallExpr *E,
                                         ReturnValueSlot ReturnValue) {
@@ -2336,6 +2341,22 @@
     V = Builder.CreateFCmpUNO(V, V, "cmp");
     return RValue::get(Builder.CreateZExt(V, ConvertType(E->getType())));
   }
+  case Builtin::BI__builtin_matrix_insert: {
+    MatrixBuilder<CGBuilderTy> MB(Builder);
+    Value *MatValue = EmitScalarExpr(E->getArg(0));
+    const MatrixType *MatrixTy = getMatrixTy(E->getArg(0)->getType());
+    Value *RowValue = EmitScalarExpr(E->getArg(1));
+    Value *ColValue = EmitScalarExpr(E->getArg(2));
+    Value *ValValue = EmitScalarExpr(E->getArg(3));
+    // Check that the ValValue and the Matrix value match, or do an implict cast
+    // if they don't
+    assert(ValValue->getType() ==
+               cast<SequentialType>(MatValue->getType())->getElementType() &&
+           "Inserted type must match matrix data type");
+
+    return RValue::get(MB.CreateMatrixInsert(MatValue, ValValue, RowValue,
+                                             ColValue, MatrixTy->getNumRows()));
+  }
 
   case Builtin::BIfinite:
   case Builtin::BI__finite:
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -1522,6 +1522,19 @@
     if (SemaBuiltinOSLogFormat(TheCall))
       return ExprError();
     break;
+
+  case Builtin::BI__builtin_matrix_insert:
+    if (!getLangOpts().EnableMatrix) {
+      Diag(TheCall->getBeginLoc(), diag::err_builtin_matrix_disabled);
+      return ExprError();
+    }
+
+    switch (BuiltinID) {
+    case Builtin::BI__builtin_matrix_insert:
+      return SemaBuiltinMatrixInsertOverload(TheCall, TheCallResult);
+    default:
+      llvm_unreachable("All matrix builtins should be handled here!");
+    }
   }
 
   // Since the target specific builtins for each arch overlap, only check those
@@ -14955,3 +14968,121 @@
       rhs, std::bind(&Sema::AddPotentialMisalignedMembers, std::ref(*this), _1,
                      _2, _3, _4));
 }
+
+//
+/// \brief Uses the call to create an overloaded matrix insertion decl
+///
+/// SemaBuiltinMatrixInsertOverload - Handle __builtin_matrix_insert.
+/// This builtin is declared as taking everything and returning nothing,
+/// so all inputs will need to be checked to ensure that things are correct.
+/// This function should take a matrix, a row and column to insert to, and
+/// a value (same type as a matrix element) that is being inserted.
+/// It then returns the matrix with that element inserted.
+ExprResult Sema::SemaBuiltinMatrixInsertOverload(CallExpr *TheCall,
+                                                 ExprResult CallResult) {
+  // This function take four parameters
+  // Matrix -- type matrix (row, column, elt)
+  // Row    -- type integer (constant int for now)
+  // Column -- type integer (constant int for now)
+  // Value  -- type same as elt type
+  //
+  // Returns: Matrix with the value inserted at that coordinate
+
+  // First part of this method focuses on handling the creation of the
+  // overloaded function type Second part focuses on setting up the call with
+  // the correct return type and parameters
+
+  // This builtin should have four parameters passed to it
+  if (checkArgCount(*this, TheCall, 4))
+    return ExprError();
+
+  Expr *Callee = TheCall->getCallee();
+  DeclRefExpr *DRE = cast<DeclRefExpr>(Callee->IgnoreParenCasts());
+  FunctionDecl *FDecl = cast<FunctionDecl>(DRE->getDecl());
+
+  // Some typechecking to ensure that the parameters are correct
+  Expr *MatArg = TheCall->getArg(0);
+  Expr *RowArg = TheCall->getArg(1);
+  Expr *ColArg = TheCall->getArg(2);
+  Expr *ValArg = TheCall->getArg(3);
+  {
+    QualType MTy = MatArg->getType();
+    QualType RTy = RowArg->getType();
+    QualType CTy = RowArg->getType();
+    QualType VTy = ValArg->getType();
+
+    bool ArgError = false;
+    if (!MTy->isMatrixType()) {
+      Diag(MatArg->getBeginLoc(), diag::err_builtin_matrix_arg) << 0;
+      ArgError = true;
+    }
+    if (!RTy->isUnsignedIntegerType()) {
+      Diag(RowArg->getBeginLoc(), diag::err_builtin_matrix_scalar_int_arg)
+          << 0 << 0;
+      ArgError = true;
+    }
+    if (!CTy->isUnsignedIntegerType()) {
+      Diag(ColArg->getBeginLoc(), diag::err_builtin_matrix_scalar_int_arg)
+          << 1 << 0;
+      ArgError = true;
+    }
+    if (ArgError)
+      return ExprError();
+
+    // Either the value type being inserted is the same as the matrix element
+    // type or it needs to be something that can be casted to the matrix element
+    // type.
+    QualType EltTy = cast<MatrixType>(MTy.getCanonicalType())->getElementType();
+
+    // We'll try to allow Itegral to floating point conversions.
+    // Otherwise it's an error!
+    if (EltTy != VTy) {
+      ExprResult TypeCastRes = ImplicitCastExpr::Create(
+          Context, EltTy, CK_IntegralToFloating, ValArg, nullptr, VK_RValue);
+
+      if (TypeCastRes.isInvalid()) {
+        Diag(MatArg->getBeginLoc(),
+             diag::err_builtin_matrix_implicit_cast_error)
+            << EltTy << VTy;
+        return ExprError();
+      }
+      ValArg = TypeCastRes.get();
+      TheCall->setArg(3, ValArg);
+    }
+  }
+
+  // Create new function prototype
+
+  // Convert l-value matrix input to r-value if necessary
+  if (!MatArg->isRValue()) {
+    ExprResult Res =
+        ImplicitCastExpr::Create(Context, MatArg->getType(), CK_LValueToRValue,
+                                 MatArg, nullptr, VK_RValue);
+    assert(!Res.isInvalid() && "Matrix Cast failed");
+    TheCall->setArg(0, Res.get());
+  }
+
+  llvm::SmallVector<QualType, 4> ParameterTypes = {
+      MatArg->getType().getCanonicalType(), RowArg->getType(),
+      ColArg->getType(), ValArg->getType()};
+
+  // Create a new DeclRefExpr to refer to the new decl.
+  DeclRefExpr *NewDRE = DeclRefExpr::Create(
+      Context, DRE->getQualifierLoc(), SourceLocation(), FDecl,
+      /*enclosing*/ false, DRE->getLocation(), Context.BuiltinFnTy,
+      DRE->getValueKind(), nullptr, nullptr, DRE->isNonOdrUse());
+
+  // Set the callee in the CallExpr.
+  // FIXME: This loses syntactic information.
+  QualType CalleePtrTy = Context.getPointerType(FDecl->getType());
+  ExprResult PromotedCall =
+      ImpCastExprToType(NewDRE, CalleePtrTy, CK_BuiltinFnToFnPtr);
+  TheCall->setCallee(PromotedCall.get());
+
+  // Change the result type of the call to match the original value type. This
+  // is arbitrary, but the codegen for these builtins ins design to handle it
+  // gracefully.
+  TheCall->setType(MatArg->getType());
+
+  return CallResult;
+}
diff --git a/clang/test/CodeGen/builtin-matrix.c b/clang/test/CodeGen/builtin-matrix.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/builtin-matrix.c
@@ -0,0 +1,126 @@
+// RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+typedef double dx5x5_t __attribute__((matrix_type(5, 5)));
+typedef float fx2x3_t __attribute__((matrix_type(2, 3)));
+
+// Check that we can use __builtin_matrix_insert on different floating point
+// matrixes and
+void insert_fp(dx5x5_t a, double d, fx2x3_t b, float e) {
+  // CHECK-LABEL: @insert_fp(
+  // CHECK-NEXT: entry:
+  // CHECK-NEXT:    %a.addr = alloca [25 x double], align 8
+  // CHECK-NEXT:    %d.addr = alloca double, align 8
+  // CHECK-NEXT:    %b.addr = alloca [6 x float], align 4
+  // CHECK-NEXT:    %e.addr = alloca float, align 4
+  // CHECK-NEXT:    %0 = bitcast [25 x double]* %a.addr to <25 x double>*
+  // CHECK-NEXT:    store <25 x double> %a, <25 x double>* %0, align 8
+  // CHECK-NEXT:    store double %d, double* %d.addr, align 8
+  // CHECK-NEXT:    %1 = bitcast [6 x float]* %b.addr to <6 x float>*
+  // CHECK-NEXT:    store <6 x float> %b, <6 x float>* %1, align 4
+  // CHECK-NEXT:    store float %e, float* %e.addr, align 4
+  // CHECK-NEXT:    %2 = load <25 x double>, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %3 = load double, double* %d.addr, align 8
+  // CHECK-NEXT:    %4 = insertelement <25 x double> %2, double %3, i32 5
+  // CHECK-NEXT:    store <25 x double> %4, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %5 = load <6 x float>, <6 x float>* %1, align 4
+  // CHECK-NEXT:    %6 = load float, float* %e.addr, align 4
+  // CHECK-NEXT:    %7 = insertelement <6 x float> %5, float %6, i32 1
+  // CHECK-NEXT:    store <6 x float> %7, <6 x float>* %1, align 4
+  // CHECK-NEXT:    %8 = load <25 x double>, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %9 = load double, double* %d.addr, align 8
+  // CHECK-NEXT:    %10 = insertelement <25 x double> %8, double %9, i32 1
+  // CHECK-NEXT:    store <25 x double> %10, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %11 = load <6 x float>, <6 x float>* %1, align 4
+  // CHECK-NEXT:    %12 = load float, float* %e.addr, align 4
+  // CHECK-NEXT:    %13 = insertelement <6 x float> %11, float %12, i32 3
+  // CHECK-NEXT:    store <6 x float> %13, <6 x float>* %1, align 4
+  // CHECK-NEXT:   ret void
+
+  a = __builtin_matrix_insert(a, 0u, 1u, d);
+  b = __builtin_matrix_insert(b, 1u, 0u, e);
+  a = __builtin_matrix_insert(a, 1u, 0u, d);
+  b = __builtin_matrix_insert(b, 1u, 1u, e);
+}
+
+// Check that we can can use __builtin_matrix_insert on integer matrixes.
+typedef int ix9x3_t __attribute__((matrix_type(9, 3)));
+void insert_int(ix9x3_t a, int i) {
+  // CHECK-LABEL: @insert_int(
+  // CHECK-NEXT: entry:
+  // CHECK-NEXT:   %a.addr = alloca [27 x i32], align 4
+  // CHECK-NEXT:   %i.addr = alloca i32, align 4
+  // CHECK-NEXT:   %0 = bitcast [27 x i32]* %a.addr to <27 x i32>*
+  // CHECK-NEXT:   store <27 x i32> %a, <27 x i32>* %0, align 4
+  // CHECK-NEXT:   store i32 %i, i32* %i.addr, align 4
+  // CHECK-NEXT:   %1 = load <27 x i32>, <27 x i32>* %0, align 4
+  // CHECK-NEXT:   %2 = load i32, i32* %i.addr, align 4
+  // CHECK-NEXT:   %3 = insertelement <27 x i32> %1, i32 %2, i32 13
+  // CHECK-NEXT:   store <27 x i32> %3, <27 x i32>* %0, align 4
+  // CHECK-NEXT:   ret void
+
+  a = __builtin_matrix_insert(a, 4u, 1u, i);
+}
+
+// Check that we can can use __builtin_matrix_insert on FP and integer
+// matrixes.
+typedef int ix9x3_t __attribute__((matrix_type(9, 3)));
+void insert_int_fp(ix9x3_t *a, int i, fx2x3_t b, float e) {
+  // CHECK-LABEL: @insert_int_fp(
+  // CHECK-NEXT: entry:
+  // CHECK-NEXT:    %a.addr = alloca [27 x i32]*, align 8
+  // CHECK-NEXT:    %i.addr = alloca i32, align 4
+  // CHECK-NEXT:    %b.addr = alloca [6 x float], align 4
+  // CHECK-NEXT:    %e.addr = alloca float, align 4
+  // CHECK-NEXT:    store [27 x i32]* %a, [27 x i32]** %a.addr, align 8
+  // CHECK-NEXT:    store i32 %i, i32* %i.addr, align 4
+  // CHECK-NEXT:    %0 = bitcast [6 x float]* %b.addr to <6 x float>*
+  // CHECK-NEXT:    store <6 x float> %b, <6 x float>* %0, align 4
+  // CHECK-NEXT:    store float %e, float* %e.addr, align 4
+  // CHECK-NEXT:    %1 = load [27 x i32]*, [27 x i32]** %a.addr, align 8
+  // CHECK-NEXT:    %2 = bitcast [27 x i32]* %1 to <27 x i32>*
+  // CHECK-NEXT:    %3 = load <27 x i32>, <27 x i32>* %2, align 4
+  // CHECK-NEXT:    %4 = load i32, i32* %i.addr, align 4
+  // CHECK-NEXT:    %5 = insertelement <27 x i32> %3, i32 %4, i32 13
+  // CHECK-NEXT:    %6 = load [27 x i32]*, [27 x i32]** %a.addr, align 8
+  // CHECK-NEXT:    %7 = bitcast [27 x i32]* %6 to <27 x i32>*
+  // CHECK-NEXT:    store <27 x i32> %5, <27 x i32>* %7, align 4
+  // CHECK-NEXT:    %8 = load <6 x float>, <6 x float>* %0, align 4
+  // CHECK-NEXT:    %9 = load float, float* %e.addr, align 4
+  // CHECK-NEXT:    %10 = insertelement <6 x float> %8, float %9, i32 3
+  // CHECK-NEXT:    store <6 x float> %10, <6 x float>* %0, align 4
+  // CHECK-NEXT: ret void
+
+  *a = __builtin_matrix_insert(*a, 4u, 1u, i);
+  b = __builtin_matrix_insert(b, 1u, 1u, e);
+}
+
+// Check that we can use overloaded versions of __builtin_matrix_insert on
+// matrixes with matching dimensions, but different element types.
+typedef double dx3x3_t __attribute__((matrix_type(3, 3)));
+typedef float fx3x3_t __attribute__((matrix_type(3, 3)));
+void insert_matching_dimensions(dx3x3_t a, double i, fx3x3_t b, float e) {
+  // CHECK-LABEL: @insert_matching_dimensions(
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %a.addr = alloca [9 x double], align 8
+  // CHECK-NEXT:    %i.addr = alloca double, align 8
+  // CHECK-NEXT:    %b.addr = alloca [9 x float], align 4
+  // CHECK-NEXT:    %e.addr = alloca float, align 4
+  // CHECK-NEXT:    %0 = bitcast [9 x double]* %a.addr to <9 x double>*
+  // CHECK-NEXT:    store <9 x double> %a, <9 x double>* %0, align 8
+  // CHECK-NEXT:    store double %i, double* %i.addr, align 8
+  // CHECK-NEXT:    %1 = bitcast [9 x float]* %b.addr to <9 x float>*
+  // CHECK-NEXT:    store <9 x float> %b, <9 x float>* %1, align 4
+  // CHECK-NEXT:    store float %e, float* %e.addr, align 4
+  // CHECK-NEXT:    %2 = load <9 x double>, <9 x double>* %0, align 8
+  // CHECK-NEXT:    %3 = load double, double* %i.addr, align 8
+  // CHECK-NEXT:    %4 = insertelement <9 x double> %2, double %3, i32 5
+  // CHECK-NEXT:    store <9 x double> %4, <9 x double>* %0, align 8
+  // CHECK-NEXT:    %5 = load <9 x float>, <9 x float>* %1, align 4
+  // CHECK-NEXT:    %6 = load float, float* %e.addr, align 4
+  // CHECK-NEXT:    %7 = insertelement <9 x float> %5, float %6, i32 7
+  // CHECK-NEXT:    store <9 x float> %7, <9 x float>* %1, align 4
+  // CHECK-NEXT:   ret void
+
+  a = __builtin_matrix_insert(a, 2u, 1u, i);
+  b = __builtin_matrix_insert(b, 1u, 2u, e);
+}
diff --git a/clang/test/CodeGenCXX/builtin-matrix.cpp b/clang/test/CodeGenCXX/builtin-matrix.cpp
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGenCXX/builtin-matrix.cpp
@@ -0,0 +1,150 @@
+// RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++11 | FileCheck %s
+
+typedef double dx5x5_t __attribute__((matrix_type(5, 5)));
+using fx2x3_t = float __attribute__((matrix_type(2, 3)));
+
+void insert_fp(dx5x5_t *a, double d, fx2x3_t *b, float e) {
+  *a = __builtin_matrix_insert(*a, 0u, 1u, d);
+  *b = __builtin_matrix_insert(*b, 1u, 0u, e);
+
+  // CHECK-LABEL: @_Z9insert_fpPDm5_5_ddPDm2_3_ff(
+  // CHECK-NEXT: entry:
+  // CHECK-NEXT:    %a.addr = alloca [25 x double]*, align 8
+  // CHECK-NEXT:    %d.addr = alloca double, align 8
+  // CHECK-NEXT:    %b.addr = alloca [6 x float]*, align 8
+  // CHECK-NEXT:    %e.addr = alloca float, align 4
+  // CHECK-NEXT:    store [25 x double]* %a, [25 x double]** %a.addr, align 8
+  // CHECK-NEXT:    store double %d, double* %d.addr, align 8
+  // CHECK-NEXT:    store [6 x float]* %b, [6 x float]** %b.addr, align 8
+  // CHECK-NEXT:    store float %e, float* %e.addr, align 4
+  // CHECK-NEXT:    %0 = load [25 x double]*, [25 x double]** %a.addr, align 8
+  // CHECK-NEXT:    %1 = bitcast [25 x double]* %0 to <25 x double>*
+  // CHECK-NEXT:    %2 = load <25 x double>, <25 x double>* %1, align 8
+  // CHECK-NEXT:    %3 = load double, double* %d.addr, align 8
+  // CHECK-NEXT:    %4 = insertelement <25 x double> %2, double %3, i32 5
+  // CHECK-NEXT:    %5 = load [25 x double]*, [25 x double]** %a.addr, align 8
+  // CHECK-NEXT:    %6 = bitcast [25 x double]* %5 to <25 x double>*
+  // CHECK-NEXT:    store <25 x double> %4, <25 x double>* %6, align 8
+  // CHECK-NEXT:    %7 = load [6 x float]*, [6 x float]** %b.addr, align 8
+  // CHECK-NEXT:    %8 = bitcast [6 x float]* %7 to <6 x float>*
+  // CHECK-NEXT:    %9 = load <6 x float>, <6 x float>* %8, align 4
+  // CHECK-NEXT:    %10 = load float, float* %e.addr, align 4
+  // CHECK-NEXT:    %11 = insertelement <6 x float> %9, float %10, i32 1
+  // CHECK-NEXT:    %12 = load [6 x float]*, [6 x float]** %b.addr, align 8
+  // CHECK-NEXT:    %13 = bitcast [6 x float]* %12 to <6 x float>*
+  // CHECK-NEXT:    store <6 x float> %11, <6 x float>* %13, align 4
+  // CHECK-NEXT:   ret void
+}
+
+typedef int ix9x3_t __attribute__((matrix_type(9, 3)));
+
+void insert_int(ix9x3_t *a, int i) {
+  *a = __builtin_matrix_insert(*a, 4u, 1u, i);
+
+  // CHECK-LABEL: @_Z10insert_intPDm9_3_ii(
+  // CHECK-NEXT: entry:
+  // CHECK-NEXT:    %a.addr = alloca [27 x i32]*, align 8
+  // CHECK-NEXT:    %i.addr = alloca i32, align 4
+  // CHECK-NEXT:    store [27 x i32]* %a, [27 x i32]** %a.addr, align 8
+  // CHECK-NEXT:    store i32 %i, i32* %i.addr, align 4
+  // CHECK-NEXT:    %0 = load [27 x i32]*, [27 x i32]** %a.addr, align 8
+  // CHECK-NEXT:    %1 = bitcast [27 x i32]* %0 to <27 x i32>*
+  // CHECK-NEXT:    %2 = load <27 x i32>, <27 x i32>* %1, align 4
+  // CHECK-NEXT:    %3 = load i32, i32* %i.addr, align 4
+  // CHECK-NEXT:    %4 = insertelement <27 x i32> %2, i32 %3, i32 13
+  // CHECK-NEXT:    %5 = load [27 x i32]*, [27 x i32]** %a.addr, align 8
+  // CHECK-NEXT:    %6 = bitcast [27 x i32]* %5 to <27 x i32>*
+  // CHECK-NEXT:    store <27 x i32> %4, <27 x i32>* %6, align 4
+  // CHECK-NEXT:   ret void
+}
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+struct MyMatrix {
+  using matrix_t = EltTy __attribute__((matrix_type(Rows, Columns)));
+
+  matrix_t value;
+};
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+void insert(MyMatrix<EltTy, Rows, Columns> &Mat, EltTy e) {
+  Mat.value = __builtin_matrix_insert(Mat.value, 1u, 0u, e);
+}
+
+void test_template(unsigned *Ptr1, unsigned E1, float *Ptr2, float E2) {
+
+  // CHECK-LABEL: define void @_Z13test_templatePjjPff(i32* %Ptr1, i32 %E1, float* %Ptr2, float %E2)
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %Ptr1.addr = alloca i32*, align 8
+  // CHECK-NEXT:    %E1.addr = alloca i32, align 4
+  // CHECK-NEXT:    %Ptr2.addr = alloca float*, align 8
+  // CHECK-NEXT:    %E2.addr = alloca float, align 4
+  // CHECK-NEXT:    %Mat1 = alloca %struct.MyMatrix, align 4
+  // CHECK-NEXT:    %Mat2 = alloca %struct.MyMatrix.0, align 4
+  // CHECK-NEXT:    store i32* %Ptr1, i32** %Ptr1.addr, align 8
+  // CHECK-NEXT:    store i32 %E1, i32* %E1.addr, align 4
+  // CHECK-NEXT:    store float* %Ptr2, float** %Ptr2.addr, align 8
+  // CHECK-NEXT:    store float %E2, float* %E2.addr, align 4
+  // CHECK-NEXT:    %0 = load i32*, i32** %Ptr1.addr, align 8
+  // CHECK-NEXT:    %1 = bitcast i32* %0 to [4 x i32]*
+  // CHECK-NEXT:    %2 = bitcast [4 x i32]* %1 to <4 x i32>*
+  // CHECK-NEXT:    %3 = load <4 x i32>, <4 x i32>* %2, align 4
+  // CHECK-NEXT:    %value = getelementptr inbounds %struct.MyMatrix, %struct.MyMatrix* %Mat1, i32 0, i32 0
+  // CHECK-NEXT:    %4 = bitcast [4 x i32]* %value to <4 x i32>*
+  // CHECK-NEXT:    store <4 x i32> %3, <4 x i32>* %4, align 4
+  // CHECK-NEXT:    %5 = load i32, i32* %E1.addr, align 4
+  // CHECK-NEXT:    call void @_Z6insertIjLj2ELj2EEvR8MyMatrixIT_XT0_EXT1_EES1_(%struct.MyMatrix* dereferenceable(16) %Mat1, i32 %5)
+  // CHECK-NEXT:    %6 = load float*, float** %Ptr2.addr, align 8
+  // CHECK-NEXT:    %7 = bitcast float* %6 to [24 x float]*
+  // CHECK-NEXT:    %8 = bitcast [24 x float]* %7 to <24 x float>*
+  // CHECK-NEXT:    %9 = load <24 x float>, <24 x float>* %8, align 4
+  // CHECK-NEXT:    %value1 = getelementptr inbounds %struct.MyMatrix.0, %struct.MyMatrix.0* %Mat2, i32 0, i32 0
+  // CHECK-NEXT:    %10 = bitcast [24 x float]* %value1 to <24 x float>*
+  // CHECK-NEXT:    store <24 x float> %9, <24 x float>* %10, align 4
+  // CHECK-NEXT:    %11 = load float, float* %E2.addr, align 4
+  // CHECK-NEXT:    call void @_Z6insertIfLj3ELj8EEvR8MyMatrixIT_XT0_EXT1_EES1_(%struct.MyMatrix.0* dereferenceable(96) %Mat2, float %11)
+  // CHECK-NEXT:    ret void
+
+  // CHECK-LABEL: define linkonce_odr void @_Z6insertIjLj2ELj2EEvR8MyMatrixIT_XT0_EXT1_EES1_(%struct.MyMatrix* dereferenceable(16) %Mat, i32 %e)
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %Mat.addr = alloca %struct.MyMatrix*, align 8
+  // CHECK-NEXT:    %e.addr = alloca i32, align 4
+  // CHECK-NEXT:    store %struct.MyMatrix* %Mat, %struct.MyMatrix** %Mat.addr, align 8
+  // CHECK-NEXT:    store i32 %e, i32* %e.addr, align 4
+  // CHECK-NEXT:    %0 = load %struct.MyMatrix*, %struct.MyMatrix** %Mat.addr, align 8
+  // CHECK-NEXT:    %value = getelementptr inbounds %struct.MyMatrix, %struct.MyMatrix* %0, i32 0, i32 0
+  // CHECK-NEXT:    %1 = bitcast [4 x i32]* %value to <4 x i32>*
+  // CHECK-NEXT:    %2 = load <4 x i32>, <4 x i32>* %1, align 4
+  // CHECK-NEXT:    %3 = load i32, i32* %e.addr, align 4
+  // CHECK-NEXT:    %4 = insertelement <4 x i32> %2, i32 %3, i32 1
+  // CHECK-NEXT:    %5 = load %struct.MyMatrix*, %struct.MyMatrix** %Mat.addr, align 8
+  // CHECK-NEXT:    %value1 = getelementptr inbounds %struct.MyMatrix, %struct.MyMatrix* %5, i32 0, i32 0
+  // CHECK-NEXT:    %6 = bitcast [4 x i32]* %value1 to <4 x i32>*
+  // CHECK-NEXT:    store <4 x i32> %4, <4 x i32>* %6, align 4
+  // CHECK-NEXT:    ret void
+
+  // CHECK-LABEL: define linkonce_odr void @_Z6insertIfLj3ELj8EEvR8MyMatrixIT_XT0_EXT1_EES1_(%struct.MyMatrix.0* dereferenceable(96) %Mat, float %e)
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %Mat.addr = alloca %struct.MyMatrix.0*, align 8
+  // CHECK-NEXT:    %e.addr = alloca float, align 4
+  // CHECK-NEXT:    store %struct.MyMatrix.0* %Mat, %struct.MyMatrix.0** %Mat.addr, align 8
+  // CHECK-NEXT:    store float %e, float* %e.addr, align 4
+  // CHECK-NEXT:    %0 = load %struct.MyMatrix.0*, %struct.MyMatrix.0** %Mat.addr, align 8
+  // CHECK-NEXT:    %value = getelementptr inbounds %struct.MyMatrix.0, %struct.MyMatrix.0* %0, i32 0, i32 0
+  // CHECK-NEXT:    %1 = bitcast [24 x float]* %value to <24 x float>*
+  // CHECK-NEXT:    %2 = load <24 x float>, <24 x float>* %1, align 4
+  // CHECK-NEXT:    %3 = load float, float* %e.addr, align 4
+  // CHECK-NEXT:    %4 = insertelement <24 x float> %2, float %3, i32 1
+  // CHECK-NEXT:    %5 = load %struct.MyMatrix.0*, %struct.MyMatrix.0** %Mat.addr, align 8
+  // CHECK-NEXT:    %value1 = getelementptr inbounds %struct.MyMatrix.0, %struct.MyMatrix.0* %5, i32 0, i32 0
+  // CHECK-NEXT:    %6 = bitcast [24 x float]* %value1 to <24 x float>*
+  // CHECK-NEXT:    store <24 x float> %4, <24 x float>* %6, align 4
+  // CHECK-NEXT:    ret void
+
+  MyMatrix<unsigned, 2, 2> Mat1;
+  Mat1.value = *((decltype(Mat1)::matrix_t *)Ptr1);
+  insert(Mat1, E1);
+
+  MyMatrix<float, 3, 8> Mat2;
+  Mat2.value = *((decltype(Mat2)::matrix_t *)Ptr2);
+  insert(Mat2, E2);
+}
diff --git a/clang/test/SemaCXX/builtin-matrix.cpp b/clang/test/SemaCXX/builtin-matrix.cpp
new file mode 100644
--- /dev/null
+++ b/clang/test/SemaCXX/builtin-matrix.cpp
@@ -0,0 +1,25 @@
+// RUN: %clang_cc1 %s -fenable-matrix -pedantic -std=c++11 -verify -triple=x86_64-apple-darwin9
+
+typedef float sx10x10_t __attribute__((matrix_type(10, 10)));
+sx10x10_t a;
+
+struct Foo {
+  char *s;
+};
+
+void insert(sx10x10_t *a, float f) {
+  *a = __builtin_matrix_insert(
+      10, // expected-error {{First argument must be a matrix}}
+      a,  // expected-error {{Row argument must be an unsigned integer}}
+      a,  // expected-error {{Column argument must be an unsigned integer}}
+      10);
+
+  int x = __builtin_matrix_insert(*a, 3u, 5u, 10.0); // expected-error {{cannot initialize a variable of type 'int' with an rvalue of type 'sx10x10_t' (aka 'float __attribute__((matrix_type(10, 10))) ')}}
+
+  // TODO: Should error here (index out of range).
+  *a = __builtin_matrix_insert(*a, -1u, 5u, 10.0);
+
+  // FIXME: Column argument is fine!
+  *a = __builtin_matrix_insert(*a, f,     // expected-error {{Row argument must be an unsigned integer}}
+                               5u, 10.0); // expected-error {{Column argument must be an unsigned integer}}
+}