diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def
--- a/clang/include/clang/Basic/Builtins.def
+++ b/clang/include/clang/Basic/Builtins.def
@@ -575,6 +575,8 @@
 BUILTIN(__builtin_alloca_with_align, "v*zIz", "Fn")
 BUILTIN(__builtin_call_with_static_chain, "v.", "nt")
 
+BUILTIN(__builtin_matrix_transpose, "v.", "nFt")
+
 // "Overloaded" Atomic operator builtins.  These are overloaded to support data
 // types of i8, i16, i32, i64, and i128.  The front-end sees calls to the
 // non-suffixed version of these (which has a bogus type) and transforms them to
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10754,6 +10754,9 @@
 def err_matrix_incomplete_idx: Error<
   "single subscript expressions are not allowed for matrix values">;
 
+def err_builtin_matrix_arg: Error<
+  "%select{first|second}0 argument must be a matrix">;
+
 def err_preserve_field_info_not_field : Error<
   "__builtin_preserve_field_info argument %0 not a field access">;
 def err_preserve_field_info_not_const: Error<
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -12082,6 +12082,11 @@
                                 int ArgNum, unsigned ExpectedFieldNum,
                                 bool AllowName);
   bool SemaBuiltinARMMemoryTaggingCall(unsigned BuiltinID, CallExpr *TheCall);
+
+  // Matrix builtin handling.
+  ExprResult SemaBuiltinMatrixTransposeOverload(CallExpr *TheCall,
+                                                ExprResult CallResult);
+
 public:
   enum FormatStringType {
     FST_Scanf,
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -44,6 +44,7 @@
 #include "llvm/IR/IntrinsicsWebAssembly.h"
 #include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/MatrixBuilder.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/TargetParser.h"
@@ -1634,6 +1635,10 @@
   return RValue::get(Builder.CreateCall(F, { Src, Src, ShiftAmt }));
 }
 
+const ConstantMatrixType *getMatrixTy(QualType Ty) {
+  return cast<ConstantMatrixType>(Ty.getCanonicalType());
+};
+
 RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
                                         const CallExpr *E,
                                         ReturnValueSlot ReturnValue) {
@@ -2371,6 +2376,15 @@
     return RValue::get(Builder.CreateZExt(V, ConvertType(E->getType())));
   }
 
+  case Builtin::BI__builtin_matrix_transpose: {
+    const ConstantMatrixType *MatrixTy = getMatrixTy(E->getArg(0)->getType());
+    Value *MatValue = EmitScalarExpr(E->getArg(0));
+    MatrixBuilder<CGBuilderTy> MB(Builder);
+    Value *Result = MB.CreateMatrixTranspose(MatValue, MatrixTy->getNumRows(),
+                                             MatrixTy->getNumColumns());
+    return RValue::get(Result);
+  }
+
   case Builtin::BIfinite:
   case Builtin::BI__finite:
   case Builtin::BIfinitef:
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -1895,7 +1895,7 @@
       return ExprError();
     break;
   case Builtin::BI__builtin_frame_address:
-  case Builtin::BI__builtin_return_address:
+  case Builtin::BI__builtin_return_address: {
     if (SemaBuiltinConstantArgRange(TheCall, 0, 0, 0xFFFF))
       return ExprError();
 
@@ -1912,6 +1912,20 @@
     break;
   }
 
+  case Builtin::BI__builtin_matrix_transpose:
+    if (!getLangOpts().MatrixTypes) {
+      Diag(TheCall->getBeginLoc(), diag::err_builtin_matrix_disabled);
+      return ExprError();
+    }
+
+    switch (BuiltinID) {
+    case Builtin::BI__builtin_matrix_transpose:
+      return SemaBuiltinMatrixTransposeOverload(TheCall, TheCallResult);
+    default:
+      llvm_unreachable("All matrix builtins should be handled here!");
+    }
+  }
+
   // Since the target specific builtins for each arch overlap, only check those
   // of the arch we are compiling for.
   if (Context.BuiltinInfo.isTSBuiltin(BuiltinID)) {
@@ -14769,3 +14783,34 @@
       rhs, std::bind(&Sema::AddPotentialMisalignedMembers, std::ref(*this), _1,
                      _2, _3, _4));
 }
+
+ExprResult Sema::SemaBuiltinMatrixTransposeOverload(CallExpr *TheCall,
+                                                    ExprResult CallResult) {
+  if (checkArgCount(*this, TheCall, 1))
+    return ExprError();
+
+  Expr *Arg = TheCall->getArg(0);
+  if (!Arg->getType()->isConstantMatrixType()) {
+    Diag(Arg->getBeginLoc(), diag::err_builtin_matrix_arg) << 0;
+    return ExprError();
+  }
+
+  // Convert arg to RValue, if required.
+  if (!Arg->isRValue()) {
+    ExprResult Res = ImplicitCastExpr::Create(
+        Context, Arg->getType(), CK_LValueToRValue, Arg, nullptr, VK_RValue);
+    assert(!Res.isInvalid() && "Matrix Cast failed");
+    TheCall->setArg(0, Res.get());
+  }
+
+  // Create returned matrix type by swapping rows and columns of the argument
+  // matrix type.
+  ConstantMatrixType const *MType =
+      cast<ConstantMatrixType const>(Arg->getType().getCanonicalType());
+  QualType ResultType = Context.getConstantMatrixType(
+      MType->getElementType(), MType->getNumColumns(), MType->getNumRows());
+
+  // Change the return type to the type of the returned matrix.
+  TheCall->setType(ResultType);
+  return CallResult;
+}
diff --git a/clang/test/CodeGen/matrix-type-builtins.c b/clang/test/CodeGen/matrix-type-builtins.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/matrix-type-builtins.c
@@ -0,0 +1,74 @@
+// RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+// Tests for the matrix type operators.
+
+typedef double dx5x5_t __attribute__((matrix_type(5, 5)));
+typedef float fx2x3_t __attribute__((matrix_type(2, 3)));
+typedef float fx3x2_t __attribute__((matrix_type(3, 2)));
+typedef int ix20x4_t __attribute__((matrix_type(20, 4)));
+typedef int ix4x20_t __attribute__((matrix_type(4, 20)));
+typedef unsigned ux1x6_t __attribute__((matrix_type(1, 6)));
+
+void transpose1(dx5x5_t *a, fx3x2_t *b, ix20x4_t *c) {
+  // CHECK-LABEL: define void @transpose1([25 x double]* %a, [6 x float]* %b, [80 x i32]* %c)
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %a.addr = alloca [25 x double]*, align 8
+  // CHECK-NEXT:    %b.addr = alloca [6 x float]*, align 8
+  // CHECK-NEXT:    %c.addr = alloca [80 x i32]*, align 8
+  // CHECK-NEXT:    %a_t = alloca [25 x double], align 8
+  // CHECK-NEXT:    %b_t = alloca [6 x float], align 4
+  // CHECK-NEXT:    %c_t = alloca [80 x i32], align 4
+  // CHECK-NEXT:    store [25 x double]* %a, [25 x double]** %a.addr, align 8
+  // CHECK-NEXT:    store [6 x float]* %b, [6 x float]** %b.addr, align 8
+  // CHECK-NEXT:    store [80 x i32]* %c, [80 x i32]** %c.addr, align 8
+  // CHECK-NEXT:    %0 = load [25 x double]*, [25 x double]** %a.addr, align 8
+  // CHECK-NEXT:    %1 = bitcast [25 x double]* %0 to <25 x double>*
+  // CHECK-NEXT:    %2 = load <25 x double>, <25 x double>* %1, align 8
+  // CHECK-NEXT:    %3 = call <25 x double> @llvm.matrix.transpose.v25f64(<25 x double> %2, i32 5, i32 5)
+  // CHECK-NEXT:    %4 = bitcast [25 x double]* %a_t to <25 x double>*
+  // CHECK-NEXT:    store <25 x double> %3, <25 x double>* %4, align 8
+  dx5x5_t a_t = __builtin_matrix_transpose(*a);
+
+  // CHECK-NEXT:    %5 = load [6 x float]*, [6 x float]** %b.addr, align 8
+  // CHECK-NEXT:    %6 = bitcast [6 x float]* %5 to <6 x float>*
+  // CHECK-NEXT:    %7 = load <6 x float>, <6 x float>* %6, align 4
+  // CHECK-NEXT:    %8 = call <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %7, i32 3, i32 2)
+  // CHECK-NEXT:    %9 = bitcast [6 x float]* %b_t to <6 x float>*
+  // CHECK-NEXT:    store <6 x float> %8, <6 x float>* %9, align 4
+  fx2x3_t b_t = __builtin_matrix_transpose(*b);
+
+  // CHECK-NEXT:    %10 = load [80 x i32]*, [80 x i32]** %c.addr, align 8
+  // CHECK-NEXT:    %11 = bitcast [80 x i32]* %10 to <80 x i32>*
+  // CHECK-NEXT:    %12 = load <80 x i32>, <80 x i32>* %11, align 4
+  // CHECK-NEXT:    %13 = call <80 x i32> @llvm.matrix.transpose.v80i32(<80 x i32> %12, i32 20, i32 4)
+  // CHECK-NEXT:    %14 = bitcast [80 x i32]* %c_t to <80 x i32>*
+  // CHECK-NEXT:    store <80 x i32> %13, <80 x i32>* %14, align 4
+  // CHECK-NEXT:    ret void
+  ix4x20_t c_t = __builtin_matrix_transpose(*c);
+}
+
+// CHECK: declare <25 x double> @llvm.matrix.transpose.v25f64(<25 x double>, i32 immarg, i32 immarg)
+// CHECK: declare <6 x float> @llvm.matrix.transpose.v6f32(<6 x float>, i32 immarg, i32 immarg)
+// CHECK: declare <80 x i32> @llvm.matrix.transpose.v80i32(<80 x i32>, i32 immarg, i32 immarg)
+
+struct Foo {
+  ux1x6_t matrix;
+};
+
+void transpose2(struct Foo F) {
+  // CHECK-LABEL: define void @transpose2(%struct.Foo* byval(%struct.Foo) align 8 %F) #0 {
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %Res = alloca [6 x i32], align 4
+  // CHECK-NEXT:    %matrix = getelementptr inbounds %struct.Foo, %struct.Foo* %F, i32 0, i32 0
+  // CHECK-NEXT:    %0 = bitcast [6 x i32]* %matrix to <6 x i32>*
+  // CHECK-NEXT:    %1 = load <6 x i32>, <6 x i32>* %0, align 8
+  // CHECK-NEXT:    %2 = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> %1, i32 1, i32 6)
+  // CHECK-NEXT:    %3 = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> %2, i32 6, i32 1)
+  // CHECK-NEXT:    %4 = bitcast [6 x i32]* %Res to <6 x i32>*
+  // CHECK-NEXT:    store <6 x i32> %3, <6 x i32>* %4, align 4
+  // CHECK-NEXT:    ret void
+
+  ux1x6_t Res = __builtin_matrix_transpose(__builtin_matrix_transpose(F.matrix));
+}
+
+// CHECK: declare <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32>, i32 immarg, i32 immarg)
diff --git a/clang/test/CodeGenCXX/matrix-type-builtins.cpp b/clang/test/CodeGenCXX/matrix-type-builtins.cpp
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGenCXX/matrix-type-builtins.cpp
@@ -0,0 +1,124 @@
+// RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++17 | FileCheck %s
+
+// Tests for the matrix type builtins.
+
+using dx5x5_t = double __attribute__((matrix_type(5, 5)));
+typedef float fx2x3_t __attribute__((matrix_type(2, 3)));
+
+void transpose1(dx5x5_t *a, fx2x3_t *b) {
+  // CHECK-LABEL: define void @_Z10transpose1PU11matrix_typeLm5ELm5EdPU11matrix_typeLm2ELm3Ef([25 x double]* %a, [6 x float]* %b)
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %a.addr = alloca [25 x double]*, align 8
+  // CHECK-NEXT:    %b.addr = alloca [6 x float]*, align 8
+  // CHECK-NEXT:    %a_t = alloca [25 x double], align 8
+  // CHECK-NEXT:    %b_t = alloca [6 x float], align 4
+  // CHECK-NEXT:    store [25 x double]* %a, [25 x double]** %a.addr, align 8
+  // CHECK-NEXT:    store [6 x float]* %b, [6 x float]** %b.addr, align 8
+  // CHECK-NEXT:    %0 = load [25 x double]*, [25 x double]** %a.addr, align 8
+  // CHECK-NEXT:    %1 = bitcast [25 x double]* %0 to <25 x double>*
+  // CHECK-NEXT:    %2 = load <25 x double>, <25 x double>* %1, align 8
+  // CHECK-NEXT:    %3 = call <25 x double> @llvm.matrix.transpose.v25f64(<25 x double> %2, i32 5, i32 5)
+  // CHECK-NEXT:    %4 = bitcast [25 x double]* %a_t to <25 x double>*
+  // CHECK-NEXT:    store <25 x double> %3, <25 x double>* %4, align 8
+  dx5x5_t a_t = __builtin_matrix_transpose(*a);
+
+  // CHECK-NEXT:    %5 = load [6 x float]*, [6 x float]** %b.addr, align 8
+  // CHECK-NEXT:    %6 = bitcast [6 x float]* %5 to <6 x float>*
+  // CHECK-NEXT:    %7 = load <6 x float>, <6 x float>* %6, align 4
+  // CHECK-NEXT:    %8 = call <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %7, i32 2, i32 3)
+  // CHECK-NEXT:    %9 = bitcast [6 x float]* %b_t to <6 x float>*
+  // CHECK-NEXT:    store <6 x float> %8, <6 x float>* %9, align 4
+  // CHECK-NEXT:    ret void
+  auto b_t = __builtin_matrix_transpose(*b);
+}
+
+template <typename T, unsigned R, unsigned C>
+using matrix_t = T __attribute__((matrix_type(R, C)));
+
+void transpose2(matrix_t<float, 7, 4> &M1, matrix_t<unsigned, 6, 9> &M2) {
+  // CHECK-LABEL: define void @_Z10transpose2RU11matrix_typeLm7ELm4EfRU11matrix_typeLm6ELm9Ej([28 x float]* dereferenceable(112) %M1, [54 x i32]* dereferenceable(216) %M2) #0 {
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %M1.addr = alloca [28 x float]*, align 8
+  // CHECK-NEXT:    %M2.addr = alloca [54 x i32]*, align 8
+  // CHECK-NEXT:    %M1_t = alloca [28 x float], align 4
+  // CHECK-NEXT:    %M2_t = alloca [54 x i32], align 4
+  // CHECK-NEXT:    store [28 x float]* %M1, [28 x float]** %M1.addr, align 8
+  // CHECK-NEXT:    store [54 x i32]* %M2, [54 x i32]** %M2.addr, align 8
+  // CHECK-NEXT:    %0 = load [28 x float]*, [28 x float]** %M1.addr, align 8
+  // CHECK-NEXT:    %1 = bitcast [28 x float]* %0 to <28 x float>*
+  // CHECK-NEXT:    %2 = load <28 x float>, <28 x float>* %1, align 4
+  // CHECK-NEXT:    %3 = call <28 x float> @llvm.matrix.transpose.v28f32(<28 x float> %2, i32 7, i32 4)
+  // CHECK-NEXT:    %4 = bitcast [28 x float]* %M1_t to <28 x float>*
+  // CHECK-NEXT:    store <28 x float> %3, <28 x float>* %4, align 4
+  matrix_t<float, 4, 7> M1_t = __builtin_matrix_transpose(M1);
+
+  // CHECK-NEXT:    %5 = load [54 x i32]*, [54 x i32]** %M2.addr, align 8
+  // CHECK-NEXT:    %6 = bitcast [54 x i32]* %5 to <54 x i32>*
+  // CHECK-NEXT:    %7 = load <54 x i32>, <54 x i32>* %6, align 4
+  // CHECK-NEXT:    %8 = call <54 x i32> @llvm.matrix.transpose.v54i32(<54 x i32> %7, i32 6, i32 9)
+  // CHECK-NEXT:    %9 = bitcast [54 x i32]* %M2_t to <54 x i32>*
+  // CHECK-NEXT:    store <54 x i32> %8, <54 x i32>* %9, align 4
+  // CHECK-NEXT:    ret void
+  auto M2_t = __builtin_matrix_transpose(M2);
+}
+
+template <typename T, unsigned R, unsigned C>
+matrix_t<T, C, R> transpose(matrix_t<T, R, C> M) {
+  return __builtin_matrix_transpose(M);
+}
+
+void test_transpose_template() {
+  // CHECK-LABEL: define void @_Z23test_transpose_templatev() #2 {
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %M1 = alloca [40 x i32], align 4
+  // CHECK-NEXT:    %M1_t = alloca [40 x i32], align 4
+  // CHECK-NEXT:    %M2 = alloca [42 x double], align 8
+  // CHECK-NEXT:    %M2_t = alloca [42 x double], align 8
+  // CHECK-NEXT:    %0 = bitcast [40 x i32]* %M1 to <40 x i32>*
+  // CHECK-NEXT:    %1 = load <40 x i32>, <40 x i32>* %0, align 4
+  // CHECK-NEXT:    %call = call <40 x i32> @_Z9transposeIiLj4ELj10EEU11matrix_typeXT1_EXT0_ET_U11matrix_typeXT0_EXT1_ES0_(<40 x i32> %1)
+  // CHECK-NEXT:    %2 = bitcast [40 x i32]* %M1_t to <40 x i32>*
+  // CHECK-NEXT:    store <40 x i32> %call, <40 x i32>* %2, align 4
+
+  matrix_t<int, 4, 10> M1;
+  matrix_t<int, 10, 4> M1_t = transpose(M1);
+
+  // CHECK-NEXT:    %3 = bitcast [42 x double]* %M2 to <42 x double>*
+  // CHECK-NEXT:    %4 = load <42 x double>, <42 x double>* %3, align 8
+  // CHECK-NEXT:    %call1 = call <42 x double> @_Z9transposeIdLj7ELj6EEU11matrix_typeXT1_EXT0_ET_U11matrix_typeXT0_EXT1_ES0_(<42 x double> %4)
+  // CHECK-NEXT:    %call2 = call <42 x double> @_Z9transposeIdLj6ELj7EEU11matrix_typeXT1_EXT0_ET_U11matrix_typeXT0_EXT1_ES0_(<42 x double> %call1)
+  // CHECK-NEXT:    %call3 = call <42 x double> @_Z9transposeIdLj7ELj6EEU11matrix_typeXT1_EXT0_ET_U11matrix_typeXT0_EXT1_ES0_(<42 x double> %call2)
+  // CHECK-NEXT:    %5 = bitcast [42 x double]* %M2_t to <42 x double>*
+  // CHECK-NEXT:    store <42 x double> %call3, <42 x double>* %5, align 8
+  // CHECK-NEXT:    ret void
+  matrix_t<double, 7, 6> M2;
+  matrix_t<double, 6, 7> M2_t = transpose(transpose(transpose(M2)));
+}
+
+// CHECK-LABEL: define linkonce_odr <40 x i32> @_Z9transposeIiLj4ELj10EEU11matrix_typeXT1_EXT0_ET_U11matrix_typeXT0_EXT1_ES0_(<40 x i32> %M)
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    %M.addr = alloca [40 x i32], align 4
+// CHECK-NEXT:    %0 = bitcast [40 x i32]* %M.addr to <40 x i32>*
+// CHECK-NEXT:    store <40 x i32> %M, <40 x i32>* %0, align 4
+// CHECK-NEXT:    %1 = load <40 x i32>, <40 x i32>* %0, align 4
+// CHECK-NEXT:    %2 = call <40 x i32> @llvm.matrix.transpose.v40i32(<40 x i32> %1, i32 4, i32 10)
+// CHECK-NEXT:    ret <40 x i32> %2
+// CHECK-NEXT:  }
+
+// CHECK-LABEL: define linkonce_odr <42 x double> @_Z9transposeIdLj7ELj6EEU11matrix_typeXT1_EXT0_ET_U11matrix_typeXT0_EXT1_ES0_(<42 x double> %M)
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    %M.addr = alloca [42 x double], align 8
+// CHECK-NEXT:    %0 = bitcast [42 x double]* %M.addr to <42 x double>*
+// CHECK-NEXT:    store <42 x double> %M, <42 x double>* %0, align 8
+// CHECK-NEXT:    %1 = load <42 x double>, <42 x double>* %0, align 8
+// CHECK-NEXT:    %2 = call <42 x double> @llvm.matrix.transpose.v42f64(<42 x double> %1, i32 7, i32 6)
+// CHECK-NEXT:    ret <42 x double> %2
+
+// CHECK-LABEL: define linkonce_odr <42 x double> @_Z9transposeIdLj6ELj7EEU11matrix_typeXT1_EXT0_ET_U11matrix_typeXT0_EXT1_ES0_(<42 x double> %M)
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    %M.addr = alloca [42 x double], align 8
+// CHECK-NEXT:    %0 = bitcast [42 x double]* %M.addr to <42 x double>*
+// CHECK-NEXT:    store <42 x double> %M, <42 x double>* %0, align 8
+// CHECK-NEXT:    %1 = load <42 x double>, <42 x double>* %0, align 8
+// CHECK-NEXT:    %2 = call <42 x double> @llvm.matrix.transpose.v42f64(<42 x double> %1, i32 6, i32 7)
+// CHECK-NEXT:    ret <42 x double> %2
diff --git a/clang/test/Sema/matrix-type-builtins.c b/clang/test/Sema/matrix-type-builtins.c
new file mode 100644
--- /dev/null
+++ b/clang/test/Sema/matrix-type-builtins.c
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 %s -fenable-matrix -pedantic -verify -triple=x86_64-apple-darwin9
+
+typedef float sx5x10_t __attribute__((matrix_type(5, 10)));
+typedef int ix3x2_t __attribute__((matrix_type(3, 2)));
+
+void transpose(sx5x10_t a, ix3x2_t b, int *c, int d) {
+  a = __builtin_matrix_transpose(b);
+  // expected-error@-1 {{assigning to 'sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))') from incompatible type 'int __attribute__((matrix_type(2, 3)))'}}
+  b = __builtin_matrix_transpose(b);
+  // expected-error@-1 {{assigning to 'ix3x2_t' (aka 'int __attribute__((matrix_type(3, 2)))') from incompatible type 'int __attribute__((matrix_type(2, 3)))'}}
+  __builtin_matrix_transpose(c);
+  // expected-error@-1 {{first argument must be a matrix}}
+  __builtin_matrix_transpose(d);
+  // expected-error@-1 {{first argument must be a matrix}}
+  __builtin_matrix_transpose("test");
+  // expected-error@-1 {{first argument must be a matrix}}
+}
diff --git a/clang/test/SemaCXX/matrix-type-builtins.cpp b/clang/test/SemaCXX/matrix-type-builtins.cpp
new file mode 100644
--- /dev/null
+++ b/clang/test/SemaCXX/matrix-type-builtins.cpp
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 %s -fenable-matrix -pedantic -std=c++11 -verify -triple=x86_64-apple-darwin9
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+struct MyMatrix {
+  using matrix_t = EltTy __attribute__((matrix_type(Rows, Columns)));
+
+  matrix_t value;
+};
+
+template <typename EltTy0, unsigned R0, unsigned C0, typename EltTy1, unsigned R1, unsigned C1>
+typename MyMatrix<EltTy1, R1, C1>::matrix_t transpose(MyMatrix<EltTy0, R0, C0> &A) {
+  char *v1 = __builtin_matrix_transpose(A.value);
+  // expected-error@-1 {{cannot initialize a variable of type 'char *' with an rvalue of type 'unsigned int __attribute__((matrix_type(3, 2)))'}}
+  // expected-error@-2 {{cannot initialize a variable of type 'char *' with an rvalue of type 'unsigned int __attribute__((matrix_type(3, 3)))'}}
+
+  __builtin_matrix_transpose(A);
+  // expected-error@-1 {{first argument must be a matrix}}
+  // expected-error@-2 {{first argument must be a matrix}}
+
+  return __builtin_matrix_transpose(A.value);
+  // expected-error@-1 {{cannot initialize return object of type 'typename MyMatrix<unsigned int, 2U, 3U>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(2, 3)))') with an rvalue of type 'unsigned int __attribute__((matrix_type(3, 2)))'}}
+  // expected-error@-2 {{cannot initialize return object of type 'typename MyMatrix<unsigned int, 2U, 3U>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(2, 3)))') with an rvalue of type 'unsigned int __attribute__((matrix_type(3, 3)))'}}
+}
+
+void test_transpose_template(unsigned *Ptr1, float *Ptr2) {
+  MyMatrix<unsigned, 2, 3> Mat1;
+  MyMatrix<unsigned, 3, 3> Mat2;
+  Mat1.value = *((decltype(Mat1)::matrix_t *)Ptr1);
+  Mat1.value = transpose<unsigned, 2, 3, unsigned, 2, 3>(Mat1);
+  // expected-note@-1 {{in instantiation of function template specialization 'transpose<unsigned int, 2, 3, unsigned int, 2, 3>' requested here}}
+
+  Mat1.value = transpose<unsigned, 3, 3, unsigned, 2, 3>(Mat2);
+  // expected-note@-1 {{in instantiation of function template specialization 'transpose<unsigned int, 3, 3, unsigned int, 2, 3>' requested here}}
+}