diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def
--- a/clang/include/clang/Basic/Builtins.def
+++ b/clang/include/clang/Basic/Builtins.def
@@ -577,6 +577,8 @@
 BUILTIN(__builtin_alloca_with_align, "v*zIz", "Fn")
 BUILTIN(__builtin_call_with_static_chain, "v.", "nt")
 
+BUILTIN(__builtin_matrix_transpose, "v.", "nFt")
+
 // "Overloaded" Atomic operator builtins.  These are overloaded to support data
 // types of i8, i16, i32, i64, and i128.  The front-end sees calls to the
 // non-suffixed version of these (which has a bogus type) and transforms them to
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10768,6 +10768,9 @@
 def err_matrix_subscript_comma: Error<
   "comma expressions are not allowed as indices in matrix subscript expressions">;
 
+def err_builtin_matrix_arg: Error<
+  "%select{first|second}0 argument must be a matrix">;
+
 def err_preserve_field_info_not_field : Error<
   "__builtin_preserve_field_info argument %0 not a field access">;
 def err_preserve_field_info_not_const: Error<
diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -12108,6 +12108,11 @@
                                 int ArgNum, unsigned ExpectedFieldNum,
                                 bool AllowName);
   bool SemaBuiltinARMMemoryTaggingCall(unsigned BuiltinID, CallExpr *TheCall);
+
+  // Matrix builtin handling.
+  ExprResult SemaBuiltinMatrixTransposeOverload(CallExpr *TheCall,
+                                                ExprResult CallResult);
+
 public:
   enum FormatStringType {
     FST_Scanf,
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -44,6 +44,7 @@
 #include "llvm/IR/IntrinsicsWebAssembly.h"
 #include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/MDBuilder.h"
+#include "llvm/IR/MatrixBuilder.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/TargetParser.h"
@@ -1636,6 +1637,10 @@
   return RValue::get(Builder.CreateCall(F, { Src, Src, ShiftAmt }));
 }
 
+const ConstantMatrixType *getMatrixTy(QualType Ty) {
+  return cast<ConstantMatrixType>(Ty.getCanonicalType());
+};
+
 RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
                                         const CallExpr *E,
                                         ReturnValueSlot ReturnValue) {
@@ -2373,6 +2378,15 @@
     return RValue::get(Builder.CreateZExt(V, ConvertType(E->getType())));
   }
 
+  case Builtin::BI__builtin_matrix_transpose: {
+    const ConstantMatrixType *MatrixTy = getMatrixTy(E->getArg(0)->getType());
+    Value *MatValue = EmitScalarExpr(E->getArg(0));
+    MatrixBuilder<CGBuilderTy> MB(Builder);
+    Value *Result = MB.CreateMatrixTranspose(MatValue, MatrixTy->getNumRows(),
+                                             MatrixTy->getNumColumns());
+    return RValue::get(Result);
+  }
+
   case Builtin::BIfinite:
   case Builtin::BI__finite:
   case Builtin::BIfinitef:
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -1896,7 +1896,7 @@
       return ExprError();
     break;
   case Builtin::BI__builtin_frame_address:
-  case Builtin::BI__builtin_return_address:
+  case Builtin::BI__builtin_return_address: {
     if (SemaBuiltinConstantArgRange(TheCall, 0, 0, 0xFFFF))
       return ExprError();
 
@@ -1913,6 +1913,20 @@
     break;
   }
 
+  case Builtin::BI__builtin_matrix_transpose:
+    if (!getLangOpts().MatrixTypes) {
+      Diag(TheCall->getBeginLoc(), diag::err_builtin_matrix_disabled);
+      return ExprError();
+    }
+
+    switch (BuiltinID) {
+    case Builtin::BI__builtin_matrix_transpose:
+      return SemaBuiltinMatrixTransposeOverload(TheCall, TheCallResult);
+    default:
+      llvm_unreachable("All matrix builtins should be handled here!");
+    }
+  }
+
   // Since the target specific builtins for each arch overlap, only check those
   // of the arch we are compiling for.
   if (Context.BuiltinInfo.isTSBuiltin(BuiltinID)) {
@@ -15028,3 +15042,26 @@
       rhs, std::bind(&Sema::AddPotentialMisalignedMembers, std::ref(*this), _1,
                      _2, _3, _4));
 }
+
+ExprResult Sema::SemaBuiltinMatrixTransposeOverload(CallExpr *TheCall,
+                                                    ExprResult CallResult) {
+  if (checkArgCount(*this, TheCall, 1))
+    return ExprError();
+
+  Expr *Arg = TheCall->getArg(0);
+  if (!Arg->getType()->isConstantMatrixType()) {
+    Diag(Arg->getBeginLoc(), diag::err_builtin_matrix_arg) << 0;
+    return ExprError();
+  }
+
+  // Create returned matrix type by swapping rows and columns of the argument
+  // matrix type.
+  ConstantMatrixType const *MType =
+      cast<ConstantMatrixType const>(Arg->getType().getCanonicalType());
+  QualType ResultType = Context.getConstantMatrixType(
+      MType->getElementType(), MType->getNumColumns(), MType->getNumRows());
+
+  // Change the return type to the type of the returned matrix.
+  TheCall->setType(ResultType);
+  return CallResult;
+}
diff --git a/clang/test/CodeGen/matrix-type-builtins.c b/clang/test/CodeGen/matrix-type-builtins.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/matrix-type-builtins.c
@@ -0,0 +1,70 @@
+// RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+// Tests for the matrix type builtins.
+
+typedef double dx5x5_t __attribute__((matrix_type(5, 5)));
+typedef float fx2x3_t __attribute__((matrix_type(2, 3)));
+typedef float fx3x2_t __attribute__((matrix_type(3, 2)));
+typedef int ix20x4_t __attribute__((matrix_type(20, 4)));
+typedef int ix4x20_t __attribute__((matrix_type(4, 20)));
+typedef unsigned ux1x6_t __attribute__((matrix_type(1, 6)));
+typedef unsigned ux6x1_t __attribute__((matrix_type(6, 1)));
+
+void transpose_double_5x5(dx5x5_t *a) {
+  // CHECK-LABEL: define void @transpose_double_5x5(
+  // CHECK:        [[A:%.*]] = load <25 x double>, <25 x double>* {{.*}}, align 8
+  // CHECK-NEXT:   [[TRANS:%.*]] = call <25 x double> @llvm.matrix.transpose.v25f64(<25 x double> [[A]], i32 5, i32 5)
+  // CHECK-NEXT:   [[AT_ADDR:%.*]] = bitcast [25 x double]* %a_t to <25 x double>*
+  // CHECK-NEXT:   store <25 x double> [[TRANS]], <25 x double>* [[AT_ADDR]], align 8
+  dx5x5_t a_t = __builtin_matrix_transpose(*a);
+}
+
+void transpose_float_3x2(fx3x2_t *a) {
+  // CHECK-LABEL: define void @transpose_float_3x2(
+  // CHECK:        [[A:%.*]] = load <6 x float>, <6 x float>* {{.*}}, align 4
+  // CHECK-NEXT:   [[TRANS:%.*]] = call <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> [[A]], i32 3, i32 2)
+  // CHECK-NEXT:   [[AT_ADDR:%.*]] = bitcast [6 x float]* %a_t to <6 x float>*
+  // CHECK-NEXT:   store <6 x float> [[TRANS]], <6 x float>* [[AT_ADDR]], align 4
+
+  fx2x3_t a_t = __builtin_matrix_transpose(*a);
+}
+
+void transpose_int_20x4(ix20x4_t *a) {
+  // CHECK-LABEL: define void @transpose_int_20x4(
+  // CHECK:         [[A:%.*]] = load <80 x i32>, <80 x i32>* {{.*}}, align 4
+  // CHECK-NEXT:    [[TRANS:%.*]] = call <80 x i32> @llvm.matrix.transpose.v80i32(<80 x i32> [[A]], i32 20, i32 4)
+  // CHECK-NEXT:    [[AT_ADDR:%.*]] = bitcast [80 x i32]* %a_t to <80 x i32>*
+  // CHECK-NEXT:    store <80 x i32> [[TRANS]], <80 x i32>* [[AT_ADDR]], align 4
+
+  ix4x20_t a_t = __builtin_matrix_transpose(*a);
+}
+
+struct Foo {
+  ux1x6_t in;
+  ux6x1_t out;
+};
+
+void transpose_struct_member(struct Foo *F) {
+  // CHECK-LABEL: define void @transpose_struct_member(
+  // CHECK:         [[M:%.*]] = load <6 x i32>, <6 x i32>* {{.*}}, align 4
+  // CHECK-NEXT:    [[M_T:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[M]], i32 1, i32 6)
+  // CHECK-NEXT:    [[F_ADDR:%.*]] = load %struct.Foo*, %struct.Foo** %F.addr, align 8
+  // CHECK-NEXT:    [[OUT_PTR:%.*]] = getelementptr inbounds %struct.Foo, %struct.Foo* [[F_ADDR]], i32 0, i32 1
+  // CHECK-NEXT:    [[OUT_PTR_C:%.*]] = bitcast [6 x i32]* [[OUT_PTR]] to <6 x i32>*
+  // CHECK-NEXT:    store <6 x i32> [[M_T]], <6 x i32>* [[OUT_PTR_C]], align 4
+
+  F->out = __builtin_matrix_transpose(F->in);
+}
+
+void transpose_transpose_struct_member(struct Foo *F) {
+  // CHECK-LABEL: define void @transpose_transpose_struct_member(
+  // CHECK:         [[M:%.*]] = load <6 x i32>, <6 x i32>* {{.*}}, align 4
+  // CHECK-NEXT:    [[M_T:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[M]], i32 1, i32 6)
+  // CHECK-NEXT:    [[M_T2:%.*]] = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> [[M_T]], i32 6, i32 1)
+  // CHECK-NEXT:    [[F_ADDR:%.*]] = load %struct.Foo*, %struct.Foo** %F.addr, align 8
+  // CHECK-NEXT:    [[IN_PTR:%.*]] = getelementptr inbounds %struct.Foo, %struct.Foo* [[F_ADDR]], i32 0, i32 0
+  // CHECK-NEXT:    [[IN_PTR_C:%.*]] = bitcast [6 x i32]* [[IN_PTR]] to <6 x i32>*
+  // CHECK-NEXT:    store <6 x i32> [[M_T2]], <6 x i32>* [[IN_PTR_C]], align 4
+
+  F->in = __builtin_matrix_transpose(__builtin_matrix_transpose(F->in));
+}
diff --git a/clang/test/CodeGenCXX/matrix-type-builtins.cpp b/clang/test/CodeGenCXX/matrix-type-builtins.cpp
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGenCXX/matrix-type-builtins.cpp
@@ -0,0 +1,53 @@
+// RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++17 | FileCheck %s
+
+// Tests for the matrix type builtins.
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+using matrix_t = EltTy __attribute__((matrix_type(Rows, Columns)));
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+struct MyMatrix {
+  matrix_t<EltTy, Rows, Columns> value;
+};
+
+template <typename T, unsigned R, unsigned C>
+MyMatrix<T, C, R> transpose(const MyMatrix<T, R, C> &M) {
+  MyMatrix<T, C, R> Res;
+  Res.value = __builtin_matrix_transpose(M.value);
+  return Res;
+}
+
+void test_transpose_template1() {
+  // CHECK-LABEL: define void @_Z24test_transpose_template1v()
+  // CHECK:         call void @_Z9transposeIiLj4ELj10EE8MyMatrixIT_XT1_EXT0_EERKS0_IS1_XT0_EXT1_EE(%struct.MyMatrix.0* sret align 4 %M1_t, %struct.MyMatrix* nonnull align 4 dereferenceable(160) %M1)
+
+  // CHECK-LABEL: define linkonce_odr void @_Z9transposeIiLj4ELj10EE8MyMatrixIT_XT1_EXT0_EERKS0_IS1_XT0_EXT1_EE(
+  // CHECK:         [[M:%.*]] = load <40 x i32>, <40 x i32>* {{.*}}, align 4
+  // CHECK-NEXT:    [[M_T:%.*]] = call <40 x i32> @llvm.matrix.transpose.v40i32(<40 x i32> [[M]], i32 4, i32 10)
+
+  MyMatrix<int, 4, 10> M1;
+  MyMatrix<int, 10, 4> M1_t = transpose(M1);
+}
+
+void test_transpose_template2(MyMatrix<double, 7, 6> &M) {
+  // CHECK-LABEL: define void @_Z24test_transpose_template2R8MyMatrixIdLj7ELj6EE(
+  // CHECK:         call void @_Z9transposeIdLj7ELj6EE8MyMatrixIT_XT1_EXT0_EERKS0_IS1_XT0_EXT1_EE(%struct.MyMatrix.2* sret align 8 %ref.tmp1, %struct.MyMatrix.1* nonnull align 8 dereferenceable(336) %0)
+  // CHECK-NEXT:    call void @_Z9transposeIdLj6ELj7EE8MyMatrixIT_XT1_EXT0_EERKS0_IS1_XT0_EXT1_EE(%struct.MyMatrix.1* sret align 8 %ref.tmp, %struct.MyMatrix.2* nonnull align 8 dereferenceable(336) %ref.tmp1)
+  // CHECK-NEXT:    call void @_Z9transposeIdLj7ELj6EE8MyMatrixIT_XT1_EXT0_EERKS0_IS1_XT0_EXT1_EE(%struct.MyMatrix.2* sret align 8 %M2_t, %struct.MyMatrix.1* nonnull align 8 dereferenceable(336) %ref.tmp)
+
+  // CHECK-LABEL: define linkonce_odr void @_Z9transposeIdLj7ELj6EE8MyMatrixIT_XT1_EXT0_EERKS0_IS1_XT0_EXT1_EE(
+  // CHECK:         [[M:%.*]] = load <42 x double>, <42 x double>* {{.*}}, align 8
+  // CHECK-NEXT:    [[M_T:%.*]] = call <42 x double> @llvm.matrix.transpose.v42f64(<42 x double> [[M]], i32 7, i32 6)
+  // CHECK-NEXT:    [[RES_ADDR:%.*]] = getelementptr inbounds %struct.MyMatrix.2, %struct.MyMatrix.2* %agg.result, i32 0, i32 0
+  // CHECK-NEXT:    [[RES_ADDR_C:%.*]] = bitcast [42 x double]* [[RES_ADDR]] to <42 x double>*
+  // CHECK-NEXT:    store <42 x double> [[M_T]], <42 x double>* [[RES_ADDR_C]], align 8
+
+  // CHECK-LABEL: define linkonce_odr void @_Z9transposeIdLj6ELj7EE8MyMatrixIT_XT1_EXT0_EERKS0_IS1_XT0_EXT1_EE(
+  // CHECK:         [[M:%.*]] = load <42 x double>, <42 x double>* {{.*}}, align 8
+  // CHECK-NEXT:    [[M_T:%.*]] = call <42 x double> @llvm.matrix.transpose.v42f64(<42 x double> [[M]], i32 6, i32 7)
+  // CHECK-NEXT:    [[RES_ADDR:%.*]] = getelementptr inbounds %struct.MyMatrix.1, %struct.MyMatrix.1* %agg.result, i32 0, i32 0
+  // CHECK-NEXT:    [[RES_ADDR_C:%.*]] = bitcast [42 x double]* [[RES_ADDR]] to <42 x double>*
+  // CHECK-NEXT:    store <42 x double> [[M_T]], <42 x double>* [[RES_ADDR_C]], align 8
+
+  MyMatrix<double, 6, 7> M2_t = transpose(transpose(transpose(M)));
+}
diff --git a/clang/test/Sema/matrix-type-builtins.c b/clang/test/Sema/matrix-type-builtins.c
new file mode 100644
--- /dev/null
+++ b/clang/test/Sema/matrix-type-builtins.c
@@ -0,0 +1,17 @@
+// RUN: %clang_cc1 %s -fenable-matrix -pedantic -verify -triple=x86_64-apple-darwin9
+
+typedef float sx5x10_t __attribute__((matrix_type(5, 10)));
+typedef int ix3x2_t __attribute__((matrix_type(3, 2)));
+
+void transpose(sx5x10_t a, ix3x2_t b, int *c, int d) {
+  a = __builtin_matrix_transpose(b);
+  // expected-error@-1 {{assigning to 'sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))') from incompatible type 'int __attribute__((matrix_type(2, 3)))'}}
+  b = __builtin_matrix_transpose(b);
+  // expected-error@-1 {{assigning to 'ix3x2_t' (aka 'int __attribute__((matrix_type(3, 2)))') from incompatible type 'int __attribute__((matrix_type(2, 3)))'}}
+  __builtin_matrix_transpose(c);
+  // expected-error@-1 {{first argument must be a matrix}}
+  __builtin_matrix_transpose(d);
+  // expected-error@-1 {{first argument must be a matrix}}
+  __builtin_matrix_transpose("test");
+  // expected-error@-1 {{first argument must be a matrix}}
+}
diff --git a/clang/test/SemaCXX/matrix-type-builtins.cpp b/clang/test/SemaCXX/matrix-type-builtins.cpp
new file mode 100644
--- /dev/null
+++ b/clang/test/SemaCXX/matrix-type-builtins.cpp
@@ -0,0 +1,34 @@
+// RUN: %clang_cc1 %s -fenable-matrix -pedantic -std=c++11 -verify -triple=x86_64-apple-darwin9
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+struct MyMatrix {
+  using matrix_t = EltTy __attribute__((matrix_type(Rows, Columns)));
+
+  matrix_t value;
+};
+
+template <typename EltTy0, unsigned R0, unsigned C0, typename EltTy1, unsigned R1, unsigned C1>
+typename MyMatrix<EltTy1, R1, C1>::matrix_t transpose(MyMatrix<EltTy0, R0, C0> &A) {
+  char *v1 = __builtin_matrix_transpose(A.value);
+  // expected-error@-1 {{cannot initialize a variable of type 'char *' with an rvalue of type 'unsigned int __attribute__((matrix_type(3, 2)))'}}
+  // expected-error@-2 {{cannot initialize a variable of type 'char *' with an rvalue of type 'unsigned int __attribute__((matrix_type(3, 3)))'}}
+
+  __builtin_matrix_transpose(A);
+  // expected-error@-1 {{first argument must be a matrix}}
+  // expected-error@-2 {{first argument must be a matrix}}
+
+  return __builtin_matrix_transpose(A.value);
+  // expected-error@-1 {{cannot initialize return object of type 'typename MyMatrix<unsigned int, 2U, 3U>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(2, 3)))') with an rvalue of type 'unsigned int __attribute__((matrix_type(3, 2)))'}}
+  // expected-error@-2 {{cannot initialize return object of type 'typename MyMatrix<unsigned int, 2U, 3U>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(2, 3)))') with an rvalue of type 'unsigned int __attribute__((matrix_type(3, 3)))'}}
+}
+
+void test_transpose_template(unsigned *Ptr1, float *Ptr2) {
+  MyMatrix<unsigned, 2, 3> Mat1;
+  MyMatrix<unsigned, 3, 3> Mat2;
+  Mat1.value = *((decltype(Mat1)::matrix_t *)Ptr1);
+  Mat1.value = transpose<unsigned, 2, 3, unsigned, 2, 3>(Mat1);
+  // expected-note@-1 {{in instantiation of function template specialization 'transpose<unsigned int, 2, 3, unsigned int, 2, 3>' requested here}}
+
+  Mat1.value = transpose<unsigned, 3, 3, unsigned, 2, 3>(Mat2);
+  // expected-note@-1 {{in instantiation of function template specialization 'transpose<unsigned int, 3, 3, unsigned int, 2, 3>' requested here}}
+}