diff --git a/clang/include/clang/AST/ASTContext.h b/clang/include/clang/AST/ASTContext.h
--- a/clang/include/clang/AST/ASTContext.h
+++ b/clang/include/clang/AST/ASTContext.h
@@ -978,6 +978,7 @@
 #include "clang/Basic/OpenCLImageTypes.def"
   CanQualType OCLSamplerTy, OCLEventTy, OCLClkEventTy;
   CanQualType OCLQueueTy, OCLReserveIDTy;
+  CanQualType IncompleteMatrixIdxTy;
   CanQualType OMPArraySectionTy, OMPArrayShapingTy, OMPIteratorTy;
 #define EXT_OPAQUE_TYPE(ExtType, Id, Ext) \
   CanQualType Id##Ty;
diff --git a/clang/include/clang/AST/BuiltinTypes.def b/clang/include/clang/AST/BuiltinTypes.def
--- a/clang/include/clang/AST/BuiltinTypes.def
+++ b/clang/include/clang/AST/BuiltinTypes.def
@@ -310,6 +310,9 @@
 // context.
 PLACEHOLDER_TYPE(ARCUnbridgedCast, ARCUnbridgedCastTy)
 
+// A placeholder type for incomplete matrix index expressions.
+PLACEHOLDER_TYPE(IncompleteMatrixIdx, IncompleteMatrixIdxTy)
+
 // A placeholder type for OpenMP array sections.
 PLACEHOLDER_TYPE(OMPArraySection, OMPArraySectionTy)
 
diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@@ -461,6 +461,11 @@
     return const_cast<Expr*>(this)->getReferencedDeclOfCallee();
   }
 
+  /// If \p Base is part of a matrix index expressions, return the access matrix
+  /// type. \p Base is part of a matrix index expression, if either Base is a
+  /// matrix (= matrix row index expr) or a matrix row index expr.
+  const ConstantMatrixType *getMatrixFromIndexExpr(bool EnableMatrix) const;
+
   /// If this expression is an l-value for an Objective C
   /// property, find the underlying property reference expression.
   const ObjCPropertyRefExpr *getObjCProperty() const;
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10744,6 +10744,12 @@
 
 def err_builtin_matrix_disabled: Error<
   "matrix types extension is disabled. Pass -fenable-matrix to enable it">;
+def err_matrix_index_not_integer: Error<
+  "matrix %select{row|column}0 index is not an integer">;
+def err_matrix_index_outside_range: Error<
+  "matrix %select{row|column}0 index is outside the allowed range [0, %1)">;
+def err_matrix_incomplete_index: Error<
+  "single subscript expressions are not allowed for matrix values">;
 
 def err_preserve_field_info_not_field : Error<
   "__builtin_preserve_field_info argument %0 not a field access">;
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -1057,6 +1057,9 @@
       /// The placeholder type for OpenMP iterator expression.
       PREDEF_TYPE_OMP_ITERATOR = 71,
 
+      /// A placeholder type for incomplete matrix index operations.
+      PREDEF_TYPE_INCOMPLETE_MATRIX_IDX = 72,
+
       /// OpenCL image types with auto numeration
 #define IMAGE_TYPE(ImgType, Id, SingletonId, Access, Suffix) \
       PREDEF_TYPE_##Id##_ID,
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -1388,6 +1388,8 @@
     InitBuiltinType(OMPArrayShapingTy, BuiltinType::OMPArrayShaping);
     InitBuiltinType(OMPIteratorTy, BuiltinType::OMPIterator);
   }
+  if (LangOpts.MatrixTypes)
+    InitBuiltinType(IncompleteMatrixIdxTy, BuiltinType::IncompleteMatrixIdx);
 
   // C99 6.2.5p11.
   FloatComplexTy      = getComplexType(FloatTy);
diff --git a/clang/lib/AST/Expr.cpp b/clang/lib/AST/Expr.cpp
--- a/clang/lib/AST/Expr.cpp
+++ b/clang/lib/AST/Expr.cpp
@@ -3848,6 +3848,20 @@
   return nullptr;
 }
 
+const ConstantMatrixType *
+Expr::getMatrixFromIndexExpr(bool EnableMatrix) const {
+  if (!EnableMatrix)
+    return nullptr;
+
+  if (getType()->isConstantMatrixType())
+    return getType()->getAs<ConstantMatrixType>();
+
+  auto *SubscriptE = dyn_cast<ArraySubscriptExpr>(this);
+  return SubscriptE
+             ? SubscriptE->getBase()->getType()->getAs<ConstantMatrixType>()
+             : nullptr;
+}
+
 bool Expr::refersToVectorElement() const {
   // FIXME: Why do we not just look at the ObjectKind here?
   const Expr *E = this->IgnoreParens();
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -7763,6 +7763,10 @@
   if (E->getBase()->getType()->isVectorType())
     return Error(E);
 
+  // Skip matrixes as subscript bases.
+  if (E->getBase()->getMatrixFromIndexExpr(Info.getLangOpts().MatrixTypes))
+    return false;
+
   bool Success = true;
   if (!evaluatePointer(E->getBase(), Result)) {
     if (!Info.noteFailure())
diff --git a/clang/lib/AST/NSAPI.cpp b/clang/lib/AST/NSAPI.cpp
--- a/clang/lib/AST/NSAPI.cpp
+++ b/clang/lib/AST/NSAPI.cpp
@@ -482,6 +482,7 @@
   case BuiltinType::Half:
   case BuiltinType::PseudoObject:
   case BuiltinType::BuiltinFn:
+  case BuiltinType::IncompleteMatrixIdx:
   case BuiltinType::OMPArraySection:
   case BuiltinType::OMPArrayShaping:
   case BuiltinType::OMPIterator:
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -3025,6 +3025,8 @@
     return "queue_t";
   case OCLReserveID:
     return "reserve_id_t";
+  case IncompleteMatrixIdx:
+    return "<incomplete matrix index type>";
   case OMPArraySection:
     return "<OpenMP array section type>";
   case OMPArrayShaping:
@@ -4045,6 +4047,7 @@
 #include "clang/Basic/AArch64SVEACLETypes.def"
     case BuiltinType::BuiltinFn:
     case BuiltinType::NullPtr:
+    case BuiltinType::IncompleteMatrixIdx:
     case BuiltinType::OMPArraySection:
     case BuiltinType::OMPArrayShaping:
     case BuiltinType::OMPIterator:
diff --git a/clang/lib/AST/TypeLoc.cpp b/clang/lib/AST/TypeLoc.cpp
--- a/clang/lib/AST/TypeLoc.cpp
+++ b/clang/lib/AST/TypeLoc.cpp
@@ -403,6 +403,7 @@
   case BuiltinType::Id:
 #include "clang/Basic/AArch64SVEACLETypes.def"
   case BuiltinType::BuiltinFn:
+  case BuiltinType::IncompleteMatrixIdx:
   case BuiltinType::OMPArraySection:
   case BuiltinType::OMPArrayShaping:
   case BuiltinType::OMPIterator:
diff --git a/clang/lib/CodeGen/CGExpr.cpp b/clang/lib/CodeGen/CGExpr.cpp
--- a/clang/lib/CodeGen/CGExpr.cpp
+++ b/clang/lib/CodeGen/CGExpr.cpp
@@ -1846,6 +1846,16 @@
   return RValue::get(CGF.EmitLoadOfScalar(LV, Loc));
 }
 
+static RValue EmitLoadOfMatrixEltLValue(LValue LV, SourceLocation Loc,
+                                        CodeGenFunction &CGF) {
+  assert(LV.getType()->isConstantMatrixType() &&
+         "matrix element LValues need to access a matrix");
+  Address Addr = MaybeConvertMatrixAddress(LV.getVectorAddress(), CGF);
+  llvm::LoadInst *Load = CGF.Builder.CreateLoad(Addr, LV.isVolatileQualified());
+  return RValue::get(
+      CGF.Builder.CreateExtractElement(Load, LV.getVectorIdx(), "matext"));
+}
+
 /// EmitLoadOfLValue - Given an expression that represents a value lvalue, this
 /// method emits the address of the lvalue, then loads the result as an rvalue,
 /// returning the rvalue.
@@ -1878,6 +1888,9 @@
     return RValue::get(EmitLoadOfScalar(LV, Loc));
   }
 
+  if (LV.isMatrixElt())
+    return EmitLoadOfMatrixEltLValue(LV, Loc, *this);
+
   if (LV.isVectorElt()) {
     llvm::LoadInst *Load = Builder.CreateLoad(LV.getVectorAddress(),
                                               LV.isVolatileQualified());
@@ -1999,6 +2012,19 @@
   return RValue::get(Call);
 }
 
+// Store the specified rvalue into the specified matrix element.
+static void EmitStoreThroughMatrixEltLValue(RValue Src, LValue Dst,
+                                            CodeGenFunction &CGF) {
+  Address DstAddr = MaybeConvertMatrixAddress(
+      Address(Dst.getVectorPointer(),
+              CGF.getContext().getTypeAlignInChars(Dst.getType())),
+      CGF);
+  llvm::Value *Vec = CGF.Builder.CreateLoad(DstAddr);
+  Vec = CGF.Builder.CreateInsertElement(Vec, Src.getScalarVal(),
+                                        Dst.getVectorIdx(), "matins");
+
+  CGF.Builder.CreateStore(Vec, DstAddr, Dst.isVolatileQualified());
+}
 
 /// EmitStoreThroughLValue - Store the specified rvalue into the specified
 /// lvalue, where both are guaranteed to the have the same type, and that type
@@ -2025,6 +2051,9 @@
     if (Dst.isGlobalReg())
       return EmitStoreThroughGlobalRegLValue(Src, Dst);
 
+    if (Dst.isMatrixElt())
+      return EmitStoreThroughMatrixEltLValue(Src, Dst, *this);
+
     assert(Dst.isBitField() && "Unknown LValue type");
     return EmitStoreThroughBitfieldLValue(Src, Dst);
   }
@@ -3637,6 +3666,28 @@
                                  TBAAAccessInfo());
   }
 
+  // If the base is a matrix type, we have a matrix index expressions (with row
+  // and column indices). We are forming a matrix element lvalue with an index
+  // into the matrix as a flattened vector.
+  if (auto *MTy =
+          E->getBase()->getMatrixFromIndexExpr(getLangOpts().MatrixTypes)) {
+    auto *ColIdxExpr = cast<ArraySubscriptExpr>(E->getBase());
+    LValue Base = EmitLValue(ColIdxExpr->getBase());
+    llvm::Value *RowIdx = EmitScalarExpr(ColIdxExpr->getIdx());
+    llvm::Value *ColIdx = EmitScalarExpr(E->getIdx());
+    unsigned MaxWidth = std::max(RowIdx->getType()->getScalarSizeInBits(),
+                                 ColIdx->getType()->getScalarSizeInBits());
+    llvm::Type *IntTy = llvm::IntegerType::get(getLLVMContext(), MaxWidth);
+    RowIdx = Builder.CreateZExt(RowIdx, IntTy);
+    ColIdx = Builder.CreateZExt(ColIdx, IntTy);
+    llvm::Value *NumRows = Builder.getIntN(MaxWidth, MTy->getNumRows());
+    llvm::Value *FinalIdx =
+        Builder.CreateAdd(Builder.CreateMul(ColIdx, NumRows), RowIdx);
+    return LValue::MakeMatrixElt(Base.getAddress(*this), FinalIdx,
+                                 ColIdxExpr->getBase()->getType(),
+                                 Base.getBaseInfo(), TBAAAccessInfo());
+  }
+
   // All the other cases basically behave like simple offsetting.
 
   // Handle the extvector case we ignored above.
diff --git a/clang/lib/CodeGen/CGValue.h b/clang/lib/CodeGen/CGValue.h
--- a/clang/lib/CodeGen/CGValue.h
+++ b/clang/lib/CodeGen/CGValue.h
@@ -170,7 +170,8 @@
     VectorElt,    // This is a vector element l-value (V[i]), use getVector*
     BitField,     // This is a bitfield l-value, use getBitfield*.
     ExtVectorElt, // This is an extended vector subset, use getExtVectorComp
-    GlobalReg     // This is a register l-value, use getGlobalReg()
+    GlobalReg,    // This is a register l-value, use getGlobalReg()
+    MatrixElt     // This is a matrix element, use getVector*
   } LVType;
 
   llvm::Value *V;
@@ -254,6 +255,7 @@
   bool isBitField() const { return LVType == BitField; }
   bool isExtVectorElt() const { return LVType == ExtVectorElt; }
   bool isGlobalReg() const { return LVType == GlobalReg; }
+  bool isMatrixElt() const { return LVType == MatrixElt; }
 
   bool isVolatileQualified() const { return Quals.hasVolatile(); }
   bool isRestrictQualified() const { return Quals.hasRestrict(); }
@@ -337,8 +339,14 @@
   Address getVectorAddress() const {
     return Address(getVectorPointer(), getAlignment());
   }
-  llvm::Value *getVectorPointer() const { assert(isVectorElt()); return V; }
-  llvm::Value *getVectorIdx() const { assert(isVectorElt()); return VectorIdx; }
+  llvm::Value *getVectorPointer() const {
+    assert(isVectorElt() || isMatrixElt());
+    return V;
+  }
+  llvm::Value *getVectorIdx() const {
+    assert(isVectorElt() || isMatrixElt());
+    return VectorIdx;
+  }
 
   // extended vector elements.
   Address getExtVectorAddress() const {
@@ -430,6 +438,18 @@
     return R;
   }
 
+  static LValue MakeMatrixElt(Address matAddress, llvm::Value *Idx,
+                              QualType type, LValueBaseInfo BaseInfo,
+                              TBAAAccessInfo TBAAInfo) {
+    LValue R;
+    R.LVType = MatrixElt;
+    R.V = matAddress.getPointer();
+    R.VectorIdx = Idx;
+    R.Initialize(type, type.getQualifiers(), matAddress.getAlignment(),
+                 BaseInfo, TBAAInfo);
+    return R;
+  }
+
   RValue asAggregateRValue(CodeGenFunction &CGF) const {
     return RValue::getAggregate(getAddress(CGF), isVolatileQualified());
   }
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -4561,7 +4561,9 @@
   // resolution for the operator overload should get the first crack
   // at the overload.
   bool IsMSPropertySubscript = false;
-  if (base->getType()->isNonOverloadPlaceholderType()) {
+  auto BaseTy = base->getType();
+  if (BaseTy->isNonOverloadPlaceholderType() &&
+      !BaseTy->isSpecificPlaceholderType(BuiltinType::IncompleteMatrixIdx)) {
     IsMSPropertySubscript = isMSPropertySubscriptExpr(*this, base);
     if (!IsMSPropertySubscript) {
       ExprResult result = CheckPlaceholderExpr(base);
@@ -5195,7 +5197,8 @@
   }
 
   // Perform default conversions.
-  if (!LHSExp->getType()->getAs<VectorType>()) {
+  if (!LHSExp->getType()->getAs<VectorType>() &&
+      !LHSExp->getMatrixFromIndexExpr(getLangOpts().MatrixTypes)) {
     ExprResult Result = DefaultFunctionArrayLvalueConversion(LHSExp);
     if (Result.isInvalid())
       return ExprError();
@@ -5286,6 +5289,41 @@
     BaseExpr = LHSExp;
     IndexExpr = RHSExp;
     ResultType = LHSTy->getAs<PointerType>()->getPointeeType();
+  } else if (auto *MTy =
+                 LHSExp->getMatrixFromIndexExpr(getLangOpts().MatrixTypes)) {
+    BaseExpr = LHSExp;
+    IndexExpr = RHSExp;
+
+    // Validate index.
+    bool IsRowIdx = BaseExpr->getType()->isConstantMatrixType();
+    if (!IndexExpr->getType()->isIntegerType() &&
+        !IndexExpr->isTypeDependent()) {
+      Diag(IndexExpr->getBeginLoc(), diag::err_matrix_index_not_integer)
+          << (IsRowIdx ? 0 : 1);
+      return ExprError();
+    }
+
+    unsigned Dim = IsRowIdx ? MTy->getNumRows() : MTy->getNumColumns();
+    llvm::APSInt Idx;
+    if (IndexExpr->isIntegerConstantExpr(Idx, Context) &&
+        (Idx < 0 || Idx >= Dim)) {
+      Diag(IndexExpr->getBeginLoc(), diag::err_matrix_index_outside_range)
+          << (IsRowIdx ? 0 : 1) << Dim;
+      return ExprError();
+    }
+
+    // Set the type of the outer ArraySubscriptExpr to the element type and for
+    // the inner expression to the matrix type. That ensures an error when only
+    // using a single ArraySubscriptExpr on a matrix.
+    if (IsRowIdx)
+      ResultType = Context.IncompleteMatrixIdxTy;
+    else {
+      ResultType = MTy->getElementType();
+      BaseExpr->setType(ResultType);
+    }
+
+    VK = VK_LValue;
+    OK = OK_VectorComponent;
   } else if (RHSTy->isArrayType()) {
     // Same as previous, except for 123[f().a] case
     Diag(RHSExp->getBeginLoc(), diag::ext_subscript_non_lvalue)
@@ -5935,6 +5973,7 @@
   // These are always invalid as call arguments and should be reported.
   case BuiltinType::BoundMember:
   case BuiltinType::BuiltinFn:
+  case BuiltinType::IncompleteMatrixIdx:
   case BuiltinType::OMPArraySection:
   case BuiltinType::OMPArrayShaping:
   case BuiltinType::OMPIterator:
@@ -18864,6 +18903,11 @@
     return ExprError();
   }
 
+  case BuiltinType::IncompleteMatrixIdx:
+    Diag(cast<ArraySubscriptExpr>(E)->getIdx()->getBeginLoc(),
+         diag::err_matrix_incomplete_index);
+    return ExprError();
+
   // Expressions of unknown type.
   case BuiltinType::OMPArraySection:
     Diag(E->getBeginLoc(), diag::err_omp_array_section_use);
diff --git a/clang/lib/Serialization/ASTCommon.cpp b/clang/lib/Serialization/ASTCommon.cpp
--- a/clang/lib/Serialization/ASTCommon.cpp
+++ b/clang/lib/Serialization/ASTCommon.cpp
@@ -240,6 +240,9 @@
   case BuiltinType::BuiltinFn:
     ID = PREDEF_TYPE_BUILTIN_FN;
     break;
+  case BuiltinType::IncompleteMatrixIdx:
+    ID = PREDEF_TYPE_INCOMPLETE_MATRIX_IDX;
+    break;
   case BuiltinType::OMPArraySection:
     ID = PREDEF_TYPE_OMP_ARRAY_SECTION;
     break;
diff --git a/clang/lib/Serialization/ASTReader.cpp b/clang/lib/Serialization/ASTReader.cpp
--- a/clang/lib/Serialization/ASTReader.cpp
+++ b/clang/lib/Serialization/ASTReader.cpp
@@ -7007,6 +7007,9 @@
     case PREDEF_TYPE_BUILTIN_FN:
       T = Context.BuiltinFnTy;
       break;
+    case PREDEF_TYPE_INCOMPLETE_MATRIX_IDX:
+      T = Context.IncompleteMatrixIdxTy;
+      break;
     case PREDEF_TYPE_OMP_ARRAY_SECTION:
       T = Context.OMPArraySectionTy;
       break;
diff --git a/clang/test/CodeGen/matrix-type-operators.c b/clang/test/CodeGen/matrix-type-operators.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/matrix-type-operators.c
@@ -0,0 +1,304 @@
+// RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s
+
+// Tests for the matrix type operators.
+
+typedef double dx5x5_t __attribute__((matrix_type(5, 5)));
+typedef float fx2x3_t __attribute__((matrix_type(2, 3)));
+
+// Check that we can use matrix index expression on different floating point
+// matrixes and indices.
+void insert_fp(dx5x5_t a, double d, fx2x3_t b, float e, int j, unsigned k) {
+  // CHECK-LABEL: define void @insert_fp(<25 x double> %a, double %d, <6 x float> %b, float %e, i32 %j, i32 %k)
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %a.addr = alloca [25 x double], align 8
+  // CHECK-NEXT:    %d.addr = alloca double, align 8
+  // CHECK-NEXT:    %b.addr = alloca [6 x float], align 4
+  // CHECK-NEXT:    %e.addr = alloca float, align 4
+  // CHECK-NEXT:    %j.addr = alloca i32, align 4
+  // CHECK-NEXT:    %k.addr = alloca i32, align 4
+  // CHECK-NEXT:    %0 = bitcast [25 x double]* %a.addr to <25 x double>*
+  // CHECK-NEXT:    store <25 x double> %a, <25 x double>* %0, align 8
+  // CHECK-NEXT:    store double %d, double* %d.addr, align 8
+  // CHECK-NEXT:    %1 = bitcast [6 x float]* %b.addr to <6 x float>*
+  // CHECK-NEXT:    store <6 x float> %b, <6 x float>* %1, align 4
+  // CHECK-NEXT:    store float %e, float* %e.addr, align 4
+  // CHECK-NEXT:    store i32 %j, i32* %j.addr, align 4
+  // CHECK-NEXT:    store i32 %k, i32* %k.addr, align 4
+  // CHECK-NEXT:    %2 = load double, double* %d.addr, align 8
+  // CHECK-NEXT:    %3 = load <25 x double>, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %matins = insertelement <25 x double> %3, double %2, i64 5
+  // CHECK-NEXT:    store <25 x double> %matins, <25 x double>* %0, align 8
+  a[0ll][1u] = d;
+
+  // CHECK-NEXT:    %4 = load float, float* %e.addr, align 4
+  // CHECK-NEXT:    %5 = load <6 x float>, <6 x float>* %1, align 4
+  // CHECK-NEXT:    %matins1 = insertelement <6 x float> %5, float %4, i32 1
+  // CHECK-NEXT:    store <6 x float> %matins1, <6 x float>* %1, align 4
+  b[1][0] = e;
+
+  // CHECK-NEXT:    %6 = load double, double* %d.addr, align 8
+  // CHECK-NEXT:    %7 = load <25 x double>, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %matins2 = insertelement <25 x double> %7, double %6, i32 1
+  // CHECK-NEXT:    store <25 x double> %matins2, <25 x double>* %0, align 8
+  a[1][0u] = d;
+
+  // CHECK-NEXT:    %8 = load float, float* %e.addr, align 4
+  // CHECK-NEXT:    %9 = load <6 x float>, <6 x float>* %1, align 4
+  // CHECK-NEXT:    %matins3 = insertelement <6 x float> %9, float %8, i64 3
+  // CHECK-NEXT:    store <6 x float> %matins3, <6 x float>* %1, align 4
+  b[1ull][1] = e;
+
+  // CHECK-NEXT:    %10 = load float, float* %e.addr, align 4
+  // CHECK-NEXT:    %11 = load i32, i32* %j.addr, align 4
+  // CHECK-NEXT:    %12 = load i32, i32* %k.addr, align 4
+  // CHECK-NEXT:    %13 = mul i32 %12, 2
+  // CHECK-NEXT:    %14 = add i32 %13, %11
+  // CHECK-NEXT:    %15 = load <6 x float>, <6 x float>* %1, align 4
+  // CHECK-NEXT:    %matins4 = insertelement <6 x float> %15, float %10, i32 %14
+  // CHECK-NEXT:    store <6 x float> %matins4, <6 x float>* %1, align 4
+  // CHECK-NEXT:    ret void
+  b[j][k] = e;
+}
+
+// Check that we can can use matrix index expressions on integer matrixes.
+typedef int ix9x3_t __attribute__((matrix_type(9, 3)));
+void insert_int(ix9x3_t a, int i) {
+  // CHECK-LABEL: define void @insert_int(<27 x i32> %a, i32 %i)
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %a.addr = alloca [27 x i32], align 4
+  // CHECK-NEXT:    %i.addr = alloca i32, align 4
+  // CHECK-NEXT:    %0 = bitcast [27 x i32]* %a.addr to <27 x i32>*
+  // CHECK-NEXT:    store <27 x i32> %a, <27 x i32>* %0, align 4
+  // CHECK-NEXT:    store i32 %i, i32* %i.addr, align 4
+  // CHECK-NEXT:    %1 = load i32, i32* %i.addr, align 4
+  // CHECK-NEXT:    %2 = load <27 x i32>, <27 x i32>* %0, align 4
+  // CHECK-NEXT:    %matins = insertelement <27 x i32> %2, i32 %1, i32 13
+  // CHECK-NEXT:    store <27 x i32> %matins, <27 x i32>* %0, align 4
+  // CHECK-NEXT:    ret void
+
+  a[4u][1u] = i;
+}
+
+// Check that we can can use matrix index expressions on FP and integer
+// matrixes.
+typedef int ix9x3_t __attribute__((matrix_type(9, 3)));
+void insert_int_fp(ix9x3_t *a, int i, fx2x3_t b, float e, short j, unsigned long long k) {
+  // CHECK-LABEL: define void @insert_int_fp([27 x i32]* %a, i32 %i, <6 x float> %b, float %e, i16 signext %j, i64 %k)
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %a.addr = alloca [27 x i32]*, align 8
+  // CHECK-NEXT:    %i.addr = alloca i32, align 4
+  // CHECK-NEXT:    %b.addr = alloca [6 x float], align 4
+  // CHECK-NEXT:    %e.addr = alloca float, align 4
+  // CHECK-NEXT:    %j.addr = alloca i16, align 2
+  // CHECK-NEXT:    %k.addr = alloca i64, align 8
+  // CHECK-NEXT:    store [27 x i32]* %a, [27 x i32]** %a.addr, align 8
+  // CHECK-NEXT:    store i32 %i, i32* %i.addr, align 4
+  // CHECK-NEXT:    %0 = bitcast [6 x float]* %b.addr to <6 x float>*
+  // CHECK-NEXT:    store <6 x float> %b, <6 x float>* %0, align 4
+  // CHECK-NEXT:    store float %e, float* %e.addr, align 4
+  // CHECK-NEXT:    store i16 %j, i16* %j.addr, align 2
+  // CHECK-NEXT:    store i64 %k, i64* %k.addr, align 8
+  // CHECK-NEXT:    %1 = load i32, i32* %i.addr, align 4
+  // CHECK-NEXT:    %2 = load [27 x i32]*, [27 x i32]** %a.addr, align 8
+  // CHECK-NEXT:    %3 = bitcast [27 x i32]* %2 to <27 x i32>*
+  // CHECK-NEXT:    %4 = load <27 x i32>, <27 x i32>* %3, align 4
+  // CHECK-NEXT:    %matins = insertelement <27 x i32> %4, i32 %1, i32 13
+  // CHECK-NEXT:    store <27 x i32> %matins, <27 x i32>* %3, align 4
+  (*a)[4u][1u] = i;
+
+  // CHECK-NEXT:    %5 = load float, float* %e.addr, align 4
+  // CHECK-NEXT:    %6 = load <6 x float>, <6 x float>* %0, align 4
+  // CHECK-NEXT:    %matins1 = insertelement <6 x float> %6, float %5, i32 3
+  // CHECK-NEXT:    store <6 x float> %matins1, <6 x float>* %0, align 4
+  b[1u][1u] = e;
+
+  // CHECK-NEXT:    %7 = load float, float* %e.addr, align 4
+  // CHECK-NEXT:    %8 = load i16, i16* %j.addr, align 2
+  // CHECK-NEXT:    %9 = load i64, i64* %k.addr, align 8
+  // CHECK-NEXT:    %10 = zext i16 %8 to i64
+  // CHECK-NEXT:    %11 = mul i64 %9, 2
+  // CHECK-NEXT:    %12 = add i64 %11, %10
+  // CHECK-NEXT:    %13 = load <6 x float>, <6 x float>* %0, align 4
+  // CHECK-NEXT:    %matins2 = insertelement <6 x float> %13, float %7, i64 %12
+  // CHECK-NEXT:    store <6 x float> %matins2, <6 x float>* %0, align 4
+  // CHECK-NEXT:    ret void
+  b[j][k] = e;
+}
+
+// Check that we can use overloaded matrix index expressions on matrixes with
+// matching dimensions, but different element types.
+typedef double dx3x3_t __attribute__((matrix_type(3, 3)));
+typedef float fx3x3_t __attribute__((matrix_type(3, 3)));
+void insert_matching_dimensions(dx3x3_t a, double i, fx3x3_t b, float e, long int j, char k) {
+  // CHECK-LABEL: define void @insert_matching_dimensions(<9 x double> %a, double %i, <9 x float> %b, float %e, i64 %j, i8 signext %k) #3 {
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %a.addr = alloca [9 x double], align 8
+  // CHECK-NEXT:    %i.addr = alloca double, align 8
+  // CHECK-NEXT:    %b.addr = alloca [9 x float], align 4
+  // CHECK-NEXT:    %e.addr = alloca float, align 4
+  // CHECK-NEXT:    %j.addr = alloca i64, align 8
+  // CHECK-NEXT:    %k.addr = alloca i8, align 1
+  // CHECK-NEXT:    %0 = bitcast [9 x double]* %a.addr to <9 x double>*
+  // CHECK-NEXT:    store <9 x double> %a, <9 x double>* %0, align 8
+  // CHECK-NEXT:    store double %i, double* %i.addr, align 8
+  // CHECK-NEXT:    %1 = bitcast [9 x float]* %b.addr to <9 x float>*
+  // CHECK-NEXT:    store <9 x float> %b, <9 x float>* %1, align 4
+  // CHECK-NEXT:    store float %e, float* %e.addr, align 4
+  // CHECK-NEXT:    store i64 %j, i64* %j.addr, align 8
+  // CHECK-NEXT:    store i8 %k, i8* %k.addr, align 1
+  // CHECK-NEXT:    %2 = load double, double* %i.addr, align 8
+  // CHECK-NEXT:    %3 = load <9 x double>, <9 x double>* %0, align 8
+  // CHECK-NEXT:    %matins = insertelement <9 x double> %3, double %2, i32 5
+  // CHECK-NEXT:    store <9 x double> %matins, <9 x double>* %0, align 8
+  a[2u][1u] = i;
+
+  // CHECK-NEXT:    %4 = load float, float* %e.addr, align 4
+  // CHECK-NEXT:    %5 = load <9 x float>, <9 x float>* %1, align 4
+  // CHECK-NEXT:    %matins1 = insertelement <9 x float> %5, float %4, i32 7
+  // CHECK-NEXT:    store <9 x float> %matins1, <9 x float>* %1, align 4
+  b[1u][2u] = e;
+
+  // CHECK-NEXT:    %6 = load double, double* %i.addr, align 8
+  // CHECK-NEXT:    %conv = fptrunc double %6 to float
+  // CHECK-NEXT:    %7 = load i64, i64* %j.addr, align 8
+  // CHECK-NEXT:    %8 = load i8, i8* %k.addr, align 1
+  // CHECK-NEXT:    %9 = zext i8 %8 to i64
+  // CHECK-NEXT:    %10 = mul i64 %9, 3
+  // CHECK-NEXT:    %11 = add i64 %10, %7
+  // CHECK-NEXT:    %12 = load <9 x float>, <9 x float>* %1, align 4
+  // CHECK-NEXT:    %matins2 = insertelement <9 x float> %12, float %conv, i64 %11
+  // CHECK-NEXT:    store <9 x float> %matins2, <9 x float>* %1, align 4
+  // CHECK-NEXT:    ret void
+  b[j][k] = i;
+}
+
+void extract1(dx5x5_t a, fx3x3_t b, ix9x3_t c, unsigned long j) {
+  // CHECK-LABEL: @extract1(
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %a.addr = alloca [25 x double], align 8
+  // CHECK-NEXT:    %b.addr = alloca [9 x float], align 4
+  // CHECK-NEXT:    %c.addr = alloca [27 x i32], align 4
+  // CHECK-NEXT:    %j.addr = alloca i64, align 8
+  // CHECK-NEXT:    %v1 = alloca double, align 8
+  // CHECK-NEXT:    %v2 = alloca float, align 4
+  // CHECK-NEXT:    %v3 = alloca i32, align 4
+  // CHECK-NEXT:    %v4 = alloca i32, align 4
+  // CHECK-NEXT:    %0 = bitcast [25 x double]* %a.addr to <25 x double>*
+  // CHECK-NEXT:    store <25 x double> %a, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %1 = bitcast [9 x float]* %b.addr to <9 x float>*
+  // CHECK-NEXT:    store <9 x float> %b, <9 x float>* %1, align 4
+  double v1 = a[2][3];
+
+  // CHECK-NEXT:    %2 = bitcast [27 x i32]* %c.addr to <27 x i32>*
+  // CHECK-NEXT:    store <27 x i32> %c, <27 x i32>* %2, align 4
+  // CHECK-NEXT:    store i64 %j, i64* %j.addr, align 8
+  // CHECK-NEXT:    %3 = load <25 x double>, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %matext = extractelement <25 x double> %3, i32 17
+  // CHECK-NEXT:    store double %matext, double* %v1, align 8
+  float v2 = b[2][1];
+
+  // CHECK-NEXT:    %4 = load <9 x float>, <9 x float>* %1, align 4
+  // CHECK-NEXT:    %matext1 = extractelement <9 x float> %4, i32 5
+  // CHECK-NEXT:    store float %matext1, float* %v2, align 4
+  // CHECK-NEXT:    %5 = load <27 x i32>, <27 x i32>* %2, align 4
+  // CHECK-NEXT:    %matext2 = extractelement <27 x i32> %5, i32 10
+  // CHECK-NEXT:    store i32 %matext2, i32* %v3, align 4
+  int v3 = c[1][1];
+
+  // CHECK-NEXT:    %6 = load i64, i64* %j.addr, align 8
+  // CHECK-NEXT:    %7 = load i64, i64* %j.addr, align 8
+  // CHECK-NEXT:    %8 = mul i64 %7, 9
+  // CHECK-NEXT:    %9 = add i64 %8, %6
+  // CHECK-NEXT:    %10 = load <27 x i32>, <27 x i32>* %2, align 4
+  // CHECK-NEXT:    %matext3 = extractelement <27 x i32> %10, i64 %9
+  // CHECK-NEXT:    store i32 %matext3, i32* %v4, align 4
+  // CHECK-NEXT:    ret void
+  int v4 = c[j][j];
+}
+
+typedef double dx3x2_t __attribute__((matrix_type(3, 2)));
+double test_extract_matrix_pointer(dx5x5_t *ptr, dx3x2_t **ptr2) {
+  // CHECK-LABEL: define double @test_extract_matrix_pointer([25 x double]* %ptr, [6 x double]** %ptr2)
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %ptr.addr = alloca [25 x double]*, align 8
+  // CHECK-NEXT:    %ptr2.addr = alloca [6 x double]**, align 8
+  // CHECK-NEXT:    store [25 x double]* %ptr, [25 x double]** %ptr.addr, align 8
+  // CHECK-NEXT:    store [6 x double]** %ptr2, [6 x double]*** %ptr2.addr, align 8
+  // CHECK-NEXT:    %0 = load [25 x double]*, [25 x double]** %ptr.addr, align 8
+  // CHECK-NEXT:    %arrayidx = getelementptr inbounds [25 x double], [25 x double]* %0, i64 0
+  // CHECK-NEXT:    %1 = bitcast [25 x double]* %arrayidx to <25 x double>*
+  // CHECK-NEXT:    %2 = load <25 x double>, <25 x double>* %1, align 8
+  // CHECK-NEXT:    %matext = extractelement <25 x double> %2, i32 17
+  // CHECK-NEXT:    %3 = load [6 x double]**, [6 x double]*** %ptr2.addr, align 8
+  // CHECK-NEXT:    %arrayidx1 = getelementptr inbounds [6 x double]*, [6 x double]** %3, i64 1
+  // CHECK-NEXT:    %4 = load [6 x double]*, [6 x double]** %arrayidx1, align 8
+  // CHECK-NEXT:    %arrayidx2 = getelementptr inbounds [6 x double], [6 x double]* %4, i64 2
+  // CHECK-NEXT:    %5 = bitcast [6 x double]* %arrayidx2 to <6 x double>*
+  // CHECK-NEXT:    %6 = load <6 x double>, <6 x double>* %5, align 8
+  // CHECK-NEXT:    %matext3 = extractelement <6 x double> %6, i32 3
+  // CHECK-NEXT:    %add = fadd double %matext, %matext3
+  // CHECK-NEXT:    %7 = load [6 x double]**, [6 x double]*** %ptr2.addr, align 8
+  // CHECK-NEXT:    %add.ptr = getelementptr inbounds [6 x double]*, [6 x double]** %7, i64 4
+  // CHECK-NEXT:    %8 = load [6 x double]*, [6 x double]** %add.ptr, align 8
+  // CHECK-NEXT:    %add.ptr4 = getelementptr inbounds [6 x double], [6 x double]* %8, i64 6
+  // CHECK-NEXT:    %9 = bitcast [6 x double]* %add.ptr4 to <6 x double>*
+  // CHECK-NEXT:    %10 = load <6 x double>, <6 x double>* %9, align 8
+  // CHECK-NEXT:    %matext5 = extractelement <6 x double> %10, i32 1
+  // CHECK-NEXT:    %add6 = fadd double %add, %matext5
+  // CHECK-NEXT:    ret double %add6
+
+  return (ptr[0])[2][3] + ptr2[1][2][0][1] + (*(*(ptr2 + 4) + 6))[1][0];
+}
+void insert_extract(dx5x5_t a, fx3x3_t b, unsigned long j, short k) {
+  // CHECK-LABEL: define void @insert_extract(<25 x double> %a, <9 x float> %b, i64 %j, i16 signext %k)
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %a.addr = alloca [25 x double], align 8
+  // CHECK-NEXT:    %b.addr = alloca [9 x float], align 4
+  // CHECK-NEXT:    %j.addr = alloca i64, align 8
+  // CHECK-NEXT:    %k.addr = alloca i16, align 2
+  // CHECK-NEXT:    %0 = bitcast [25 x double]* %a.addr to <25 x double>*
+  // CHECK-NEXT:    store <25 x double> %a, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %1 = bitcast [9 x float]* %b.addr to <9 x float>*
+  // CHECK-NEXT:    store <9 x float> %b, <9 x float>* %1, align 4
+  // CHECK-NEXT:    store i64 %j, i64* %j.addr, align 8
+  // CHECK-NEXT:    store i16 %k, i16* %k.addr, align 2
+  // CHECK-NEXT:    %2 = load i64, i64* %j.addr, align 8
+  // CHECK-NEXT:    %3 = load i16, i16* %k.addr, align 2
+  // CHECK-NEXT:    %4 = zext i16 %3 to i64
+  // CHECK-NEXT:    %5 = mul i64 %4, 3
+  // CHECK-NEXT:    %6 = add i64 %5, %2
+  // CHECK-NEXT:    %7 = load <9 x float>, <9 x float>* %1, align 4
+  // CHECK-NEXT:    %matext = extractelement <9 x float> %7, i64 %6
+  // CHECK-NEXT:    %conv = fpext float %matext to double
+  // CHECK-NEXT:    %8 = load <25 x double>, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %matins = insertelement <25 x double> %8, double %conv, i32 17
+  // CHECK-NEXT:    store <25 x double> %matins, <25 x double>* %0, align 8
+  a[2][3] = b[j][k];
+
+  // CHECK-NEXT:    %9 = load <9 x float>, <9 x float>* %1, align 4
+  // CHECK-NEXT:    %matext1 = extractelement <9 x float> %9, i32 3
+  // CHECK-NEXT:    %10 = load i16, i16* %k.addr, align 2
+  // CHECK-NEXT:    %11 = load i64, i64* %j.addr, align 8
+  // CHECK-NEXT:    %12 = zext i16 %10 to i64
+  // CHECK-NEXT:    %13 = mul i64 %11, 3
+  // CHECK-NEXT:    %14 = add i64 %13, %12
+  // CHECK-NEXT:    %15 = load <9 x float>, <9 x float>* %1, align 4
+  // CHECK-NEXT:    %matins2 = insertelement <9 x float> %15, float %matext1, i64 %14
+  // CHECK-NEXT:    store <9 x float> %matins2, <9 x float>* %1, align 4
+  b[k][j] = b[0][1];
+
+  // CHECK-NEXT:    %16 = load i16, i16* %k.addr, align 2
+  // CHECK-NEXT:    %17 = zext i16 %16 to i32
+  // CHECK-NEXT:    %18 = mul i32 %17, 3
+  // CHECK-NEXT:    %19 = add i32 %18, 0
+  // CHECK-NEXT:    %20 = load <9 x float>, <9 x float>* %1, align 4
+  // CHECK-NEXT:    %matext3 = extractelement <9 x float> %20, i32 %19
+  // CHECK-NEXT:    %21 = load i64, i64* %j.addr, align 8
+  // CHECK-NEXT:    %22 = mul i64 %21, 3
+  // CHECK-NEXT:    %23 = add i64 %22, 2
+  // CHECK-NEXT:    %24 = load <9 x float>, <9 x float>* %1, align 4
+  // CHECK-NEXT:    %matins4 = insertelement <9 x float> %24, float %matext3, i64 %23
+  // CHECK-NEXT:    store <9 x float> %matins4, <9 x float>* %1, align 4
+  // CHECK-NEXT:    ret void
+  b[2][j] = b[0][k];
+}
diff --git a/clang/test/CodeGenCXX/matrix-type-operators.cpp b/clang/test/CodeGenCXX/matrix-type-operators.cpp
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGenCXX/matrix-type-operators.cpp
@@ -0,0 +1,211 @@
+// RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++11 | FileCheck %s
+
+typedef double dx5x5_t __attribute__((matrix_type(5, 5)));
+using fx2x3_t = float __attribute__((matrix_type(2, 3)));
+
+void insert_fp(dx5x5_t *a, double d, fx2x3_t *b, float e) {
+  (*a)[0u][1u] = d;
+  (*b)[1u][0u] = e;
+
+  // CHECK-LABEL: @_Z9insert_fpPU11matrix_typeLm5ELm5EddPU11matrix_typeLm2ELm3Eff(
+  // CHECK-NEXT: entry:
+  // CHECK-NEXT:    %a.addr = alloca [25 x double]*, align 8
+  // CHECK-NEXT:    %d.addr = alloca double, align 8
+  // CHECK-NEXT:    %b.addr = alloca [6 x float]*, align 8
+  // CHECK-NEXT:    %e.addr = alloca float, align 4
+  // CHECK-NEXT:    store [25 x double]* %a, [25 x double]** %a.addr, align 8
+  // CHECK-NEXT:    store double %d, double* %d.addr, align 8
+  // CHECK-NEXT:    store [6 x float]* %b, [6 x float]** %b.addr, align 8
+  // CHECK-NEXT:    store float %e, float* %e.addr, align 4
+  // CHECK-NEXT:    %0 = load double, double* %d.addr, align 8
+  // CHECK-NEXT:    %1 = load [25 x double]*, [25 x double]** %a.addr, align 8
+  // CHECK-NEXT:    %2 = bitcast [25 x double]* %1 to <25 x double>*
+  // CHECK-NEXT:    %3 = load <25 x double>, <25 x double>* %2, align 8
+  // CHECK-NEXT:    %matins = insertelement <25 x double> %3, double %0, i32 5
+  // CHECK-NEXT:    store <25 x double> %matins, <25 x double>* %2, align 8
+  // CHECK-NEXT:    %4 = load float, float* %e.addr, align 4
+  // CHECK-NEXT:    %5 = load [6 x float]*, [6 x float]** %b.addr, align 8
+  // CHECK-NEXT:    %6 = bitcast [6 x float]* %5 to <6 x float>*
+  // CHECK-NEXT:    %7 = load <6 x float>, <6 x float>* %6, align 4
+  // CHECK-NEXT:    %matins1 = insertelement <6 x float> %7, float %4, i32 1
+  // CHECK-NEXT:    store <6 x float> %matins1, <6 x float>* %6, align 4
+  // CHECK-NEXT:    ret void
+}
+
+typedef int ix9x3_t __attribute__((matrix_type(9, 3)));
+
+void insert_int(ix9x3_t *a, int i) {
+  (*a)[4u][1u] = i;
+
+  // CHECK-LABEL: @_Z10insert_intPU11matrix_typeLm9ELm3Eii(
+  // CHECK-NEXT: entry:
+  // CHECK-NEXT:    %a.addr = alloca [27 x i32]*, align 8
+  // CHECK-NEXT:    %i.addr = alloca i32, align 4
+  // CHECK-NEXT:    store [27 x i32]* %a, [27 x i32]** %a.addr, align 8
+  // CHECK-NEXT:    store i32 %i, i32* %i.addr, align 4
+  // CHECK-NEXT:    %0 = load i32, i32* %i.addr, align 4
+  // CHECK-NEXT:    %1 = load [27 x i32]*, [27 x i32]** %a.addr, align 8
+  // CHECK-NEXT:    %2 = bitcast [27 x i32]* %1 to <27 x i32>*
+  // CHECK-NEXT:    %3 = load <27 x i32>, <27 x i32>* %2, align 4
+  // CHECK-NEXT:    %matins = insertelement <27 x i32> %3, i32 %0, i32 13
+  // CHECK-NEXT:    store <27 x i32> %matins, <27 x i32>* %2, align 4
+  // CHECK-NEXT:    ret void
+}
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+struct MyMatrix {
+  using matrix_t = EltTy __attribute__((matrix_type(Rows, Columns)));
+
+  matrix_t value;
+};
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+void insert(MyMatrix<EltTy, Rows, Columns> &Mat, EltTy e) {
+  Mat.value[1u][0u] = e;
+}
+
+void test_template(unsigned *Ptr1, unsigned E1, float *Ptr2, float E2) {
+
+  // CHECK-LABEL: define void @_Z13test_templatePjjPff(i32* %Ptr1, i32 %E1, float* %Ptr2, float %E2)
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %Ptr1.addr = alloca i32*, align 8
+  // CHECK-NEXT:    %E1.addr = alloca i32, align 4
+  // CHECK-NEXT:    %Ptr2.addr = alloca float*, align 8
+  // CHECK-NEXT:    %E2.addr = alloca float, align 4
+  // CHECK-NEXT:    %Mat1 = alloca %struct.MyMatrix, align 4
+  // CHECK-NEXT:    %Mat2 = alloca %struct.MyMatrix.0, align 4
+  // CHECK-NEXT:    store i32* %Ptr1, i32** %Ptr1.addr, align 8
+  // CHECK-NEXT:    store i32 %E1, i32* %E1.addr, align 4
+  // CHECK-NEXT:    store float* %Ptr2, float** %Ptr2.addr, align 8
+  // CHECK-NEXT:    store float %E2, float* %E2.addr, align 4
+  // CHECK-NEXT:    %0 = load i32*, i32** %Ptr1.addr, align 8
+  // CHECK-NEXT:    %1 = bitcast i32* %0 to [4 x i32]*
+  // CHECK-NEXT:    %2 = bitcast [4 x i32]* %1 to <4 x i32>*
+  // CHECK-NEXT:    %3 = load <4 x i32>, <4 x i32>* %2, align 4
+  // CHECK-NEXT:    %value = getelementptr inbounds %struct.MyMatrix, %struct.MyMatrix* %Mat1, i32 0, i32 0
+  // CHECK-NEXT:    %4 = bitcast [4 x i32]* %value to <4 x i32>*
+  // CHECK-NEXT:    store <4 x i32> %3, <4 x i32>* %4, align 4
+  // CHECK-NEXT:    %5 = load i32, i32* %E1.addr, align 4
+  // CHECK-NEXT:    call void @_Z6insertIjLj2ELj2EEvR8MyMatrixIT_XT0_EXT1_EES1_(%struct.MyMatrix* dereferenceable(16) %Mat1, i32 %5)
+  // CHECK-NEXT:    %6 = load float*, float** %Ptr2.addr, align 8
+  // CHECK-NEXT:    %7 = bitcast float* %6 to [24 x float]*
+  // CHECK-NEXT:    %8 = bitcast [24 x float]* %7 to <24 x float>*
+  // CHECK-NEXT:    %9 = load <24 x float>, <24 x float>* %8, align 4
+  // CHECK-NEXT:    %value1 = getelementptr inbounds %struct.MyMatrix.0, %struct.MyMatrix.0* %Mat2, i32 0, i32 0
+  // CHECK-NEXT:    %10 = bitcast [24 x float]* %value1 to <24 x float>*
+  // CHECK-NEXT:    store <24 x float> %9, <24 x float>* %10, align 4
+  // CHECK-NEXT:    %11 = load float, float* %E2.addr, align 4
+  // CHECK-NEXT:    call void @_Z6insertIfLj3ELj8EEvR8MyMatrixIT_XT0_EXT1_EES1_(%struct.MyMatrix.0* dereferenceable(96) %Mat2, float %11)
+  // CHECK-NEXT:    ret void
+
+  // CHECK-LABEL: define linkonce_odr void @_Z6insertIjLj2ELj2EEvR8MyMatrixIT_XT0_EXT1_EES1_(%struct.MyMatrix* dereferenceable(16) %Mat, i32 %e)
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %Mat.addr = alloca %struct.MyMatrix*, align 8
+  // CHECK-NEXT:    %e.addr = alloca i32, align 4
+  // CHECK-NEXT:    store %struct.MyMatrix* %Mat, %struct.MyMatrix** %Mat.addr, align 8
+  // CHECK-NEXT:    store i32 %e, i32* %e.addr, align 4
+  // CHECK-NEXT:    %0 = load i32, i32* %e.addr, align 4
+  // CHECK-NEXT:    %1 = load %struct.MyMatrix*, %struct.MyMatrix** %Mat.addr, align 8
+  // CHECK-NEXT:    %value = getelementptr inbounds %struct.MyMatrix, %struct.MyMatrix* %1, i32 0, i32 0
+  // CHECK-NEXT:    %2 = bitcast [4 x i32]* %value to <4 x i32>*
+  // CHECK-NEXT:    %3 = load <4 x i32>, <4 x i32>* %2, align 4
+  // CHECK-NEXT:    %matins = insertelement <4 x i32> %3, i32 %0, i32 1
+  // CHECK-NEXT:    store <4 x i32> %matins, <4 x i32>* %2, align 4
+  // CHECK-NEXT:    ret void
+
+  // CHECK-LABEL: define linkonce_odr void @_Z6insertIfLj3ELj8EEvR8MyMatrixIT_XT0_EXT1_EES1_(%struct.MyMatrix.0* dereferenceable(96) %Mat, float %e)
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %Mat.addr = alloca %struct.MyMatrix.0*, align 8
+  // CHECK-NEXT:    %e.addr = alloca float, align 4
+  // CHECK-NEXT:    store %struct.MyMatrix.0* %Mat, %struct.MyMatrix.0** %Mat.addr, align 8
+  // CHECK-NEXT:    store float %e, float* %e.addr, align 4
+  // CHECK-NEXT:    %0 = load float, float* %e.addr, align 4
+  // CHECK-NEXT:    %1 = load %struct.MyMatrix.0*, %struct.MyMatrix.0** %Mat.addr, align 8
+  // CHECK-NEXT:    %value = getelementptr inbounds %struct.MyMatrix.0, %struct.MyMatrix.0* %1, i32 0, i32 0
+  // CHECK-NEXT:    %2 = bitcast [24 x float]* %value to <24 x float>*
+  // CHECK-NEXT:    %3 = load <24 x float>, <24 x float>* %2, align 4
+  // CHECK-NEXT:    %matins = insertelement <24 x float> %3, float %0, i32 1
+  // CHECK-NEXT:    store <24 x float> %matins, <24 x float>* %2, align 4
+  // CHECK-NEXT:    ret void
+
+  MyMatrix<unsigned, 2, 2> Mat1;
+  Mat1.value = *((decltype(Mat1)::matrix_t *)Ptr1);
+  insert(Mat1, E1);
+
+  MyMatrix<float, 3, 8> Mat2;
+  Mat2.value = *((decltype(Mat2)::matrix_t *)Ptr2);
+  insert(Mat2, E2);
+}
+
+typedef float fx3x3_t __attribute__((matrix_type(3, 3)));
+void extract1(dx5x5_t a, fx3x3_t b, ix9x3_t c) {
+  // CHECK-LABEL: @_Z8extract1U11matrix_typeLm5ELm5EdU11matrix_typeLm3ELm3EfU11matrix_typeLm9ELm3Ei(
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %a.addr = alloca [25 x double], align 8
+  // CHECK-NEXT:    %b.addr = alloca [9 x float], align 4
+  // CHECK-NEXT:    %c.addr = alloca [27 x i32], align 4
+  // CHECK-NEXT:    %v1 = alloca double, align 8
+  // CHECK-NEXT:    %v2 = alloca float, align 4
+  // CHECK-NEXT:    %v3 = alloca i32, align 4
+  // CHECK-NEXT:    %0 = bitcast [25 x double]* %a.addr to <25 x double>*
+  // CHECK-NEXT:    store <25 x double> %a, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %1 = bitcast [9 x float]* %b.addr to <9 x float>*
+  // CHECK-NEXT:    store <9 x float> %b, <9 x float>* %1, align 4
+  // CHECK-NEXT:    %2 = bitcast [27 x i32]* %c.addr to <27 x i32>*
+  // CHECK-NEXT:    store <27 x i32> %c, <27 x i32>* %2, align 4
+  // CHECK-NEXT:    %3 = load <25 x double>, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %matext = extractelement <25 x double> %3, i32 17
+  // CHECK-NEXT:    store double %matext, double* %v1, align 8
+  // CHECK-NEXT:    %4 = load <9 x float>, <9 x float>* %1, align 4
+  // CHECK-NEXT:    %matext1 = extractelement <9 x float> %4, i32 5
+  // CHECK-NEXT:    store float %matext1, float* %v2, align 4
+  // CHECK-NEXT:    %5 = load <27 x i32>, <27 x i32>* %2, align 4
+  // CHECK-NEXT:    %matext2 = extractelement <27 x i32> %5, i32 10
+  // CHECK-NEXT:    store i32 %matext2, i32* %v3, align 4
+  // CHECK-NEXT:    ret void
+
+  double v1 = a[2][3];
+  float v2 = b[2][1];
+  int v3 = c[1][1];
+}
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+EltTy extract(MyMatrix<EltTy, Rows, Columns> &Mat) {
+  return Mat.value[1u][0u];
+}
+
+void test_extract_template(unsigned *Ptr1, float *Ptr2) {
+  // CHECK-LABEL: define void @_Z21test_extract_templatePjPf(i32* %Ptr1, float* %Ptr2)
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %Ptr1.addr = alloca i32*, align 8
+  // CHECK-NEXT:    %Ptr2.addr = alloca float*, align 8
+  // CHECK-NEXT:    %Mat1 = alloca %struct.MyMatrix, align 4
+  // CHECK-NEXT:    %v1 = alloca i32, align 4
+  // CHECK-NEXT:    store i32* %Ptr1, i32** %Ptr1.addr, align 8
+  // CHECK-NEXT:    store float* %Ptr2, float** %Ptr2.addr, align 8
+  // CHECK-NEXT:    %0 = load i32*, i32** %Ptr1.addr, align 8
+  // CHECK-NEXT:    %1 = bitcast i32* %0 to [4 x i32]*
+  // CHECK-NEXT:    %2 = bitcast [4 x i32]* %1 to <4 x i32>*
+  // CHECK-NEXT:    %3 = load <4 x i32>, <4 x i32>* %2, align 4
+  // CHECK-NEXT:    %value = getelementptr inbounds %struct.MyMatrix, %struct.MyMatrix* %Mat1, i32 0, i32 0
+  // CHECK-NEXT:    %4 = bitcast [4 x i32]* %value to <4 x i32>*
+  // CHECK-NEXT:    store <4 x i32> %3, <4 x i32>* %4, align 4
+  // CHECK-NEXT:    %call = call i32 @_Z7extractIjLj2ELj2EET_R8MyMatrixIS0_XT0_EXT1_EE(%struct.MyMatrix* dereferenceable(16) %Mat1)
+  // CHECK-NEXT:    store i32 %call, i32* %v1, align 4
+  // CHECK-NEXT:    ret void
+
+  // CHECK-LABEL: define linkonce_odr i32 @_Z7extractIjLj2ELj2EET_R8MyMatrixIS0_XT0_EXT1_EE(%struct.MyMatrix* dereferenceable(16) %Mat)
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %Mat.addr = alloca %struct.MyMatrix*, align 8
+  // CHECK-NEXT:    store %struct.MyMatrix* %Mat, %struct.MyMatrix** %Mat.addr, align 8
+  // CHECK-NEXT:    %0 = load %struct.MyMatrix*, %struct.MyMatrix** %Mat.addr, align 8
+  // CHECK-NEXT:    %value = getelementptr inbounds %struct.MyMatrix, %struct.MyMatrix* %0, i32 0, i32 0
+  // CHECK-NEXT:    %1 = bitcast [4 x i32]* %value to <4 x i32>*
+  // CHECK-NEXT:    %2 = load <4 x i32>, <4 x i32>* %1, align 4
+  // CHECK-NEXT:    %matext = extractelement <4 x i32> %2, i32 1
+  // CHECK-NEXT:    ret i32 %matext
+
+  MyMatrix<unsigned, 2, 2> Mat1;
+  Mat1.value = *((decltype(Mat1)::matrix_t *)Ptr1);
+  unsigned v1 = extract(Mat1);
+}
diff --git a/clang/test/Sema/matrix-type-operators.c b/clang/test/Sema/matrix-type-operators.c
new file mode 100644
--- /dev/null
+++ b/clang/test/Sema/matrix-type-operators.c
@@ -0,0 +1,69 @@
+// RUN: %clang_cc1 %s -fenable-matrix -pedantic -verify -triple=x86_64-apple-darwin9
+
+typedef float sx5x10_t __attribute__((matrix_type(5, 10)));
+
+void insert(sx5x10_t a, float f) {
+  // Non integer indexes.
+  a[3][f] = 0;
+  // expected-error@-1 {{matrix column index is not an integer}}
+  a[f][9] = 0;
+  // expected-error@-1 {{matrix row index is not an integer}}
+  a[f][f] = 0;
+  // expected-error@-1 {{matrix row index is not an integer}}
+  a[0][f] = 0;
+  // expected-error@-1 {{matrix column index is not an integer}}
+
+  // Invalid element type.
+  a[3][4] = &f;
+  // expected-error@-1 {{assigning to 'float' from incompatible type 'float *'; remove &}}
+
+  // Indexes outside allowed dimensions.
+  a[-1][3] = 10.0;
+  // expected-error@-1 {{matrix row index is outside the allowed range [0, 5)}}
+  a[3][-1] = 10.0;
+  // expected-error@-1 {{matrix column index is outside the allowed range [0, 10)}}
+  a[3][-1u] = 10.0;
+  // expected-error@-1 {{matrix column index is outside the allowed range [0, 10)}}
+  a[-1u][3] = 10.0;
+  // expected-error@-1 {{matrix row index is outside the allowed range [0, 5)}}
+  a[5][2] = 10.0;
+  // expected-error@-1 {{matrix row index is outside the allowed range [0, 5)}}
+  a[4][10] = 10.0;
+  // expected-error@-1 {{matrix column index is outside the allowed range [0, 10)}}
+  a[5][10.0] = f;
+  // expected-error@-1 {{matrix row index is outside the allowed range [0, 5)}}
+
+  a[3] = 5.0;
+  // expected-error@-1 {{single subscript expressions are not allowed for matrix values}}
+}
+
+void extract(sx5x10_t a, float f) {
+  // Non integer indexes.
+  float v1 = a[3][f];
+  // expected-error@-1 {{matrix column index is not an integer}}
+  float v2 = a[f][9];
+  // expected-error@-1 {{matrix row index is not an integer}}
+  float v3 = a[f][f];
+  // expected-error@-1 {{matrix row index is not an integer}}
+
+  // Invalid element type.
+  char *v4 = a[3][4];
+  // expected-error@-1 {{initializing 'char *' with an expression of incompatible type 'float'}}
+
+  // Indexes outside allowed dimensions.
+  float v5 = a[-1][3];
+  // expected-error@-1 {{matrix row index is outside the allowed range [0, 5)}}
+  float v6 = a[3][-1];
+  // expected-error@-1 {{matrix column index is outside the allowed range [0, 10)}}
+  float v8 = a[-1u][3];
+  // expected-error@-1 {{matrix row index is outside the allowed range [0, 5)}}
+  float v9 = a[5][2];
+  // expected-error@-1 {{matrix row index is outside the allowed range [0, 5)}}
+  float v10 = a[4][10];
+  // expected-error@-1 {{matrix column index is outside the allowed range [0, 10)}}
+  float v11 = a[5][10.0];
+  // expected-error@-1 {{matrix row index is outside the allowed range [0, 5)}}
+
+  float v12 = a[3];
+  // expected-error@-1 {{single subscript expressions are not allowed for matrix values}}
+}
diff --git a/clang/test/SemaCXX/matrix-type-operators.cpp b/clang/test/SemaCXX/matrix-type-operators.cpp
new file mode 100644
--- /dev/null
+++ b/clang/test/SemaCXX/matrix-type-operators.cpp
@@ -0,0 +1,70 @@
+// RUN: %clang_cc1 %s -fenable-matrix -pedantic -std=c++11 -verify -triple=x86_64-apple-darwin9
+
+typedef float sx5x10_t __attribute__((matrix_type(5, 10)));
+
+void insert(sx5x10_t a, float f) {
+  // Non integer indexes.
+  a[3][f] = 0;
+  // expected-error@-1 {{matrix column index is not an integer}}
+  a[f][9] = 0;
+  // expected-error@-1 {{matrix row index is not an integer}}
+  a[f][f] = 0;
+  // expected-error@-1 {{matrix row index is not an integer}}
+  a[0][f] = 0;
+  // expected-error@-1 {{matrix column index is not an integer}}
+
+  // Invalid element type.
+  a[3][4] = &f;
+  // expected-error@-1 {{assigning to 'float' from incompatible type 'float *'; remove &}}
+
+  // Indexes outside allowed dimensions.
+  a[-1][3] = 10.0;
+  // expected-error@-1 {{matrix row index is outside the allowed range [0, 5)}}
+  a[3][-1] = 10.0;
+  // expected-error@-1 {{matrix column index is outside the allowed range [0, 10)}}
+  a[3][-1u] = 10.0;
+  // expected-error@-1 {{matrix column index is outside the allowed range [0, 10)}}
+  a[-1u][3] = 10.0;
+  // expected-error@-1 {{matrix row index is outside the allowed range [0, 5)}}
+  a[5][2] = 10.0;
+  // expected-error@-1 {{matrix row index is outside the allowed range [0, 5)}}
+  a[4][10] = 10.0;
+  // expected-error@-1 {{matrix column index is outside the allowed range [0, 10)}}
+  a[5][10.0] = f;
+  // expected-error@-1 {{matrix row index is outside the allowed range [0, 5)}}
+}
+
+void extract(sx5x10_t a, float f) {
+  // Non integer indexes.
+  float v1 = a[3][f];
+  // expected-error@-1 {{matrix column index is not an integer}}
+  float v2 = a[f][9];
+  // expected-error@-1 {{matrix row index is not an integer}}
+  float v3 = a[f][f];
+  // expected-error@-1 {{matrix row index is not an integer}}
+
+  // Invalid element type.
+  char *v4 = a[3][4];
+  // expected-error@-1 {{cannot initialize a variable of type 'char *' with an lvalue of type 'float'}}
+
+  // Indexes outside allowed dimensions.
+  float v5 = a[-1][3];
+  // expected-error@-1 {{matrix row index is outside the allowed range [0, 5)}}
+  float v6 = a[3][-1];
+  // expected-error@-1 {{matrix column index is outside the allowed range [0, 10)}}
+  float v8 = a[-1u][3];
+  // expected-error@-1 {{matrix row index is outside the allowed range [0, 5)}}
+  float v9 = a[5][2];
+  // expected-error@-1 {{matrix row index is outside the allowed range [0, 5)}}
+  float v10 = a[4][10];
+  // expected-error@-1 {{matrix column index is outside the allowed range [0, 10)}}
+  float v11 = a[5][10.0];
+  // expected-error@-1 {{matrix row index is outside the allowed range [0, 5)}}
+}
+
+void incomplete_matrix_index_expr(sx5x10_t a, float f) {
+  float x = a[3];
+  // expected-error@-1 {{single subscript expressions are not allowed for matrix values}}
+  a[2] = f;
+  // expected-error@-1 {{single subscript expressions are not allowed for matrix values}}
+}