diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -11139,6 +11139,11 @@
   QualType CheckVectorLogicalOperands(ExprResult &LHS, ExprResult &RHS,
                                       SourceLocation Loc);
 
+  /// Type checking for matrix binary operators.
+  QualType CheckMatrixElementwiseOperands(ExprResult &LHS, ExprResult &RHS,
+                                          SourceLocation Loc,
+                                          bool IsCompAssign);
+
   bool areLaxCompatibleVectorTypes(QualType srcType, QualType destType);
   bool isLaxVectorConversion(QualType srcType, QualType destType);
 
diff --git a/clang/lib/CodeGen/CGExprScalar.cpp b/clang/lib/CodeGen/CGExprScalar.cpp
--- a/clang/lib/CodeGen/CGExprScalar.cpp
+++ b/clang/lib/CodeGen/CGExprScalar.cpp
@@ -37,6 +37,7 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsPowerPC.h"
+#include "llvm/IR/MatrixBuilder.h"
 #include "llvm/IR/Module.h"
 #include <cstdarg>
 
@@ -3469,6 +3470,11 @@
     }
   }
 
+  if (op.Ty->isMatrixType()) {
+    llvm::MatrixBuilder<CGBuilderTy> MB(Builder);
+    return MB.CreateAdd(op.LHS, op.RHS);
+  }
+
   if (op.Ty->isUnsignedIntegerType() &&
       CGF.SanOpts.has(SanitizerKind::UnsignedIntegerOverflow) &&
       !CanElideOverflowCheck(CGF.getContext(), op))
@@ -3614,6 +3620,11 @@
       }
     }
 
+    if (op.Ty->isMatrixType()) {
+      llvm::MatrixBuilder<CGBuilderTy> MB(Builder);
+      return MB.CreateSub(op.LHS, op.RHS);
+    }
+
     if (op.Ty->isUnsignedIntegerType() &&
         CGF.SanOpts.has(SanitizerKind::UnsignedIntegerOverflow) &&
         !CanElideOverflowCheck(CGF.getContext(), op))
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -9960,6 +9960,11 @@
     return compType;
   }
 
+  if (LHS.get()->getType()->isMatrixType() ||
+      RHS.get()->getType()->isMatrixType()) {
+    return CheckMatrixElementwiseOperands(LHS, RHS, Loc, CompLHSTy);
+  }
+
   QualType compType = UsualArithmeticConversions(
       LHS, RHS, Loc, CompLHSTy ? ACK_CompAssign : ACK_Arithmetic);
   if (LHS.isInvalid() || RHS.isInvalid())
@@ -10055,6 +10060,11 @@
     return compType;
   }
 
+  if (LHS.get()->getType()->isMatrixType() ||
+      RHS.get()->getType()->isMatrixType()) {
+    return CheckMatrixElementwiseOperands(LHS, RHS, Loc, CompLHSTy);
+  }
+
   QualType compType = UsualArithmeticConversions(
       LHS, RHS, Loc, CompLHSTy ? ACK_CompAssign : ACK_Arithmetic);
   if (LHS.isInvalid() || RHS.isInvalid())
@@ -11646,6 +11656,70 @@
   return GetSignedVectorType(LHS.get()->getType());
 }
 
+static bool tryConvertScalarToMatrixElementTy(Sema &S, QualType ElementType,
+                                              ExprResult *Scalar) {
+  QualType ScalarTy = Scalar->get()->getType().getUnqualifiedType();
+  if (!ScalarTy->isArithmeticType())
+    return false;
+
+  CastKind ScalarCast = CK_NoOp;
+  if (ElementType->isIntegralType(S.Context) &&
+      ScalarTy->isIntegralType(S.Context))
+    ScalarCast = CK_IntegralCast;
+  else if (ElementType->isRealFloatingType() && ScalarTy->isRealFloatingType())
+    ScalarCast = CK_FloatingCast;
+  else if (ElementType->isRealFloatingType() &&
+           ScalarTy->isIntegralType(S.Context))
+    ScalarCast = CK_IntegralToFloating;
+  else
+    return false;
+
+  *Scalar = S.ImpCastExprToType(Scalar->get(), ElementType, ScalarCast);
+
+  return true;
+}
+
+QualType Sema::CheckMatrixElementwiseOperands(ExprResult &LHS, ExprResult &RHS,
+                                              SourceLocation Loc,
+                                              bool IsCompAssign) {
+  if (!IsCompAssign) {
+    LHS = DefaultFunctionArrayLvalueConversion(LHS.get());
+    if (LHS.isInvalid())
+      return QualType();
+  }
+  RHS = DefaultFunctionArrayLvalueConversion(RHS.get());
+  if (RHS.isInvalid())
+    return QualType();
+
+  // For conversion purposes, we ignore any qualifiers.
+  // For example, "const float" and "float" are equivalent.
+  QualType LHSType = LHS.get()->getType().getUnqualifiedType();
+  QualType RHSType = RHS.get()->getType().getUnqualifiedType();
+
+  const MatrixType *LHSMatType = LHSType->getAs<MatrixType>();
+  const MatrixType *RHSMatType = RHSType->getAs<MatrixType>();
+  assert((LHSMatType || RHSMatType) && "At least one operand must be a matrix");
+
+  if (Context.hasSameType(LHSType, RHSType))
+    return LHSType;
+
+  if (LHSMatType && !RHSMatType) {
+    if (tryConvertScalarToMatrixElementTy(*this, LHSMatType->getElementType(),
+                                          &RHS))
+      return LHSType;
+    return InvalidOperands(Loc, LHS, RHS);
+  }
+
+  if (!LHSMatType && RHSMatType) {
+    if (tryConvertScalarToMatrixElementTy(*this, RHSMatType->getElementType(),
+                                          &LHS))
+      return RHSType;
+    return InvalidOperands(Loc, LHS, RHS);
+  }
+
+  return InvalidOperands(Loc, LHS, RHS);
+}
+
 inline QualType Sema::CheckBitwiseOperands(ExprResult &LHS, ExprResult &RHS,
                                            SourceLocation Loc,
                                            BinaryOperatorKind Opc) {
diff --git a/clang/test/CodeGen/matrix-type-operators.c b/clang/test/CodeGen/matrix-type-operators.c
--- a/clang/test/CodeGen/matrix-type-operators.c
+++ b/clang/test/CodeGen/matrix-type-operators.c
@@ -155,3 +155,311 @@
   // CHECK-NEXT:    store i32 %matext2, i32* %v3, align 4
   // CHECK-NEXT:    ret void
 }
+
+void add_matrix_matrix(dx5x5_t a, dx5x5_t b, dx5x5_t c, ix9x3_t ai, ix9x3_t bi, ix9x3_t ci) {
+  a = b + c;
+  ai = bi + ci;
+
+  // CHECK-LABEL: @add_matrix_matrix(
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %a.addr = alloca [25 x double], align 8
+  // CHECK-NEXT:    %b.addr = alloca [25 x double], align 8
+  // CHECK-NEXT:    %c.addr = alloca [25 x double], align 8
+  // CHECK-NEXT:    %ai.addr = alloca [27 x i32], align 4
+  // CHECK-NEXT:    %bi.addr = alloca [27 x i32], align 4
+  // CHECK-NEXT:    %ci.addr = alloca [27 x i32], align 4
+  // CHECK-NEXT:    %0 = bitcast [25 x double]* %a.addr to <25 x double>*
+  // CHECK-NEXT:    store <25 x double> %a, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %1 = bitcast [25 x double]* %b.addr to <25 x double>*
+  // CHECK-NEXT:    store <25 x double> %b, <25 x double>* %1, align 8
+  // CHECK-NEXT:    %2 = bitcast [25 x double]* %c.addr to <25 x double>*
+  // CHECK-NEXT:    store <25 x double> %c, <25 x double>* %2, align 8
+  // CHECK-NEXT:    %3 = bitcast [27 x i32]* %ai.addr to <27 x i32>*
+  // CHECK-NEXT:    store <27 x i32> %ai, <27 x i32>* %3, align 4
+  // CHECK-NEXT:    %4 = bitcast [27 x i32]* %bi.addr to <27 x i32>*
+  // CHECK-NEXT:    store <27 x i32> %bi, <27 x i32>* %4, align 4
+  // CHECK-NEXT:    %5 = bitcast [27 x i32]* %ci.addr to <27 x i32>*
+  // CHECK-NEXT:    store <27 x i32> %ci, <27 x i32>* %5, align 4
+  // CHECK-NEXT:    %6 = load <25 x double>, <25 x double>* %1, align 8
+  // CHECK-NEXT:    %7 = load <25 x double>, <25 x double>* %2, align 8
+  // CHECK-NEXT:    %8 = fadd <25 x double> %6, %7
+  // CHECK-NEXT:    store <25 x double> %8, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %9 = load <27 x i32>, <27 x i32>* %4, align 4
+  // CHECK-NEXT:    %10 = load <27 x i32>, <27 x i32>* %5, align 4
+  // CHECK-NEXT:    %11 = add <27 x i32> %9, %10
+  // CHECK-NEXT:    store <27 x i32> %11, <27 x i32>* %3, align 4
+  // CHECK-NEXT:    ret void
+}
+
+void add_matrix_scalar_float(dx5x5_t a, fx2x3_t b, float vf, double vd) {
+  a = a + vf;
+  a = a + vd;
+
+  // CHECK-LABEL: define void @add_matrix_scalar_float(<25 x double> %a, <6 x float> %b, float %vf, double %vd)
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %a.addr = alloca [25 x double], align 8
+  // CHECK-NEXT:    %b.addr = alloca [6 x float], align 4
+  // CHECK-NEXT:    %vf.addr = alloca float, align 4
+  // CHECK-NEXT:    %vd.addr = alloca double, align 8
+  // CHECK-NEXT:    %0 = bitcast [25 x double]* %a.addr to <25 x double>*
+  // CHECK-NEXT:    store <25 x double> %a, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %1 = bitcast [6 x float]* %b.addr to <6 x float>*
+  // CHECK-NEXT:    store <6 x float> %b, <6 x float>* %1, align 4
+  // CHECK-NEXT:    store float %vf, float* %vf.addr, align 4
+  // CHECK-NEXT:    store double %vd, double* %vd.addr, align 8
+  // CHECK-NEXT:    %2 = load <25 x double>, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %3 = load float, float* %vf.addr, align 4
+  // CHECK-NEXT:    %conv = fpext float %3 to double
+  // CHECK-NEXT:    %scalar.splat.splatinsert = insertelement <25 x double> undef, double %conv, i32 0
+  // CHECK-NEXT:    %scalar.splat.splat = shufflevector <25 x double> %scalar.splat.splatinsert, <25 x double> undef, <25 x i32> zeroinitializer
+  // CHECK-NEXT:    %4 = fadd <25 x double> %2, %scalar.splat.splat
+  // CHECK-NEXT:    store <25 x double> %4, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %5 = load <25 x double>, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %6 = load double, double* %vd.addr, align 8
+  // CHECK-NEXT:    %scalar.splat.splatinsert1 = insertelement <25 x double> undef, double %6, i32 0
+  // CHECK-NEXT:    %scalar.splat.splat2 = shufflevector <25 x double> %scalar.splat.splatinsert1, <25 x double> undef, <25 x i32> zeroinitializer
+  // CHECK-NEXT:    %7 = fadd <25 x double> %5, %scalar.splat.splat2
+  // CHECK-NEXT:    store <25 x double> %7, <25 x double>* %0, align 8
+
+  b = b + vf;
+  b = b + vd;
+
+  // CHECK-NEXT:    %8 = load <6 x float>, <6 x float>* %1, align 4
+  // CHECK-NEXT:    %9 = load float, float* %vf.addr, align 4
+  // CHECK-NEXT:    %scalar.splat.splatinsert3 = insertelement <6 x float> undef, float %9, i32 0
+  // CHECK-NEXT:    %scalar.splat.splat4 = shufflevector <6 x float> %scalar.splat.splatinsert3, <6 x float> undef, <6 x i32> zeroinitializer
+  // CHECK-NEXT:    %10 = fadd <6 x float> %8, %scalar.splat.splat4
+  // CHECK-NEXT:    store <6 x float> %10, <6 x float>* %1, align 4
+  // CHECK-NEXT:    %11 = load <6 x float>, <6 x float>* %1, align 4
+  // CHECK-NEXT:    %12 = load double, double* %vd.addr, align 8
+  // CHECK-NEXT:    %conv5 = fptrunc double %12 to float
+  // CHECK-NEXT:    %scalar.splat.splatinsert6 = insertelement <6 x float> undef, float %conv5, i32 0
+  // CHECK-NEXT:    %scalar.splat.splat7 = shufflevector <6 x float> %scalar.splat.splatinsert6, <6 x float> undef, <6 x i32> zeroinitializer
+  // CHECK-NEXT:    %13 = fadd <6 x float> %11, %scalar.splat.splat7
+  // CHECK-NEXT:    store <6 x float> %13, <6 x float>* %1, align 4
+  // CHECK-NEXT:    ret void
+}
+
+typedef int llix9x3_t __attribute__((matrix_type(9, 3)));
+
+void add_matrix_scalar_ints(ix9x3_t a, llix9x3_t b, short vs, long int vli, unsigned long long int vulli) {
+  a = a + vs;
+  a = a + vli;
+  a = a + vulli;
+
+  // CHECK-LABEL: define void @add_matrix_scalar_ints(<27 x i32> %a, <27 x i32> %b, i16 signext %vs, i64 %vli, i64 %vulli)
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %a.addr = alloca [27 x i32], align 4
+  // CHECK-NEXT:    %b.addr = alloca [27 x i32], align 4
+  // CHECK-NEXT:    %vs.addr = alloca i16, align 2
+  // CHECK-NEXT:    %vli.addr = alloca i64, align 8
+  // CHECK-NEXT:    %vulli.addr = alloca i64, align 8
+  // CHECK-NEXT:    %0 = bitcast [27 x i32]* %a.addr to <27 x i32>*
+  // CHECK-NEXT:    store <27 x i32> %a, <27 x i32>* %0, align 4
+  // CHECK-NEXT:    %1 = bitcast [27 x i32]* %b.addr to <27 x i32>*
+  // CHECK-NEXT:    store <27 x i32> %b, <27 x i32>* %1, align 4
+  // CHECK-NEXT:    store i16 %vs, i16* %vs.addr, align 2
+  // CHECK-NEXT:    store i64 %vli, i64* %vli.addr, align 8
+  // CHECK-NEXT:    store i64 %vulli, i64* %vulli.addr, align 8
+  // CHECK-NEXT:    %2 = load <27 x i32>, <27 x i32>* %0, align 4
+  // CHECK-NEXT:    %3 = load i16, i16* %vs.addr, align 2
+  // CHECK-NEXT:    %conv = sext i16 %3 to i32
+  // CHECK-NEXT:    %scalar.splat.splatinsert = insertelement <27 x i32> undef, i32 %conv, i32 0
+  // CHECK-NEXT:    %scalar.splat.splat = shufflevector <27 x i32> %scalar.splat.splatinsert, <27 x i32> undef, <27 x i32> zeroinitializer
+  // CHECK-NEXT:    %4 = add <27 x i32> %2, %scalar.splat.splat
+  // CHECK-NEXT:    store <27 x i32> %4, <27 x i32>* %0, align 4
+  // CHECK-NEXT:    %5 = load <27 x i32>, <27 x i32>* %0, align 4
+  // CHECK-NEXT:    %6 = load i64, i64* %vli.addr, align 8
+  // CHECK-NEXT:    %conv1 = trunc i64 %6 to i32
+  // CHECK-NEXT:    %scalar.splat.splatinsert2 = insertelement <27 x i32> undef, i32 %conv1, i32 0
+  // CHECK-NEXT:    %scalar.splat.splat3 = shufflevector <27 x i32> %scalar.splat.splatinsert2, <27 x i32> undef, <27 x i32> zeroinitializer
+  // CHECK-NEXT:    %7 = add <27 x i32> %5, %scalar.splat.splat3
+  // CHECK-NEXT:    store <27 x i32> %7, <27 x i32>* %0, align 4
+  // CHECK-NEXT:    %8 = load <27 x i32>, <27 x i32>* %0, align 4
+  // CHECK-NEXT:    %9 = load i64, i64* %vulli.addr, align 8
+  // CHECK-NEXT:    %conv4 = trunc i64 %9 to i32
+  // CHECK-NEXT:    %scalar.splat.splatinsert5 = insertelement <27 x i32> undef, i32 %conv4, i32 0
+  // CHECK-NEXT:    %scalar.splat.splat6 = shufflevector <27 x i32> %scalar.splat.splatinsert5, <27 x i32> undef, <27 x i32> zeroinitializer
+  // CHECK-NEXT:    %10 = add <27 x i32> %8, %scalar.splat.splat6
+  // CHECK-NEXT:    store <27 x i32> %10, <27 x i32>* %0, align 4
+  // CHECK-NEXT:    %11 = load i16, i16* %vs.addr, align 2
+
+  b = vs + b;
+  b = vli + b;
+  b = vulli + b;
+
+  // CHECK-NEXT:    %conv7 = sext i16 %11 to i32
+  // CHECK-NEXT:    %12 = load <27 x i32>, <27 x i32>* %1, align 4
+  // CHECK-NEXT:    %scalar.splat.splatinsert8 = insertelement <27 x i32> undef, i32 %conv7, i32 0
+  // CHECK-NEXT:    %scalar.splat.splat9 = shufflevector <27 x i32> %scalar.splat.splatinsert8, <27 x i32> undef, <27 x i32> zeroinitializer
+  // CHECK-NEXT:    %13 = add <27 x i32> %scalar.splat.splat9, %12
+  // CHECK-NEXT:    store <27 x i32> %13, <27 x i32>* %1, align 4
+  // CHECK-NEXT:    %14 = load i64, i64* %vli.addr, align 8
+  // CHECK-NEXT:    %conv10 = trunc i64 %14 to i32
+  // CHECK-NEXT:    %15 = load <27 x i32>, <27 x i32>* %1, align 4
+  // CHECK-NEXT:    %scalar.splat.splatinsert11 = insertelement <27 x i32> undef, i32 %conv10, i32 0
+  // CHECK-NEXT:    %scalar.splat.splat12 = shufflevector <27 x i32> %scalar.splat.splatinsert11, <27 x i32> undef, <27 x i32> zeroinitializer
+  // CHECK-NEXT:    %16 = add <27 x i32> %scalar.splat.splat12, %15
+  // CHECK-NEXT:    store <27 x i32> %16, <27 x i32>* %1, align 4
+  // CHECK-NEXT:    %17 = load i64, i64* %vulli.addr, align 8
+  // CHECK-NEXT:    %conv13 = trunc i64 %17 to i32
+  // CHECK-NEXT:    %18 = load <27 x i32>, <27 x i32>* %1, align 4
+  // CHECK-NEXT:    %scalar.splat.splatinsert14 = insertelement <27 x i32> undef, i32 %conv13, i32 0
+  // CHECK-NEXT:    %scalar.splat.splat15 = shufflevector <27 x i32> %scalar.splat.splatinsert14, <27 x i32> undef, <27 x i32> zeroinitializer
+  // CHECK-NEXT:    %19 = add <27 x i32> %scalar.splat.splat15, %18
+  // CHECK-NEXT:    store <27 x i32> %19, <27 x i32>* %1, align 4
+  // CHECK-NEXT:    ret void
+}
+
+void sub_matrix_matrix(dx5x5_t a, dx5x5_t b, dx5x5_t c, ix9x3_t ai, ix9x3_t bi, ix9x3_t ci) {
+  a = b - c;
+  ai = bi - ci;
+
+  // CHECK-LABEL: @sub_matrix_matrix(
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %a.addr = alloca [25 x double], align 8
+  // CHECK-NEXT:    %b.addr = alloca [25 x double], align 8
+  // CHECK-NEXT:    %c.addr = alloca [25 x double], align 8
+  // CHECK-NEXT:    %ai.addr = alloca [27 x i32], align 4
+  // CHECK-NEXT:    %bi.addr = alloca [27 x i32], align 4
+  // CHECK-NEXT:    %ci.addr = alloca [27 x i32], align 4
+  // CHECK-NEXT:    %0 = bitcast [25 x double]* %a.addr to <25 x double>*
+  // CHECK-NEXT:    store <25 x double> %a, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %1 = bitcast [25 x double]* %b.addr to <25 x double>*
+  // CHECK-NEXT:    store <25 x double> %b, <25 x double>* %1, align 8
+  // CHECK-NEXT:    %2 = bitcast [25 x double]* %c.addr to <25 x double>*
+  // CHECK-NEXT:    store <25 x double> %c, <25 x double>* %2, align 8
+  // CHECK-NEXT:    %3 = bitcast [27 x i32]* %ai.addr to <27 x i32>*
+  // CHECK-NEXT:    store <27 x i32> %ai, <27 x i32>* %3, align 4
+  // CHECK-NEXT:    %4 = bitcast [27 x i32]* %bi.addr to <27 x i32>*
+  // CHECK-NEXT:    store <27 x i32> %bi, <27 x i32>* %4, align 4
+  // CHECK-NEXT:    %5 = bitcast [27 x i32]* %ci.addr to <27 x i32>*
+  // CHECK-NEXT:    store <27 x i32> %ci, <27 x i32>* %5, align 4
+  // CHECK-NEXT:    %6 = load <25 x double>, <25 x double>* %1, align 8
+  // CHECK-NEXT:    %7 = load <25 x double>, <25 x double>* %2, align 8
+  // CHECK-NEXT:    %8 = fsub <25 x double> %6, %7
+  // CHECK-NEXT:    store <25 x double> %8, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %9 = load <27 x i32>, <27 x i32>* %4, align 4
+  // CHECK-NEXT:    %10 = load <27 x i32>, <27 x i32>* %5, align 4
+  // CHECK-NEXT:    %11 = sub <27 x i32> %9, %10
+  // CHECK-NEXT:    store <27 x i32> %11, <27 x i32>* %3, align 4
+  // CHECK-NEXT:    ret void
+}
+
+void sub_matrix_scalar_float(dx5x5_t a, fx2x3_t b, float vf, double vd) {
+  a = a - vf;
+  a = a - vd;
+
+  // CHECK-LABEL: define void @sub_matrix_scalar_float(<25 x double> %a, <6 x float> %b, float %vf, double %vd)
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %a.addr = alloca [25 x double], align 8
+  // CHECK-NEXT:    %b.addr = alloca [6 x float], align 4
+  // CHECK-NEXT:    %vf.addr = alloca float, align 4
+  // CHECK-NEXT:    %vd.addr = alloca double, align 8
+  // CHECK-NEXT:    %0 = bitcast [25 x double]* %a.addr to <25 x double>*
+  // CHECK-NEXT:    store <25 x double> %a, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %1 = bitcast [6 x float]* %b.addr to <6 x float>*
+  // CHECK-NEXT:    store <6 x float> %b, <6 x float>* %1, align 4
+  // CHECK-NEXT:    store float %vf, float* %vf.addr, align 4
+  // CHECK-NEXT:    store double %vd, double* %vd.addr, align 8
+  // CHECK-NEXT:    %2 = load <25 x double>, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %3 = load float, float* %vf.addr, align 4
+  // CHECK-NEXT:    %conv = fpext float %3 to double
+  // CHECK-NEXT:    %scalar.splat.splatinsert = insertelement <25 x double> undef, double %conv, i32 0
+  // CHECK-NEXT:    %scalar.splat.splat = shufflevector <25 x double> %scalar.splat.splatinsert, <25 x double> undef, <25 x i32> zeroinitializer
+  // CHECK-NEXT:    %4 = fsub <25 x double> %2, %scalar.splat.splat
+  // CHECK-NEXT:    store <25 x double> %4, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %5 = load <25 x double>, <25 x double>* %0, align 8
+  // CHECK-NEXT:    %6 = load double, double* %vd.addr, align 8
+  // CHECK-NEXT:    %scalar.splat.splatinsert1 = insertelement <25 x double> undef, double %6, i32 0
+  // CHECK-NEXT:    %scalar.splat.splat2 = shufflevector <25 x double> %scalar.splat.splatinsert1, <25 x double> undef, <25 x i32> zeroinitializer
+  // CHECK-NEXT:    %7 = fsub <25 x double> %5, %scalar.splat.splat2
+  // CHECK-NEXT:    store <25 x double> %7, <25 x double>* %0, align 8
+
+  b = b - vf;
+  b = b - vd;
+
+  // CHECK-NEXT:    %8 = load <6 x float>, <6 x float>* %1, align 4
+  // CHECK-NEXT:    %9 = load float, float* %vf.addr, align 4
+  // CHECK-NEXT:    %scalar.splat.splatinsert3 = insertelement <6 x float> undef, float %9, i32 0
+  // CHECK-NEXT:    %scalar.splat.splat4 = shufflevector <6 x float> %scalar.splat.splatinsert3, <6 x float> undef, <6 x i32> zeroinitializer
+  // CHECK-NEXT:    %10 = fsub <6 x float> %8, %scalar.splat.splat4
+  // CHECK-NEXT:    store <6 x float> %10, <6 x float>* %1, align 4
+  // CHECK-NEXT:    %11 = load <6 x float>, <6 x float>* %1, align 4
+  // CHECK-NEXT:    %12 = load double, double* %vd.addr, align 8
+  // CHECK-NEXT:    %conv5 = fptrunc double %12 to float
+  // CHECK-NEXT:    %scalar.splat.splatinsert6 = insertelement <6 x float> undef, float %conv5, i32 0
+  // CHECK-NEXT:    %scalar.splat.splat7 = shufflevector <6 x float> %scalar.splat.splatinsert6, <6 x float> undef, <6 x i32> zeroinitializer
+  // CHECK-NEXT:    %13 = fsub <6 x float> %11, %scalar.splat.splat7
+  // CHECK-NEXT:    store <6 x float> %13, <6 x float>* %1, align 4
+  // CHECK-NEXT:    ret void
+}
+
+void sub_matrix_scalar_ints(ix9x3_t a, llix9x3_t b, short vs, long int vli, unsigned long long int vulli) {
+  a = a - vs;
+  a = a - vli;
+  a = a - vulli;
+
+  // CHECK-LABEL: define void @sub_matrix_scalar_ints(<27 x i32> %a, <27 x i32> %b, i16 signext %vs, i64 %vli, i64 %vulli)
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %a.addr = alloca [27 x i32], align 4
+  // CHECK-NEXT:    %b.addr = alloca [27 x i32], align 4
+  // CHECK-NEXT:    %vs.addr = alloca i16, align 2
+  // CHECK-NEXT:    %vli.addr = alloca i64, align 8
+  // CHECK-NEXT:    %vulli.addr = alloca i64, align 8
+  // CHECK-NEXT:    %0 = bitcast [27 x i32]* %a.addr to <27 x i32>*
+  // CHECK-NEXT:    store <27 x i32> %a, <27 x i32>* %0, align 4
+  // CHECK-NEXT:    %1 = bitcast [27 x i32]* %b.addr to <27 x i32>*
+  // CHECK-NEXT:    store <27 x i32> %b, <27 x i32>* %1, align 4
+  // CHECK-NEXT:    store i16 %vs, i16* %vs.addr, align 2
+  // CHECK-NEXT:    store i64 %vli, i64* %vli.addr, align 8
+  // CHECK-NEXT:    store i64 %vulli, i64* %vulli.addr, align 8
+  // CHECK-NEXT:    %2 = load <27 x i32>, <27 x i32>* %0, align 4
+  // CHECK-NEXT:    %3 = load i16, i16* %vs.addr, align 2
+  // CHECK-NEXT:    %conv = sext i16 %3 to i32
+  // CHECK-NEXT:    %scalar.splat.splatinsert = insertelement <27 x i32> undef, i32 %conv, i32 0
+  // CHECK-NEXT:    %scalar.splat.splat = shufflevector <27 x i32> %scalar.splat.splatinsert, <27 x i32> undef, <27 x i32> zeroinitializer
+  // CHECK-NEXT:    %4 = sub <27 x i32> %2, %scalar.splat.splat
+  // CHECK-NEXT:    store <27 x i32> %4, <27 x i32>* %0, align 4
+  // CHECK-NEXT:    %5 = load <27 x i32>, <27 x i32>* %0, align 4
+  // CHECK-NEXT:    %6 = load i64, i64* %vli.addr, align 8
+  // CHECK-NEXT:    %conv1 = trunc i64 %6 to i32
+  // CHECK-NEXT:    %scalar.splat.splatinsert2 = insertelement <27 x i32> undef, i32 %conv1, i32 0
+  // CHECK-NEXT:    %scalar.splat.splat3 = shufflevector <27 x i32> %scalar.splat.splatinsert2, <27 x i32> undef, <27 x i32> zeroinitializer
+  // CHECK-NEXT:    %7 = sub <27 x i32> %5, %scalar.splat.splat3
+  // CHECK-NEXT:    store <27 x i32> %7, <27 x i32>* %0, align 4
+  // CHECK-NEXT:    %8 = load <27 x i32>, <27 x i32>* %0, align 4
+  // CHECK-NEXT:    %9 = load i64, i64* %vulli.addr, align 8
+  // CHECK-NEXT:    %conv4 = trunc i64 %9 to i32
+  // CHECK-NEXT:    %scalar.splat.splatinsert5 = insertelement <27 x i32> undef, i32 %conv4, i32 0
+  // CHECK-NEXT:    %scalar.splat.splat6 = shufflevector <27 x i32> %scalar.splat.splatinsert5, <27 x i32> undef, <27 x i32> zeroinitializer
+  // CHECK-NEXT:    %10 = sub <27 x i32> %8, %scalar.splat.splat6
+  // CHECK-NEXT:    store <27 x i32> %10, <27 x i32>* %0, align 4
+
+  b = vs - b;
+  b = vli - b;
+  b = vulli - b;
+
+  // CHECK-NEXT:    %11 = load i16, i16* %vs.addr, align 2
+  // CHECK-NEXT:    %conv7 = sext i16 %11 to i32
+  // CHECK-NEXT:    %12 = load <27 x i32>, <27 x i32>* %1, align 4
+  // CHECK-NEXT:    %scalar.splat.splatinsert8 = insertelement <27 x i32> undef, i32 %conv7, i32 0
+  // CHECK-NEXT:    %scalar.splat.splat9 = shufflevector <27 x i32> %scalar.splat.splatinsert8, <27 x i32> undef, <27 x i32> zeroinitializer
+  // CHECK-NEXT:    %13 = sub <27 x i32> %scalar.splat.splat9, %12
+  // CHECK-NEXT:    store <27 x i32> %13, <27 x i32>* %1, align 4
+  // CHECK-NEXT:    %14 = load i64, i64* %vli.addr, align 8
+  // CHECK-NEXT:    %conv10 = trunc i64 %14 to i32
+  // CHECK-NEXT:    %15 = load <27 x i32>, <27 x i32>* %1, align 4
+  // CHECK-NEXT:    %scalar.splat.splatinsert11 = insertelement <27 x i32> undef, i32 %conv10, i32 0
+  // CHECK-NEXT:    %scalar.splat.splat12 = shufflevector <27 x i32> %scalar.splat.splatinsert11, <27 x i32> undef, <27 x i32> zeroinitializer
+  // CHECK-NEXT:    %16 = sub <27 x i32> %scalar.splat.splat12, %15
+  // CHECK-NEXT:    store <27 x i32> %16, <27 x i32>* %1, align 4
+  // CHECK-NEXT:    %17 = load i64, i64* %vulli.addr, align 8
+  // CHECK-NEXT:    %conv13 = trunc i64 %17 to i32
+  // CHECK-NEXT:    %18 = load <27 x i32>, <27 x i32>* %1, align 4
+  // CHECK-NEXT:    %scalar.splat.splatinsert14 = insertelement <27 x i32> undef, i32 %conv13, i32 0
+  // CHECK-NEXT:    %scalar.splat.splat15 = shufflevector <27 x i32> %scalar.splat.splatinsert14, <27 x i32> undef, <27 x i32> zeroinitializer
+  // CHECK-NEXT:    %19 = sub <27 x i32> %scalar.splat.splat15, %18
+  // CHECK-NEXT:    store <27 x i32> %19, <27 x i32>* %1, align 4
+  // CHECK-NEXT:    ret void
+}
diff --git a/clang/test/CodeGenCXX/matrix-type-operators.cpp b/clang/test/CodeGenCXX/matrix-type-operators.cpp
--- a/clang/test/CodeGenCXX/matrix-type-operators.cpp
+++ b/clang/test/CodeGenCXX/matrix-type-operators.cpp
@@ -209,3 +209,79 @@
   Mat1.value = *((decltype(Mat1)::matrix_t *)Ptr1);
   unsigned v1 = extract(Mat1);
 }
+
+template <typename EltTy0, unsigned R0, unsigned C0>
+typename MyMatrix<EltTy0, R0, C0>::matrix_t add(MyMatrix<EltTy0, R0, C0> &A, MyMatrix<EltTy0, R0, C0> &B) {
+  return A.value + B.value;
+}
+
+void test_add_template() {
+  // CHECK-LABEL:    define void @_Z17test_add_templatev()
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %Mat1 = alloca %struct.MyMatrix.1, align 4
+  // CHECK-NEXT:    %Mat2 = alloca %struct.MyMatrix.1, align 4
+  // CHECK-NEXT:    %call = call <10 x float> @_Z3addIfLj2ELj5EEN8MyMatrixIT_XT0_EXT1_EE8matrix_tERS2_S4_(%struct.MyMatrix.1* dereferenceable(40) %Mat1, %struct.MyMatrix.1* dereferenceable(40) %Mat2)
+  // CHECK-NEXT:    %value = getelementptr inbounds %struct.MyMatrix.1, %struct.MyMatrix.1* %Mat1, i32 0, i32 0
+  // CHECK-NEXT:    %0 = bitcast [10 x float]* %value to <10 x float>*
+  // CHECK-NEXT:    store <10 x float> %call, <10 x float>* %0, align 4
+  // CHECK-NEXT:    ret void
+
+  // CHECK-LABEL: define linkonce_odr <10 x float> @_Z3addIfLj2ELj5EEN8MyMatrixIT_XT0_EXT1_EE8matrix_tERS2_S4_(%struct.MyMatrix.1* dereferenceable(40) %A, %struct.MyMatrix.1* dereferenceable(40) %B)
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %A.addr = alloca %struct.MyMatrix.1*, align 8
+  // CHECK-NEXT:    %B.addr = alloca %struct.MyMatrix.1*, align 8
+  // CHECK-NEXT:    store %struct.MyMatrix.1* %A, %struct.MyMatrix.1** %A.addr, align 8
+  // CHECK-NEXT:    store %struct.MyMatrix.1* %B, %struct.MyMatrix.1** %B.addr, align 8
+  // CHECK-NEXT:    %0 = load %struct.MyMatrix.1*, %struct.MyMatrix.1** %A.addr, align 8
+  // CHECK-NEXT:    %value = getelementptr inbounds %struct.MyMatrix.1, %struct.MyMatrix.1* %0, i32 0, i32 0
+  // CHECK-NEXT:    %1 = bitcast [10 x float]* %value to <10 x float>*
+  // CHECK-NEXT:    %2 = load <10 x float>, <10 x float>* %1, align 4
+  // CHECK-NEXT:    %3 = load %struct.MyMatrix.1*, %struct.MyMatrix.1** %B.addr, align 8
+  // CHECK-NEXT:    %value1 = getelementptr inbounds %struct.MyMatrix.1, %struct.MyMatrix.1* %3, i32 0, i32 0
+  // CHECK-NEXT:    %4 = bitcast [10 x float]* %value1 to <10 x float>*
+  // CHECK-NEXT:    %5 = load <10 x float>, <10 x float>* %4, align 4
+  // CHECK-NEXT:    %6 = fadd <10 x float> %2, %5
+  // CHECK-NEXT:    ret <10 x float> %6
+
+  MyMatrix<float, 2, 5> Mat1;
+  MyMatrix<float, 2, 5> Mat2;
+  Mat1.value = add(Mat1, Mat2);
+}
+
+template <typename EltTy0, unsigned R0, unsigned C0>
+typename MyMatrix<EltTy0, R0, C0>::matrix_t subtract(MyMatrix<EltTy0, R0, C0> &A, MyMatrix<EltTy0, R0, C0> &B) {
+  return A.value - B.value;
+}
+
+void test_subtract_template() {
+  // CHECK-LABEL: define void @_Z22test_subtract_templatev()
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %Mat1 = alloca %struct.MyMatrix.1, align 4
+  // CHECK-NEXT:    %Mat2 = alloca %struct.MyMatrix.1, align 4
+  // CHECK-NEXT:    %call = call <10 x float> @_Z8subtractIfLj2ELj5EEN8MyMatrixIT_XT0_EXT1_EE8matrix_tERS2_S4_(%struct.MyMatrix.1* dereferenceable(40) %Mat1, %struct.MyMatrix.1* dereferenceable(40) %Mat2)
+  // CHECK-NEXT:    %value = getelementptr inbounds %struct.MyMatrix.1, %struct.MyMatrix.1* %Mat1, i32 0, i32 0
+  // CHECK-NEXT:    %0 = bitcast [10 x float]* %value to <10 x float>*
+  // CHECK-NEXT:    store <10 x float> %call, <10 x float>* %0, align 4
+  // CHECK-NEXT:    ret void
+
+  // CHECK-LABEL: define linkonce_odr <10 x float> @_Z8subtractIfLj2ELj5EEN8MyMatrixIT_XT0_EXT1_EE8matrix_tERS2_S4_(%struct.MyMatrix.1* dereferenceable(40) %A, %struct.MyMatrix.1* dereferenceable(40) %B)
+  // CHECK-NEXT:  entry:
+  // CHECK-NEXT:    %A.addr = alloca %struct.MyMatrix.1*, align 8
+  // CHECK-NEXT:    %B.addr = alloca %struct.MyMatrix.1*, align 8
+  // CHECK-NEXT:    store %struct.MyMatrix.1* %A, %struct.MyMatrix.1** %A.addr, align 8
+  // CHECK-NEXT:    store %struct.MyMatrix.1* %B, %struct.MyMatrix.1** %B.addr, align 8
+  // CHECK-NEXT:    %0 = load %struct.MyMatrix.1*, %struct.MyMatrix.1** %A.addr, align 8
+  // CHECK-NEXT:    %value = getelementptr inbounds %struct.MyMatrix.1, %struct.MyMatrix.1* %0, i32 0, i32 0
+  // CHECK-NEXT:    %1 = bitcast [10 x float]* %value to <10 x float>*
+  // CHECK-NEXT:    %2 = load <10 x float>, <10 x float>* %1, align 4
+  // CHECK-NEXT:    %3 = load %struct.MyMatrix.1*, %struct.MyMatrix.1** %B.addr, align 8
+  // CHECK-NEXT:    %value1 = getelementptr inbounds %struct.MyMatrix.1, %struct.MyMatrix.1* %3, i32 0, i32 0
+  // CHECK-NEXT:    %4 = bitcast [10 x float]* %value1 to <10 x float>*
+  // CHECK-NEXT:    %5 = load <10 x float>, <10 x float>* %4, align 4
+  // CHECK-NEXT:    %6 = fsub <10 x float> %2, %5
+  // CHECK-NEXT:    ret <10 x float> %6
+
+  MyMatrix<float, 2, 5> Mat1;
+  MyMatrix<float, 2, 5> Mat2;
+  Mat1.value = subtract(Mat1, Mat2);
+}
diff --git a/clang/test/Sema/matrix-type-operators.c b/clang/test/Sema/matrix-type-operators.c
--- a/clang/test/Sema/matrix-type-operators.c
+++ b/clang/test/Sema/matrix-type-operators.c
@@ -65,3 +65,32 @@
   float v12 = a[3];
   // expected-error@-1 {{single subscript expressions are not allowed for matrix values}}
 }
+
+typedef float sx10x5_t __attribute__((matrix_type(10, 5)));
+typedef float sx10x10_t __attribute__((matrix_type(10, 10)));
+
+void add(sx10x10_t a, sx5x10_t b, sx10x5_t c) {
+  a = b + c;
+  // expected-error@-1 {{invalid operands to binary expression ('sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))') and 'sx10x5_t' (aka 'float __attribute__((matrix_type(10, 5)))'))}}
+
+  a = b + b; // expected-error {{assigning to 'sx10x10_t' (aka 'float __attribute__((matrix_type(10, 10)))') from incompatible type 'sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))')}}
+
+  a = 10 + b;
+  // expected-error@-1 {{assigning to 'sx10x10_t' (aka 'float __attribute__((matrix_type(10, 10)))') from incompatible type 'sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))')}}
+
+  a = b + &c;
+  // expected-error@-1 {{invalid operands to binary expression ('sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))') and 'sx10x5_t *' (aka 'float  __attribute__((matrix_type(10, 5)))*'))}}
+}
+
+void sub(sx10x10_t a, sx5x10_t b, sx10x5_t c) {
+  a = b - c;
+  // expected-error@-1 {{invalid operands to binary expression ('sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))') and 'sx10x5_t' (aka 'float __attribute__((matrix_type(10, 5)))'))}}
+
+  a = b - b; // expected-error {{assigning to 'sx10x10_t' (aka 'float __attribute__((matrix_type(10, 10)))') from incompatible type 'sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))')}}
+
+  a = 10 - b;
+  // expected-error@-1 {{assigning to 'sx10x10_t' (aka 'float __attribute__((matrix_type(10, 10)))') from incompatible type 'sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))')}}
+
+  a = b - &c;
+  // expected-error@-1 {{invalid operands to binary expression ('sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))') and 'sx10x5_t *' (aka 'float  __attribute__((matrix_type(10, 5)))*'))}}
+}
diff --git a/clang/test/SemaCXX/matrix-type-operators.cpp b/clang/test/SemaCXX/matrix-type-operators.cpp
--- a/clang/test/SemaCXX/matrix-type-operators.cpp
+++ b/clang/test/SemaCXX/matrix-type-operators.cpp
@@ -66,3 +66,66 @@
   a[2] = f;
   // expected-error@-1 {{single subscript expressions are not allowed for matrix values}}
 }
+
+template <typename EltTy, unsigned Rows, unsigned Columns>
+struct MyMatrix {
+  using matrix_t = EltTy __attribute__((matrix_type(Rows, Columns)));
+
+  matrix_t value;
+};
+
+template <typename EltTy0, unsigned R0, unsigned C0, typename EltTy1, unsigned R1, unsigned C1, typename EltTy2, unsigned R2, unsigned C2>
+typename MyMatrix<EltTy2, R2, C2>::matrix_t add(MyMatrix<EltTy0, R0, C0> &A, MyMatrix<EltTy1, R1, C1> &B) {
+  char *v1 = A.value + B.value;
+  // expected-error@-1 {{cannot initialize a variable of type 'char *' with an rvalue of type 'MyMatrix<unsigned int, 2, 2>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(2, 2)))')}}
+  // expected-error@-2 {{invalid operands to binary expression ('MyMatrix<unsigned int, 3, 3>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(3, 3)))') and 'MyMatrix<float, 2, 2>::matrix_t' (aka 'float __attribute__((matrix_type(2, 2)))'))}}
+  // expected-error@-3 {{invalid operands to binary expression ('MyMatrix<unsigned int, 2, 2>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(2, 2)))') and 'MyMatrix<unsigned int, 3, 3>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(3, 3)))'))}}
+
+  return A.value + B.value;
+  // expected-error@-1 {{invalid operands to binary expression ('MyMatrix<unsigned int, 3, 3>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(3, 3)))') and 'MyMatrix<float, 2, 2>::matrix_t' (aka 'float __attribute__((matrix_type(2, 2)))'))}}
+  // expected-error@-2 {{invalid operands to binary expression ('MyMatrix<unsigned int, 2, 2>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(2, 2)))') and 'MyMatrix<unsigned int, 3, 3>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(3, 3)))'))}}
+}
+
+void test_add_template(unsigned *Ptr1, float *Ptr2) {
+  MyMatrix<unsigned, 2, 2> Mat1;
+  MyMatrix<unsigned, 3, 3> Mat2;
+  MyMatrix<float, 2, 2> Mat3;
+  Mat1.value = *((decltype(Mat1)::matrix_t *)Ptr1);
+  unsigned v1 = add<unsigned, 2, 2, unsigned, 2, 2, unsigned, 2, 2>(Mat1, Mat1);
+  // expected-error@-1 {{cannot initialize a variable of type 'unsigned int' with an rvalue of type 'typename MyMatrix<unsigned int, 2U, 2U>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(2, 2)))')}}
+  // expected-note@-2 {{in instantiation of function template specialization 'add<unsigned int, 2, 2, unsigned int, 2, 2, unsigned int, 2, 2>' requested here}}
+
+  Mat1.value = add<unsigned, 2, 2, unsigned, 3, 3, unsigned, 2, 2>(Mat1, Mat2);
+  // expected-note@-1 {{in instantiation of function template specialization 'add<unsigned int, 2, 2, unsigned int, 3, 3, unsigned int, 2, 2>' requested here}}
+
+  Mat1.value = add<unsigned, 3, 3, float, 2, 2, unsigned, 2, 2>(Mat2, Mat3);
+  // expected-note@-1 {{in instantiation of function template specialization 'add<unsigned int, 3, 3, float, 2, 2, unsigned int, 2, 2>' requested here}}
+}
+
+template <typename EltTy0, unsigned R0, unsigned C0, typename EltTy1, unsigned R1, unsigned C1, typename EltTy2, unsigned R2, unsigned C2>
+typename MyMatrix<EltTy2, R2, C2>::matrix_t subtract(MyMatrix<EltTy0, R0, C0> &A, MyMatrix<EltTy1, R1, C1> &B) {
+  char *v1 = A.value - B.value;
+  // expected-error@-1 {{cannot initialize a variable of type 'char *' with an rvalue of type 'MyMatrix<unsigned int, 2, 2>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(2, 2)))')}}
+  // expected-error@-2 {{invalid operands to binary expression ('MyMatrix<unsigned int, 3, 3>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(3, 3)))') and 'MyMatrix<float, 2, 2>::matrix_t' (aka 'float __attribute__((matrix_type(2, 2)))')}}
+  // expected-error@-3 {{invalid operands to binary expression ('MyMatrix<unsigned int, 2, 2>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(2, 2)))') and 'MyMatrix<unsigned int, 3, 3>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(3, 3)))')}}
+
+  return A.value - B.value;
+  // expected-error@-1 {{invalid operands to binary expression ('MyMatrix<unsigned int, 3, 3>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(3, 3)))') and 'MyMatrix<float, 2, 2>::matrix_t' (aka 'float __attribute__((matrix_type(2, 2)))')}}
+  // expected-error@-2 {{invalid operands to binary expression ('MyMatrix<unsigned int, 2, 2>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(2, 2)))') and 'MyMatrix<unsigned int, 3, 3>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(3, 3)))')}}
+}
+
+void test_subtract_template(unsigned *Ptr1, float *Ptr2) {
+  MyMatrix<unsigned, 2, 2> Mat1;
+  MyMatrix<unsigned, 3, 3> Mat2;
+  MyMatrix<float, 2, 2> Mat3;
+  Mat1.value = *((decltype(Mat1)::matrix_t *)Ptr1);
+  unsigned v1 = subtract<unsigned, 2, 2, unsigned, 2, 2, unsigned, 2, 2>(Mat1, Mat1);
+  // expected-error@-1 {{cannot initialize a variable of type 'unsigned int' with an rvalue of type 'typename MyMatrix<unsigned int, 2U, 2U>::matrix_t' (aka 'unsigned int __attribute__((matrix_type(2, 2)))')}}
+  // expected-note@-2 {{in instantiation of function template specialization 'subtract<unsigned int, 2, 2, unsigned int, 2, 2, unsigned int, 2, 2>' requested here}}
+
+  Mat1.value = subtract<unsigned, 2, 2, unsigned, 3, 3, unsigned, 2, 2>(Mat1, Mat2);
+  // expected-note@-1 {{in instantiation of function template specialization 'subtract<unsigned int, 2, 2, unsigned int, 3, 3, unsigned int, 2, 2>' requested here}}
+
+  Mat1.value = subtract<unsigned, 3, 3, float, 2, 2, unsigned, 2, 2>(Mat2, Mat3);
+  // expected-note@-1 {{in instantiation of function template specialization 'subtract<unsigned int, 3, 3, float, 2, 2, unsigned int, 2, 2>' requested here}}
+}
diff --git a/llvm/include/llvm/IR/MatrixBuilder.h b/llvm/include/llvm/IR/MatrixBuilder.h
--- a/llvm/include/llvm/IR/MatrixBuilder.h
+++ b/llvm/include/llvm/IR/MatrixBuilder.h
@@ -127,6 +127,16 @@
   /// Add matrixes \p LHS and \p RHS. Support both integer and floating point
   /// matrixes.
   Value *CreateAdd(Value *LHS, Value *RHS) {
+    assert(LHS->getType()->isVectorTy() || RHS->getType()->isVectorTy());
+    if (LHS->getType()->isVectorTy() && !RHS->getType()->isVectorTy())
+      RHS = B.CreateVectorSplat(
+          cast<VectorType>(LHS->getType())->getNumElements(), RHS,
+          "scalar.splat");
+    else if (!LHS->getType()->isVectorTy() && RHS->getType()->isVectorTy())
+      LHS = B.CreateVectorSplat(
+          cast<VectorType>(RHS->getType())->getNumElements(), LHS,
+          "scalar.splat");
+
     return cast<VectorType>(LHS->getType())
                    ->getElementType()
                    ->isFloatingPointTy()
@@ -137,6 +147,16 @@
   /// Subtract matrixes \p LHS and \p RHS. Support both integer and floating
   /// point matrixes.
   Value *CreateSub(Value *LHS, Value *RHS) {
+    assert(LHS->getType()->isVectorTy() || RHS->getType()->isVectorTy());
+    if (LHS->getType()->isVectorTy() && !RHS->getType()->isVectorTy())
+      RHS = B.CreateVectorSplat(
+          cast<VectorType>(LHS->getType())->getNumElements(), RHS,
+          "scalar.splat");
+    else if (!LHS->getType()->isVectorTy() && RHS->getType()->isVectorTy())
+      LHS = B.CreateVectorSplat(
+          cast<VectorType>(RHS->getType())->getNumElements(), LHS,
+          "scalar.splat");
+
     return cast<VectorType>(LHS->getType())
                    ->getElementType()
                    ->isFloatingPointTy()