diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def --- a/clang/include/clang/Basic/Builtins.def +++ b/clang/include/clang/Basic/Builtins.def @@ -575,6 +575,8 @@ BUILTIN(__builtin_alloca_with_align, "v*zIz", "Fn") BUILTIN(__builtin_call_with_static_chain, "v.", "nt") +BUILTIN(__builtin_matrix_transpose, "v.", "nFt") + // "Overloaded" Atomic operator builtins. These are overloaded to support data // types of i8, i16, i32, i64, and i128. The front-end sees calls to the // non-suffixed version of these (which has a bogus type) and transforms them to diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td --- a/clang/include/clang/Basic/DiagnosticSemaKinds.td +++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td @@ -10754,6 +10754,9 @@ def err_matrix_incomplete_idx: Error< "single subscript expressions are not allowed for matrix values">; +def err_builtin_matrix_arg: Error< + "%select{first|second}0 argument must be a matrix">; + def err_preserve_field_info_not_field : Error< "__builtin_preserve_field_info argument %0 not a field access">; def err_preserve_field_info_not_const: Error< diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -12082,6 +12082,11 @@ int ArgNum, unsigned ExpectedFieldNum, bool AllowName); bool SemaBuiltinARMMemoryTaggingCall(unsigned BuiltinID, CallExpr *TheCall); + + // Matrix builtin handling. + ExprResult SemaBuiltinMatrixTransposeOverload(CallExpr *TheCall, + ExprResult CallResult); + public: enum FormatStringType { FST_Scanf, diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -44,6 +44,7 @@ #include "llvm/IR/IntrinsicsWebAssembly.h" #include "llvm/IR/IntrinsicsX86.h" #include "llvm/IR/MDBuilder.h" +#include "llvm/IR/MatrixBuilder.h" #include "llvm/Support/ConvertUTF.h" #include "llvm/Support/ScopedPrinter.h" #include "llvm/Support/TargetParser.h" @@ -1634,6 +1635,10 @@ return RValue::get(Builder.CreateCall(F, { Src, Src, ShiftAmt })); } +const ConstantMatrixType *getMatrixTy(QualType Ty) { + return cast(Ty.getCanonicalType()); +}; + RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID, const CallExpr *E, ReturnValueSlot ReturnValue) { @@ -2371,6 +2376,15 @@ return RValue::get(Builder.CreateZExt(V, ConvertType(E->getType()))); } + case Builtin::BI__builtin_matrix_transpose: { + const ConstantMatrixType *MatrixTy = getMatrixTy(E->getArg(0)->getType()); + Value *MatValue = EmitScalarExpr(E->getArg(0)); + MatrixBuilder MB(Builder); + Value *Result = MB.CreateMatrixTranspose(MatValue, MatrixTy->getNumRows(), + MatrixTy->getNumColumns()); + return RValue::get(Result); + } + case Builtin::BIfinite: case Builtin::BI__finite: case Builtin::BIfinitef: diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -1895,7 +1895,7 @@ return ExprError(); break; case Builtin::BI__builtin_frame_address: - case Builtin::BI__builtin_return_address: + case Builtin::BI__builtin_return_address: { if (SemaBuiltinConstantArgRange(TheCall, 0, 0, 0xFFFF)) return ExprError(); @@ -1912,6 +1912,20 @@ break; } + case Builtin::BI__builtin_matrix_transpose: + if (!getLangOpts().MatrixTypes) { + Diag(TheCall->getBeginLoc(), diag::err_builtin_matrix_disabled); + return ExprError(); + } + + switch (BuiltinID) { + case Builtin::BI__builtin_matrix_transpose: + return SemaBuiltinMatrixTransposeOverload(TheCall, TheCallResult); + default: + llvm_unreachable("All matrix builtins should be handled here!"); + } + } + // Since the target specific builtins for each arch overlap, only check those // of the arch we are compiling for. if (Context.BuiltinInfo.isTSBuiltin(BuiltinID)) { @@ -14769,3 +14783,34 @@ rhs, std::bind(&Sema::AddPotentialMisalignedMembers, std::ref(*this), _1, _2, _3, _4)); } + +ExprResult Sema::SemaBuiltinMatrixTransposeOverload(CallExpr *TheCall, + ExprResult CallResult) { + if (checkArgCount(*this, TheCall, 1)) + return ExprError(); + + Expr *Arg = TheCall->getArg(0); + if (!Arg->getType()->isConstantMatrixType()) { + Diag(Arg->getBeginLoc(), diag::err_builtin_matrix_arg) << 0; + return ExprError(); + } + + // Convert arg to RValue, if required. + if (!Arg->isRValue()) { + ExprResult Res = ImplicitCastExpr::Create( + Context, Arg->getType(), CK_LValueToRValue, Arg, nullptr, VK_RValue); + assert(!Res.isInvalid() && "Matrix Cast failed"); + TheCall->setArg(0, Res.get()); + } + + // Create returned matrix type by swapping rows and columns of the argument + // matrix type. + ConstantMatrixType const *MType = + cast(Arg->getType().getCanonicalType()); + QualType ResultType = Context.getConstantMatrixType( + MType->getElementType(), MType->getNumColumns(), MType->getNumRows()); + + // Change the return type to the type of the returned matrix. + TheCall->setType(ResultType); + return CallResult; +} diff --git a/clang/test/CodeGen/matrix-type-builtins.c b/clang/test/CodeGen/matrix-type-builtins.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/matrix-type-builtins.c @@ -0,0 +1,74 @@ +// RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - | FileCheck %s + +// Tests for the matrix type operators. + +typedef double dx5x5_t __attribute__((matrix_type(5, 5))); +typedef float fx2x3_t __attribute__((matrix_type(2, 3))); +typedef float fx3x2_t __attribute__((matrix_type(3, 2))); +typedef int ix20x4_t __attribute__((matrix_type(20, 4))); +typedef int ix4x20_t __attribute__((matrix_type(4, 20))); +typedef unsigned ux1x6_t __attribute__((matrix_type(1, 6))); + +void transpose1(dx5x5_t *a, fx3x2_t *b, ix20x4_t *c) { + // CHECK-LABEL: define void @transpose1([25 x double]* %a, [6 x float]* %b, [80 x i32]* %c) + // CHECK-NEXT: entry: + // CHECK-NEXT: %a.addr = alloca [25 x double]*, align 8 + // CHECK-NEXT: %b.addr = alloca [6 x float]*, align 8 + // CHECK-NEXT: %c.addr = alloca [80 x i32]*, align 8 + // CHECK-NEXT: %a_t = alloca [25 x double], align 8 + // CHECK-NEXT: %b_t = alloca [6 x float], align 4 + // CHECK-NEXT: %c_t = alloca [80 x i32], align 4 + // CHECK-NEXT: store [25 x double]* %a, [25 x double]** %a.addr, align 8 + // CHECK-NEXT: store [6 x float]* %b, [6 x float]** %b.addr, align 8 + // CHECK-NEXT: store [80 x i32]* %c, [80 x i32]** %c.addr, align 8 + // CHECK-NEXT: %0 = load [25 x double]*, [25 x double]** %a.addr, align 8 + // CHECK-NEXT: %1 = bitcast [25 x double]* %0 to <25 x double>* + // CHECK-NEXT: %2 = load <25 x double>, <25 x double>* %1, align 8 + // CHECK-NEXT: %3 = call <25 x double> @llvm.matrix.transpose.v25f64(<25 x double> %2, i32 5, i32 5) + // CHECK-NEXT: %4 = bitcast [25 x double]* %a_t to <25 x double>* + // CHECK-NEXT: store <25 x double> %3, <25 x double>* %4, align 8 + dx5x5_t a_t = __builtin_matrix_transpose(*a); + + // CHECK-NEXT: %5 = load [6 x float]*, [6 x float]** %b.addr, align 8 + // CHECK-NEXT: %6 = bitcast [6 x float]* %5 to <6 x float>* + // CHECK-NEXT: %7 = load <6 x float>, <6 x float>* %6, align 4 + // CHECK-NEXT: %8 = call <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %7, i32 3, i32 2) + // CHECK-NEXT: %9 = bitcast [6 x float]* %b_t to <6 x float>* + // CHECK-NEXT: store <6 x float> %8, <6 x float>* %9, align 4 + fx2x3_t b_t = __builtin_matrix_transpose(*b); + + // CHECK-NEXT: %10 = load [80 x i32]*, [80 x i32]** %c.addr, align 8 + // CHECK-NEXT: %11 = bitcast [80 x i32]* %10 to <80 x i32>* + // CHECK-NEXT: %12 = load <80 x i32>, <80 x i32>* %11, align 4 + // CHECK-NEXT: %13 = call <80 x i32> @llvm.matrix.transpose.v80i32(<80 x i32> %12, i32 20, i32 4) + // CHECK-NEXT: %14 = bitcast [80 x i32]* %c_t to <80 x i32>* + // CHECK-NEXT: store <80 x i32> %13, <80 x i32>* %14, align 4 + // CHECK-NEXT: ret void + ix4x20_t c_t = __builtin_matrix_transpose(*c); +} + +// CHECK: declare <25 x double> @llvm.matrix.transpose.v25f64(<25 x double>, i32 immarg, i32 immarg) +// CHECK: declare <6 x float> @llvm.matrix.transpose.v6f32(<6 x float>, i32 immarg, i32 immarg) +// CHECK: declare <80 x i32> @llvm.matrix.transpose.v80i32(<80 x i32>, i32 immarg, i32 immarg) + +struct Foo { + ux1x6_t matrix; +}; + +void transpose2(struct Foo F) { + // CHECK-LABEL: define void @transpose2(%struct.Foo* byval(%struct.Foo) align 8 %F) #0 { + // CHECK-NEXT: entry: + // CHECK-NEXT: %Res = alloca [6 x i32], align 4 + // CHECK-NEXT: %matrix = getelementptr inbounds %struct.Foo, %struct.Foo* %F, i32 0, i32 0 + // CHECK-NEXT: %0 = bitcast [6 x i32]* %matrix to <6 x i32>* + // CHECK-NEXT: %1 = load <6 x i32>, <6 x i32>* %0, align 8 + // CHECK-NEXT: %2 = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> %1, i32 1, i32 6) + // CHECK-NEXT: %3 = call <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32> %2, i32 6, i32 1) + // CHECK-NEXT: %4 = bitcast [6 x i32]* %Res to <6 x i32>* + // CHECK-NEXT: store <6 x i32> %3, <6 x i32>* %4, align 4 + // CHECK-NEXT: ret void + + ux1x6_t Res = __builtin_matrix_transpose(__builtin_matrix_transpose(F.matrix)); +} + +// CHECK: declare <6 x i32> @llvm.matrix.transpose.v6i32(<6 x i32>, i32 immarg, i32 immarg) diff --git a/clang/test/CodeGenCXX/matrix-type-builtins.cpp b/clang/test/CodeGenCXX/matrix-type-builtins.cpp new file mode 100644 --- /dev/null +++ b/clang/test/CodeGenCXX/matrix-type-builtins.cpp @@ -0,0 +1,124 @@ +// RUN: %clang_cc1 -fenable-matrix -triple x86_64-apple-darwin %s -emit-llvm -disable-llvm-passes -o - -std=c++17 | FileCheck %s + +// Tests for the matrix type builtins. + +using dx5x5_t = double __attribute__((matrix_type(5, 5))); +typedef float fx2x3_t __attribute__((matrix_type(2, 3))); + +void transpose1(dx5x5_t *a, fx2x3_t *b) { + // CHECK-LABEL: define void @_Z10transpose1PU11matrix_typeLm5ELm5EdPU11matrix_typeLm2ELm3Ef([25 x double]* %a, [6 x float]* %b) + // CHECK-NEXT: entry: + // CHECK-NEXT: %a.addr = alloca [25 x double]*, align 8 + // CHECK-NEXT: %b.addr = alloca [6 x float]*, align 8 + // CHECK-NEXT: %a_t = alloca [25 x double], align 8 + // CHECK-NEXT: %b_t = alloca [6 x float], align 4 + // CHECK-NEXT: store [25 x double]* %a, [25 x double]** %a.addr, align 8 + // CHECK-NEXT: store [6 x float]* %b, [6 x float]** %b.addr, align 8 + // CHECK-NEXT: %0 = load [25 x double]*, [25 x double]** %a.addr, align 8 + // CHECK-NEXT: %1 = bitcast [25 x double]* %0 to <25 x double>* + // CHECK-NEXT: %2 = load <25 x double>, <25 x double>* %1, align 8 + // CHECK-NEXT: %3 = call <25 x double> @llvm.matrix.transpose.v25f64(<25 x double> %2, i32 5, i32 5) + // CHECK-NEXT: %4 = bitcast [25 x double]* %a_t to <25 x double>* + // CHECK-NEXT: store <25 x double> %3, <25 x double>* %4, align 8 + dx5x5_t a_t = __builtin_matrix_transpose(*a); + + // CHECK-NEXT: %5 = load [6 x float]*, [6 x float]** %b.addr, align 8 + // CHECK-NEXT: %6 = bitcast [6 x float]* %5 to <6 x float>* + // CHECK-NEXT: %7 = load <6 x float>, <6 x float>* %6, align 4 + // CHECK-NEXT: %8 = call <6 x float> @llvm.matrix.transpose.v6f32(<6 x float> %7, i32 2, i32 3) + // CHECK-NEXT: %9 = bitcast [6 x float]* %b_t to <6 x float>* + // CHECK-NEXT: store <6 x float> %8, <6 x float>* %9, align 4 + // CHECK-NEXT: ret void + auto b_t = __builtin_matrix_transpose(*b); +} + +template +using matrix_t = T __attribute__((matrix_type(R, C))); + +void transpose2(matrix_t &M1, matrix_t &M2) { + // CHECK-LABEL: define void @_Z10transpose2RU11matrix_typeLm7ELm4EfRU11matrix_typeLm6ELm9Ej([28 x float]* dereferenceable(112) %M1, [54 x i32]* dereferenceable(216) %M2) #0 { + // CHECK-NEXT: entry: + // CHECK-NEXT: %M1.addr = alloca [28 x float]*, align 8 + // CHECK-NEXT: %M2.addr = alloca [54 x i32]*, align 8 + // CHECK-NEXT: %M1_t = alloca [28 x float], align 4 + // CHECK-NEXT: %M2_t = alloca [54 x i32], align 4 + // CHECK-NEXT: store [28 x float]* %M1, [28 x float]** %M1.addr, align 8 + // CHECK-NEXT: store [54 x i32]* %M2, [54 x i32]** %M2.addr, align 8 + // CHECK-NEXT: %0 = load [28 x float]*, [28 x float]** %M1.addr, align 8 + // CHECK-NEXT: %1 = bitcast [28 x float]* %0 to <28 x float>* + // CHECK-NEXT: %2 = load <28 x float>, <28 x float>* %1, align 4 + // CHECK-NEXT: %3 = call <28 x float> @llvm.matrix.transpose.v28f32(<28 x float> %2, i32 7, i32 4) + // CHECK-NEXT: %4 = bitcast [28 x float]* %M1_t to <28 x float>* + // CHECK-NEXT: store <28 x float> %3, <28 x float>* %4, align 4 + matrix_t M1_t = __builtin_matrix_transpose(M1); + + // CHECK-NEXT: %5 = load [54 x i32]*, [54 x i32]** %M2.addr, align 8 + // CHECK-NEXT: %6 = bitcast [54 x i32]* %5 to <54 x i32>* + // CHECK-NEXT: %7 = load <54 x i32>, <54 x i32>* %6, align 4 + // CHECK-NEXT: %8 = call <54 x i32> @llvm.matrix.transpose.v54i32(<54 x i32> %7, i32 6, i32 9) + // CHECK-NEXT: %9 = bitcast [54 x i32]* %M2_t to <54 x i32>* + // CHECK-NEXT: store <54 x i32> %8, <54 x i32>* %9, align 4 + // CHECK-NEXT: ret void + auto M2_t = __builtin_matrix_transpose(M2); +} + +template +matrix_t transpose(matrix_t M) { + return __builtin_matrix_transpose(M); +} + +void test_transpose_template() { + // CHECK-LABEL: define void @_Z23test_transpose_templatev() #2 { + // CHECK-NEXT: entry: + // CHECK-NEXT: %M1 = alloca [40 x i32], align 4 + // CHECK-NEXT: %M1_t = alloca [40 x i32], align 4 + // CHECK-NEXT: %M2 = alloca [42 x double], align 8 + // CHECK-NEXT: %M2_t = alloca [42 x double], align 8 + // CHECK-NEXT: %0 = bitcast [40 x i32]* %M1 to <40 x i32>* + // CHECK-NEXT: %1 = load <40 x i32>, <40 x i32>* %0, align 4 + // CHECK-NEXT: %call = call <40 x i32> @_Z9transposeIiLj4ELj10EEU11matrix_typeXT1_EXT0_ET_U11matrix_typeXT0_EXT1_ES0_(<40 x i32> %1) + // CHECK-NEXT: %2 = bitcast [40 x i32]* %M1_t to <40 x i32>* + // CHECK-NEXT: store <40 x i32> %call, <40 x i32>* %2, align 4 + + matrix_t M1; + matrix_t M1_t = transpose(M1); + + // CHECK-NEXT: %3 = bitcast [42 x double]* %M2 to <42 x double>* + // CHECK-NEXT: %4 = load <42 x double>, <42 x double>* %3, align 8 + // CHECK-NEXT: %call1 = call <42 x double> @_Z9transposeIdLj7ELj6EEU11matrix_typeXT1_EXT0_ET_U11matrix_typeXT0_EXT1_ES0_(<42 x double> %4) + // CHECK-NEXT: %call2 = call <42 x double> @_Z9transposeIdLj6ELj7EEU11matrix_typeXT1_EXT0_ET_U11matrix_typeXT0_EXT1_ES0_(<42 x double> %call1) + // CHECK-NEXT: %call3 = call <42 x double> @_Z9transposeIdLj7ELj6EEU11matrix_typeXT1_EXT0_ET_U11matrix_typeXT0_EXT1_ES0_(<42 x double> %call2) + // CHECK-NEXT: %5 = bitcast [42 x double]* %M2_t to <42 x double>* + // CHECK-NEXT: store <42 x double> %call3, <42 x double>* %5, align 8 + // CHECK-NEXT: ret void + matrix_t M2; + matrix_t M2_t = transpose(transpose(transpose(M2))); +} + +// CHECK-LABEL: define linkonce_odr <40 x i32> @_Z9transposeIiLj4ELj10EEU11matrix_typeXT1_EXT0_ET_U11matrix_typeXT0_EXT1_ES0_(<40 x i32> %M) +// CHECK-NEXT: entry: +// CHECK-NEXT: %M.addr = alloca [40 x i32], align 4 +// CHECK-NEXT: %0 = bitcast [40 x i32]* %M.addr to <40 x i32>* +// CHECK-NEXT: store <40 x i32> %M, <40 x i32>* %0, align 4 +// CHECK-NEXT: %1 = load <40 x i32>, <40 x i32>* %0, align 4 +// CHECK-NEXT: %2 = call <40 x i32> @llvm.matrix.transpose.v40i32(<40 x i32> %1, i32 4, i32 10) +// CHECK-NEXT: ret <40 x i32> %2 +// CHECK-NEXT: } + +// CHECK-LABEL: define linkonce_odr <42 x double> @_Z9transposeIdLj7ELj6EEU11matrix_typeXT1_EXT0_ET_U11matrix_typeXT0_EXT1_ES0_(<42 x double> %M) +// CHECK-NEXT: entry: +// CHECK-NEXT: %M.addr = alloca [42 x double], align 8 +// CHECK-NEXT: %0 = bitcast [42 x double]* %M.addr to <42 x double>* +// CHECK-NEXT: store <42 x double> %M, <42 x double>* %0, align 8 +// CHECK-NEXT: %1 = load <42 x double>, <42 x double>* %0, align 8 +// CHECK-NEXT: %2 = call <42 x double> @llvm.matrix.transpose.v42f64(<42 x double> %1, i32 7, i32 6) +// CHECK-NEXT: ret <42 x double> %2 + +// CHECK-LABEL: define linkonce_odr <42 x double> @_Z9transposeIdLj6ELj7EEU11matrix_typeXT1_EXT0_ET_U11matrix_typeXT0_EXT1_ES0_(<42 x double> %M) +// CHECK-NEXT: entry: +// CHECK-NEXT: %M.addr = alloca [42 x double], align 8 +// CHECK-NEXT: %0 = bitcast [42 x double]* %M.addr to <42 x double>* +// CHECK-NEXT: store <42 x double> %M, <42 x double>* %0, align 8 +// CHECK-NEXT: %1 = load <42 x double>, <42 x double>* %0, align 8 +// CHECK-NEXT: %2 = call <42 x double> @llvm.matrix.transpose.v42f64(<42 x double> %1, i32 6, i32 7) +// CHECK-NEXT: ret <42 x double> %2 diff --git a/clang/test/Sema/matrix-type-builtins.c b/clang/test/Sema/matrix-type-builtins.c new file mode 100644 --- /dev/null +++ b/clang/test/Sema/matrix-type-builtins.c @@ -0,0 +1,17 @@ +// RUN: %clang_cc1 %s -fenable-matrix -pedantic -verify -triple=x86_64-apple-darwin9 + +typedef float sx5x10_t __attribute__((matrix_type(5, 10))); +typedef int ix3x2_t __attribute__((matrix_type(3, 2))); + +void transpose(sx5x10_t a, ix3x2_t b, int *c, int d) { + a = __builtin_matrix_transpose(b); + // expected-error@-1 {{assigning to 'sx5x10_t' (aka 'float __attribute__((matrix_type(5, 10)))') from incompatible type 'int __attribute__((matrix_type(2, 3)))'}} + b = __builtin_matrix_transpose(b); + // expected-error@-1 {{assigning to 'ix3x2_t' (aka 'int __attribute__((matrix_type(3, 2)))') from incompatible type 'int __attribute__((matrix_type(2, 3)))'}} + __builtin_matrix_transpose(c); + // expected-error@-1 {{first argument must be a matrix}} + __builtin_matrix_transpose(d); + // expected-error@-1 {{first argument must be a matrix}} + __builtin_matrix_transpose("test"); + // expected-error@-1 {{first argument must be a matrix}} +} diff --git a/clang/test/SemaCXX/matrix-type-builtins.cpp b/clang/test/SemaCXX/matrix-type-builtins.cpp new file mode 100644 --- /dev/null +++ b/clang/test/SemaCXX/matrix-type-builtins.cpp @@ -0,0 +1,34 @@ +// RUN: %clang_cc1 %s -fenable-matrix -pedantic -std=c++11 -verify -triple=x86_64-apple-darwin9 + +template +struct MyMatrix { + using matrix_t = EltTy __attribute__((matrix_type(Rows, Columns))); + + matrix_t value; +}; + +template +typename MyMatrix::matrix_t transpose(MyMatrix &A) { + char *v1 = __builtin_matrix_transpose(A.value); + // expected-error@-1 {{cannot initialize a variable of type 'char *' with an rvalue of type 'unsigned int __attribute__((matrix_type(3, 2)))'}} + // expected-error@-2 {{cannot initialize a variable of type 'char *' with an rvalue of type 'unsigned int __attribute__((matrix_type(3, 3)))'}} + + __builtin_matrix_transpose(A); + // expected-error@-1 {{first argument must be a matrix}} + // expected-error@-2 {{first argument must be a matrix}} + + return __builtin_matrix_transpose(A.value); + // expected-error@-1 {{cannot initialize return object of type 'typename MyMatrix::matrix_t' (aka 'unsigned int __attribute__((matrix_type(2, 3)))') with an rvalue of type 'unsigned int __attribute__((matrix_type(3, 2)))'}} + // expected-error@-2 {{cannot initialize return object of type 'typename MyMatrix::matrix_t' (aka 'unsigned int __attribute__((matrix_type(2, 3)))') with an rvalue of type 'unsigned int __attribute__((matrix_type(3, 3)))'}} +} + +void test_transpose_template(unsigned *Ptr1, float *Ptr2) { + MyMatrix Mat1; + MyMatrix Mat2; + Mat1.value = *((decltype(Mat1)::matrix_t *)Ptr1); + Mat1.value = transpose(Mat1); + // expected-note@-1 {{in instantiation of function template specialization 'transpose' requested here}} + + Mat1.value = transpose(Mat2); + // expected-note@-1 {{in instantiation of function template specialization 'transpose' requested here}} +}