diff --git a/SingleSource/UnitTests/CMakeLists.txt b/SingleSource/UnitTests/CMakeLists.txt --- a/SingleSource/UnitTests/CMakeLists.txt +++ b/SingleSource/UnitTests/CMakeLists.txt @@ -2,6 +2,7 @@ add_subdirectory(C++11) add_subdirectory(Float) +add_subdirectory(Matrix) add_subdirectory(SignlessTypes) add_subdirectory(Threads) add_subdirectory(Vector) diff --git a/SingleSource/UnitTests/Matrix/AMXINT8/CMakeLists.txt b/SingleSource/UnitTests/Matrix/AMXINT8/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/SingleSource/UnitTests/Matrix/AMXINT8/CMakeLists.txt @@ -0,0 +1,5 @@ +list(APPEND CPPFLAGS -I ${CMAKE_SOURCE_DIR}/${MATRIX_MAIN_DIR}) +list(APPEND LDFLAGS -lm) +list(APPEND CFLAGS "-march=sapphirerapids") +list(APPEND CFLAGS -fms-extensions) +llvm_singlesource(PREFIX "Matrix-AMXINT8-") diff --git a/SingleSource/UnitTests/Matrix/AMXINT8/Makefile b/SingleSource/UnitTests/Matrix/AMXINT8/Makefile new file mode 100644 --- /dev/null +++ b/SingleSource/UnitTests/Matrix/AMXINT8/Makefile @@ -0,0 +1,11 @@ +# SingleSource/UnitTests/Matrix/AMXINT8/Makefile + +DIRS = +LEVEL = ../../../.. +CFLAGS += -fms-extensions -march=native -mamx-int8 -mamx-bf16 -I${SourceDir}/.. +LDFLAGS += -lm + +include $(LEVEL)/SingleSource/Makefile.singlesrc + +TARGET_FLAGS += -march=native -mamx-int8 -mamx-bf16 +LCCFLAGS += -march=native -mamx-int8 -mamx-bf16 diff --git a/SingleSource/UnitTests/Matrix/AMXINT8/t_gemm_bf16.cpp b/SingleSource/UnitTests/Matrix/AMXINT8/t_gemm_bf16.cpp new file mode 100755 --- /dev/null +++ b/SingleSource/UnitTests/Matrix/AMXINT8/t_gemm_bf16.cpp @@ -0,0 +1,156 @@ +#include +#include +#include +#include + + +#define TILE_SZ 16 +__attribute__((noinline)) +void inner_product2(int *A_mem, int *B_mem, int *C_mem, int M, int N, int K) { + // tiling + const int m = M / TILE_SZ; + const int n = N / TILE_SZ; + const int k = K / TILE_SZ; + + assert(!(M % TILE_SZ) && !(N % TILE_SZ) && !(K % TILE_SZ)); + for (int i = 0; i < m; i++) + for (int j = 0; j < n; j++) { + __tile1024i c = {TILE_SZ, TILE_SZ*sizeof(int)}; + __tile_zero(&c); + for (int l = 0; l < k; l++) { + __tile1024i a = {TILE_SZ, TILE_SZ*sizeof(int)}; + __tile1024i b = {TILE_SZ, TILE_SZ*sizeof(int)}; + __tile_loadd(&a, A_mem+(i*TILE_SZ)*K+l*TILE_SZ, K*sizeof(int)); + __tile_loadd(&b, B_mem+(l*TILE_SZ)*N+j*TILE_SZ, N*sizeof(int)); + __tile_dpbf16ps(&c, a, b); + } + __tile_stored(C_mem+(i*TILE_SZ)*M+j*TILE_SZ, N*sizeof(int), c); + } +} + +#define TILE_M 16 +#define TILE_N 16 +#define TILE_K 16 +#define M_ACC 2 +#define N_ACC 2 +#define KPACK 1 +typedef int type_t; +typedef int res_type_t; + +template +class Tile { +public: + __tile1024i& getTile() { + return tile; + } +private: + __tile1024i tile {Row, Col*sizeof(int)}; +}; + +template +static void tilezero(Tile &tile) { + __tile_zero(&tile.getTile()); +} + +template +static void tileload(Tile &dst, const void *base, size_t stride) { + __tile_loadd(&dst.getTile(), base, stride); +} + +template +static void tdp(Tile &dst, Tile src1, Tile src2) { + __tile_dpbf16ps(&dst.getTile(), src1.getTile(), src2.getTile()); +} + +template +void tilestore(Tile &src, void *base, size_t stride) { + __tile_stored(base, stride, src.getTile()); +} + +__attribute__((noinline)) +void inner_product(int *A_mem, int *B_mem, int *C_mem, int M, int N, int K) { + // tiling + assert(M%(TILE_M*M_ACC) == 0 && N%(TILE_N*N_ACC) == 0 && K%TILE_K == 0); + for (int n = 0; n < N; n += N_ACC*TILE_N) { + for (int m = 0; m < M; m += M_ACC*TILE_M) { + Tile tC[M_ACC][N_ACC]; + Tile tA[M_ACC]; + Tile tB; + + for (int n_acc = 0; n_acc < N_ACC; ++n_acc) + for (int m_acc = 0; m_acc < M_ACC; ++m_acc) + tilezero(tC[m_acc][n_acc]); + + for (int k = 0; k < K; k += TILE_K) { + for (int n_acc = 0; n_acc < N_ACC; ++n_acc) { + tileload(tB, B_mem+k*N+n+n_acc*TILE_N, N*sizeof(type_t)*KPACK); + for (int m_acc = 0; m_acc < M_ACC; ++m_acc) { + if (n_acc == 0) + tileload(tA[m_acc], A_mem+(m + m_acc*TILE_M)*K+k, K*sizeof(type_t)); + tdp(tC[m_acc][n_acc], tA[m_acc], tB); + if (k == K - TILE_K) { + int mc = m + m_acc*TILE_M, nc = n + n_acc*TILE_N; + tilestore(tC[m_acc][n_acc], C_mem+mc*N+nc, N*sizeof(res_type_t)); + } + } + } + } + } + } +} + +float make_fp32(short x) +{ + unsigned int y = x; + y = y << 16; + float *res = reinterpret_cast(&y); + return *res; +} + +unsigned short make_bf16(float x) +{ + int *res = reinterpret_cast(&x); + *res = *res >> 16; + return (unsigned short)*res; +} + +void inner_product_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N, int K) { + // tiling + for (int m = 0; m < M; m++) + for (int n = 0; n < N; n++) { + for (int k = 0; k < K; k++) { + short *va = (short *)(A_mem + m*K + k); + short *vb = (short *)(B_mem + k*N + n); + float acc = *((float*)(C_mem + m*N + n)); + for (int i = 0; i < 2; i++) { + acc += (make_fp32(va[i]) * make_fp32(vb[i])); + } + *((float*)(C_mem + m*N + n))= acc; + } + } +} + +#define SIZE 128 +int gA[SIZE][SIZE], gB[SIZE][SIZE], gC[SIZE][SIZE], gD[SIZE][SIZE]; +int main() { + for (int i = 0; i < SIZE; i++) + for (int j = 0; j < SIZE; j++) { + unsigned short *pAbh = (unsigned short *)&gA[i][j]; + *pAbh = make_bf16(0.1f * (i+j)); + *(pAbh+1) = make_bf16(0.1f * (i+j)); + unsigned short *pBbh = (unsigned short *)&gB[i][j]; + *pBbh = make_bf16(0.1f * (i+j)); + *(pBbh+1) = make_bf16(0.1f * (i+j)); + } + + inner_product_ref((int *)gA, (int *)gB, (int *)gC, SIZE, SIZE, SIZE); + inner_product((int *)gA, (int *)gB, (int *)gD, SIZE, SIZE, SIZE); + int err_num = memcmp(gC, gD, sizeof(gC))?1:0; + inner_product2((int *)gA, (int *)gB, (int *)gD, SIZE, SIZE, SIZE); + err_num = memcmp(gC, gD, sizeof(gC))?(1+err_num):err_num; + if (err_num == 0) + std::cout << "PASSED\n"; + else + std::cout << "FAILED\n"; +} + diff --git a/SingleSource/UnitTests/Matrix/AMXINT8/t_gemm_bf16.reference_output b/SingleSource/UnitTests/Matrix/AMXINT8/t_gemm_bf16.reference_output new file mode 100755 --- /dev/null +++ b/SingleSource/UnitTests/Matrix/AMXINT8/t_gemm_bf16.reference_output @@ -0,0 +1 @@ +exit 132 diff --git a/SingleSource/UnitTests/Matrix/AMXINT8/t_gemm_int8.cpp b/SingleSource/UnitTests/Matrix/AMXINT8/t_gemm_int8.cpp new file mode 100755 --- /dev/null +++ b/SingleSource/UnitTests/Matrix/AMXINT8/t_gemm_int8.cpp @@ -0,0 +1,137 @@ +#include +#include +#include +#include + + +#define TILE_SZ 16 +__attribute__((noinline)) +void inner_product2(int *A_mem, int *B_mem, int *C_mem, int M, int N, int K) { + // tiling + const int m = M / TILE_SZ; + const int n = N / TILE_SZ; + const int k = K / TILE_SZ; + + assert(!(M % TILE_SZ) && !(N % TILE_SZ) && !(K % TILE_SZ)); + for (int i = 0; i < m; i++) + for (int j = 0; j < n; j++) { + __tile1024i c = {TILE_SZ, TILE_SZ*sizeof(int)}; + __tile_zero(&c); + for (int l = 0; l < k; l++) { + __tile1024i a = {TILE_SZ, TILE_SZ*sizeof(int)}; + __tile1024i b = {TILE_SZ, TILE_SZ*sizeof(int)}; + __tile_loadd(&a, A_mem+(i*TILE_SZ)*K+l*TILE_SZ, K*sizeof(int)); + __tile_loadd(&b, B_mem+(l*TILE_SZ)*N+j*TILE_SZ, N*sizeof(int)); + __tile_dpbssd(&c, a, b); + } + __tile_stored(C_mem+(i*TILE_SZ)*M+j*TILE_SZ, N*sizeof(int), c); + } +} + +#define TILE_M 16 +#define TILE_N 16 +#define TILE_K 16 +#define M_ACC 2 +#define N_ACC 2 +#define KPACK 1 +typedef int type_t; +typedef int res_type_t; + +template +class Tile { +public: + __tile1024i& getTile() { + return tile; + } +private: + __tile1024i tile {Row, Col*sizeof(int)}; +}; + +template +static void tilezero(Tile &tile) { + __tile_zero(&tile.getTile()); +} + +template +static void tileload(Tile &dst, const void *base, size_t stride) { + __tile_loadd(&dst.getTile(), base, stride); +} + +template +static void tdp(Tile &dst, Tile src1, Tile src2) { + __tile_dpbssd(&dst.getTile(), src1.getTile(), src2.getTile()); +} + +template +void tilestore(Tile &src, void *base, size_t stride) { + __tile_stored(base, stride, src.getTile()); +} + +__attribute__((noinline)) +void inner_product(int *A_mem, int *B_mem, int *C_mem, int M, int N, int K) { + // tiling + assert(M%(TILE_M*M_ACC) == 0 && N%(TILE_N*N_ACC) == 0 && K%TILE_K == 0); + for (int n = 0; n < N; n += N_ACC*TILE_N) { + for (int m = 0; m < M; m += M_ACC*TILE_M) { + Tile tC[M_ACC][N_ACC]; + Tile tA[M_ACC]; + Tile tB; + + for (int n_acc = 0; n_acc < N_ACC; ++n_acc) + for (int m_acc = 0; m_acc < M_ACC; ++m_acc) + tilezero(tC[m_acc][n_acc]); + + for (int k = 0; k < K; k += TILE_K) { + for (int n_acc = 0; n_acc < N_ACC; ++n_acc) { + tileload(tB, B_mem+k*N+n+n_acc*TILE_N, N*sizeof(type_t)*KPACK); + for (int m_acc = 0; m_acc < M_ACC; ++m_acc) { + if (n_acc == 0) + tileload(tA[m_acc], A_mem+(m + m_acc*TILE_M)*K+k, K*sizeof(type_t)); + tdp(tC[m_acc][n_acc], tA[m_acc], tB); + if (k == K - TILE_K) { + int mc = m + m_acc*TILE_M, nc = n + n_acc*TILE_N; + tilestore(tC[m_acc][n_acc], C_mem+mc*N+nc, N*sizeof(res_type_t)); + } + } + } + } + } + } +} + +void inner_product_ref(int *A_mem, int *B_mem, int *C_mem, int M, int N, int K) { + // tiling + for (int m = 0; m < M; m++) + for (int n = 0; n < N; n++) { + for (int k = 0; k < K; k++) { + char *va = (char *)(A_mem + m*K + k); + char *vb = (char *)(B_mem + k*N + n); + int acc = *(C_mem + m*N + n); + for (int i = 0; i < 4; i++) { + acc += (va[i] * vb[i]); + } + *(C_mem + m*N + n) = acc; + } + } +} + +#define SIZE 128 +int gA[SIZE][SIZE], gB[SIZE][SIZE], gC[SIZE][SIZE], gD[SIZE][SIZE]; +int main() { + for (int i = 0; i < SIZE; i++) + for (int j = 0; j < SIZE; j++) { + gA[i][j] = 0x01010101 * i * j; + gB[i][j] = 0x01010101 * i * j; + } + + inner_product_ref((int *)gA, (int *)gB, (int *)gC, SIZE, SIZE, SIZE); + inner_product((int *)gA, (int *)gB, (int *)gD, SIZE, SIZE, SIZE); + int err_num = memcmp(gC, gD, sizeof(gC))?1:0; + inner_product2((int *)gA, (int *)gB, (int *)gD, SIZE, SIZE, SIZE); + err_num = memcmp(gC, gD, sizeof(gC))?(1+err_num):err_num; + if (err_num == 0) + std::cout << "PASSED\n"; + else + std::cout << "FAILED\n"; +} + diff --git a/SingleSource/UnitTests/Matrix/AMXINT8/t_gemm_int8.reference_output b/SingleSource/UnitTests/Matrix/AMXINT8/t_gemm_int8.reference_output new file mode 100755 --- /dev/null +++ b/SingleSource/UnitTests/Matrix/AMXINT8/t_gemm_int8.reference_output @@ -0,0 +1 @@ +exit 132 diff --git a/SingleSource/UnitTests/Matrix/CMakeLists.txt b/SingleSource/UnitTests/Matrix/CMakeLists.txt new file mode 100644 --- /dev/null +++ b/SingleSource/UnitTests/Matrix/CMakeLists.txt @@ -0,0 +1,10 @@ +set(MATRIX_MAIN_DIR SingleSource/UnitTests/Matrix) + +if(CMAKE_C_COMPILER_ID STREQUAL "Clang") + if(ARCH STREQUAL "x86") + if(X86CPU_ARCH STREQUAL "skylake-avx512") + add_subdirectory(AMXINT8) + endif() + endif() +endif() +llvm_singlesource(PREFIX "Matrix-") diff --git a/SingleSource/UnitTests/Matrix/Makefile b/SingleSource/UnitTests/Matrix/Makefile new file mode 100644 --- /dev/null +++ b/SingleSource/UnitTests/Matrix/Makefile @@ -0,0 +1,18 @@ +# SingleSource/UnitTests/Matrix/Makefile +LEVEL = ../../.. + +include $(LEVEL)/Makefile.config + +DIRS = + +ifeq ($(CC_UNDER_TEST_IS_CLANG), 1) +# FIXME: Once testing infrastructure runs on sapphirerapids, we should change following line. +# currenly we only test compilation results of AMX cases +ifeq ($(HAVE_X86_AVX512DQ_INSTRUCTIONS), 1) +DIRS += AMXINT8 +endif +endif + +include $(LEVEL)/SingleSource/Makefile.singlesrc + +