diff --git a/clang/include/clang/Basic/AArch64SVEACLETypes.def b/clang/include/clang/Basic/AArch64SVEACLETypes.def --- a/clang/include/clang/Basic/AArch64SVEACLETypes.def +++ b/clang/include/clang/Basic/AArch64SVEACLETypes.def @@ -66,7 +66,7 @@ SVE_VECTOR_TYPE("__SVFloat32_t", "__SVFloat32_t", SveFloat32, SveFloat32Ty, 4, 32, true, true, false) SVE_VECTOR_TYPE("__SVFloat64_t", "__SVFloat64_t", SveFloat64, SveFloat64Ty, 2, 64, true, true, false) -SVE_VECTOR_TYPE("__SVBFloat16_t", "__SVBFloat16_t", SveBFloat16, SveBFloat16Ty, 8, 16, false, false, true) +SVE_VECTOR_TYPE("__SVBFloat16_t", "__SVBFloat16_t", SveBFloat16, SveBFloat16Ty, 8, 16, true, false, true) // // x2 @@ -85,6 +85,7 @@ SVE_VECTOR_TYPE("__clang_svfloat32x2_t", "svfloat32x2_t", SveFloat32x2, SveFloat32x2Ty, 8, 32, true, true, false) SVE_VECTOR_TYPE("__clang_svfloat64x2_t", "svfloat64x2_t", SveFloat64x2, SveFloat64x2Ty, 4, 64, true, true, false) +SVE_VECTOR_TYPE("__clang_svbfloat16x2_t", "svbfloat16x2_t", SveBFloat16x2, SveBFloat16x2Ty, 16, 16, true, false, true) // // x3 // @@ -102,6 +103,7 @@ SVE_VECTOR_TYPE("__clang_svfloat32x3_t", "svfloat32x3_t", SveFloat32x3, SveFloat32x3Ty, 12, 32, true, true, false) SVE_VECTOR_TYPE("__clang_svfloat64x3_t", "svfloat64x3_t", SveFloat64x3, SveFloat64x3Ty, 6, 64, true, true, false) +SVE_VECTOR_TYPE("__clang_svbfloat16x3_t", "svbfloat16x3_t", SveBFloat16x3, SveBFloat16x3Ty, 24, 16, true, false, true) // // x4 // @@ -119,6 +121,8 @@ SVE_VECTOR_TYPE("__clang_svfloat32x4_t", "svfloat32x4_t", SveFloat32x4, SveFloat32x4Ty, 16, 32, true, true, false) SVE_VECTOR_TYPE("__clang_svfloat64x4_t", "svfloat64x4_t", SveFloat64x4, SveFloat64x4Ty, 8, 64, true, true, false) +SVE_VECTOR_TYPE("__clang_svbfloat16x4_t", "svbfloat16x4_t", SveBFloat16x4, SveBFloat16x4Ty, 32, 16, true, false, true) + SVE_PREDICATE_TYPE("__SVBool_t", "__SVBool_t", SveBool, SveBoolTy, 16) #undef SVE_VECTOR_TYPE diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -480,15 +480,22 @@ // Load one quadword and replicate (scalar base) def SVLD1RQ : SInst<"svld1rq[_{2}]", "dPc", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld1rq">; +multiclass StructLoad { + def : SInst; + let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { + def: SInst; + } +} + // Load N-element structure into N vectors (scalar base) -def SVLD2 : SInst<"svld2[_{2}]", "2Pc", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld2", [IsStructLoad]>; -def SVLD3 : SInst<"svld3[_{2}]", "3Pc", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld3", [IsStructLoad]>; -def SVLD4 : SInst<"svld4[_{2}]", "4Pc", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld4", [IsStructLoad]>; +defm SVLD2 : StructLoad<"svld2[_{2}]", "2Pc", "aarch64_sve_ld2">; +defm SVLD3 : StructLoad<"svld3[_{2}]", "3Pc", "aarch64_sve_ld3">; +defm SVLD4 : StructLoad<"svld4[_{2}]", "4Pc", "aarch64_sve_ld4">; // Load N-element structure into N vectors (scalar base, VL displacement) -def SVLD2_VNUM : SInst<"svld2_vnum[_{2}]", "2Pcl", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld2", [IsStructLoad]>; -def SVLD3_VNUM : SInst<"svld3_vnum[_{2}]", "3Pcl", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld3", [IsStructLoad]>; -def SVLD4_VNUM : SInst<"svld4_vnum[_{2}]", "4Pcl", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld4", [IsStructLoad]>; +defm SVLD2_VNUM : StructLoad<"svld2_vnum[_{2}]", "2Pcl", "aarch64_sve_ld2">; +defm SVLD3_VNUM : StructLoad<"svld3_vnum[_{2}]", "3Pcl", "aarch64_sve_ld3">; +defm SVLD4_VNUM : StructLoad<"svld4_vnum[_{2}]", "4Pcl", "aarch64_sve_ld4">; // Load one octoword and replicate (scalar base) let ArchGuard = "defined(__ARM_FEATURE_SVE_MATMUL_FP64)" in { @@ -601,15 +608,21 @@ def SVST1H_SCATTER_INDEX_S : MInst<"svst1h_scatter[_{2}base]_index[_{d}]", "vPuld", "ilUiUl", [IsScatterStore], MemEltTyInt16, "aarch64_sve_st1_scatter_scalar_offset">; def SVST1W_SCATTER_INDEX_S : MInst<"svst1w_scatter[_{2}base]_index[_{d}]", "vPuld", "lUl", [IsScatterStore], MemEltTyInt32, "aarch64_sve_st1_scatter_scalar_offset">; +multiclass StructStore { + def : SInst; + let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { + def: SInst; + } +} // Store N vectors into N-element structure (scalar base) -def SVST2 : SInst<"svst2[_{d}]", "vPp2", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_st2", [IsStructStore]>; -def SVST3 : SInst<"svst3[_{d}]", "vPp3", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_st3", [IsStructStore]>; -def SVST4 : SInst<"svst4[_{d}]", "vPp4", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_st4", [IsStructStore]>; +defm SVST2 : StructStore<"svst2[_{d}]", "vPp2", "aarch64_sve_st2">; +defm SVST3 : StructStore<"svst3[_{d}]", "vPp3", "aarch64_sve_st3">; +defm SVST4 : StructStore<"svst4[_{d}]", "vPp4", "aarch64_sve_st4">; // Store N vectors into N-element structure (scalar base, VL displacement) -def SVST2_VNUM : SInst<"svst2_vnum[_{d}]", "vPpl2", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_st2", [IsStructStore]>; -def SVST3_VNUM : SInst<"svst3_vnum[_{d}]", "vPpl3", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_st3", [IsStructStore]>; -def SVST4_VNUM : SInst<"svst4_vnum[_{d}]", "vPpl4", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_st4", [IsStructStore]>; +defm SVST2_VNUM : StructStore<"svst2_vnum[_{d}]", "vPpl2", "aarch64_sve_st2">; +defm SVST3_VNUM : StructStore<"svst3_vnum[_{d}]", "vPpl3", "aarch64_sve_st3">; +defm SVST4_VNUM : StructStore<"svst4_vnum[_{d}]", "vPpl4", "aarch64_sve_st4">; // Store one vector, with no truncation, non-temporal (scalar base) def SVSTNT1 : MInst<"svstnt1[_{d}]", "vPpd", "csilUcUsUiUlhfd", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">; diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp --- a/clang/lib/CodeGen/CodeGenTypes.cpp +++ b/clang/lib/CodeGen/CodeGenTypes.cpp @@ -619,6 +619,12 @@ return GET_SVE_FP_VEC(DoubleTy, false, 8); case BuiltinType::SveBFloat16: return GET_SVE_FP_VEC(BFloat16Ty, false, 8); + case BuiltinType::SveBFloat16x2: + return GET_SVE_FP_VEC(BFloat16Ty, false, 16); + case BuiltinType::SveBFloat16x3: + return GET_SVE_FP_VEC(BFloat16Ty, false, 24); + case BuiltinType::SveBFloat16x4: + return GET_SVE_FP_VEC(BFloat16Ty, false, 32); #undef GET_SVE_FP_VEC case BuiltinType::Dependent: #define BUILTIN_TYPE(Id, SingletonId) diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld2-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld2-bfloat.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld2-bfloat.c @@ -0,0 +1,32 @@ +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +svbfloat16x2_t test_svld2_bf16(svbool_t pg, const bfloat16_t *base) +{ + // CHECK-LABEL: test_svld2_bf16 + // CHECK: %[[PG:.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %pg) + // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1( %[[PG]], bfloat* %base) + // CHECK-NEXT: ret %[[LOAD]] + return SVE_ACLE_FUNC(svld2,_bf16,,)(pg, base); +} + + +svbfloat16x2_t test_svld2_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum) +{ + // CHECK-LABEL: test_svld2_vnum_bf16 + // CHECK-DAG: %[[PG:.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %pg) + // CHECK-DAG: %[[BASE:.*]] = bitcast bfloat* %base to * + // CHECK-DAG: %[[GEP:.*]] = getelementptr , * %[[BASE]], i64 %vnum, i64 0 + // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1( %[[PG]], bfloat* %[[GEP]]) + // CHECK-NEXT: ret %[[LOAD]] + return SVE_ACLE_FUNC(svld2_vnum,_bf16,,)(pg, base, vnum); +} diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld3-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld3-bfloat.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld3-bfloat.c @@ -0,0 +1,31 @@ +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +svbfloat16x3_t test_svld3_bf16(svbool_t pg, const bfloat16_t *base) +{ + // CHECK-LABEL: test_svld3_bf16 + // CHECK: %[[PG:.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %pg) + // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1( %[[PG]], bfloat* %base) + // CHECK-NEXT: ret %[[LOAD]] + return SVE_ACLE_FUNC(svld3,_bf16,,)(pg, base); +} + +svbfloat16x3_t test_svld3_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum) +{ + // CHECK-LABEL: test_svld3_vnum_bf16 + // CHECK-DAG: %[[PG:.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %pg) + // CHECK-DAG: %[[BASE:.*]] = bitcast bfloat* %base to * + // CHECK-DAG: %[[GEP:.*]] = getelementptr , * %[[BASE]], i64 %vnum, i64 0 + // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1( %[[PG]], bfloat* %[[GEP]]) + // CHECK-NEXT: ret %[[LOAD]] + return SVE_ACLE_FUNC(svld3_vnum,_bf16,,)(pg, base, vnum); +} diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld4-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld4-bfloat.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld4-bfloat.c @@ -0,0 +1,31 @@ +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +svbfloat16x4_t test_svld4_bf16(svbool_t pg, const bfloat16_t *base) +{ + // CHECK-LABEL: test_svld4_bf16 + // CHECK: %[[PG:.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %pg) + // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1( %[[PG]], bfloat* %base) + // CHECK-NEXT: ret %[[LOAD]] + return SVE_ACLE_FUNC(svld4,_bf16,,)(pg, base); +} + +svbfloat16x4_t test_svld4_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum) +{ + // CHECK-LABEL: test_svld4_vnum_bf16 + // CHECK-DAG: %[[PG:.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %pg) + // CHECK-DAG: %[[BASE:.*]] = bitcast bfloat* %base to * + // CHECK-DAG: %[[GEP:.*]] = getelementptr , * %[[BASE]], i64 %vnum, i64 0 + // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1( %[[PG]], bfloat* %[[GEP]]) + // CHECK-NEXT: ret %[[LOAD]] + return SVE_ACLE_FUNC(svld4_vnum,_bf16,,)(pg, base, vnum); +} diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st2-bfloat.c @@ -0,0 +1,34 @@ +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif +void test_svst2_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x2_t data) +{ + // CHECK-LABEL: test_svst2_bf16 + // CHECK-DAG: %[[V0:.*]] = call @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv16bf16( %data, i32 0) + // CHECK-DAG: %[[V1:.*]] = call @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv16bf16( %data, i32 1) + // CHECK-DAG: %[[PG:.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %pg) + // CHECK: call void @llvm.aarch64.sve.st2.nxv8bf16( %[[V0]], %[[V1]], %[[PG]], bfloat* %base) + // CHECK-NEXT: ret + return SVE_ACLE_FUNC(svst2,_bf16,,)(pg, base, data); +} + +void test_svst2_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x2_t data) +{ + // CHECK-LABEL: test_svst2_vnum_bf16 + // CHECK-DAG: %[[BITCAST:.*]] = bitcast bfloat* %base to * + // CHECK-DAG: %[[GEP:.*]] = getelementptr , * %[[BITCAST]], i64 %vnum, i64 0 + // CHECK-DAG: %[[V0:.*]] = call @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv16bf16( %data, i32 0) + // CHECK-DAG: %[[V1:.*]] = call @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv16bf16( %data, i32 1) + // CHECK-DAG: %[[PG:.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %pg) + // CHECK: call void @llvm.aarch64.sve.st2.nxv8bf16( %[[V0]], %[[V1]], %[[PG]], bfloat* %[[GEP]]) + // CHECK-NEXT: ret + return SVE_ACLE_FUNC(svst2_vnum,_bf16,,)(pg, base, vnum, data); +} diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st3-bfloat.c @@ -0,0 +1,37 @@ +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +void test_svst3_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x3_t data) +{ + // CHECK-LABEL: test_svst3_bf16 + // CHECK-DAG: %[[V0:.*]] = call @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv24bf16( %data, i32 0) + // CHECK-DAG: %[[V1:.*]] = call @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv24bf16( %data, i32 1) + // CHECK-DAG: %[[V2:.*]] = call @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv24bf16( %data, i32 2) + // CHECK-DAG: %[[PG:.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %pg) + // CHECK: call void @llvm.aarch64.sve.st3.nxv8bf16( %[[V0]], %[[V1]], %[[V2]], %[[PG]], bfloat* %base) + // CHECK-NEXT: ret + return SVE_ACLE_FUNC(svst3,_bf16,,)(pg, base, data); +} + +void test_svst3_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x3_t data) +{ + // CHECK-LABEL: test_svst3_vnum_bf16 + // CHECK-DAG: %[[BITCAST:.*]] = bitcast bfloat* %base to * + // CHECK-DAG: %[[GEP:.*]] = getelementptr , * %[[BITCAST]], i64 %vnum, i64 0 + // CHECK-DAG: %[[V0:.*]] = call @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv24bf16( %data, i32 0) + // CHECK-DAG: %[[V1:.*]] = call @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv24bf16( %data, i32 1) + // CHECK-DAG: %[[V2:.*]] = call @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv24bf16( %data, i32 2) + // CHECK-DAG: %[[PG:.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %pg) + // CHECK: call void @llvm.aarch64.sve.st3.nxv8bf16( %[[V0]], %[[V1]], %[[V2]], %[[PG]], bfloat* %[[GEP]]) + // CHECK-NEXT: ret + return SVE_ACLE_FUNC(svst3_vnum,_bf16,,)(pg, base, vnum, data); +} diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_st4-bfloat.c @@ -0,0 +1,39 @@ +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_SVE_BF16 -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +void test_svst4_bf16(svbool_t pg, bfloat16_t *base, svbfloat16x4_t data) +{ + // CHECK-LABEL: test_svst4_bf16 + // CHECK-DAG: %[[PG:.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %pg) + // CHECK-DAG: %[[V0:.*]] = call @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv32bf16( %data, i32 0) + // CHECK-DAG: %[[V1:.*]] = call @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv32bf16( %data, i32 1) + // CHECK-DAG: %[[V2:.*]] = call @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv32bf16( %data, i32 2) + // CHECK-DAG: %[[V3:.*]] = call @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv32bf16( %data, i32 3) + // CHECK: call void @llvm.aarch64.sve.st4.nxv8bf16( %[[V0]], %[[V1]], %[[V2]], %[[V3]], %[[PG]], bfloat* %base) + // CHECK-NEXT: ret + return SVE_ACLE_FUNC(svst4,_bf16,,)(pg, base, data); +} + +void test_svst4_vnum_bf16(svbool_t pg, bfloat16_t *base, int64_t vnum, svbfloat16x4_t data) +{ + // CHECK-LABEL: test_svst4_vnum_bf16 + // CHECK-DAG: %[[BITCAST:.*]] = bitcast bfloat* %base to * + // CHECK-DAG: %[[GEP:.*]] = getelementptr , * %[[BITCAST]], i64 %vnum, i64 0 + // CHECK-DAG: %[[V0:.*]] = call @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv32bf16( %data, i32 0) + // CHECK-DAG: %[[V1:.*]] = call @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv32bf16( %data, i32 1) + // CHECK-DAG: %[[V2:.*]] = call @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv32bf16( %data, i32 2) + // CHECK-DAG: %[[V3:.*]] = call @llvm.aarch64.sve.tuple.get.nxv8bf16.nxv32bf16( %data, i32 3) + // CHECK-DAG: %[[PG:.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %pg) + // CHECK: call void @llvm.aarch64.sve.st4.nxv8bf16( %[[V0]], %[[V1]], %[[V2]], %[[V3]], %[[PG]], bfloat* %[[GEP]]) + // CHECK-NEXT: ret + return SVE_ACLE_FUNC(svst4_vnum,_bf16,,)(pg, base, vnum, data); +} diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -539,6 +539,7 @@ case 'b': Signed = false; Float = false; + BFloat = false; ElementBitwidth /= 4; break; case 'o': @@ -568,18 +569,21 @@ case '@': Signed = false; Float = false; + BFloat = false; ElementBitwidth /= 4; NumVectors = 0; break; case 'K': Signed = true; Float = false; + BFloat = false; Bitwidth = ElementBitwidth; NumVectors = 0; break; case 'L': Signed = false; Float = false; + BFloat = false; Bitwidth = ElementBitwidth; NumVectors = 0; break; @@ -587,15 +591,18 @@ Predicate = false; Signed = false; Float = false; + BFloat = false; break; case 'x': Predicate = false; Signed = true; Float = false; + BFloat = false; break; case 'i': Predicate = false; Float = false; + BFloat = false; ElementBitwidth = Bitwidth = 64; NumVectors = 0; Signed = false; @@ -604,6 +611,7 @@ case 'I': Predicate = false; Float = false; + BFloat = false; ElementBitwidth = Bitwidth = 32; NumVectors = 0; Signed = true; @@ -613,6 +621,7 @@ case 'J': Predicate = false; Float = false; + BFloat = false; ElementBitwidth = Bitwidth = 32; NumVectors = 0; Signed = true; @@ -623,6 +632,7 @@ Predicate = false; Signed = true; Float = false; + BFloat = false; ElementBitwidth = Bitwidth = 32; NumVectors = 0; break; @@ -630,6 +640,7 @@ Predicate = false; Signed = true; Float = false; + BFloat = false; ElementBitwidth = Bitwidth = 64; NumVectors = 0; break; @@ -637,6 +648,7 @@ Predicate = false; Signed = false; Float = false; + BFloat = false; ElementBitwidth = Bitwidth = 32; NumVectors = 0; break; @@ -663,16 +675,19 @@ case 'g': Signed = false; Float = false; + BFloat = false; ElementBitwidth = 64; break; case 't': Signed = true; Float = false; + BFloat = false; ElementBitwidth = 32; break; case 'z': Signed = false; Float = false; + BFloat = false; ElementBitwidth = 32; break; case 'O': @@ -1142,6 +1157,12 @@ OS << "typedef __clang_svfloat64x4_t svfloat64x4_t;\n"; OS << "typedef __SVBool_t svbool_t;\n\n"; + OS << "#ifdef __ARM_FEATURE_SVE_BF16\n"; + OS << "typedef __clang_svbfloat16x2_t svbfloat16x2_t;\n"; + OS << "typedef __clang_svbfloat16x3_t svbfloat16x3_t;\n"; + OS << "typedef __clang_svbfloat16x4_t svbfloat16x4_t;\n"; + OS << "#endif\n"; + OS << "typedef enum\n"; OS << "{\n"; OS << " SV_POW2 = 0,\n"; diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp --- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp +++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp @@ -4853,6 +4853,9 @@ case clang::BuiltinType::SveUint64x4: case clang::BuiltinType::SveFloat16: case clang::BuiltinType::SveBFloat16: + case clang::BuiltinType::SveBFloat16x2: + case clang::BuiltinType::SveBFloat16x3: + case clang::BuiltinType::SveBFloat16x4: case clang::BuiltinType::SveFloat16x2: case clang::BuiltinType::SveFloat16x3: case clang::BuiltinType::SveFloat16x4: diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -3992,7 +3992,8 @@ SelectPredicatedStore(Node, 2, AArch64::ST2B, AArch64::ST2B_IMM); return; - } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) { + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { SelectPredicatedStore(Node, 2, AArch64::ST2H, AArch64::ST2H_IMM); return; @@ -4012,7 +4013,8 @@ SelectPredicatedStore(Node, 3, AArch64::ST3B, AArch64::ST3B_IMM); return; - } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) { + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { SelectPredicatedStore(Node, 3, AArch64::ST3H, AArch64::ST3H_IMM); return; @@ -4032,7 +4034,8 @@ SelectPredicatedStore(Node, 4, AArch64::ST4B, AArch64::ST4B_IMM); return; - } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) { + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { SelectPredicatedStore(Node, 4, AArch64::ST4H, AArch64::ST4H_IMM); return; @@ -4644,7 +4647,8 @@ if (VT == MVT::nxv16i8) { SelectPredicatedLoad(Node, 2, AArch64::LD2B_IMM); return; - } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) { + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { SelectPredicatedLoad(Node, 2, AArch64::LD2H_IMM); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { @@ -4660,7 +4664,8 @@ if (VT == MVT::nxv16i8) { SelectPredicatedLoad(Node, 3, AArch64::LD3B_IMM); return; - } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) { + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { SelectPredicatedLoad(Node, 3, AArch64::LD3H_IMM); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { @@ -4676,7 +4681,8 @@ if (VT == MVT::nxv16i8) { SelectPredicatedLoad(Node, 4, AArch64::LD4B_IMM); return; - } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16) { + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) { SelectPredicatedLoad(Node, 4, AArch64::LD4H_IMM); return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve -asm-verbose=0 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 -asm-verbose=0 < %s | FileCheck %s ; ; LD1RQB @@ -284,6 +284,14 @@ ret %res } +define @ld2h_bf16( %pred, bfloat* %addr) { +; CHECK-LABEL: ld2h_bf16: +; CHECK: ld2h { z0.h, z1.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16( %pred, bfloat* %addr) + ret %res +} + ; ; LD2W ; @@ -356,6 +364,14 @@ ret %res } +define @ld3h_bf16( %pred, bfloat* %addr) { +; CHECK-LABEL: ld3h_bf16: +; CHECK: ld3h { z0.h, z1.h, z2.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16( %pred, bfloat* %addr) + ret %res +} + ; ; LD3W ; @@ -428,6 +444,14 @@ ret %res } +define @ld4h_bf16( %pred, bfloat* %addr) { +; CHECK-LABEL: ld4h_bf16: +; CHECK: ld4h { z0.h, z1.h, z2.h, z3.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16( %pred, bfloat* %addr) + ret %res +} + ; ; LD4W ; @@ -490,6 +514,7 @@ declare @llvm.aarch64.sve.ld2.nxv8i32.nxv4i1.p0i32(, i32*) declare @llvm.aarch64.sve.ld2.nxv4i64.nxv2i1.p0i64(, i64*) declare @llvm.aarch64.sve.ld2.nxv16f16.nxv8i1.p0f16(, half*) +declare @llvm.aarch64.sve.ld2.nxv16bf16.nxv8i1.p0bf16(, bfloat*) declare @llvm.aarch64.sve.ld2.nxv8f32.nxv4i1.p0f32(, float*) declare @llvm.aarch64.sve.ld2.nxv4f64.nxv2i1.p0f64(, double*) @@ -498,6 +523,7 @@ declare @llvm.aarch64.sve.ld3.nxv12i32.nxv4i1.p0i32(, i32*) declare @llvm.aarch64.sve.ld3.nxv6i64.nxv2i1.p0i64(, i64*) declare @llvm.aarch64.sve.ld3.nxv24f16.nxv8i1.p0f16(, half*) +declare @llvm.aarch64.sve.ld3.nxv24bf16.nxv8i1.p0bf16(, bfloat*) declare @llvm.aarch64.sve.ld3.nxv12f32.nxv4i1.p0f32(, float*) declare @llvm.aarch64.sve.ld3.nxv6f64.nxv2i1.p0f64(, double*) @@ -506,5 +532,6 @@ declare @llvm.aarch64.sve.ld4.nxv16i32.nxv4i1.p0i32(, i32*) declare @llvm.aarch64.sve.ld4.nxv8i64.nxv2i1.p0i64(, i64*) declare @llvm.aarch64.sve.ld4.nxv32f16.nxv8i1.p0f16(, half*) +declare @llvm.aarch64.sve.ld4.nxv32bf16.nxv8i1.p0bf16(, bfloat*) declare @llvm.aarch64.sve.ld4.nxv16f32.nxv4i1.p0f32(, float*) declare @llvm.aarch64.sve.ld4.nxv8f64.nxv2i1.p0f64(, double*) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-stores.ll @@ -1,4 +1,4 @@ -; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve < %s 2>%t | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve,+bf16 < %s 2>%t | FileCheck %s ; RUN: FileCheck --check-prefix=WARN --allow-empty %s <%t ; WARN-NOT: warning @@ -44,6 +44,17 @@ ret void } +define void @st2h_bf16( %v0, %v1, %pred, bfloat* %addr) { +; CHECK-LABEL: st2h_bf16: +; CHECK: st2h { z0.h, z1.h }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st2.nxv8bf16( %v0, + %v1, + %pred, + bfloat* %addr) + ret void +} + ; ; ST2W ; @@ -140,6 +151,18 @@ ret void } +define void @st3h_bf16( %v0, %v1, %v2, %pred, bfloat* %addr) { +; CHECK-LABEL: st3h_bf16: +; CHECK: st3h { z0.h, z1.h, z2.h }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st3.nxv8bf16( %v0, + %v1, + %v2, + %pred, + bfloat* %addr) + ret void +} + ; ; ST3W ; @@ -243,6 +266,19 @@ ret void } +define void @st4h_bf16( %v0, %v1, %v2, %v3, %pred, bfloat* %addr) { +; CHECK-LABEL: st4h_bf16: +; CHECK: st4h { z0.h, z1.h, z2.h, z3.h }, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st4.nxv8bf16( %v0, + %v1, + %v2, + %v3, + %pred, + bfloat* %addr) + ret void +} + ; ; ST4W ; @@ -395,6 +431,7 @@ declare void @llvm.aarch64.sve.st2.nxv4i32(, , , i32*) declare void @llvm.aarch64.sve.st2.nxv2i64(, , , i64*) declare void @llvm.aarch64.sve.st2.nxv8f16(, , , half*) +declare void @llvm.aarch64.sve.st2.nxv8bf16(, , , bfloat*) declare void @llvm.aarch64.sve.st2.nxv4f32(, , , float*) declare void @llvm.aarch64.sve.st2.nxv2f64(, , , double*) @@ -403,6 +440,7 @@ declare void @llvm.aarch64.sve.st3.nxv4i32(, , , , i32*) declare void @llvm.aarch64.sve.st3.nxv2i64(, , , , i64*) declare void @llvm.aarch64.sve.st3.nxv8f16(, , , , half*) +declare void @llvm.aarch64.sve.st3.nxv8bf16(, , , , bfloat*) declare void @llvm.aarch64.sve.st3.nxv4f32(, , , , float*) declare void @llvm.aarch64.sve.st3.nxv2f64(, , , , double*) @@ -411,6 +449,7 @@ declare void @llvm.aarch64.sve.st4.nxv4i32(, , , , , i32*) declare void @llvm.aarch64.sve.st4.nxv2i64(, , , , , i64*) declare void @llvm.aarch64.sve.st4.nxv8f16(, , , , , half*) +declare void @llvm.aarch64.sve.st4.nxv8bf16(, , , , , bfloat*) declare void @llvm.aarch64.sve.st4.nxv4f32(, , , , , float*) declare void @llvm.aarch64.sve.st4.nxv2f64(, , , , , double*)