diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -271,6 +271,11 @@ def SVLD1SW : MInst<"svld1sw_{d}", "dPU", "lUl", [IsLoad], MemEltTyInt32, "aarch64_sve_ld1">; def SVLD1UW : MInst<"svld1uw_{d}", "dPY", "lUl", [IsLoad, IsZExtReturn], MemEltTyInt32, "aarch64_sve_ld1">; +let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { + def SVLD1_BF : MInst<"svld1[_{2}]", "dPc", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">; + def SVLD1_VNUM_BF : MInst<"svld1_vnum[_{2}]", "dPcl", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">; +} + // Load one vector (scalar base, VL displacement) def SVLD1_VNUM : MInst<"svld1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ld1">; def SVLD1SB_VNUM : MInst<"svld1sb_vnum_{d}", "dPSl", "silUsUiUl", [IsLoad], MemEltTyInt8, "aarch64_sve_ld1">; @@ -376,6 +381,11 @@ def SVLDFF1SW_VNUM : MInst<"svldff1sw_vnum_{d}", "dPUl", "lUl", [IsLoad], MemEltTyInt32, "aarch64_sve_ldff1">; def SVLDFF1UW_VNUM : MInst<"svldff1uw_vnum_{d}", "dPYl", "lUl", [IsLoad, IsZExtReturn], MemEltTyInt32, "aarch64_sve_ldff1">; +let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { + def SVLDFF1_BF : MInst<"svldff1[_{2}]", "dPc", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldff1">; + def SVLDFF1_VNUM_BF : MInst<"svldff1_vnum[_{2}]", "dPcl", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldff1">; +} + // First-faulting load one vector (vector base) def SVLDFF1_GATHER_BASES_U : MInst<"svldff1_gather[_{2}base]_{d}", "dPu", "ilUiUlfd", [IsGatherLoad], MemEltTyDefault, "aarch64_sve_ldff1_gather_scalar_offset">; def SVLDFF1SB_GATHER_BASES_U : MInst<"svldff1sb_gather[_{2}base]_{d}", "dPu", "ilUiUl", [IsGatherLoad], MemEltTyInt8, "aarch64_sve_ldff1_gather_scalar_offset">; @@ -471,15 +481,29 @@ def SVLDNF1SW_VNUM : MInst<"svldnf1sw_vnum_{d}", "dPUl", "lUl", [IsLoad], MemEltTyInt32, "aarch64_sve_ldnf1">; def SVLDNF1UW_VNUM : MInst<"svldnf1uw_vnum_{d}", "dPYl", "lUl", [IsLoad, IsZExtReturn], MemEltTyInt32, "aarch64_sve_ldnf1">; +let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { + def SVLDNF1_BF : MInst<"svldnf1[_{2}]", "dPc", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnf1">; + def SVLDNF1_VNUM_BF : MInst<"svldnf1_vnum[_{2}]", "dPcl", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnf1">; +} + // Load one vector, unextended load, non-temporal (scalar base) def SVLDNT1 : MInst<"svldnt1[_{2}]", "dPc", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">; // Load one vector, unextended load, non-temporal (scalar base, VL displacement) def SVLDNT1_VNUM : MInst<"svldnt1_vnum[_{2}]", "dPcl", "csilUcUsUiUlhfd", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">; +let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { + def SVLDNT1_BF : MInst<"svldnt1[_{2}]", "dPc", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">; + def SVLDNT1_VNUM_BF : MInst<"svldnt1_vnum[_{2}]", "dPcl", "b", [IsLoad], MemEltTyDefault, "aarch64_sve_ldnt1">; +} + // Load one quadword and replicate (scalar base) def SVLD1RQ : SInst<"svld1rq[_{2}]", "dPc", "csilUcUsUiUlhfd", MergeNone, "aarch64_sve_ld1rq">; +let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { + def SVLD1RQ_BF : SInst<"svld1rq[_{2}]", "dPc", "b", MergeNone, "aarch64_sve_ld1rq">; +} + multiclass StructLoad { def : SInst; let ArchGuard = "defined(__ARM_FEATURE_SVE_BF16)" in { diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1-bfloat.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1-bfloat.c @@ -0,0 +1,34 @@ +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -fsyntax-only -verify -verify-ignore-unexpected=error -verify-ignore-unexpected=note %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +svbfloat16_t test_svld1_bf16(svbool_t pg, const bfloat16_t *base) +{ + // CHECK-LABEL: test_svld1_bf16 + // CHECK: %[[PG:.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %pg) + // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld1.nxv8bf16( %[[PG]], bfloat* %base) + // CHECK: ret %[[LOAD]] + // expected-warning@+1 {{implicit declaration of function 'svld1_bf16'}} + return SVE_ACLE_FUNC(svld1,_bf16,,)(pg, base); +} + +svbfloat16_t test_svld1_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum) +{ + // CHECK-LABEL: test_svld1_vnum_bf16 + // CHECK-DAG: %[[PG:.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %pg) + // CHECK-DAG: %[[BITCAST:.*]] = bitcast bfloat* %base to * + // CHECK-DAG: %[[GEP:.*]] = getelementptr , * %[[BITCAST]], i64 %vnum, i64 0 + // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld1.nxv8bf16( %[[PG]], bfloat* %[[GEP]]) + // CHECK: ret %[[LOAD]] + // expected-warning@+1 {{implicit declaration of function 'svld1_vnum_bf16'}} + return SVE_ACLE_FUNC(svld1_vnum,_bf16,,)(pg, base, vnum); +} diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1rq-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1rq-bfloat.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ld1rq-bfloat.c @@ -0,0 +1,22 @@ +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -fsyntax-only -verify -verify-ignore-unexpected=error -verify-ignore-unexpected=note %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +svbfloat16_t test_svld1rq_bf16(svbool_t pg, const bfloat16_t *base) +{ + // CHECK-LABEL: test_svld1rq_bf16 + // CHECK: %[[PG:.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %pg) + // CHECK: %[[INTRINSIC:.*]] = call @llvm.aarch64.sve.ld1rq.nxv8bf16( %[[PG]], bfloat* %base) + // CHECK: ret %[[INTRINSIC]] + // expected-warning@+1 {{implicit declaration of function 'svld1rq_bf16'}} + return SVE_ACLE_FUNC(svld1rq,_bf16,,)(pg, base); +} diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1-bfloat.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldff1-bfloat.c @@ -0,0 +1,34 @@ +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -fsyntax-only -verify -verify-ignore-unexpected=error -verify-ignore-unexpected=note %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +svbfloat16_t test_svldff1_bf16(svbool_t pg, const bfloat16_t *base) +{ + // CHECK-LABEL: test_svldff1_bf16 + // CHECK: %[[PG:.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %pg) + // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ldff1.nxv8bf16( %[[PG]], bfloat* %base) + // CHECK: ret %[[LOAD]] + // expected-warning@+1 {{implicit declaration of function 'svldff1_bf16'}} + return SVE_ACLE_FUNC(svldff1,_bf16,,)(pg, base); +} + +svbfloat16_t test_svldff1_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum) +{ + // CHECK-LABEL: test_svldff1_vnum_bf16 + // CHECK-DAG: %[[BITCAST:.*]] = bitcast bfloat* %base to * + // CHECK-DAG: %[[GEP:.*]] = getelementptr , * %[[BITCAST]], i64 %vnum, i64 0 + // CHECK-DAG: %[[PG:.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %pg) + // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ldff1.nxv8bf16( %[[PG]], bfloat* %[[GEP]]) + // CHECK: ret %[[LOAD]] + // expected-warning@+1 {{implicit declaration of function 'svldff1_vnum_bf16'}} + return SVE_ACLE_FUNC(svldff1_vnum,_bf16,,)(pg, base, vnum); +} diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1-bfloat.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnf1-bfloat.c @@ -0,0 +1,33 @@ +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -fsyntax-only -verify -verify-ignore-unexpected=error -verify-ignore-unexpected=note %s +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +svbfloat16_t test_svldnf1_bf16(svbool_t pg, const bfloat16_t *base) +{ + // CHECK-LABEL: test_svldnf1_bf16 + // CHECK: %[[PG:.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %pg) + // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ldnf1.nxv8bf16( %[[PG]], bfloat* %base) + // CHECK: ret %[[LOAD]] + // expected-warning@+1 {{implicit declaration of function 'svldnf1_bf16'}} + return SVE_ACLE_FUNC(svldnf1,_bf16,,)(pg, base); +} + +svbfloat16_t test_svldnf1_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum) +{ + // CHECK-LABEL: test_svldnf1_vnum_bf16 + // CHECK-DAG: %[[BITCAST:.*]] = bitcast bfloat* %base to * + // CHECK-DAG: %[[GEP:.*]] = getelementptr , * %[[BITCAST]], i64 %vnum, i64 0 + // CHECK-DAG: %[[PG:.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %pg) + // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ldnf1.nxv8bf16( %[[PG]], bfloat* %[[GEP]]) + // CHECK: ret %[[LOAD]] + // expected-warning@+1 {{implicit declaration of function 'svldnf1_vnum_bf16'}} + return SVE_ACLE_FUNC(svldnf1_vnum,_bf16,,)(pg, base, vnum); +} diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnt1-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnt1-bfloat.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_ldnt1-bfloat.c @@ -0,0 +1,34 @@ +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -D__ARM_FEATURE_SVE_BF16 -DSVE_OVERLOADED_FORMS -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -D__ARM_FEATURE_SVE -D__ARM_FEATURE_BF16_SCALAR_ARITHMETIC -triple aarch64-none-linux-gnu -target-feature +sve -target-feature +bf16 -fallow-half-arguments-and-returns -fsyntax-only -verify -verify-ignore-unexpected=error -verify-ignore-unexpected=note %s + +#include + +#ifdef SVE_OVERLOADED_FORMS +// A simple used,unused... macro, long enough to represent any SVE builtin. +#define SVE_ACLE_FUNC(A1,A2_UNUSED,A3,A4_UNUSED) A1##A3 +#else +#define SVE_ACLE_FUNC(A1,A2,A3,A4) A1##A2##A3##A4 +#endif + +svbfloat16_t test_svldnt1_bf16(svbool_t pg, const bfloat16_t *base) +{ + // CHECK-LABEL: test_svldnt1_bf16 + // CHECK-DAG: %[[PG:.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %pg) + // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ldnt1.nxv8bf16( %[[PG]], bfloat* %base) + // CHECK: ret %[[LOAD]] + // expected-warning@+1 {{implicit declaration of function 'svldnt1_bf16'}} + return SVE_ACLE_FUNC(svldnt1,_bf16,,)(pg, base); +} + +svbfloat16_t test_svldnt1_vnum_bf16(svbool_t pg, const bfloat16_t *base, int64_t vnum) +{ + // CHECK-LABEL: test_svldnt1_vnum_bf16 + // CHECK-DAG: %[[PG:.*]] = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %pg) + // CHECK-DAG: %[[BITCAST:.*]] = bitcast bfloat* %base to * + // CHECK-DAG: %[[GEP:.*]] = getelementptr , * %[[BITCAST]], i64 %vnum, i64 0 + // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ldnt1.nxv8bf16( %[[PG]], bfloat* %[[GEP]]) + // CHECK: ret %[[LOAD]] + // expected-warning@+1 {{implicit declaration of function 'svldnt1_vnum_bf16'}} + return SVE_ACLE_FUNC(svldnt1_vnum,_bf16,,)(pg, base, vnum); +} diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1518,10 +1518,11 @@ defm : pred_load; // 8-element contiguous loads - defm : pred_load; - defm : pred_load; - defm : pred_load; - defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; + defm : pred_load; // 16-element contiguous loads defm : pred_load; @@ -1704,10 +1705,11 @@ defm : ld1; // 8-element contiguous loads - defm : ld1; - defm : ld1; - defm : ld1; - defm : ld1; + defm : ld1; + defm : ld1; + defm : ld1; + defm : ld1; + defm : ld1; // 16-element contiguous loads defm : ld1; @@ -1725,31 +1727,32 @@ } // 2-element contiguous non-faulting loads - defm : ldnf1; - defm : ldnf1; - defm : ldnf1; - defm : ldnf1; - defm : ldnf1; - defm : ldnf1; - defm : ldnf1; - defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; // 4-element contiguous non-faulting loads - defm : ldnf1; - defm : ldnf1; - defm : ldnf1; - defm : ldnf1; - defm : ldnf1; - defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; // 8-element contiguous non-faulting loads - defm : ldnf1; - defm : ldnf1; - defm : ldnf1; - defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; + defm : ldnf1; // 16-element contiguous non-faulting loads - defm : ldnf1; + defm : ldnf1; multiclass ldff1 { // reg + reg @@ -1764,29 +1767,30 @@ } // 2-element contiguous first faulting loads - defm : ldff1; - defm : ldff1; - defm : ldff1; - defm : ldff1; - defm : ldff1; - defm : ldff1; - defm : ldff1; - defm : ldff1; - defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; // 4-element contiguous first faulting loads - defm : ldff1; - defm : ldff1; - defm : ldff1; - defm : ldff1; - defm : ldff1; - defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; // 8-element contiguous first faulting loads - defm : ldff1; - defm : ldff1; - defm : ldff1; - defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; + defm : ldff1; // 16-element contiguous first faulting loads defm : ldff1; diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -161,7 +161,8 @@ return false; Type *Ty = cast(DataType)->getElementType(); - if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy()) + if (Ty->isBFloatTy() || Ty->isHalfTy() || + Ty->isFloatTy() || Ty->isDoubleTy()) return true; if (Ty->isIntegerTy(8) || Ty->isIntegerTy(16) || diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1-addressing-mode-reg-imm.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1-addressing-mode-reg-imm.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1-addressing-mode-reg-imm.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1-addressing-mode-reg-imm.ll @@ -207,6 +207,17 @@ ret %load } +define @ld1h_bf16_inbound( %pg, bfloat* %a) { +; CHECK-LABEL: ld1h_bf16_inbound: +; CHECK: ld1h { z0.h }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ret + %base_scalable = bitcast bfloat* %a to * + %base = getelementptr , * %base_scalable, i64 1 + %base_scalar = bitcast * %base to bfloat* + %load = call @llvm.aarch64.sve.ld1.nxv8bf16( %pg, bfloat* %base_scalar) + ret %load +} + ; ; LD1W ; @@ -288,6 +299,7 @@ declare @llvm.aarch64.sve.ld1.nxv8i8(, i8*) declare @llvm.aarch64.sve.ld1.nxv8i16(, i16*) declare @llvm.aarch64.sve.ld1.nxv8f16(, half*) +declare @llvm.aarch64.sve.ld1.nxv8bf16(, bfloat*) declare @llvm.aarch64.sve.ld1.nxv4i8(, i8*) declare @llvm.aarch64.sve.ld1.nxv4i16(, i16*) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1-addressing-mode-reg-reg.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1-addressing-mode-reg-reg.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1-addressing-mode-reg-reg.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1-addressing-mode-reg-reg.ll @@ -95,6 +95,15 @@ ret %load } +define @ld1h_bf16( %pg, bfloat* %a, i64 %index) { +; CHECK-LABEL: ld1h_bf16 +; CHECK: ld1h { z0.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base = getelementptr bfloat, bfloat* %a, i64 %index + %load = call @llvm.aarch64.sve.ld1.nxv8bf16( %pg, bfloat* %base) + ret %load +} + define @ld1h_s( %pred, i16* %a, i64 %index) { ; CHECK-LABEL: ld1h_s: ; CHECK: ld1h { z0.s }, p0/z, [x0, x1, lsl #1] @@ -204,6 +213,7 @@ declare @llvm.aarch64.sve.ld1.nxv8i8(, i8*) declare @llvm.aarch64.sve.ld1.nxv8i16(, i16*) declare @llvm.aarch64.sve.ld1.nxv8f16(, half*) +declare @llvm.aarch64.sve.ld1.nxv8bf16(, bfloat*) declare @llvm.aarch64.sve.ld1.nxv4i8(, i8*) declare @llvm.aarch64.sve.ld1.nxv4i16(, i16*) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-ld1.ll @@ -87,6 +87,14 @@ ret %res } +define @ld1h_bf16( %pred, bfloat* %addr) { +; CHECK-LABEL: ld1h_bf16: +; CHECK: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1.nxv8bf16( %pred, bfloat* %addr) + ret %res +} + define @ld1h_s( %pred, i16* %addr) { ; CHECK-LABEL: ld1h_s: ; CHECK: ld1h { z0.s }, p0/z, [x0] @@ -188,6 +196,7 @@ declare @llvm.aarch64.sve.ld1.nxv8i8(, i8*) declare @llvm.aarch64.sve.ld1.nxv8i16(, i16*) declare @llvm.aarch64.sve.ld1.nxv8f16(, half*) +declare @llvm.aarch64.sve.ld1.nxv8bf16(, bfloat*) declare @llvm.aarch64.sve.ld1.nxv4i8(, i8*) declare @llvm.aarch64.sve.ld1.nxv4i16(, i16*) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads-ff.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads-ff.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads-ff.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads-ff.ll @@ -206,6 +206,14 @@ ret %load } +define @ldff1h_bf16( %pg, bfloat* %a) { +; CHECK-LABEL: ldff1h_bf16: +; CHECK: ldff1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldff1.nxv8bf16( %pg, bfloat* %a) + ret %load +} + define @ldff1h_f16_reg( %pg, half* %a, i64 %offset) { ; CHECK-LABEL: ldff1h_f16_reg: ; CHECK: ldff1h { z0.h }, p0/z, [x0, x1, lsl #1] @@ -215,6 +223,15 @@ ret %load } +define @ldff1h_bf16_reg( %pg, bfloat* %a, i64 %offset) { +; CHECK-LABEL: ldff1h_bf16_reg: +; CHECK: ldff1h { z0.h }, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base = getelementptr bfloat, bfloat* %a, i64 %offset + %load = call @llvm.aarch64.sve.ldff1.nxv8bf16( %pg, bfloat* %base) + ret %load +} + ; ; LDFF1SH ; @@ -398,6 +415,7 @@ declare @llvm.aarch64.sve.ldff1.nxv8i8(, i8*) declare @llvm.aarch64.sve.ldff1.nxv8i16(, i16*) declare @llvm.aarch64.sve.ldff1.nxv8f16(, half*) +declare @llvm.aarch64.sve.ldff1.nxv8bf16(, bfloat*) declare @llvm.aarch64.sve.ldff1.nxv4i8(, i8*) declare @llvm.aarch64.sve.ldff1.nxv4i16(, i16*) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads-nf.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads-nf.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads-nf.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads-nf.ll @@ -140,6 +140,14 @@ ret %load } +define @ldnf1h_bf16( %pg, bfloat* %a) { +; CHECK-LABEL: ldnf1h_bf16: +; CHECK: ldnf1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call @llvm.aarch64.sve.ldnf1.nxv8bf16( %pg, bfloat* %a) + ret %load +} + define @ldnf1h_f16_inbound( %pg, half* %a) { ; CHECK-LABEL: ldnf1h_f16_inbound: ; CHECK: ldnf1h { z0.h }, p0/z, [x0, #1, mul vl] @@ -151,6 +159,17 @@ ret %load } +define @ldnf1h_bf16_inbound( %pg, bfloat* %a) { +; CHECK-LABEL: ldnf1h_bf16_inbound: +; CHECK: ldnf1h { z0.h }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ret + %base_scalable = bitcast bfloat* %a to * + %base = getelementptr , * %base_scalable, i64 1 + %base_scalar = bitcast * %base to bfloat* + %load = call @llvm.aarch64.sve.ldnf1.nxv8bf16( %pg, bfloat* %base_scalar) + ret %load +} + define @ldnf1b_s( %pg, i8* %a) { ; CHECK-LABEL: ldnf1b_s: ; CHECK: ldnf1b { z0.s }, p0/z, [x0] @@ -442,6 +461,7 @@ declare @llvm.aarch64.sve.ldnf1.nxv8i8(, i8*) declare @llvm.aarch64.sve.ldnf1.nxv8i16(, i16*) declare @llvm.aarch64.sve.ldnf1.nxv8f16(, half*) +declare @llvm.aarch64.sve.ldnf1.nxv8bf16(, bfloat*) declare @llvm.aarch64.sve.ldnf1.nxv4i8(, i8*) declare @llvm.aarch64.sve.ldnf1.nxv4i16(, i16*) diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll --- a/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll +++ b/llvm/test/CodeGen/AArch64/sve-intrinsics-loads.ll @@ -97,6 +97,23 @@ ret %res } +define @ld1rqh_bf16( %pred, bfloat* %addr) { +; CHECK-LABEL: ld1rqh_bf16: +; CHECK: ld1rqh { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ld1rq.nxv8bf16( %pred, bfloat* %addr) + ret %res +} + +define @ld1rqh_bf16_imm( %pred, bfloat* %addr) { +; CHECK-LABEL: ld1rqh_bf16_imm: +; CHECK: ld1rqh { z0.h }, p0/z, [x0, #-16] +; CHECK-NEXT: ret + %ptr = getelementptr inbounds bfloat, bfloat* %addr, i16 -8 + %res = call @llvm.aarch64.sve.ld1rq.nxv8bf16( %pred, bfloat* %ptr) + ret %res +} + ; ; LD1RQW ; @@ -208,6 +225,15 @@ ret %res } +define @ldnt1h_bf16( %pred, bfloat* %addr) { +; CHECK-LABEL: ldnt1h_bf16: +; CHECK: ldnt1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.ldnt1.nxv8bf16( %pred, + bfloat* %addr) + ret %res +} + ; ; LDNT1W ; @@ -498,6 +524,7 @@ declare @llvm.aarch64.sve.ld1rq.nxv4i32(, i32*) declare @llvm.aarch64.sve.ld1rq.nxv2i64(, i64*) declare @llvm.aarch64.sve.ld1rq.nxv8f16(, half*) +declare @llvm.aarch64.sve.ld1rq.nxv8bf16(, bfloat*) declare @llvm.aarch64.sve.ld1rq.nxv4f32(, float*) declare @llvm.aarch64.sve.ld1rq.nxv2f64(, double*) @@ -506,6 +533,7 @@ declare @llvm.aarch64.sve.ldnt1.nxv4i32(, i32*) declare @llvm.aarch64.sve.ldnt1.nxv2i64(, i64*) declare @llvm.aarch64.sve.ldnt1.nxv8f16(, half*) +declare @llvm.aarch64.sve.ldnt1.nxv8bf16(, bfloat*) declare @llvm.aarch64.sve.ldnt1.nxv4f32(, float*) declare @llvm.aarch64.sve.ldnt1.nxv2f64(, double*) diff --git a/llvm/test/CodeGen/AArch64/sve-masked-ldst-nonext.ll b/llvm/test/CodeGen/AArch64/sve-masked-ldst-nonext.ll --- a/llvm/test/CodeGen/AArch64/sve-masked-ldst-nonext.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-ldst-nonext.ll @@ -87,6 +87,14 @@ ret %load } +define @masked_load_nxv8bf16( *%a, %mask) nounwind { +; CHECK-LABEL: masked_load_nxv8bf16: +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %load = call @llvm.masked.load.nxv8bf16( *%a, i32 2, %mask, undef) + ret %load +} + ; ; Masked Stores ; @@ -182,6 +190,7 @@ declare @llvm.masked.load.nxv4f32(*, i32, , ) declare @llvm.masked.load.nxv4f16(*, i32, , ) declare @llvm.masked.load.nxv8f16(*, i32, , ) +declare @llvm.masked.load.nxv8bf16(*, i32, , ) declare void @llvm.masked.store.nxv2i64(, *, i32, ) declare void @llvm.masked.store.nxv4i32(, *, i32, )