diff --git a/clang/include/clang/Basic/BuiltinsAArch64.def b/clang/include/clang/Basic/BuiltinsAArch64.def --- a/clang/include/clang/Basic/BuiltinsAArch64.def +++ b/clang/include/clang/Basic/BuiltinsAArch64.def @@ -99,6 +99,12 @@ BUILTIN(__builtin_arm_tcancel, "vWUIi", "n") BUILTIN(__builtin_arm_ttest, "WUi", "nc") +// Armv8.7-A load/store 64-byte intrinsics +BUILTIN(__builtin_arm_ld64b, "vvC*WUi*", "n") +BUILTIN(__builtin_arm_st64b, "vv*WUiC*", "n") +BUILTIN(__builtin_arm_st64bv, "WUiv*WUiC*", "n") +BUILTIN(__builtin_arm_st64bv0, "WUiv*WUiC*", "n") + TARGET_HEADER_BUILTIN(_BitScanForward, "UcUNi*UNi", "nh", "intrin.h", ALL_MS_LANGUAGES, "") TARGET_HEADER_BUILTIN(_BitScanReverse, "UcUNi*UNi", "nh", "intrin.h", ALL_MS_LANGUAGES, "") TARGET_HEADER_BUILTIN(_BitScanForward64, "UcUNi*ULLi", "nh", "intrin.h", ALL_MS_LANGUAGES, "") diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -356,6 +356,9 @@ if (Opts.BranchTargetEnforcement) Builder.defineMacro("__ARM_FEATURE_BTI_DEFAULT", "1"); + if (HasLS64) + Builder.defineMacro("__ARM_FEATURE_LS64", "1"); + switch (ArchKind) { default: break; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -8979,6 +8979,46 @@ CGM.getIntrinsic(Intrinsic::aarch64_fjcvtzs), Arg); } + if (BuiltinID == AArch64::BI__builtin_arm_ld64b || + BuiltinID == AArch64::BI__builtin_arm_st64b || + BuiltinID == AArch64::BI__builtin_arm_st64bv || + BuiltinID == AArch64::BI__builtin_arm_st64bv0) { + llvm::Value *MemAddr = EmitScalarExpr(E->getArg(0)); + llvm::Value *ValPtr = EmitScalarExpr(E->getArg(1)); + + if (BuiltinID == AArch64::BI__builtin_arm_ld64b) { + // Load from the address via an LLVM intrinsic, receiving a + // tuple of 8 i64 words, and store each one to ValPtr. + Function *F = CGM.getIntrinsic(Intrinsic::aarch64_ld64b); + llvm::Value *Val = Builder.CreateCall(F, MemAddr); + llvm::Value *ToRet; + for (size_t i = 0; i < 8; i++) { + llvm::Value *ValOffsetPtr = Builder.CreateGEP(ValPtr, Builder.getInt32(i)); + Address Addr(ValOffsetPtr, CharUnits::fromQuantity(8)); + ToRet = Builder.CreateStore(Builder.CreateExtractValue(Val, i), Addr); + } + return ToRet; + } else { + // Load 8 i64 words from ValPtr, and store them to the address + // via an LLVM intrinsic. + SmallVector Args; + Args.push_back(MemAddr); + for (size_t i = 0; i < 8; i++) { + llvm::Value *ValOffsetPtr = Builder.CreateGEP(ValPtr, Builder.getInt32(i)); + Address Addr(ValOffsetPtr, CharUnits::fromQuantity(8)); + Args.push_back(Builder.CreateLoad(Addr)); + } + + auto Intr = (BuiltinID == AArch64::BI__builtin_arm_st64b + ? Intrinsic::aarch64_st64b + : BuiltinID == AArch64::BI__builtin_arm_st64bv + ? Intrinsic::aarch64_st64bv + : Intrinsic::aarch64_st64bv0); + Function *F = CGM.getIntrinsic(Intr); + return Builder.CreateCall(F, Args); + } + } + if (BuiltinID == AArch64::BI__clear_cache) { assert(E->getNumArgs() == 2 && "__clear_cache takes 2 arguments"); const FunctionDecl *FD = E->getDirectCallee(); diff --git a/clang/lib/Headers/arm_acle.h b/clang/lib/Headers/arm_acle.h --- a/clang/lib/Headers/arm_acle.h +++ b/clang/lib/Headers/arm_acle.h @@ -639,6 +639,32 @@ } #endif +/* Armv8.7-A load/store 64-byte intrinsics */ +#if __ARM_64BIT_STATE && defined(__ARM_FEATURE_LS64) +typedef struct { + uint64_t val[8]; +} data512_t; + +static __inline__ data512_t __attribute__((__always_inline__, __nodebug__)) +__arm_ld64b(const void *__addr) { + data512_t __value; + __builtin_arm_ld64b(__addr, __value.val); + return __value; +} +static __inline__ void __attribute__((__always_inline__, __nodebug__)) +__arm_st64b(void *__addr, data512_t __value) { + __builtin_arm_st64b(__addr, __value.val); +} +static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__)) +__arm_st64bv(void *__addr, data512_t __value) { + return __builtin_arm_st64bv(__addr, __value.val); +} +static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__)) +__arm_st64bv0(void *__addr, data512_t __value) { + return __builtin_arm_st64bv0(__addr, __value.val); +} +#endif + /* 10.1 Special register intrinsics */ #define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg) #define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg) diff --git a/clang/test/CodeGen/aarch64-ls64.c b/clang/test/CodeGen/aarch64-ls64.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-ls64.c @@ -0,0 +1,163 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +ls64 -S -emit-llvm -x c %s -o - | FileCheck %s +// RUN: %clang_cc1 -triple aarch64-eabi -target-feature +ls64 -S -emit-llvm -x c++ %s -o - | FileCheck %s +// RUN: %clang_cc1 -triple aarch64_be-eabi -target-feature +ls64 -S -emit-llvm -x c %s -o - | FileCheck %s +// RUN: %clang_cc1 -triple aarch64_be-eabi -target-feature +ls64 -S -emit-llvm -x c++ %s -o - | FileCheck %s + +#include + +#ifdef __cplusplus +#define EXTERN_C extern "C" +#else +#define EXTERN_C +#endif + +data512_t val; +void *addr; +uint64_t status; + +// CHECK-LABEL: @test_ld64b( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[__ADDR_ADDR_I:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: [[REF_TMP:%.*]] = alloca [[STRUCT_DATA512_T:%.*]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i8*, i8** @addr, align 8 +// CHECK-NEXT: store i8* [[TMP0]], i8** [[__ADDR_ADDR_I]], align 8, !noalias !6 +// CHECK-NEXT: [[TMP1:%.*]] = load i8*, i8** [[__ADDR_ADDR_I]], align 8, !noalias !6 +// CHECK-NEXT: [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_DATA512_T]], %struct.data512_t* [[REF_TMP]], i32 0, i32 0 +// CHECK-NEXT: [[ARRAYDECAY_I:%.*]] = getelementptr inbounds [8 x i64], [8 x i64]* [[VAL_I]], i64 0, i64 0 +// CHECK-NEXT: [[TMP2:%.*]] = call { i64, i64, i64, i64, i64, i64, i64, i64 } @llvm.aarch64.ld64b(i8* [[TMP1]]) [[ATTR2:#.*]], !noalias !6 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 0 +// CHECK-NEXT: store i64 [[TMP3]], i64* [[ARRAYDECAY_I]], align 8, !alias.scope !6 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 1 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 1 +// CHECK-NEXT: store i64 [[TMP5]], i64* [[TMP4]], align 8, !alias.scope !6 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 2 +// CHECK-NEXT: [[TMP7:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 2 +// CHECK-NEXT: store i64 [[TMP7]], i64* [[TMP6]], align 8, !alias.scope !6 +// CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 3 +// CHECK-NEXT: [[TMP9:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 3 +// CHECK-NEXT: store i64 [[TMP9]], i64* [[TMP8]], align 8, !alias.scope !6 +// CHECK-NEXT: [[TMP10:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 4 +// CHECK-NEXT: [[TMP11:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 4 +// CHECK-NEXT: store i64 [[TMP11]], i64* [[TMP10]], align 8, !alias.scope !6 +// CHECK-NEXT: [[TMP12:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 5 +// CHECK-NEXT: [[TMP13:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 5 +// CHECK-NEXT: store i64 [[TMP13]], i64* [[TMP12]], align 8, !alias.scope !6 +// CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 6 +// CHECK-NEXT: [[TMP15:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 6 +// CHECK-NEXT: store i64 [[TMP15]], i64* [[TMP14]], align 8, !alias.scope !6 +// CHECK-NEXT: [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7 +// CHECK-NEXT: [[TMP17:%.*]] = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } [[TMP2]], 7 +// CHECK-NEXT: store i64 [[TMP17]], i64* [[TMP16]], align 8, !alias.scope !6 +// CHECK-NEXT: [[TMP18:%.*]] = bitcast %struct.data512_t* [[REF_TMP]] to i8* +// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 bitcast (%struct.data512_t* @val to i8*), i8* align 8 [[TMP18]], i64 64, i1 false) +// CHECK-NEXT: ret void +// +EXTERN_C void test_ld64b(void) +{ + val = __arm_ld64b(addr); +} + +// CHECK-LABEL: @test_st64b( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[__ADDR_ADDR_I:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: [[AGG_TMP:%.*]] = alloca [[STRUCT_DATA512_T:%.*]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i8*, i8** @addr, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast %struct.data512_t* [[AGG_TMP]] to i8* +// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP1]], i8* align 8 bitcast (%struct.data512_t* @val to i8*), i64 64, i1 false) +// CHECK-NEXT: store i8* [[TMP0]], i8** [[__ADDR_ADDR_I]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load i8*, i8** [[__ADDR_ADDR_I]], align 8 +// CHECK-NEXT: [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_DATA512_T]], %struct.data512_t* [[AGG_TMP]], i32 0, i32 0 +// CHECK-NEXT: [[ARRAYDECAY_I:%.*]] = getelementptr inbounds [8 x i64], [8 x i64]* [[VAL_I]], i64 0, i64 0 +// CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[ARRAYDECAY_I]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 1 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP4]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 2 +// CHECK-NEXT: [[TMP7:%.*]] = load i64, i64* [[TMP6]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 3 +// CHECK-NEXT: [[TMP9:%.*]] = load i64, i64* [[TMP8]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 4 +// CHECK-NEXT: [[TMP11:%.*]] = load i64, i64* [[TMP10]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 5 +// CHECK-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 6 +// CHECK-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8 +// CHECK-NEXT: [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7 +// CHECK-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8 +// CHECK-NEXT: call void @llvm.aarch64.st64b(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) [[ATTR2]] +// CHECK-NEXT: ret void +// +EXTERN_C void test_st64b(void) +{ + __arm_st64b(addr, val); +} + +// CHECK-LABEL: @test_st64bv( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[__ADDR_ADDR_I:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: [[AGG_TMP:%.*]] = alloca [[STRUCT_DATA512_T:%.*]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i8*, i8** @addr, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast %struct.data512_t* [[AGG_TMP]] to i8* +// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP1]], i8* align 8 bitcast (%struct.data512_t* @val to i8*), i64 64, i1 false) +// CHECK-NEXT: store i8* [[TMP0]], i8** [[__ADDR_ADDR_I]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load i8*, i8** [[__ADDR_ADDR_I]], align 8 +// CHECK-NEXT: [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_DATA512_T]], %struct.data512_t* [[AGG_TMP]], i32 0, i32 0 +// CHECK-NEXT: [[ARRAYDECAY_I:%.*]] = getelementptr inbounds [8 x i64], [8 x i64]* [[VAL_I]], i64 0, i64 0 +// CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[ARRAYDECAY_I]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 1 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP4]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 2 +// CHECK-NEXT: [[TMP7:%.*]] = load i64, i64* [[TMP6]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 3 +// CHECK-NEXT: [[TMP9:%.*]] = load i64, i64* [[TMP8]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 4 +// CHECK-NEXT: [[TMP11:%.*]] = load i64, i64* [[TMP10]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 5 +// CHECK-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 6 +// CHECK-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8 +// CHECK-NEXT: [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7 +// CHECK-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8 +// CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.aarch64.st64bv(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) [[ATTR2]] +// CHECK-NEXT: store i64 [[TMP18]], i64* @status, align 8 +// CHECK-NEXT: ret void +// +EXTERN_C void test_st64bv(void) +{ + status = __arm_st64bv(addr, val); +} + +// CHECK-LABEL: @test_st64bv0( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[__ADDR_ADDR_I:%.*]] = alloca i8*, align 8 +// CHECK-NEXT: [[AGG_TMP:%.*]] = alloca [[STRUCT_DATA512_T:%.*]], align 8 +// CHECK-NEXT: [[TMP0:%.*]] = load i8*, i8** @addr, align 8 +// CHECK-NEXT: [[TMP1:%.*]] = bitcast %struct.data512_t* [[AGG_TMP]] to i8* +// CHECK-NEXT: call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 8 [[TMP1]], i8* align 8 bitcast (%struct.data512_t* @val to i8*), i64 64, i1 false) +// CHECK-NEXT: store i8* [[TMP0]], i8** [[__ADDR_ADDR_I]], align 8 +// CHECK-NEXT: [[TMP2:%.*]] = load i8*, i8** [[__ADDR_ADDR_I]], align 8 +// CHECK-NEXT: [[VAL_I:%.*]] = getelementptr inbounds [[STRUCT_DATA512_T]], %struct.data512_t* [[AGG_TMP]], i32 0, i32 0 +// CHECK-NEXT: [[ARRAYDECAY_I:%.*]] = getelementptr inbounds [8 x i64], [8 x i64]* [[VAL_I]], i64 0, i64 0 +// CHECK-NEXT: [[TMP3:%.*]] = load i64, i64* [[ARRAYDECAY_I]], align 8 +// CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 1 +// CHECK-NEXT: [[TMP5:%.*]] = load i64, i64* [[TMP4]], align 8 +// CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 2 +// CHECK-NEXT: [[TMP7:%.*]] = load i64, i64* [[TMP6]], align 8 +// CHECK-NEXT: [[TMP8:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 3 +// CHECK-NEXT: [[TMP9:%.*]] = load i64, i64* [[TMP8]], align 8 +// CHECK-NEXT: [[TMP10:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 4 +// CHECK-NEXT: [[TMP11:%.*]] = load i64, i64* [[TMP10]], align 8 +// CHECK-NEXT: [[TMP12:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 5 +// CHECK-NEXT: [[TMP13:%.*]] = load i64, i64* [[TMP12]], align 8 +// CHECK-NEXT: [[TMP14:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 6 +// CHECK-NEXT: [[TMP15:%.*]] = load i64, i64* [[TMP14]], align 8 +// CHECK-NEXT: [[TMP16:%.*]] = getelementptr i64, i64* [[ARRAYDECAY_I]], i32 7 +// CHECK-NEXT: [[TMP17:%.*]] = load i64, i64* [[TMP16]], align 8 +// CHECK-NEXT: [[TMP18:%.*]] = call i64 @llvm.aarch64.st64bv0(i8* [[TMP2]], i64 [[TMP3]], i64 [[TMP5]], i64 [[TMP7]], i64 [[TMP9]], i64 [[TMP11]], i64 [[TMP13]], i64 [[TMP15]], i64 [[TMP17]]) [[ATTR2]] +// CHECK-NEXT: store i64 [[TMP18]], i64* @status, align 8 +// CHECK-NEXT: ret void +// +EXTERN_C void test_st64bv0(void) +{ + status = __arm_st64bv0(addr, val); +} diff --git a/clang/test/Preprocessor/aarch64-target-features.c b/clang/test/Preprocessor/aarch64-target-features.c --- a/clang/test/Preprocessor/aarch64-target-features.c +++ b/clang/test/Preprocessor/aarch64-target-features.c @@ -441,6 +441,12 @@ // CHECK-BFLOAT: __ARM_FEATURE_BF16 1 // CHECK-BFLOAT: __ARM_FEATURE_BF16_VECTOR_ARITHMETIC 1 +// ================== Check Armv8.7-A LS64 extension. +// RUN: %clang -target aarch64-arm-none-eabi -march=armv8.7-a+ls64 -x c -E -dM %s -o - 2>&1 | FileCheck -check-prefix=CHECK-LS64 %s +// RUN: %clang -target aarch64-arm-none-eabi -march=armv8.7-a -x c -E -dM %s -o - 2>&1 | FileCheck -check-prefix=CHECK-NO-LS64 %s +// CHECK-LS64: __ARM_FEATURE_LS64 1 +// CHECK-NO-LS64-NOT: __ARM_FEATURE_LS64 1 + // ================== Check sve-vector-bits flag. // RUN: %clang -target aarch64-arm-none-eabi -march=armv8-a+sve -msve-vector-bits=128 -x c -E -dM %s -o - 2>&1 | FileCheck -check-prefix=CHECK-SVE-VECTOR-BITS -D#VBITS=128 %s // RUN: %clang -target aarch64-arm-none-eabi -march=armv8-a+sve -msve-vector-bits=256 -x c -E -dM %s -o - 2>&1 | FileCheck -check-prefix=CHECK-SVE-VECTOR-BITS -D#VBITS=256 %s diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -810,6 +810,14 @@ def int_aarch64_ttest : GCCBuiltin<"__builtin_arm_ttest">, Intrinsic<[llvm_i64_ty], [], [IntrNoMem, IntrHasSideEffects]>; + +// Armv8.7-A load/store 64-byte intrinsics +defvar data512 = !listsplat(llvm_i64_ty, 8); +def int_aarch64_ld64b: Intrinsic; +def int_aarch64_st64b: Intrinsic<[], !listconcat([llvm_ptr_ty], data512)>; +def int_aarch64_st64bv: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], data512)>; +def int_aarch64_st64bv0: Intrinsic<[llvm_i64_ty], !listconcat([llvm_ptr_ty], data512)>; + } def llvm_nxv2i1_ty : LLVMType; diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -1377,9 +1377,12 @@ ReplaceUses(SDValue(N, NumVecs), SDValue(Ld, 1)); - // Transfer memoperands. - MachineMemOperand *MemOp = cast(N)->getMemOperand(); - CurDAG->setNodeMemRefs(cast(Ld), {MemOp}); + // Transfer memoperands. In the case of AArch64::LD64B, there won't be one, + // because it's too simple to have needed special treatment during lowering. + if (auto *MemIntr = dyn_cast(N)) { + MachineMemOperand *MemOp = MemIntr->getMemOperand(); + CurDAG->setNodeMemRefs(cast(Ld), {MemOp}); + } CurDAG->RemoveDeadNode(N); } @@ -3830,6 +3833,9 @@ return; } break; + case Intrinsic::aarch64_ld64b: + SelectLoad(Node, 8, AArch64::LD64B, AArch64::x8sub_0); + return; } } break; case ISD::INTRINSIC_WO_CHAIN: { diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td @@ -7809,10 +7809,18 @@ let Predicates = [HasLS64] in { def LD64B: LoadStore64B<0b101, "ld64b", (ins GPR64sp:$Rn), (outs GPR64x8:$Rt)>; - def ST64B: LoadStore64B<0b001, "st64b", (ins GPR64sp:$Rn, GPR64x8:$Rt), + def ST64B: LoadStore64B<0b001, "st64b", (ins GPR64x8:$Rt, GPR64sp:$Rn), (outs)>; def ST64BV: Store64BV<0b011, "st64bv">; def ST64BV0: Store64BV<0b010, "st64bv0">; + + class ST64BPattern + : Pat<(intrinsic GPR64sp:$addr, GPR64:$x0, GPR64:$x1, GPR64:$x2, GPR64:$x3, GPR64:$x4, GPR64:$x5, GPR64:$x6, GPR64:$x7), + (instruction (REG_SEQUENCE GPR64x8Class, $x0, x8sub_0, $x1, x8sub_1, $x2, x8sub_2, $x3, x8sub_3, $x4, x8sub_4, $x5, x8sub_5, $x6, x8sub_6, $x7, x8sub_7), $addr)>; + + def : ST64BPattern; + def : ST64BPattern; + def : ST64BPattern; } include "AArch64InstrAtomics.td" diff --git a/llvm/test/CodeGen/AArch64/ls64-intrinsics.ll b/llvm/test/CodeGen/AArch64/ls64-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/ls64-intrinsics.ll @@ -0,0 +1,92 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64 -mattr=+ls64 -verify-machineinstrs -o - %s | FileCheck %s +; RUN: llc -mtriple=aarch64_be -mattr=+ls64 -verify-machineinstrs -o - %s | FileCheck %s + +define void @test_ld64b({ i64, i64, i64, i64, i64, i64, i64, i64 }* %out, i8* %addr) { +; CHECK-LABEL: test_ld64b: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ld64b x2, [x1] +; CHECK-NEXT: stp x8, x9, [x0, #48] +; CHECK-NEXT: stp x6, x7, [x0, #32] +; CHECK-NEXT: stp x4, x5, [x0, #16] +; CHECK-NEXT: stp x2, x3, [x0] +; CHECK-NEXT: ret +entry: + %val = tail call { i64, i64, i64, i64, i64, i64, i64, i64 } @llvm.aarch64.ld64b(i8* %addr) + store { i64, i64, i64, i64, i64, i64, i64, i64 } %val, { i64, i64, i64, i64, i64, i64, i64, i64 }* %out, align 8 + ret void +} + +define void @test_st64b({ i64, i64, i64, i64, i64, i64, i64, i64 }* %in, i8* %addr) { +; CHECK-LABEL: test_st64b: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldp x8, x9, [x0, #48] +; CHECK-NEXT: ldp x6, x7, [x0, #32] +; CHECK-NEXT: ldp x4, x5, [x0, #16] +; CHECK-NEXT: ldp x2, x3, [x0] +; CHECK-NEXT: st64b x2, [x1] +; CHECK-NEXT: ret +entry: + %val = load { i64, i64, i64, i64, i64, i64, i64, i64 }, { i64, i64, i64, i64, i64, i64, i64, i64 }* %in, align 8 + %v0 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } %val, 0 + %v1 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } %val, 1 + %v2 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } %val, 2 + %v3 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } %val, 3 + %v4 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } %val, 4 + %v5 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } %val, 5 + %v6 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } %val, 6 + %v7 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } %val, 7 + tail call void @llvm.aarch64.st64b(i8* %addr, i64 %v0, i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7) + ret void +} + +define i64 @test_st64bv({ i64, i64, i64, i64, i64, i64, i64, i64 }* %in, i8* %addr) { +; CHECK-LABEL: test_st64bv: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldp x8, x9, [x0, #48] +; CHECK-NEXT: ldp x6, x7, [x0, #32] +; CHECK-NEXT: ldp x4, x5, [x0, #16] +; CHECK-NEXT: ldp x2, x3, [x0] +; CHECK-NEXT: st64bv x0, x2, [x1] +; CHECK-NEXT: ret +entry: + %val = load { i64, i64, i64, i64, i64, i64, i64, i64 }, { i64, i64, i64, i64, i64, i64, i64, i64 }* %in, align 8 + %v0 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } %val, 0 + %v1 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } %val, 1 + %v2 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } %val, 2 + %v3 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } %val, 3 + %v4 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } %val, 4 + %v5 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } %val, 5 + %v6 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } %val, 6 + %v7 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } %val, 7 + %status = tail call i64 @llvm.aarch64.st64bv(i8* %addr, i64 %v0, i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7) + ret i64 %status +} + +define i64 @test_st64bv0({ i64, i64, i64, i64, i64, i64, i64, i64 }* %in, i8* %addr) { +; CHECK-LABEL: test_st64bv0: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: ldp x8, x9, [x0, #48] +; CHECK-NEXT: ldp x6, x7, [x0, #32] +; CHECK-NEXT: ldp x4, x5, [x0, #16] +; CHECK-NEXT: ldp x2, x3, [x0] +; CHECK-NEXT: st64bv0 x0, x2, [x1] +; CHECK-NEXT: ret +entry: + %val = load { i64, i64, i64, i64, i64, i64, i64, i64 }, { i64, i64, i64, i64, i64, i64, i64, i64 }* %in, align 8 + %v0 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } %val, 0 + %v1 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } %val, 1 + %v2 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } %val, 2 + %v3 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } %val, 3 + %v4 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } %val, 4 + %v5 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } %val, 5 + %v6 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } %val, 6 + %v7 = extractvalue { i64, i64, i64, i64, i64, i64, i64, i64 } %val, 7 + %status = tail call i64 @llvm.aarch64.st64bv0(i8* %addr, i64 %v0, i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6, i64 %v7) + ret i64 %status +} + +declare { i64, i64, i64, i64, i64, i64, i64, i64 } @llvm.aarch64.ld64b(i8*) +declare void @llvm.aarch64.st64b(i8*, i64, i64, i64, i64, i64, i64, i64, i64) +declare i64 @llvm.aarch64.st64bv(i8*, i64, i64, i64, i64, i64, i64, i64, i64) +declare i64 @llvm.aarch64.st64bv0(i8*, i64, i64, i64, i64, i64, i64, i64, i64)