diff --git a/clang/include/clang/Basic/TargetBuiltins.h b/clang/include/clang/Basic/TargetBuiltins.h --- a/clang/include/clang/Basic/TargetBuiltins.h +++ b/clang/include/clang/Basic/TargetBuiltins.h @@ -281,6 +281,8 @@ bool isTupleCreate() const { return Flags & IsTupleCreate; } bool isTupleGet() const { return Flags & IsTupleGet; } bool isTupleSet() const { return Flags & IsTupleSet; } + bool isSME() const { return Flags & IsSME; } + bool isSMELoadStore() const { return Flags & IsSMELoadStore; } uint64_t getBits() const { return Flags; } bool isFlagSet(uint64_t Flag) const { return Flags & Flag; } diff --git a/clang/include/clang/Basic/arm_sve.td b/clang/include/clang/Basic/arm_sve.td --- a/clang/include/clang/Basic/arm_sve.td +++ b/clang/include/clang/Basic/arm_sve.td @@ -98,6 +98,7 @@ // N: svfloat64_t // J: Prefetch type (sv_prfop) +// %: pointer to void // A: pointer to int8_t // B: pointer to int16_t // C: pointer to int32_t @@ -205,6 +206,8 @@ def IsTupleCreate : FlagType<0x100000000>; def IsTupleGet : FlagType<0x200000000>; def IsTupleSet : FlagType<0x400000000>; +def IsSME : FlagType<0x800000000>; +def IsSMELoadStore : FlagType<0x1000000000>; // These must be kept in sync with the flags in include/clang/Basic/TargetBuiltins.h class ImmCheckType { @@ -542,6 +545,17 @@ def SVBFMLALT_LANE : SInst<"svbfmlalt_lane[_{0}]", "MMddn", "b", MergeNone, "aarch64_sve_bfmlalt_lane", [IsOverloadNone], [ImmCheck<3, ImmCheck0_7>]>; } +def SVLD1_HOR_ZA8 : MInst<"svld1_hor_za8", "vnmnPQ", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1b_horiz">; +def SVLD1_HOR_ZA16 : MInst<"svld1_hor_za16", "vnmnPQ", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1h_horiz">; +def SVLD1_HOR_ZA32 : MInst<"svld1_hor_za32", "vnmnPQ", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1w_horiz">; +def SVLD1_HOR_ZA64 : MInst<"svld1_hor_za64", "vnmnPQ", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1d_horiz">; +def SVLD1_HOR_ZA128 : MInst<"svld1_hor_za128", "vnmnPQ", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1q_horiz">; +def SVLD1_VER_ZA8 : MInst<"svld1_ver_za8", "vnmnPQ", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1b_vert">; +def SVLD1_VER_ZA16 : MInst<"svld1_ver_za16", "vnmnPQ", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1h_vert">; +def SVLD1_VER_ZA32 : MInst<"svld1_ver_za32", "vnmnPQ", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1w_vert">; +def SVLD1_VER_ZA64 : MInst<"svld1_ver_za64", "vnmnPQ", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1d_vert">; +def SVLD1_VER_ZA128 : MInst<"svld1_ver_za128", "vnmnPQ", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_ld1q_vert">; + //////////////////////////////////////////////////////////////////////////////// // Stores @@ -664,6 +678,17 @@ def SVSTNT1_VNUM_BF : MInst<"svstnt1_vnum[_{d}]", "vPpld", "b", [IsStore], MemEltTyDefault, "aarch64_sve_stnt1">; } +def SVST1_HOR_ZA8 : MInst<"svst1_hor_za8", "vnmnP%", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1b_horiz">; +def SVST1_HOR_ZA16 : MInst<"svst1_hor_za16", "vnmnP%", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1h_horiz">; +def SVST1_HOR_ZA32 : MInst<"svst1_hor_za32", "vnmnP%", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1w_horiz">; +def SVST1_HOR_ZA64 : MInst<"svst1_hor_za64", "vnmnP%", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1d_horiz">; +def SVST1_HOR_ZA128 : MInst<"svst1_hor_za128", "vnmnP%", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1q_horiz">; +def SVST1_VER_ZA8 : MInst<"svst1_ver_za8", "vnmnP%", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1b_vert">; +def SVST1_VER_ZA16 : MInst<"svst1_ver_za16", "vnmnP%", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1h_vert">; +def SVST1_VER_ZA32 : MInst<"svst1_ver_za32", "vnmnP%", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1w_vert">; +def SVST1_VER_ZA64 : MInst<"svst1_ver_za64", "vnmnP%", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1d_vert">; +def SVST1_VER_ZA128 : MInst<"svst1_ver_za128", "vnmnP%", "", [IsOverloadNone, IsSME, IsSMELoadStore], MemEltTyDefault, "aarch64_sme_st1q_vert">; + //////////////////////////////////////////////////////////////////////////////// // Prefetches diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h --- a/clang/lib/Basic/Targets/AArch64.h +++ b/clang/lib/Basic/Targets/AArch64.h @@ -49,6 +49,7 @@ bool HasSVE2SHA3; bool HasSVE2SM4; bool HasSVE2BitPerm; + bool HasSME; bool HasMatmulFP64; bool HasMatmulFP32; bool HasLSE; diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp --- a/clang/lib/Basic/Targets/AArch64.cpp +++ b/clang/lib/Basic/Targets/AArch64.cpp @@ -338,6 +338,9 @@ if (HasSVE2 && HasSVE2SM4) Builder.defineMacro("__ARM_FEATURE_SVE2_SM4", "1"); + if (HasSME) + Builder.defineMacro("__ARM_FEATURE_SME", "1"); + if (HasCRC) Builder.defineMacro("__ARM_FEATURE_CRC32", "1"); @@ -540,6 +543,7 @@ HasSVE2SHA3 = false; HasSVE2SM4 = false; HasSVE2BitPerm = false; + HasSME = false; HasMatmulFP64 = false; HasMatmulFP32 = false; HasLSE = false; @@ -583,6 +587,9 @@ HasSVE2 = true; HasSVE2BitPerm = true; } + if (Feature == "+sme") { + HasSME = true; + } if (Feature == "+f32mm") { FPU |= SveMode; HasMatmulFP32 = true; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -9019,6 +9019,54 @@ return Store; } +Value *CodeGenFunction::EmitSMELoadStore(SVETypeFlags TypeFlags, + SmallVectorImpl &Ops, + unsigned IntID) { + SmallVector NewOps; + NewOps.push_back(Ops[3]); + llvm::Value *BasePtr; + switch (IntID) { + case Intrinsic::aarch64_sme_ld1h_horiz: + case Intrinsic::aarch64_sme_ld1h_vert: + case Intrinsic::aarch64_sme_st1h_horiz: + case Intrinsic::aarch64_sme_st1h_vert: + BasePtr = + Builder.CreatePointerCast(Ops[4], llvm::PointerType::get(Int16Ty, 0)); + break; + case Intrinsic::aarch64_sme_ld1w_horiz: + case Intrinsic::aarch64_sme_ld1w_vert: + case Intrinsic::aarch64_sme_st1w_horiz: + case Intrinsic::aarch64_sme_st1w_vert: + BasePtr = + Builder.CreatePointerCast(Ops[4], llvm::PointerType::get(Int32Ty, 0)); + break; + case Intrinsic::aarch64_sme_ld1d_horiz: + case Intrinsic::aarch64_sme_ld1d_vert: + case Intrinsic::aarch64_sme_st1d_horiz: + case Intrinsic::aarch64_sme_st1d_vert: + BasePtr = + Builder.CreatePointerCast(Ops[4], llvm::PointerType::get(Int64Ty, 0)); + break; + case Intrinsic::aarch64_sme_ld1q_horiz: + case Intrinsic::aarch64_sme_ld1q_vert: + case Intrinsic::aarch64_sme_st1q_horiz: + case Intrinsic::aarch64_sme_st1q_vert: + BasePtr = Builder.CreatePointerCast( + Ops[4], llvm::PointerType::get( + llvm::IntegerType::get(getLLVMContext(), 128), 0)); + break; + default: + BasePtr = Ops[4]; + break; + } + NewOps.push_back(BasePtr); + NewOps.push_back(Ops[0]); + llvm::Value *CastOffset = Builder.CreateIntCast(Ops[2], Int32Ty, false); + NewOps.push_back(Builder.CreateAdd(Ops[1], CastOffset, "tileslice")); + Function *F = CGM.getIntrinsic(IntID, {}); + return Builder.CreateCall(F, NewOps); +} + // Limit the usage of scalable llvm IR generated by the ACLE by using the // sve dup.x intrinsic instead of IRBuilder::CreateVectorSplat. Value *CodeGenFunction::EmitSVEDupX(Value *Scalar, llvm::Type *Ty) { @@ -9119,6 +9167,8 @@ TypeFlags.isZExtReturn()); else if (TypeFlags.isStore()) return EmitSVEMaskedStore(E, Ops, Builtin->LLVMIntrinsic); + else if (TypeFlags.isSMELoadStore()) + return EmitSMELoadStore(TypeFlags, Ops, Builtin->LLVMIntrinsic); else if (TypeFlags.isGatherLoad()) return EmitSVEGatherLoad(TypeFlags, Ops, Builtin->LLVMIntrinsic); else if (TypeFlags.isScatterStore()) diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h --- a/clang/lib/CodeGen/CodeGenFunction.h +++ b/clang/lib/CodeGen/CodeGenFunction.h @@ -4222,6 +4222,9 @@ llvm::Value *EmitSVEMaskedStore(const CallExpr *, SmallVectorImpl &Ops, unsigned BuiltinID); + llvm::Value *EmitSMELoadStore(SVETypeFlags TypeFlags, + llvm::SmallVectorImpl &Ops, + unsigned IntID); llvm::Value *EmitSVEPrefetchLoad(const SVETypeFlags &TypeFlags, SmallVectorImpl &Ops, unsigned BuiltinID); diff --git a/clang/lib/Headers/CMakeLists.txt b/clang/lib/Headers/CMakeLists.txt --- a/clang/lib/Headers/CMakeLists.txt +++ b/clang/lib/Headers/CMakeLists.txt @@ -306,6 +306,8 @@ clang_generate_header(-gen-arm-fp16 arm_fp16.td arm_fp16.h) # Generate arm_sve.h clang_generate_header(-gen-arm-sve-header arm_sve.td arm_sve.h) + # Generate arm_sme.h + clang_generate_header(-gen-arm-sme-header arm_sve.td arm_sme.h) # Generate arm_bf16.h clang_generate_header(-gen-arm-bf16 arm_bf16.td arm_bf16.h) # Generate arm_mve.h @@ -330,6 +332,7 @@ list(APPEND aarch64_only_generated_files "${CMAKE_CURRENT_BINARY_DIR}/arm_sve.h" + "${CMAKE_CURRENT_BINARY_DIR}/arm_sme.h" "${CMAKE_CURRENT_BINARY_DIR}/arm_bf16.h" "${output_dir}/arm_neon_sve_bridge.h" ) diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_ld1.c @@ -0,0 +1,711 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -no-opaque-pointers -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -no-opaque-pointers -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -no-opaque-pointers -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -o /dev/null %s + +#include + +// CHECK-LABEL: @test_svld1_hor_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG:%.*]], i8* [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE1]]) +// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE2]]) +// CHECK-NEXT: [[TILESLICE3:%.*]] = add i32 [[SLICE_BASE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE3]]) +// CHECK-NEXT: [[TILESLICE4:%.*]] = add i32 [[SLICE_BASE]], 4 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE4]]) +// CHECK-NEXT: [[TILESLICE5:%.*]] = add i32 [[SLICE_BASE]], 5 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE5]]) +// CHECK-NEXT: [[TILESLICE6:%.*]] = add i32 [[SLICE_BASE]], 6 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE6]]) +// CHECK-NEXT: [[TILESLICE7:%.*]] = add i32 [[SLICE_BASE]], 7 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE7]]) +// CHECK-NEXT: [[TILESLICE8:%.*]] = add i32 [[SLICE_BASE]], 8 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE8]]) +// CHECK-NEXT: [[TILESLICE9:%.*]] = add i32 [[SLICE_BASE]], 9 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE9]]) +// CHECK-NEXT: [[TILESLICE10:%.*]] = add i32 [[SLICE_BASE]], 10 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE10]]) +// CHECK-NEXT: [[TILESLICE11:%.*]] = add i32 [[SLICE_BASE]], 11 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE11]]) +// CHECK-NEXT: [[TILESLICE12:%.*]] = add i32 [[SLICE_BASE]], 12 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE12]]) +// CHECK-NEXT: [[TILESLICE13:%.*]] = add i32 [[SLICE_BASE]], 13 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE13]]) +// CHECK-NEXT: [[TILESLICE14:%.*]] = add i32 [[SLICE_BASE]], 14 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE14]]) +// CHECK-NEXT: [[TILESLICE15:%.*]] = add i32 [[SLICE_BASE]], 15 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE15]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svld1_hor_za8ju10__SVBool_tPKv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG:%.*]], i8* [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 2 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: [[TILESLICE3:%.*]] = add i32 [[SLICE_BASE]], 3 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: [[TILESLICE4:%.*]] = add i32 [[SLICE_BASE]], 4 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE4]]) +// CPP-CHECK-NEXT: [[TILESLICE5:%.*]] = add i32 [[SLICE_BASE]], 5 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE5]]) +// CPP-CHECK-NEXT: [[TILESLICE6:%.*]] = add i32 [[SLICE_BASE]], 6 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE6]]) +// CPP-CHECK-NEXT: [[TILESLICE7:%.*]] = add i32 [[SLICE_BASE]], 7 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE7]]) +// CPP-CHECK-NEXT: [[TILESLICE8:%.*]] = add i32 [[SLICE_BASE]], 8 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE8]]) +// CPP-CHECK-NEXT: [[TILESLICE9:%.*]] = add i32 [[SLICE_BASE]], 9 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE9]]) +// CPP-CHECK-NEXT: [[TILESLICE10:%.*]] = add i32 [[SLICE_BASE]], 10 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE10]]) +// CPP-CHECK-NEXT: [[TILESLICE11:%.*]] = add i32 [[SLICE_BASE]], 11 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE11]]) +// CPP-CHECK-NEXT: [[TILESLICE12:%.*]] = add i32 [[SLICE_BASE]], 12 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE12]]) +// CPP-CHECK-NEXT: [[TILESLICE13:%.*]] = add i32 [[SLICE_BASE]], 13 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE13]]) +// CPP-CHECK-NEXT: [[TILESLICE14:%.*]] = add i32 [[SLICE_BASE]], 14 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE14]]) +// CPP-CHECK-NEXT: [[TILESLICE15:%.*]] = add i32 [[SLICE_BASE]], 15 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE15]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_za8(uint32_t slice_base, svbool_t pg, const void *ptr) { + svld1_hor_za8(0, slice_base, 0, pg, ptr); + svld1_hor_za8(0, slice_base, 1, pg, ptr); + svld1_hor_za8(0, slice_base, 2, pg, ptr); + svld1_hor_za8(0, slice_base, 3, pg, ptr); + svld1_hor_za8(0, slice_base, 4, pg, ptr); + svld1_hor_za8(0, slice_base, 5, pg, ptr); + svld1_hor_za8(0, slice_base, 6, pg, ptr); + svld1_hor_za8(0, slice_base, 7, pg, ptr); + svld1_hor_za8(0, slice_base, 8, pg, ptr); + svld1_hor_za8(0, slice_base, 9, pg, ptr); + svld1_hor_za8(0, slice_base, 10, pg, ptr); + svld1_hor_za8(0, slice_base, 11, pg, ptr); + svld1_hor_za8(0, slice_base, 12, pg, ptr); + svld1_hor_za8(0, slice_base, 13, pg, ptr); + svld1_hor_za8(0, slice_base, 14, pg, ptr); + svld1_hor_za8(0, slice_base, 15, pg, ptr); +} + +// CHECK-LABEL: @test_svld1_hor_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i16* +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG:%.*]], i16* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE1]]) +// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE2]]) +// CHECK-NEXT: [[TILESLICE3:%.*]] = add i32 [[SLICE_BASE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE3]]) +// CHECK-NEXT: [[TILESLICE4:%.*]] = add i32 [[SLICE_BASE]], 4 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE4]]) +// CHECK-NEXT: [[TILESLICE5:%.*]] = add i32 [[SLICE_BASE]], 5 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE5]]) +// CHECK-NEXT: [[TILESLICE6:%.*]] = add i32 [[SLICE_BASE]], 6 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE6]]) +// CHECK-NEXT: [[TILESLICE7:%.*]] = add i32 [[SLICE_BASE]], 7 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE7]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE2]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE3]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE4]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE5]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE6]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svld1_hor_za16ju10__SVBool_tPKv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i16* +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG:%.*]], i16* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 2 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: [[TILESLICE3:%.*]] = add i32 [[SLICE_BASE]], 3 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: [[TILESLICE4:%.*]] = add i32 [[SLICE_BASE]], 4 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE4]]) +// CPP-CHECK-NEXT: [[TILESLICE5:%.*]] = add i32 [[SLICE_BASE]], 5 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE5]]) +// CPP-CHECK-NEXT: [[TILESLICE6:%.*]] = add i32 [[SLICE_BASE]], 6 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE6]]) +// CPP-CHECK-NEXT: [[TILESLICE7:%.*]] = add i32 [[SLICE_BASE]], 7 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE7]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE4]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE5]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE6]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_za16(uint32_t slice_base, svbool_t pg, const void *ptr) { + svld1_hor_za16(0, slice_base, 0, pg, ptr); + svld1_hor_za16(0, slice_base, 1, pg, ptr); + svld1_hor_za16(0, slice_base, 2, pg, ptr); + svld1_hor_za16(0, slice_base, 3, pg, ptr); + svld1_hor_za16(0, slice_base, 4, pg, ptr); + svld1_hor_za16(0, slice_base, 5, pg, ptr); + svld1_hor_za16(0, slice_base, 6, pg, ptr); + svld1_hor_za16(0, slice_base, 7, pg, ptr); + svld1_hor_za16(1, slice_base, 1, pg, ptr); + svld1_hor_za16(1, slice_base, 2, pg, ptr); + svld1_hor_za16(1, slice_base, 3, pg, ptr); + svld1_hor_za16(1, slice_base, 4, pg, ptr); + svld1_hor_za16(1, slice_base, 5, pg, ptr); + svld1_hor_za16(1, slice_base, 6, pg, ptr); + svld1_hor_za16(1, slice_base, 7, pg, ptr); +} + +// CHECK-LABEL: @test_svld1_hor_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i32* +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG:%.*]], i32* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 0, i32 [[TILESLICE1]]) +// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 0, i32 [[TILESLICE2]]) +// CHECK-NEXT: [[TILESLICE3:%.*]] = add i32 [[SLICE_BASE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 0, i32 [[TILESLICE3]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 1, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 1, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 1, i32 [[TILESLICE2]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 1, i32 [[TILESLICE3]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 2, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 2, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 2, i32 [[TILESLICE2]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 2, i32 [[TILESLICE3]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 3, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE2]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svld1_hor_za32ju10__SVBool_tPKv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i32* +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG:%.*]], i32* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 0, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 2 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 0, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: [[TILESLICE3:%.*]] = add i32 [[SLICE_BASE]], 3 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 0, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 1, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 1, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 1, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 1, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 2, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 2, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 2, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 2, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 3, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.horiz( [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_za32(uint32_t slice_base, svbool_t pg, const void *ptr) { + svld1_hor_za32(0, slice_base, 0, pg, ptr); + svld1_hor_za32(0, slice_base, 1, pg, ptr); + svld1_hor_za32(0, slice_base, 2, pg, ptr); + svld1_hor_za32(0, slice_base, 3, pg, ptr); + svld1_hor_za32(1, slice_base, 0, pg, ptr); + svld1_hor_za32(1, slice_base, 1, pg, ptr); + svld1_hor_za32(1, slice_base, 2, pg, ptr); + svld1_hor_za32(1, slice_base, 3, pg, ptr); + svld1_hor_za32(2, slice_base, 0, pg, ptr); + svld1_hor_za32(2, slice_base, 1, pg, ptr); + svld1_hor_za32(2, slice_base, 2, pg, ptr); + svld1_hor_za32(2, slice_base, 3, pg, ptr); + svld1_hor_za32(3, slice_base, 0, pg, ptr); + svld1_hor_za32(3, slice_base, 1, pg, ptr); + svld1_hor_za32(3, slice_base, 2, pg, ptr); + svld1_hor_za32(3, slice_base, 3, pg, ptr); +} + +// CHECK-LABEL: @test_svld1_hor_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i64* +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG:%.*]], i64* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 0, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 1, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 1, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 2, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 2, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 3, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 3, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 4, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 4, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 5, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 5, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 6, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 6, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 7, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 7, i32 [[TILESLICE1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svld1_hor_za64ju10__SVBool_tPKv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i64* +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG:%.*]], i64* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 0, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 1, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 1, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 2, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 2, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 3, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 3, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 4, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 4, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 5, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 5, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 6, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 6, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 7, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.horiz( [[PG]], i64* [[TMP0]], i64 7, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_za64(uint32_t slice_base, svbool_t pg, const void *ptr) { + svld1_hor_za64(0, slice_base, 0, pg, ptr); + svld1_hor_za64(0, slice_base, 1, pg, ptr); + svld1_hor_za64(1, slice_base, 0, pg, ptr); + svld1_hor_za64(1, slice_base, 1, pg, ptr); + svld1_hor_za64(2, slice_base, 0, pg, ptr); + svld1_hor_za64(2, slice_base, 1, pg, ptr); + svld1_hor_za64(3, slice_base, 0, pg, ptr); + svld1_hor_za64(3, slice_base, 1, pg, ptr); + svld1_hor_za64(4, slice_base, 0, pg, ptr); + svld1_hor_za64(4, slice_base, 1, pg, ptr); + svld1_hor_za64(5, slice_base, 0, pg, ptr); + svld1_hor_za64(5, slice_base, 1, pg, ptr); + svld1_hor_za64(6, slice_base, 0, pg, ptr); + svld1_hor_za64(6, slice_base, 1, pg, ptr); + svld1_hor_za64(7, slice_base, 0, pg, ptr); + svld1_hor_za64(7, slice_base, 1, pg, ptr); +} + +// CHECK-LABEL: @test_svld1_hor_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i128* +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG:%.*]], i128* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 1, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 2, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 3, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 4, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 5, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 6, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 7, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 8, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 9, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 10, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 11, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 12, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 13, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 14, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 15, i32 [[SLICE_BASE]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svld1_hor_za128ju10__SVBool_tPKv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i128* +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG:%.*]], i128* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 1, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 2, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 3, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 4, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 5, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 6, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 7, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 8, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 9, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 10, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 11, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 12, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 13, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 14, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.horiz( [[PG]], i128* [[TMP0]], i64 15, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_hor_za128(uint32_t slice_base, svbool_t pg, const void *ptr) { + svld1_hor_za128(0, slice_base, 0, pg, ptr); + svld1_hor_za128(1, slice_base, 0, pg, ptr); + svld1_hor_za128(2, slice_base, 0, pg, ptr); + svld1_hor_za128(3, slice_base, 0, pg, ptr); + svld1_hor_za128(4, slice_base, 0, pg, ptr); + svld1_hor_za128(5, slice_base, 0, pg, ptr); + svld1_hor_za128(6, slice_base, 0, pg, ptr); + svld1_hor_za128(7, slice_base, 0, pg, ptr); + svld1_hor_za128(8, slice_base, 0, pg, ptr); + svld1_hor_za128(9, slice_base, 0, pg, ptr); + svld1_hor_za128(10, slice_base, 0, pg, ptr); + svld1_hor_za128(11, slice_base, 0, pg, ptr); + svld1_hor_za128(12, slice_base, 0, pg, ptr); + svld1_hor_za128(13, slice_base, 0, pg, ptr); + svld1_hor_za128(14, slice_base, 0, pg, ptr); + svld1_hor_za128(15, slice_base, 0, pg, ptr); +} + +// CHECK-LABEL: @test_svld1_ver_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG:%.*]], i8* [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE1]]) +// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE2]]) +// CHECK-NEXT: [[TILESLICE3:%.*]] = add i32 [[SLICE_BASE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE3]]) +// CHECK-NEXT: [[TILESLICE4:%.*]] = add i32 [[SLICE_BASE]], 4 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE4]]) +// CHECK-NEXT: [[TILESLICE5:%.*]] = add i32 [[SLICE_BASE]], 5 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE5]]) +// CHECK-NEXT: [[TILESLICE6:%.*]] = add i32 [[SLICE_BASE]], 6 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE6]]) +// CHECK-NEXT: [[TILESLICE7:%.*]] = add i32 [[SLICE_BASE]], 7 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE7]]) +// CHECK-NEXT: [[TILESLICE8:%.*]] = add i32 [[SLICE_BASE]], 8 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE8]]) +// CHECK-NEXT: [[TILESLICE9:%.*]] = add i32 [[SLICE_BASE]], 9 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE9]]) +// CHECK-NEXT: [[TILESLICE10:%.*]] = add i32 [[SLICE_BASE]], 10 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE10]]) +// CHECK-NEXT: [[TILESLICE11:%.*]] = add i32 [[SLICE_BASE]], 11 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE11]]) +// CHECK-NEXT: [[TILESLICE12:%.*]] = add i32 [[SLICE_BASE]], 12 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE12]]) +// CHECK-NEXT: [[TILESLICE13:%.*]] = add i32 [[SLICE_BASE]], 13 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE13]]) +// CHECK-NEXT: [[TILESLICE14:%.*]] = add i32 [[SLICE_BASE]], 14 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE14]]) +// CHECK-NEXT: [[TILESLICE15:%.*]] = add i32 [[SLICE_BASE]], 15 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE15]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svld1_ver_za8ju10__SVBool_tPKv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG:%.*]], i8* [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 2 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: [[TILESLICE3:%.*]] = add i32 [[SLICE_BASE]], 3 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: [[TILESLICE4:%.*]] = add i32 [[SLICE_BASE]], 4 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE4]]) +// CPP-CHECK-NEXT: [[TILESLICE5:%.*]] = add i32 [[SLICE_BASE]], 5 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE5]]) +// CPP-CHECK-NEXT: [[TILESLICE6:%.*]] = add i32 [[SLICE_BASE]], 6 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE6]]) +// CPP-CHECK-NEXT: [[TILESLICE7:%.*]] = add i32 [[SLICE_BASE]], 7 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE7]]) +// CPP-CHECK-NEXT: [[TILESLICE8:%.*]] = add i32 [[SLICE_BASE]], 8 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE8]]) +// CPP-CHECK-NEXT: [[TILESLICE9:%.*]] = add i32 [[SLICE_BASE]], 9 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE9]]) +// CPP-CHECK-NEXT: [[TILESLICE10:%.*]] = add i32 [[SLICE_BASE]], 10 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE10]]) +// CPP-CHECK-NEXT: [[TILESLICE11:%.*]] = add i32 [[SLICE_BASE]], 11 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE11]]) +// CPP-CHECK-NEXT: [[TILESLICE12:%.*]] = add i32 [[SLICE_BASE]], 12 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE12]]) +// CPP-CHECK-NEXT: [[TILESLICE13:%.*]] = add i32 [[SLICE_BASE]], 13 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE13]]) +// CPP-CHECK-NEXT: [[TILESLICE14:%.*]] = add i32 [[SLICE_BASE]], 14 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE14]]) +// CPP-CHECK-NEXT: [[TILESLICE15:%.*]] = add i32 [[SLICE_BASE]], 15 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE15]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_za8(uint32_t slice_base, svbool_t pg, const void *ptr) { + svld1_ver_za8(0, slice_base, 0, pg, ptr); + svld1_ver_za8(0, slice_base, 1, pg, ptr); + svld1_ver_za8(0, slice_base, 2, pg, ptr); + svld1_ver_za8(0, slice_base, 3, pg, ptr); + svld1_ver_za8(0, slice_base, 4, pg, ptr); + svld1_ver_za8(0, slice_base, 5, pg, ptr); + svld1_ver_za8(0, slice_base, 6, pg, ptr); + svld1_ver_za8(0, slice_base, 7, pg, ptr); + svld1_ver_za8(0, slice_base, 8, pg, ptr); + svld1_ver_za8(0, slice_base, 9, pg, ptr); + svld1_ver_za8(0, slice_base, 10, pg, ptr); + svld1_ver_za8(0, slice_base, 11, pg, ptr); + svld1_ver_za8(0, slice_base, 12, pg, ptr); + svld1_ver_za8(0, slice_base, 13, pg, ptr); + svld1_ver_za8(0, slice_base, 14, pg, ptr); + svld1_ver_za8(0, slice_base, 15, pg, ptr); +} + +// CHECK-LABEL: @test_svld1_ver_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i16* +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG:%.*]], i16* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE1]]) +// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE2]]) +// CHECK-NEXT: [[TILESLICE3:%.*]] = add i32 [[SLICE_BASE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE3]]) +// CHECK-NEXT: [[TILESLICE4:%.*]] = add i32 [[SLICE_BASE]], 4 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE4]]) +// CHECK-NEXT: [[TILESLICE5:%.*]] = add i32 [[SLICE_BASE]], 5 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE5]]) +// CHECK-NEXT: [[TILESLICE6:%.*]] = add i32 [[SLICE_BASE]], 6 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE6]]) +// CHECK-NEXT: [[TILESLICE7:%.*]] = add i32 [[SLICE_BASE]], 7 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE7]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE2]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE3]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE4]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE5]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE6]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svld1_ver_za16ju10__SVBool_tPKv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i16* +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG:%.*]], i16* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 2 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: [[TILESLICE3:%.*]] = add i32 [[SLICE_BASE]], 3 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: [[TILESLICE4:%.*]] = add i32 [[SLICE_BASE]], 4 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE4]]) +// CPP-CHECK-NEXT: [[TILESLICE5:%.*]] = add i32 [[SLICE_BASE]], 5 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE5]]) +// CPP-CHECK-NEXT: [[TILESLICE6:%.*]] = add i32 [[SLICE_BASE]], 6 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE6]]) +// CPP-CHECK-NEXT: [[TILESLICE7:%.*]] = add i32 [[SLICE_BASE]], 7 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE7]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE4]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE5]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE6]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_za16(uint32_t slice_base, svbool_t pg, const void *ptr) { + svld1_ver_za16(0, slice_base, 0, pg, ptr); + svld1_ver_za16(0, slice_base, 1, pg, ptr); + svld1_ver_za16(0, slice_base, 2, pg, ptr); + svld1_ver_za16(0, slice_base, 3, pg, ptr); + svld1_ver_za16(0, slice_base, 4, pg, ptr); + svld1_ver_za16(0, slice_base, 5, pg, ptr); + svld1_ver_za16(0, slice_base, 6, pg, ptr); + svld1_ver_za16(0, slice_base, 7, pg, ptr); + svld1_ver_za16(1, slice_base, 1, pg, ptr); + svld1_ver_za16(1, slice_base, 2, pg, ptr); + svld1_ver_za16(1, slice_base, 3, pg, ptr); + svld1_ver_za16(1, slice_base, 4, pg, ptr); + svld1_ver_za16(1, slice_base, 5, pg, ptr); + svld1_ver_za16(1, slice_base, 6, pg, ptr); + svld1_ver_za16(1, slice_base, 7, pg, ptr); +} + +// CHECK-LABEL: @test_svld1_ver_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i32* +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG:%.*]], i32* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 0, i32 [[TILESLICE1]]) +// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 0, i32 [[TILESLICE2]]) +// CHECK-NEXT: [[TILESLICE3:%.*]] = add i32 [[SLICE_BASE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 0, i32 [[TILESLICE3]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 1, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 1, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 1, i32 [[TILESLICE2]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 1, i32 [[TILESLICE3]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 2, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 2, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 2, i32 [[TILESLICE2]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 2, i32 [[TILESLICE3]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 3, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE2]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svld1_ver_za32ju10__SVBool_tPKv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i32* +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG:%.*]], i32* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 0, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 2 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 0, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: [[TILESLICE3:%.*]] = add i32 [[SLICE_BASE]], 3 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 0, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 1, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 1, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 1, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 1, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 2, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 2, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 2, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 2, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 3, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1w.vert( [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_za32(uint32_t slice_base, svbool_t pg, const void *ptr) { + svld1_ver_za32(0, slice_base, 0, pg, ptr); + svld1_ver_za32(0, slice_base, 1, pg, ptr); + svld1_ver_za32(0, slice_base, 2, pg, ptr); + svld1_ver_za32(0, slice_base, 3, pg, ptr); + svld1_ver_za32(1, slice_base, 0, pg, ptr); + svld1_ver_za32(1, slice_base, 1, pg, ptr); + svld1_ver_za32(1, slice_base, 2, pg, ptr); + svld1_ver_za32(1, slice_base, 3, pg, ptr); + svld1_ver_za32(2, slice_base, 0, pg, ptr); + svld1_ver_za32(2, slice_base, 1, pg, ptr); + svld1_ver_za32(2, slice_base, 2, pg, ptr); + svld1_ver_za32(2, slice_base, 3, pg, ptr); + svld1_ver_za32(3, slice_base, 0, pg, ptr); + svld1_ver_za32(3, slice_base, 1, pg, ptr); + svld1_ver_za32(3, slice_base, 2, pg, ptr); + svld1_ver_za32(3, slice_base, 3, pg, ptr); +} + +// CHECK-LABEL: @test_svld1_ver_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i64* +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG:%.*]], i64* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 0, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 1, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 1, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 2, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 2, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 3, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 3, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 4, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 4, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 5, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 5, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 6, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 6, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 7, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 7, i32 [[TILESLICE1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svld1_ver_za64ju10__SVBool_tPKv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i64* +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG:%.*]], i64* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 0, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 1, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 1, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 2, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 2, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 3, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 3, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 4, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 4, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 5, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 5, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 6, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 6, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 7, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1d.vert( [[PG]], i64* [[TMP0]], i64 7, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_za64(uint32_t slice_base, svbool_t pg, const void *ptr) { + svld1_ver_za64(0, slice_base, 0, pg, ptr); + svld1_ver_za64(0, slice_base, 1, pg, ptr); + svld1_ver_za64(1, slice_base, 0, pg, ptr); + svld1_ver_za64(1, slice_base, 1, pg, ptr); + svld1_ver_za64(2, slice_base, 0, pg, ptr); + svld1_ver_za64(2, slice_base, 1, pg, ptr); + svld1_ver_za64(3, slice_base, 0, pg, ptr); + svld1_ver_za64(3, slice_base, 1, pg, ptr); + svld1_ver_za64(4, slice_base, 0, pg, ptr); + svld1_ver_za64(4, slice_base, 1, pg, ptr); + svld1_ver_za64(5, slice_base, 0, pg, ptr); + svld1_ver_za64(5, slice_base, 1, pg, ptr); + svld1_ver_za64(6, slice_base, 0, pg, ptr); + svld1_ver_za64(6, slice_base, 1, pg, ptr); + svld1_ver_za64(7, slice_base, 0, pg, ptr); + svld1_ver_za64(7, slice_base, 1, pg, ptr); +} + +// CHECK-LABEL: @test_svld1_ver_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i128* +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG:%.*]], i128* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 1, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 2, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 3, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 4, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 5, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 6, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 7, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 8, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 9, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 10, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 11, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 12, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 13, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 14, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 15, i32 [[SLICE_BASE]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svld1_ver_za128ju10__SVBool_tPKv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i128* +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG:%.*]], i128* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 1, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 2, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 3, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 4, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 5, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 6, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 7, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 8, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 9, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 10, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 11, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 12, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 13, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 14, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.ld1q.vert( [[PG]], i128* [[TMP0]], i64 15, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: ret void +// +void test_svld1_ver_za128(uint32_t slice_base, svbool_t pg, const void *ptr) { + svld1_ver_za128(0, slice_base, 0, pg, ptr); + svld1_ver_za128(1, slice_base, 0, pg, ptr); + svld1_ver_za128(2, slice_base, 0, pg, ptr); + svld1_ver_za128(3, slice_base, 0, pg, ptr); + svld1_ver_za128(4, slice_base, 0, pg, ptr); + svld1_ver_za128(5, slice_base, 0, pg, ptr); + svld1_ver_za128(6, slice_base, 0, pg, ptr); + svld1_ver_za128(7, slice_base, 0, pg, ptr); + svld1_ver_za128(8, slice_base, 0, pg, ptr); + svld1_ver_za128(9, slice_base, 0, pg, ptr); + svld1_ver_za128(10, slice_base, 0, pg, ptr); + svld1_ver_za128(11, slice_base, 0, pg, ptr); + svld1_ver_za128(12, slice_base, 0, pg, ptr); + svld1_ver_za128(13, slice_base, 0, pg, ptr); + svld1_ver_za128(14, slice_base, 0, pg, ptr); + svld1_ver_za128(15, slice_base, 0, pg, ptr); +} diff --git a/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sme-intrinsics/acle_sme_st1.c @@ -0,0 +1,711 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -no-opaque-pointers -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -emit-llvm -o - %s | FileCheck %s +// RUN: %clang_cc1 -no-opaque-pointers -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -emit-llvm -o - -x c++ %s | FileCheck %s -check-prefix=CPP-CHECK +// RUN: %clang_cc1 -no-opaque-pointers -triple aarch64-none-linux-gnu -target-feature +sme -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -o /dev/null %s + +#include + +// CHECK-LABEL: @test_svst1_hor_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG:%.*]], i8* [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE1]]) +// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE2]]) +// CHECK-NEXT: [[TILESLICE3:%.*]] = add i32 [[SLICE_BASE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE3]]) +// CHECK-NEXT: [[TILESLICE4:%.*]] = add i32 [[SLICE_BASE]], 4 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE4]]) +// CHECK-NEXT: [[TILESLICE5:%.*]] = add i32 [[SLICE_BASE]], 5 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE5]]) +// CHECK-NEXT: [[TILESLICE6:%.*]] = add i32 [[SLICE_BASE]], 6 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE6]]) +// CHECK-NEXT: [[TILESLICE7:%.*]] = add i32 [[SLICE_BASE]], 7 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE7]]) +// CHECK-NEXT: [[TILESLICE8:%.*]] = add i32 [[SLICE_BASE]], 8 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE8]]) +// CHECK-NEXT: [[TILESLICE9:%.*]] = add i32 [[SLICE_BASE]], 9 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE9]]) +// CHECK-NEXT: [[TILESLICE10:%.*]] = add i32 [[SLICE_BASE]], 10 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE10]]) +// CHECK-NEXT: [[TILESLICE11:%.*]] = add i32 [[SLICE_BASE]], 11 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE11]]) +// CHECK-NEXT: [[TILESLICE12:%.*]] = add i32 [[SLICE_BASE]], 12 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE12]]) +// CHECK-NEXT: [[TILESLICE13:%.*]] = add i32 [[SLICE_BASE]], 13 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE13]]) +// CHECK-NEXT: [[TILESLICE14:%.*]] = add i32 [[SLICE_BASE]], 14 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE14]]) +// CHECK-NEXT: [[TILESLICE15:%.*]] = add i32 [[SLICE_BASE]], 15 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE15]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svst1_hor_za8ju10__SVBool_tPv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG:%.*]], i8* [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 2 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: [[TILESLICE3:%.*]] = add i32 [[SLICE_BASE]], 3 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: [[TILESLICE4:%.*]] = add i32 [[SLICE_BASE]], 4 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE4]]) +// CPP-CHECK-NEXT: [[TILESLICE5:%.*]] = add i32 [[SLICE_BASE]], 5 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE5]]) +// CPP-CHECK-NEXT: [[TILESLICE6:%.*]] = add i32 [[SLICE_BASE]], 6 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE6]]) +// CPP-CHECK-NEXT: [[TILESLICE7:%.*]] = add i32 [[SLICE_BASE]], 7 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE7]]) +// CPP-CHECK-NEXT: [[TILESLICE8:%.*]] = add i32 [[SLICE_BASE]], 8 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE8]]) +// CPP-CHECK-NEXT: [[TILESLICE9:%.*]] = add i32 [[SLICE_BASE]], 9 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE9]]) +// CPP-CHECK-NEXT: [[TILESLICE10:%.*]] = add i32 [[SLICE_BASE]], 10 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE10]]) +// CPP-CHECK-NEXT: [[TILESLICE11:%.*]] = add i32 [[SLICE_BASE]], 11 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE11]]) +// CPP-CHECK-NEXT: [[TILESLICE12:%.*]] = add i32 [[SLICE_BASE]], 12 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE12]]) +// CPP-CHECK-NEXT: [[TILESLICE13:%.*]] = add i32 [[SLICE_BASE]], 13 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE13]]) +// CPP-CHECK-NEXT: [[TILESLICE14:%.*]] = add i32 [[SLICE_BASE]], 14 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE14]]) +// CPP-CHECK-NEXT: [[TILESLICE15:%.*]] = add i32 [[SLICE_BASE]], 15 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.horiz( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE15]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_za8(uint32_t slice_base, svbool_t pg, void *ptr) { + svst1_hor_za8(0, slice_base, 0, pg, ptr); + svst1_hor_za8(0, slice_base, 1, pg, ptr); + svst1_hor_za8(0, slice_base, 2, pg, ptr); + svst1_hor_za8(0, slice_base, 3, pg, ptr); + svst1_hor_za8(0, slice_base, 4, pg, ptr); + svst1_hor_za8(0, slice_base, 5, pg, ptr); + svst1_hor_za8(0, slice_base, 6, pg, ptr); + svst1_hor_za8(0, slice_base, 7, pg, ptr); + svst1_hor_za8(0, slice_base, 8, pg, ptr); + svst1_hor_za8(0, slice_base, 9, pg, ptr); + svst1_hor_za8(0, slice_base, 10, pg, ptr); + svst1_hor_za8(0, slice_base, 11, pg, ptr); + svst1_hor_za8(0, slice_base, 12, pg, ptr); + svst1_hor_za8(0, slice_base, 13, pg, ptr); + svst1_hor_za8(0, slice_base, 14, pg, ptr); + svst1_hor_za8(0, slice_base, 15, pg, ptr); +} + +// CHECK-LABEL: @test_svst1_hor_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i16* +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG:%.*]], i16* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE1]]) +// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE2]]) +// CHECK-NEXT: [[TILESLICE3:%.*]] = add i32 [[SLICE_BASE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE3]]) +// CHECK-NEXT: [[TILESLICE4:%.*]] = add i32 [[SLICE_BASE]], 4 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE4]]) +// CHECK-NEXT: [[TILESLICE5:%.*]] = add i32 [[SLICE_BASE]], 5 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE5]]) +// CHECK-NEXT: [[TILESLICE6:%.*]] = add i32 [[SLICE_BASE]], 6 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE6]]) +// CHECK-NEXT: [[TILESLICE7:%.*]] = add i32 [[SLICE_BASE]], 7 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE7]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE2]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE3]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE4]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE5]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE6]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst1_hor_za16ju10__SVBool_tPv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i16* +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG:%.*]], i16* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 2 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: [[TILESLICE3:%.*]] = add i32 [[SLICE_BASE]], 3 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: [[TILESLICE4:%.*]] = add i32 [[SLICE_BASE]], 4 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE4]]) +// CPP-CHECK-NEXT: [[TILESLICE5:%.*]] = add i32 [[SLICE_BASE]], 5 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE5]]) +// CPP-CHECK-NEXT: [[TILESLICE6:%.*]] = add i32 [[SLICE_BASE]], 6 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE6]]) +// CPP-CHECK-NEXT: [[TILESLICE7:%.*]] = add i32 [[SLICE_BASE]], 7 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE7]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE4]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE5]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE6]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.horiz( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_za16(uint32_t slice_base, svbool_t pg, void *ptr) { + svst1_hor_za16(0, slice_base, 0, pg, ptr); + svst1_hor_za16(0, slice_base, 1, pg, ptr); + svst1_hor_za16(0, slice_base, 2, pg, ptr); + svst1_hor_za16(0, slice_base, 3, pg, ptr); + svst1_hor_za16(0, slice_base, 4, pg, ptr); + svst1_hor_za16(0, slice_base, 5, pg, ptr); + svst1_hor_za16(0, slice_base, 6, pg, ptr); + svst1_hor_za16(0, slice_base, 7, pg, ptr); + svst1_hor_za16(1, slice_base, 1, pg, ptr); + svst1_hor_za16(1, slice_base, 2, pg, ptr); + svst1_hor_za16(1, slice_base, 3, pg, ptr); + svst1_hor_za16(1, slice_base, 4, pg, ptr); + svst1_hor_za16(1, slice_base, 5, pg, ptr); + svst1_hor_za16(1, slice_base, 6, pg, ptr); + svst1_hor_za16(1, slice_base, 7, pg, ptr); +} + +// CHECK-LABEL: @test_svst1_hor_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i32* +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG:%.*]], i32* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 0, i32 [[TILESLICE1]]) +// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 0, i32 [[TILESLICE2]]) +// CHECK-NEXT: [[TILESLICE3:%.*]] = add i32 [[SLICE_BASE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 0, i32 [[TILESLICE3]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 1, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 1, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 1, i32 [[TILESLICE2]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 1, i32 [[TILESLICE3]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 2, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 2, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 2, i32 [[TILESLICE2]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 2, i32 [[TILESLICE3]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 3, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE2]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst1_hor_za32ju10__SVBool_tPv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i32* +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG:%.*]], i32* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 0, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 2 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 0, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: [[TILESLICE3:%.*]] = add i32 [[SLICE_BASE]], 3 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 0, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 1, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 1, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 1, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 1, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 2, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 2, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 2, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 2, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 3, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.horiz( [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_za32(uint32_t slice_base, svbool_t pg, void *ptr) { + svst1_hor_za32(0, slice_base, 0, pg, ptr); + svst1_hor_za32(0, slice_base, 1, pg, ptr); + svst1_hor_za32(0, slice_base, 2, pg, ptr); + svst1_hor_za32(0, slice_base, 3, pg, ptr); + svst1_hor_za32(1, slice_base, 0, pg, ptr); + svst1_hor_za32(1, slice_base, 1, pg, ptr); + svst1_hor_za32(1, slice_base, 2, pg, ptr); + svst1_hor_za32(1, slice_base, 3, pg, ptr); + svst1_hor_za32(2, slice_base, 0, pg, ptr); + svst1_hor_za32(2, slice_base, 1, pg, ptr); + svst1_hor_za32(2, slice_base, 2, pg, ptr); + svst1_hor_za32(2, slice_base, 3, pg, ptr); + svst1_hor_za32(3, slice_base, 0, pg, ptr); + svst1_hor_za32(3, slice_base, 1, pg, ptr); + svst1_hor_za32(3, slice_base, 2, pg, ptr); + svst1_hor_za32(3, slice_base, 3, pg, ptr); +} + +// CHECK-LABEL: @test_svst1_hor_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i64* +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG:%.*]], i64* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 0, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 1, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 1, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 2, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 2, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 3, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 3, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 4, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 4, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 5, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 5, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 6, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 6, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 7, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 7, i32 [[TILESLICE1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst1_hor_za64ju10__SVBool_tPv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i64* +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG:%.*]], i64* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 0, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 1, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 1, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 2, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 2, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 3, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 3, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 4, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 4, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 5, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 5, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 6, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 6, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 7, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.horiz( [[PG]], i64* [[TMP0]], i64 7, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_za64(uint32_t slice_base, svbool_t pg, void *ptr) { + svst1_hor_za64(0, slice_base, 0, pg, ptr); + svst1_hor_za64(0, slice_base, 1, pg, ptr); + svst1_hor_za64(1, slice_base, 0, pg, ptr); + svst1_hor_za64(1, slice_base, 1, pg, ptr); + svst1_hor_za64(2, slice_base, 0, pg, ptr); + svst1_hor_za64(2, slice_base, 1, pg, ptr); + svst1_hor_za64(3, slice_base, 0, pg, ptr); + svst1_hor_za64(3, slice_base, 1, pg, ptr); + svst1_hor_za64(4, slice_base, 0, pg, ptr); + svst1_hor_za64(4, slice_base, 1, pg, ptr); + svst1_hor_za64(5, slice_base, 0, pg, ptr); + svst1_hor_za64(5, slice_base, 1, pg, ptr); + svst1_hor_za64(6, slice_base, 0, pg, ptr); + svst1_hor_za64(6, slice_base, 1, pg, ptr); + svst1_hor_za64(7, slice_base, 0, pg, ptr); + svst1_hor_za64(7, slice_base, 1, pg, ptr); +} + +// CHECK-LABEL: @test_svst1_hor_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i128* +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG:%.*]], i128* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 1, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 2, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 3, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 4, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 5, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 6, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 7, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 8, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 9, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 10, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 11, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 12, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 13, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 14, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 15, i32 [[SLICE_BASE]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst1_hor_za128ju10__SVBool_tPv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i128* +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG:%.*]], i128* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 1, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 2, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 3, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 4, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 5, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 6, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 7, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 8, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 9, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 10, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 11, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 12, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 13, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 14, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.horiz( [[PG]], i128* [[TMP0]], i64 15, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_hor_za128(uint32_t slice_base, svbool_t pg, void *ptr) { + svst1_hor_za128(0, slice_base, 0, pg, ptr); + svst1_hor_za128(1, slice_base, 0, pg, ptr); + svst1_hor_za128(2, slice_base, 0, pg, ptr); + svst1_hor_za128(3, slice_base, 0, pg, ptr); + svst1_hor_za128(4, slice_base, 0, pg, ptr); + svst1_hor_za128(5, slice_base, 0, pg, ptr); + svst1_hor_za128(6, slice_base, 0, pg, ptr); + svst1_hor_za128(7, slice_base, 0, pg, ptr); + svst1_hor_za128(8, slice_base, 0, pg, ptr); + svst1_hor_za128(9, slice_base, 0, pg, ptr); + svst1_hor_za128(10, slice_base, 0, pg, ptr); + svst1_hor_za128(11, slice_base, 0, pg, ptr); + svst1_hor_za128(12, slice_base, 0, pg, ptr); + svst1_hor_za128(13, slice_base, 0, pg, ptr); + svst1_hor_za128(14, slice_base, 0, pg, ptr); + svst1_hor_za128(15, slice_base, 0, pg, ptr); +} + +// CHECK-LABEL: @test_svst1_ver_za8( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG:%.*]], i8* [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE1]]) +// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE2]]) +// CHECK-NEXT: [[TILESLICE3:%.*]] = add i32 [[SLICE_BASE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE3]]) +// CHECK-NEXT: [[TILESLICE4:%.*]] = add i32 [[SLICE_BASE]], 4 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE4]]) +// CHECK-NEXT: [[TILESLICE5:%.*]] = add i32 [[SLICE_BASE]], 5 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE5]]) +// CHECK-NEXT: [[TILESLICE6:%.*]] = add i32 [[SLICE_BASE]], 6 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE6]]) +// CHECK-NEXT: [[TILESLICE7:%.*]] = add i32 [[SLICE_BASE]], 7 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE7]]) +// CHECK-NEXT: [[TILESLICE8:%.*]] = add i32 [[SLICE_BASE]], 8 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE8]]) +// CHECK-NEXT: [[TILESLICE9:%.*]] = add i32 [[SLICE_BASE]], 9 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE9]]) +// CHECK-NEXT: [[TILESLICE10:%.*]] = add i32 [[SLICE_BASE]], 10 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE10]]) +// CHECK-NEXT: [[TILESLICE11:%.*]] = add i32 [[SLICE_BASE]], 11 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE11]]) +// CHECK-NEXT: [[TILESLICE12:%.*]] = add i32 [[SLICE_BASE]], 12 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE12]]) +// CHECK-NEXT: [[TILESLICE13:%.*]] = add i32 [[SLICE_BASE]], 13 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE13]]) +// CHECK-NEXT: [[TILESLICE14:%.*]] = add i32 [[SLICE_BASE]], 14 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE14]]) +// CHECK-NEXT: [[TILESLICE15:%.*]] = add i32 [[SLICE_BASE]], 15 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE15]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z18test_svst1_ver_za8ju10__SVBool_tPv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG:%.*]], i8* [[PTR:%.*]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 2 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: [[TILESLICE3:%.*]] = add i32 [[SLICE_BASE]], 3 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: [[TILESLICE4:%.*]] = add i32 [[SLICE_BASE]], 4 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE4]]) +// CPP-CHECK-NEXT: [[TILESLICE5:%.*]] = add i32 [[SLICE_BASE]], 5 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE5]]) +// CPP-CHECK-NEXT: [[TILESLICE6:%.*]] = add i32 [[SLICE_BASE]], 6 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE6]]) +// CPP-CHECK-NEXT: [[TILESLICE7:%.*]] = add i32 [[SLICE_BASE]], 7 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE7]]) +// CPP-CHECK-NEXT: [[TILESLICE8:%.*]] = add i32 [[SLICE_BASE]], 8 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE8]]) +// CPP-CHECK-NEXT: [[TILESLICE9:%.*]] = add i32 [[SLICE_BASE]], 9 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE9]]) +// CPP-CHECK-NEXT: [[TILESLICE10:%.*]] = add i32 [[SLICE_BASE]], 10 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE10]]) +// CPP-CHECK-NEXT: [[TILESLICE11:%.*]] = add i32 [[SLICE_BASE]], 11 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE11]]) +// CPP-CHECK-NEXT: [[TILESLICE12:%.*]] = add i32 [[SLICE_BASE]], 12 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE12]]) +// CPP-CHECK-NEXT: [[TILESLICE13:%.*]] = add i32 [[SLICE_BASE]], 13 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE13]]) +// CPP-CHECK-NEXT: [[TILESLICE14:%.*]] = add i32 [[SLICE_BASE]], 14 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE14]]) +// CPP-CHECK-NEXT: [[TILESLICE15:%.*]] = add i32 [[SLICE_BASE]], 15 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1b.vert( [[PG]], i8* [[PTR]], i64 0, i32 [[TILESLICE15]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_za8(uint32_t slice_base, svbool_t pg, void *ptr) { + svst1_ver_za8(0, slice_base, 0, pg, ptr); + svst1_ver_za8(0, slice_base, 1, pg, ptr); + svst1_ver_za8(0, slice_base, 2, pg, ptr); + svst1_ver_za8(0, slice_base, 3, pg, ptr); + svst1_ver_za8(0, slice_base, 4, pg, ptr); + svst1_ver_za8(0, slice_base, 5, pg, ptr); + svst1_ver_za8(0, slice_base, 6, pg, ptr); + svst1_ver_za8(0, slice_base, 7, pg, ptr); + svst1_ver_za8(0, slice_base, 8, pg, ptr); + svst1_ver_za8(0, slice_base, 9, pg, ptr); + svst1_ver_za8(0, slice_base, 10, pg, ptr); + svst1_ver_za8(0, slice_base, 11, pg, ptr); + svst1_ver_za8(0, slice_base, 12, pg, ptr); + svst1_ver_za8(0, slice_base, 13, pg, ptr); + svst1_ver_za8(0, slice_base, 14, pg, ptr); + svst1_ver_za8(0, slice_base, 15, pg, ptr); +} + +// CHECK-LABEL: @test_svst1_ver_za16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i16* +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG:%.*]], i16* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE1]]) +// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE2]]) +// CHECK-NEXT: [[TILESLICE3:%.*]] = add i32 [[SLICE_BASE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE3]]) +// CHECK-NEXT: [[TILESLICE4:%.*]] = add i32 [[SLICE_BASE]], 4 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE4]]) +// CHECK-NEXT: [[TILESLICE5:%.*]] = add i32 [[SLICE_BASE]], 5 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE5]]) +// CHECK-NEXT: [[TILESLICE6:%.*]] = add i32 [[SLICE_BASE]], 6 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE6]]) +// CHECK-NEXT: [[TILESLICE7:%.*]] = add i32 [[SLICE_BASE]], 7 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE7]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE2]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE3]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE4]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE5]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE6]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE7]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst1_ver_za16ju10__SVBool_tPv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i16* +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG:%.*]], i16* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 2 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: [[TILESLICE3:%.*]] = add i32 [[SLICE_BASE]], 3 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: [[TILESLICE4:%.*]] = add i32 [[SLICE_BASE]], 4 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE4]]) +// CPP-CHECK-NEXT: [[TILESLICE5:%.*]] = add i32 [[SLICE_BASE]], 5 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE5]]) +// CPP-CHECK-NEXT: [[TILESLICE6:%.*]] = add i32 [[SLICE_BASE]], 6 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE6]]) +// CPP-CHECK-NEXT: [[TILESLICE7:%.*]] = add i32 [[SLICE_BASE]], 7 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 0, i32 [[TILESLICE7]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE4]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE5]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE6]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1h.vert( [[PG]], i16* [[TMP0]], i64 1, i32 [[TILESLICE7]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_za16(uint32_t slice_base, svbool_t pg, void *ptr) { + svst1_ver_za16(0, slice_base, 0, pg, ptr); + svst1_ver_za16(0, slice_base, 1, pg, ptr); + svst1_ver_za16(0, slice_base, 2, pg, ptr); + svst1_ver_za16(0, slice_base, 3, pg, ptr); + svst1_ver_za16(0, slice_base, 4, pg, ptr); + svst1_ver_za16(0, slice_base, 5, pg, ptr); + svst1_ver_za16(0, slice_base, 6, pg, ptr); + svst1_ver_za16(0, slice_base, 7, pg, ptr); + svst1_ver_za16(1, slice_base, 1, pg, ptr); + svst1_ver_za16(1, slice_base, 2, pg, ptr); + svst1_ver_za16(1, slice_base, 3, pg, ptr); + svst1_ver_za16(1, slice_base, 4, pg, ptr); + svst1_ver_za16(1, slice_base, 5, pg, ptr); + svst1_ver_za16(1, slice_base, 6, pg, ptr); + svst1_ver_za16(1, slice_base, 7, pg, ptr); +} + +// CHECK-LABEL: @test_svst1_ver_za32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i32* +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG:%.*]], i32* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 0, i32 [[TILESLICE1]]) +// CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 2 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 0, i32 [[TILESLICE2]]) +// CHECK-NEXT: [[TILESLICE3:%.*]] = add i32 [[SLICE_BASE]], 3 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 0, i32 [[TILESLICE3]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 1, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 1, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 1, i32 [[TILESLICE2]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 1, i32 [[TILESLICE3]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 2, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 2, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 2, i32 [[TILESLICE2]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 2, i32 [[TILESLICE3]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 3, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE2]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE3]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst1_ver_za32ju10__SVBool_tPv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i32* +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG:%.*]], i32* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 0, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: [[TILESLICE2:%.*]] = add i32 [[SLICE_BASE]], 2 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 0, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: [[TILESLICE3:%.*]] = add i32 [[SLICE_BASE]], 3 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 0, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 1, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 1, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 1, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 1, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 2, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 2, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 2, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 2, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 3, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE2]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1w.vert( [[PG]], i32* [[TMP0]], i64 3, i32 [[TILESLICE3]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_za32(uint32_t slice_base, svbool_t pg, void *ptr) { + svst1_ver_za32(0, slice_base, 0, pg, ptr); + svst1_ver_za32(0, slice_base, 1, pg, ptr); + svst1_ver_za32(0, slice_base, 2, pg, ptr); + svst1_ver_za32(0, slice_base, 3, pg, ptr); + svst1_ver_za32(1, slice_base, 0, pg, ptr); + svst1_ver_za32(1, slice_base, 1, pg, ptr); + svst1_ver_za32(1, slice_base, 2, pg, ptr); + svst1_ver_za32(1, slice_base, 3, pg, ptr); + svst1_ver_za32(2, slice_base, 0, pg, ptr); + svst1_ver_za32(2, slice_base, 1, pg, ptr); + svst1_ver_za32(2, slice_base, 2, pg, ptr); + svst1_ver_za32(2, slice_base, 3, pg, ptr); + svst1_ver_za32(3, slice_base, 0, pg, ptr); + svst1_ver_za32(3, slice_base, 1, pg, ptr); + svst1_ver_za32(3, slice_base, 2, pg, ptr); + svst1_ver_za32(3, slice_base, 3, pg, ptr); +} + +// CHECK-LABEL: @test_svst1_ver_za64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i64* +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG:%.*]], i64* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 0, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 1, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 1, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 2, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 2, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 3, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 3, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 4, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 4, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 5, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 5, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 6, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 6, i32 [[TILESLICE1]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 7, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 7, i32 [[TILESLICE1]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z19test_svst1_ver_za64ju10__SVBool_tPv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i64* +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG:%.*]], i64* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: [[TILESLICE1:%.*]] = add i32 [[SLICE_BASE]], 1 +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 0, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 1, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 1, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 2, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 2, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 3, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 3, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 4, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 4, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 5, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 5, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 6, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 6, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 7, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1d.vert( [[PG]], i64* [[TMP0]], i64 7, i32 [[TILESLICE1]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_za64(uint32_t slice_base, svbool_t pg, void *ptr) { + svst1_ver_za64(0, slice_base, 0, pg, ptr); + svst1_ver_za64(0, slice_base, 1, pg, ptr); + svst1_ver_za64(1, slice_base, 0, pg, ptr); + svst1_ver_za64(1, slice_base, 1, pg, ptr); + svst1_ver_za64(2, slice_base, 0, pg, ptr); + svst1_ver_za64(2, slice_base, 1, pg, ptr); + svst1_ver_za64(3, slice_base, 0, pg, ptr); + svst1_ver_za64(3, slice_base, 1, pg, ptr); + svst1_ver_za64(4, slice_base, 0, pg, ptr); + svst1_ver_za64(4, slice_base, 1, pg, ptr); + svst1_ver_za64(5, slice_base, 0, pg, ptr); + svst1_ver_za64(5, slice_base, 1, pg, ptr); + svst1_ver_za64(6, slice_base, 0, pg, ptr); + svst1_ver_za64(6, slice_base, 1, pg, ptr); + svst1_ver_za64(7, slice_base, 0, pg, ptr); + svst1_ver_za64(7, slice_base, 1, pg, ptr); +} + +// CHECK-LABEL: @test_svst1_ver_za128( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i128* +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG:%.*]], i128* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 1, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 2, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 3, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 4, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 5, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 6, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 7, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 8, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 9, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 10, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 11, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 12, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 13, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 14, i32 [[SLICE_BASE]]) +// CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 15, i32 [[SLICE_BASE]]) +// CHECK-NEXT: ret void +// +// CPP-CHECK-LABEL: @_Z20test_svst1_ver_za128ju10__SVBool_tPv( +// CPP-CHECK-NEXT: entry: +// CPP-CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[PTR:%.*]] to i128* +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG:%.*]], i128* [[TMP0]], i64 0, i32 [[SLICE_BASE:%.*]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 1, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 2, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 3, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 4, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 5, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 6, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 7, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 8, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 9, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 10, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 11, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 12, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 13, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 14, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: call void @llvm.aarch64.sme.st1q.vert( [[PG]], i128* [[TMP0]], i64 15, i32 [[SLICE_BASE]]) +// CPP-CHECK-NEXT: ret void +// +void test_svst1_ver_za128(uint32_t slice_base, svbool_t pg, void *ptr) { + svst1_ver_za128(0, slice_base, 0, pg, ptr); + svst1_ver_za128(1, slice_base, 0, pg, ptr); + svst1_ver_za128(2, slice_base, 0, pg, ptr); + svst1_ver_za128(3, slice_base, 0, pg, ptr); + svst1_ver_za128(4, slice_base, 0, pg, ptr); + svst1_ver_za128(5, slice_base, 0, pg, ptr); + svst1_ver_za128(6, slice_base, 0, pg, ptr); + svst1_ver_za128(7, slice_base, 0, pg, ptr); + svst1_ver_za128(8, slice_base, 0, pg, ptr); + svst1_ver_za128(9, slice_base, 0, pg, ptr); + svst1_ver_za128(10, slice_base, 0, pg, ptr); + svst1_ver_za128(11, slice_base, 0, pg, ptr); + svst1_ver_za128(12, slice_base, 0, pg, ptr); + svst1_ver_za128(13, slice_base, 0, pg, ptr); + svst1_ver_za128(14, slice_base, 0, pg, ptr); + svst1_ver_za128(15, slice_base, 0, pg, ptr); +} diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp --- a/clang/utils/TableGen/SveEmitter.cpp +++ b/clang/utils/TableGen/SveEmitter.cpp @@ -334,6 +334,9 @@ /// Emit arm_sve.h. void createHeader(raw_ostream &o); + /// Emit arm_sme.h. + void createSMEHeader(raw_ostream &o); + /// Emit all the __builtin prototypes and code needed by Sema. void createBuiltins(raw_ostream &o); @@ -347,7 +350,9 @@ void createTypeFlags(raw_ostream &o); /// Create intrinsic and add it to \p Out - void createIntrinsic(Record *R, SmallVectorImpl> &Out); + void createIntrinsic(Record *R, + SmallVectorImpl> &Out, + bool IsSME = false); }; } // end anonymous namespace @@ -757,6 +762,11 @@ NumVectors = 0; Signed = true; break; + case '%': + Pointer = true; + Void = true; + NumVectors = 0; + break; case 'A': Pointer = true; ElementBitwidth = Bitwidth = 8; @@ -989,7 +999,7 @@ } void SVEEmitter::createIntrinsic( - Record *R, SmallVectorImpl> &Out) { + Record *R, SmallVectorImpl> &Out, bool IsSME) { StringRef Name = R->getValueAsString("Name"); StringRef Proto = R->getValueAsString("Prototype"); StringRef Types = R->getValueAsString("Types"); @@ -1005,6 +1015,9 @@ for (auto FlagRec : FlagsList) Flags |= FlagRec->getValueAsInt("Value"); + bool SMEFlag = Flags & getEnumValueForFlag("IsSME"); + if (SMEFlag != IsSME) + return; // Create a dummy TypeSpec for non-overloaded builtins. if (Types.empty()) { assert((Flags & getEnumValueForFlag("IsOverloadNone")) && @@ -1286,11 +1299,85 @@ OS << "#endif /* __ARM_SVE_H */\n"; } +void SVEEmitter::createSMEHeader(raw_ostream &OS) { + OS << "/*===---- arm_sme.h - ARM SME intrinsics " + "-----------------------------------===\n" + " *\n" + " *\n" + " * Part of the LLVM Project, under the Apache License v2.0 with LLVM " + "Exceptions.\n" + " * See https://llvm.org/LICENSE.txt for license information.\n" + " * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception\n" + " *\n" + " *===-----------------------------------------------------------------" + "------===\n" + " */\n\n"; + + OS << "#ifndef __ARM_SME_H\n"; + OS << "#define __ARM_SME_H\n\n"; + + OS << "#if !defined(__ARM_FEATURE_SME)\n"; + OS << "#error \"SME support not enabled\"\n"; + OS << "#else\n\n"; + + OS << "#include \n\n"; + + OS << "#ifdef __cplusplus\n"; + OS << "extern \"C\" {\n"; + OS << "#endif\n\n"; + + SmallVector, 128> Defs; + std::vector RV = Records.getAllDerivedDefinitions("Inst"); + for (auto *R : RV) + createIntrinsic(R, Defs, true); + + // Sort intrinsics in header file by following order/priority similar to SVE: + // - Architectural guard + // - Class (is intrinsic overloaded or not) + // - Intrinsic name + std::stable_sort(Defs.begin(), Defs.end(), + [](const std::unique_ptr &A, + const std::unique_ptr &B) { + auto ToTuple = [](const std::unique_ptr &I) { + return std::make_tuple(I->getGuard(), + (unsigned)I->getClassKind(), + I->getName()); + }; + return ToTuple(A) < ToTuple(B); + }); + + StringRef InGuard = ""; + for (auto &I : Defs) { + // Emit #endif/#if pair if needed. + if (I->getGuard() != InGuard) { + if (!InGuard.empty()) + OS << "#endif //" << InGuard << "\n"; + InGuard = I->getGuard(); + if (!InGuard.empty()) + OS << "\n#if " << InGuard << "\n"; + } + + // Actually emit the intrinsic declaration. + I->emitIntrinsic(OS); + } + + if (!InGuard.empty()) + OS << "#endif //" << InGuard << "\n"; + + OS << "#ifdef __cplusplus\n"; + OS << "} // extern \"C\"\n"; + OS << "#endif\n\n"; + OS << "#endif /*__ARM_FEATURE_SME */\n\n"; + OS << "#endif /* __ARM_SME_H */\n"; +} + void SVEEmitter::createBuiltins(raw_ostream &OS) { std::vector RV = Records.getAllDerivedDefinitions("Inst"); SmallVector, 128> Defs; - for (auto *R : RV) + for (auto *R : RV) { createIntrinsic(R, Defs); + createIntrinsic(R, Defs, true); + } // The mappings must be sorted based on BuiltinID. llvm::sort(Defs, [](const std::unique_ptr &A, @@ -1320,8 +1407,10 @@ void SVEEmitter::createCodeGenMap(raw_ostream &OS) { std::vector RV = Records.getAllDerivedDefinitions("Inst"); SmallVector, 128> Defs; - for (auto *R : RV) + for (auto *R : RV) { createIntrinsic(R, Defs); + createIntrinsic(R, Defs, true); + } // The mappings must be sorted based on BuiltinID. llvm::sort(Defs, [](const std::unique_ptr &A, @@ -1353,8 +1442,10 @@ void SVEEmitter::createRangeChecks(raw_ostream &OS) { std::vector RV = Records.getAllDerivedDefinitions("Inst"); SmallVector, 128> Defs; - for (auto *R : RV) + for (auto *R : RV) { createIntrinsic(R, Defs); + createIntrinsic(R, Defs, true); + } // The mappings must be sorted based on BuiltinID. llvm::sort(Defs, [](const std::unique_ptr &A, @@ -1418,6 +1509,10 @@ SVEEmitter(Records).createHeader(OS); } +void EmitSmeHeader(RecordKeeper &Records, raw_ostream &OS) { + SVEEmitter(Records).createSMEHeader(OS); +} + void EmitSveBuiltins(RecordKeeper &Records, raw_ostream &OS) { SVEEmitter(Records).createBuiltins(OS); } diff --git a/clang/utils/TableGen/TableGen.cpp b/clang/utils/TableGen/TableGen.cpp --- a/clang/utils/TableGen/TableGen.cpp +++ b/clang/utils/TableGen/TableGen.cpp @@ -80,6 +80,7 @@ GenArmSveBuiltinCG, GenArmSveTypeFlags, GenArmSveRangeChecks, + GenArmSmeHeader, GenArmCdeHeader, GenArmCdeBuiltinDef, GenArmCdeBuiltinSema, @@ -217,6 +218,8 @@ "Generate arm_sve_typeflags.inc for clang"), clEnumValN(GenArmSveRangeChecks, "gen-arm-sve-sema-rangechecks", "Generate arm_sve_sema_rangechecks.inc for clang"), + clEnumValN(GenArmSmeHeader, "gen-arm-sme-header", + "Generate arm_sme.h for clang"), clEnumValN(GenArmMveHeader, "gen-arm-mve-header", "Generate arm_mve.h for clang"), clEnumValN(GenArmMveBuiltinDef, "gen-arm-mve-builtin-def", @@ -434,6 +437,9 @@ case GenArmSveRangeChecks: EmitSveRangeChecks(Records, OS); break; + case GenArmSmeHeader: + EmitSmeHeader(Records, OS); + break; case GenArmCdeHeader: EmitCdeHeader(Records, OS); break; diff --git a/clang/utils/TableGen/TableGenBackends.h b/clang/utils/TableGen/TableGenBackends.h --- a/clang/utils/TableGen/TableGenBackends.h +++ b/clang/utils/TableGen/TableGenBackends.h @@ -101,6 +101,8 @@ void EmitSveTypeFlags(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitSveRangeChecks(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); +void EmitSmeHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); + void EmitMveHeader(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitMveBuiltinDef(llvm::RecordKeeper &Records, llvm::raw_ostream &OS); void EmitMveBuiltinSema(llvm::RecordKeeper &Records, llvm::raw_ostream &OS);