diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td --- a/clang/include/clang/Basic/arm_mve.td +++ b/clang/include/clang/Basic/arm_mve.td @@ -72,6 +72,124 @@ } // loop over half = "b", "t" +multiclass contiguous_load same_size, list wider> { + // Intrinsics named with explicit memory and element sizes that match: + // vldrbq_?8, vldrhq_?16, vldrwq_?32. + let params = same_size, pnt = PNT_None in { + def: Intrinsic>:$addr), + (load (address (CPtr $addr), !srl(memtype.size,3)))>, + NameOverride; + def: Intrinsic>:$addr, + Predicate:$pred), + (IRIntBase<"masked_load", [Vector, CPtr]> + (CPtr $addr), !srl(memtype.size,3), + $pred, (zeroinit Vector))>, + NameOverride; + } + + // Synonyms for the above, with the generic name vld1q that just means + // 'memory and element sizes match', and allows convenient polymorphism with + // the memory and element types covariant. + let params = same_size in { + def: Intrinsic>:$addr), + (load (address (CPtr $addr), !srl(memtype.size,3)))>, + NameOverride<"vld1q">; + def: Intrinsic>:$addr, + Predicate:$pred), + (IRIntBase<"masked_load", [Vector, CPtr]> + (CPtr $addr), !srl(memtype.size,3), + $pred, (zeroinit Vector))>, + NameOverride<"vld1q_z">; + } + + // Intrinsics with the memory size narrower than the vector element, so that + // they load less than 128 bits of memory and sign/zero extend each loaded + // value into a wider vector lane. + let params = wider, pnt = PNT_None in { + def: Intrinsic>:$addr), + (extend (load (address (CPtr> + $addr), !srl(memtype.size,3))), + Vector, (unsignedflag Scalar))>, + NameOverride; + def: Intrinsic>:$addr, + Predicate:$pred), + (extend (IRIntBase<"masked_load", + [NarrowedVecOf, + CPtr>]> + (CPtr> $addr), + !srl(memtype.size,3), $pred, + (zeroinit NarrowedVecOf)), + Vector, (unsignedflag Scalar))>, + NameOverride; + } +} + +defm: contiguous_load<"vldrbq", u8, T.All8, !listconcat(T.Int16, T.Int32)>; +defm: contiguous_load<"vldrhq", u16, T.All16, T.Int32>; +defm: contiguous_load<"vldrwq", u32, T.All32, []>; + +multiclass contiguous_store same_size, list wider> { + // Intrinsics named with explicit memory and element sizes that match: + // vstrbq_?8, vstrhq_?16, vstrwq_?32. + let params = same_size in { + def: Intrinsic>:$addr, + Vector:$value), + (store $value, + (address (Ptr $addr), !srl(memtype.size,3)))>, + NameOverride; + def: Intrinsic>:$addr, + Vector:$value, Predicate:$pred), + (IRIntBase<"masked_store", [Vector, Ptr]> + $value, (Ptr $addr), + !srl(memtype.size,3), $pred)>, + NameOverride; + } + + // Synonyms for the above, with the generic name vst1q that just means + // 'memory and element sizes match', and allows convenient polymorphism with + // the memory and element types covariant. + let params = same_size in { + def: Intrinsic>:$addr, + Vector:$value), + (store $value, + (address (Ptr $addr), !srl(memtype.size,3)))>, + NameOverride<"vst1q">; + def: Intrinsic>:$addr, + Vector:$value, Predicate:$pred), + (IRIntBase<"masked_store", [Vector, Ptr]> + $value, (Ptr $addr), + !srl(memtype.size,3), $pred)>, + NameOverride<"vst1q_p">; + } + + // Intrinsics with the memory size narrower than the vector element, so that + // they store less than 128 bits of memory, truncating each vector lane into + // a narrower value to store. + let params = wider in { + def: Intrinsic>:$addr, + Vector:$value), + (store (trunc $value, NarrowedVecOf), + (address (Ptr> $addr), + !srl(memtype.size,3)))>, + NameOverride; + def: Intrinsic>:$addr, + Vector:$value, Predicate:$pred), + (IRIntBase<"masked_store", + [NarrowedVecOf, + Ptr>]> + (trunc $value, NarrowedVecOf), + (Ptr> $addr), + !srl(memtype.size,3), $pred)>, + NameOverride; + } +} + +defm: contiguous_store<"vstrbq", u8, T.All8, !listconcat(T.Int16, T.Int32)>; +defm: contiguous_store<"vstrhq", u16, T.All16, T.Int32>; +defm: contiguous_store<"vstrwq", u32, T.All32, []>; + multiclass gather_base types, int size> { let params = types, pnt = PNT_None in { def _gather_base: Intrinsic< diff --git a/clang/include/clang/Basic/arm_mve_defs.td b/clang/include/clang/Basic/arm_mve_defs.td --- a/clang/include/clang/Basic/arm_mve_defs.td +++ b/clang/include/clang/Basic/arm_mve_defs.td @@ -34,6 +34,16 @@ string func = func_; // the method name list address_params = []; // indices of parameters with type Address list int_constant_params = []; // indices of plain integer parameters + + // is_helper_function = 1 indicates that the operation is implemented by a + // static C++ helper function in CGBuiltin.cpp instead of by a method of + // IRBuilder itself, so the call syntax is func(Builder, ...) instead of + // Builder.func(...). + bit is_helper_function = 0; + + // is_static = 1 indicates that the Builder is not needed at all, so that + // the function call syntax is simply func(...). + bit is_static = 0; } def add: IRBuilder<"CreateAdd">; def or: IRBuilder<"CreateOr">; @@ -46,12 +56,24 @@ def load: IRBuilder<"CreateLoad"> { let address_params = [0]; } def store: IRBuilder<"CreateStore"> { let address_params = [1]; } def xval: IRBuilder<"CreateExtractValue"> { let int_constant_params = [1]; } +def trunc: IRBuilder<"CreateTrunc">; +def extend: IRBuilder<"SignOrZeroExtend"> { + let is_helper_function = 1; + let int_constant_params = [2]; +} +def zeroinit: IRBuilder<"llvm::Constant::getNullValue"> { + let is_static = 1; +} + +// A node that makes an Address out of a pointer-typed Value, by +// providing an alignment as the second argument. +def address; // Another node class you can use in the codegen dag. This one corresponds to // an IR intrinsic function, which has to be specialized to a particular list // of types. -class IRInt params_ = [], bit appendKind_ = 0> { - string intname = name_; // base name of the intrinsic, minus "arm_mve_" +class IRIntBase params_ = [], bit appendKind_ = 0> { + string intname = name_; // base name of the intrinsic list params = params_; // list of parameter types // If this flag is set, then the IR intrinsic name will get a suffix _s, _u @@ -67,6 +89,11 @@ bit appendKind = appendKind_; } +// Mostly we'll be using @llvm.arm.mve.* intrinsics, so here's a trivial +// subclass that puts on that prefix. +class IRInt params = [], bit appendKind = 0> + : IRIntBase<"arm_mve_" # name, params, appendKind>; + // The 'seq' node in a codegen dag specifies a set of IR operations to be // performed in order. It has the special ability to define extra variable // names, on top of the ones that refer to the intrinsic's parameters. For @@ -151,6 +178,14 @@ // is. class VecOf: ComplexType<(CTO_Vec t)>; +// NarrowedVecOf expects t to be a scalar type, and v to be a vector +// type. It returns a vector type whose element type is t, and whose lane +// count is the same as the lane count of v. (Used as an intermediate value +// type in the IR representation of a widening load: you load a vector of +// small things out of memory, and then zext/sext them into a full 128-bit +// output vector.) +class NarrowedVecOf: ComplexType<(CTO_Vec t, v)>; + // PredOf expects t to be a scalar, and expands to a predicate vector which // (logically speaking) has the same number of lanes as VecOf would. class PredOf: ComplexType<(CTO_Pred t)>; diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -6786,6 +6786,13 @@ } } +static llvm::Value *SignOrZeroExtend(CGBuilderTy &Builder, llvm::Value *V, + llvm::Type *T, bool Unsigned) { + // Helper function called by Tablegen-constructed ARM MVE builtin codegen, + // which finds it convenient to specify signed/unsigned as a boolean flag. + return Unsigned ? Builder.CreateZExt(V, T) : Builder.CreateSExt(V, T); +} + Value *CodeGenFunction::EmitARMMVEBuiltinExpr(unsigned BuiltinID, const CallExpr *E, ReturnValueSlot ReturnValue, diff --git a/clang/test/CodeGen/arm-mve-intrinsics/load-store.c b/clang/test/CodeGen/arm-mve-intrinsics/load-store.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/arm-mve-intrinsics/load-store.c @@ -0,0 +1,1325 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s + +#include + +// CHECK-LABEL: @test_vld1q_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast half* [[BASE:%.*]] to <8 x half>* +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x half>, <8 x half>* [[TMP0]], align 2 +// CHECK-NEXT: ret <8 x half> [[TMP1]] +// +float16x8_t test_vld1q_f16(const float16_t *base) +{ +#ifdef POLYMORPHIC + return vld1q(base); +#else /* POLYMORPHIC */ + return vld1q_f16(base); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vld1q_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[BASE:%.*]] to <4 x float>* +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +// CHECK-NEXT: ret <4 x float> [[TMP1]] +// +float32x4_t test_vld1q_f32(const float32_t *base) +{ +#ifdef POLYMORPHIC + return vld1q(base); +#else /* POLYMORPHIC */ + return vld1q_f32(base); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vld1q_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>* +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1 +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// +int8x16_t test_vld1q_s8(const int8_t *base) +{ +#ifdef POLYMORPHIC + return vld1q(base); +#else /* POLYMORPHIC */ + return vld1q_s8(base); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vld1q_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>* +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2 +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +int16x8_t test_vld1q_s16(const int16_t *base) +{ +#ifdef POLYMORPHIC + return vld1q(base); +#else /* POLYMORPHIC */ + return vld1q_s16(base); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vld1q_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>* +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +int32x4_t test_vld1q_s32(const int32_t *base) +{ +#ifdef POLYMORPHIC + return vld1q(base); +#else /* POLYMORPHIC */ + return vld1q_s32(base); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vld1q_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>* +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1 +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// +uint8x16_t test_vld1q_u8(const uint8_t *base) +{ +#ifdef POLYMORPHIC + return vld1q(base); +#else /* POLYMORPHIC */ + return vld1q_u8(base); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vld1q_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>* +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2 +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +uint16x8_t test_vld1q_u16(const uint16_t *base) +{ +#ifdef POLYMORPHIC + return vld1q(base); +#else /* POLYMORPHIC */ + return vld1q_u16(base); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vld1q_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>* +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +uint32x4_t test_vld1q_u32(const uint32_t *base) +{ +#ifdef POLYMORPHIC + return vld1q(base); +#else /* POLYMORPHIC */ + return vld1q_u32(base); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vld1q_z_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast half* [[BASE:%.*]] to <8 x half>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* [[TMP0]], i32 2, <8 x i1> [[TMP2]], <8 x half> zeroinitializer) +// CHECK-NEXT: ret <8 x half> [[TMP3]] +// +float16x8_t test_vld1q_z_f16(const float16_t *base, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vld1q_z(base, p); +#else /* POLYMORPHIC */ + return vld1q_z_f16(base, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vld1q_z_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[BASE:%.*]] to <4 x float>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP0]], i32 4, <4 x i1> [[TMP2]], <4 x float> zeroinitializer) +// CHECK-NEXT: ret <4 x float> [[TMP3]] +// +float32x4_t test_vld1q_z_f32(const float32_t *base, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vld1q_z(base, p); +#else /* POLYMORPHIC */ + return vld1q_z_f32(base, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vld1q_z_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP0]], i32 1, <16 x i1> [[TMP2]], <16 x i8> zeroinitializer) +// CHECK-NEXT: ret <16 x i8> [[TMP3]] +// +int8x16_t test_vld1q_z_s8(const int8_t *base, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vld1q_z(base, p); +#else /* POLYMORPHIC */ + return vld1q_z_s8(base, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vld1q_z_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP0]], i32 2, <8 x i1> [[TMP2]], <8 x i16> zeroinitializer) +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// +int16x8_t test_vld1q_z_s16(const int16_t *base, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vld1q_z(base, p); +#else /* POLYMORPHIC */ + return vld1q_z_s16(base, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vld1q_z_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP0]], i32 4, <4 x i1> [[TMP2]], <4 x i32> zeroinitializer) +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// +int32x4_t test_vld1q_z_s32(const int32_t *base, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vld1q_z(base, p); +#else /* POLYMORPHIC */ + return vld1q_z_s32(base, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vld1q_z_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP0]], i32 1, <16 x i1> [[TMP2]], <16 x i8> zeroinitializer) +// CHECK-NEXT: ret <16 x i8> [[TMP3]] +// +uint8x16_t test_vld1q_z_u8(const uint8_t *base, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vld1q_z(base, p); +#else /* POLYMORPHIC */ + return vld1q_z_u8(base, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vld1q_z_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP0]], i32 2, <8 x i1> [[TMP2]], <8 x i16> zeroinitializer) +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// +uint16x8_t test_vld1q_z_u16(const uint16_t *base, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vld1q_z(base, p); +#else /* POLYMORPHIC */ + return vld1q_z_u16(base, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vld1q_z_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP0]], i32 4, <4 x i1> [[TMP2]], <4 x i32> zeroinitializer) +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// +uint32x4_t test_vld1q_z_u32(const uint32_t *base, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vld1q_z(base, p); +#else /* POLYMORPHIC */ + return vld1q_z_u32(base, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrbq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>* +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1 +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// +int8x16_t test_vldrbq_s8(const int8_t *base) +{ + return vldrbq_s8(base); +} + +// CHECK-LABEL: @test_vldrbq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <8 x i8>* +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1 +// CHECK-NEXT: [[TMP2:%.*]] = sext <8 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vldrbq_s16(const int8_t *base) +{ + return vldrbq_s16(base); +} + +// CHECK-LABEL: @test_vldrbq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <4 x i8>* +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 +// CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vldrbq_s32(const int8_t *base) +{ + return vldrbq_s32(base); +} + +// CHECK-LABEL: @test_vldrbq_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>* +// CHECK-NEXT: [[TMP1:%.*]] = load <16 x i8>, <16 x i8>* [[TMP0]], align 1 +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// +uint8x16_t test_vldrbq_u8(const uint8_t *base) +{ + return vldrbq_u8(base); +} + +// CHECK-LABEL: @test_vldrbq_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <8 x i8>* +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i8>, <8 x i8>* [[TMP0]], align 1 +// CHECK-NEXT: [[TMP2:%.*]] = zext <8 x i8> [[TMP1]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +uint16x8_t test_vldrbq_u16(const uint8_t *base) +{ + return vldrbq_u16(base); +} + +// CHECK-LABEL: @test_vldrbq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <4 x i8>* +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i8>, <4 x i8>* [[TMP0]], align 1 +// CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[TMP1]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +uint32x4_t test_vldrbq_u32(const uint8_t *base) +{ + return vldrbq_u32(base); +} + +// CHECK-LABEL: @test_vldrbq_z_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP0]], i32 1, <16 x i1> [[TMP2]], <16 x i8> zeroinitializer) +// CHECK-NEXT: ret <16 x i8> [[TMP3]] +// +int8x16_t test_vldrbq_z_s8(const int8_t *base, mve_pred16_t p) +{ + return vldrbq_z_s8(base, p); +} + +// CHECK-LABEL: @test_vldrbq_z_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <8 x i8>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* [[TMP0]], i32 1, <8 x i1> [[TMP2]], <8 x i8> zeroinitializer) +// CHECK-NEXT: [[TMP4:%.*]] = sext <8 x i8> [[TMP3]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP4]] +// +int16x8_t test_vldrbq_z_s16(const int8_t *base, mve_pred16_t p) +{ + return vldrbq_z_s16(base, p); +} + +// CHECK-LABEL: @test_vldrbq_z_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <4 x i8>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP0]], i32 1, <4 x i1> [[TMP2]], <4 x i8> zeroinitializer) +// CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i8> [[TMP3]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP4]] +// +int32x4_t test_vldrbq_z_s32(const int8_t *base, mve_pred16_t p) +{ + return vldrbq_z_s32(base, p); +} + +// CHECK-LABEL: @test_vldrbq_z_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* [[TMP0]], i32 1, <16 x i1> [[TMP2]], <16 x i8> zeroinitializer) +// CHECK-NEXT: ret <16 x i8> [[TMP3]] +// +uint8x16_t test_vldrbq_z_u8(const uint8_t *base, mve_pred16_t p) +{ + return vldrbq_z_u8(base, p); +} + +// CHECK-LABEL: @test_vldrbq_z_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <8 x i8>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* [[TMP0]], i32 1, <8 x i1> [[TMP2]], <8 x i8> zeroinitializer) +// CHECK-NEXT: [[TMP4:%.*]] = zext <8 x i8> [[TMP3]] to <8 x i16> +// CHECK-NEXT: ret <8 x i16> [[TMP4]] +// +uint16x8_t test_vldrbq_z_u16(const uint8_t *base, mve_pred16_t p) +{ + return vldrbq_z_u16(base, p); +} + +// CHECK-LABEL: @test_vldrbq_z_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <4 x i8>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* [[TMP0]], i32 1, <4 x i1> [[TMP2]], <4 x i8> zeroinitializer) +// CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[TMP3]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP4]] +// +uint32x4_t test_vldrbq_z_u32(const uint8_t *base, mve_pred16_t p) +{ + return vldrbq_z_u32(base, p); +} + +// CHECK-LABEL: @test_vldrhq_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast half* [[BASE:%.*]] to <8 x half>* +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x half>, <8 x half>* [[TMP0]], align 2 +// CHECK-NEXT: ret <8 x half> [[TMP1]] +// +float16x8_t test_vldrhq_f16(const float16_t *base) +{ + return vldrhq_f16(base); +} + +// CHECK-LABEL: @test_vldrhq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>* +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2 +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +int16x8_t test_vldrhq_s16(const int16_t *base) +{ + return vldrhq_s16(base); +} + +// CHECK-LABEL: @test_vldrhq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <4 x i16>* +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = sext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vldrhq_s32(const int16_t *base) +{ + return vldrhq_s32(base); +} + +// CHECK-LABEL: @test_vldrhq_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>* +// CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2 +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +uint16x8_t test_vldrhq_u16(const uint16_t *base) +{ + return vldrhq_u16(base); +} + +// CHECK-LABEL: @test_vldrhq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <4 x i16>* +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i16>, <4 x i16>* [[TMP0]], align 2 +// CHECK-NEXT: [[TMP2:%.*]] = zext <4 x i16> [[TMP1]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +uint32x4_t test_vldrhq_u32(const uint16_t *base) +{ + return vldrhq_u32(base); +} + +// CHECK-LABEL: @test_vldrhq_z_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast half* [[BASE:%.*]] to <8 x half>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* [[TMP0]], i32 2, <8 x i1> [[TMP2]], <8 x half> zeroinitializer) +// CHECK-NEXT: ret <8 x half> [[TMP3]] +// +float16x8_t test_vldrhq_z_f16(const float16_t *base, mve_pred16_t p) +{ + return vldrhq_z_f16(base, p); +} + +// CHECK-LABEL: @test_vldrhq_z_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP0]], i32 2, <8 x i1> [[TMP2]], <8 x i16> zeroinitializer) +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// +int16x8_t test_vldrhq_z_s16(const int16_t *base, mve_pred16_t p) +{ + return vldrhq_z_s16(base, p); +} + +// CHECK-LABEL: @test_vldrhq_z_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <4 x i16>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP0]], i32 2, <4 x i1> [[TMP2]], <4 x i16> zeroinitializer) +// CHECK-NEXT: [[TMP4:%.*]] = sext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP4]] +// +int32x4_t test_vldrhq_z_s32(const int16_t *base, mve_pred16_t p) +{ + return vldrhq_z_s32(base, p); +} + +// CHECK-LABEL: @test_vldrhq_z_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* [[TMP0]], i32 2, <8 x i1> [[TMP2]], <8 x i16> zeroinitializer) +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// +uint16x8_t test_vldrhq_z_u16(const uint16_t *base, mve_pred16_t p) +{ + return vldrhq_z_u16(base, p); +} + +// CHECK-LABEL: @test_vldrhq_z_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <4 x i16>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* [[TMP0]], i32 2, <4 x i1> [[TMP2]], <4 x i16> zeroinitializer) +// CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i16> [[TMP3]] to <4 x i32> +// CHECK-NEXT: ret <4 x i32> [[TMP4]] +// +uint32x4_t test_vldrhq_z_u32(const uint16_t *base, mve_pred16_t p) +{ + return vldrhq_z_u32(base, p); +} + +// CHECK-LABEL: @test_vldrwq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[BASE:%.*]] to <4 x float>* +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, <4 x float>* [[TMP0]], align 4 +// CHECK-NEXT: ret <4 x float> [[TMP1]] +// +float32x4_t test_vldrwq_f32(const float32_t *base) +{ + return vldrwq_f32(base); +} + +// CHECK-LABEL: @test_vldrwq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>* +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +int32x4_t test_vldrwq_s32(const int32_t *base) +{ + return vldrwq_s32(base); +} + +// CHECK-LABEL: @test_vldrwq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>* +// CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP0]], align 4 +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +uint32x4_t test_vldrwq_u32(const uint32_t *base) +{ + return vldrwq_u32(base); +} + +// CHECK-LABEL: @test_vldrwq_z_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[BASE:%.*]] to <4 x float>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* [[TMP0]], i32 4, <4 x i1> [[TMP2]], <4 x float> zeroinitializer) +// CHECK-NEXT: ret <4 x float> [[TMP3]] +// +float32x4_t test_vldrwq_z_f32(const float32_t *base, mve_pred16_t p) +{ + return vldrwq_z_f32(base, p); +} + +// CHECK-LABEL: @test_vldrwq_z_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP0]], i32 4, <4 x i1> [[TMP2]], <4 x i32> zeroinitializer) +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// +int32x4_t test_vldrwq_z_s32(const int32_t *base, mve_pred16_t p) +{ + return vldrwq_z_s32(base, p); +} + +// CHECK-LABEL: @test_vldrwq_z_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* [[TMP0]], i32 4, <4 x i1> [[TMP2]], <4 x i32> zeroinitializer) +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// +uint32x4_t test_vldrwq_z_u32(const uint32_t *base, mve_pred16_t p) +{ + return vldrwq_z_u32(base, p); +} + +// CHECK-LABEL: @test_vst1q_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast half* [[BASE:%.*]] to <8 x half>* +// CHECK-NEXT: store <8 x half> [[VALUE:%.*]], <8 x half>* [[TMP0]], align 2 +// CHECK-NEXT: ret void +// +void test_vst1q_f16(float16_t *base, float16x8_t value) +{ +#ifdef POLYMORPHIC + vst1q(base, value); +#else /* POLYMORPHIC */ + vst1q_f16(base, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vst1q_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[BASE:%.*]] to <4 x float>* +// CHECK-NEXT: store <4 x float> [[VALUE:%.*]], <4 x float>* [[TMP0]], align 4 +// CHECK-NEXT: ret void +// +void test_vst1q_f32(float32_t *base, float32x4_t value) +{ +#ifdef POLYMORPHIC + vst1q(base, value); +#else /* POLYMORPHIC */ + vst1q_f32(base, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vst1q_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>* +// CHECK-NEXT: store <16 x i8> [[VALUE:%.*]], <16 x i8>* [[TMP0]], align 1 +// CHECK-NEXT: ret void +// +void test_vst1q_s8(int8_t *base, int8x16_t value) +{ +#ifdef POLYMORPHIC + vst1q(base, value); +#else /* POLYMORPHIC */ + vst1q_s8(base, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vst1q_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>* +// CHECK-NEXT: store <8 x i16> [[VALUE:%.*]], <8 x i16>* [[TMP0]], align 2 +// CHECK-NEXT: ret void +// +void test_vst1q_s16(int16_t *base, int16x8_t value) +{ +#ifdef POLYMORPHIC + vst1q(base, value); +#else /* POLYMORPHIC */ + vst1q_s16(base, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vst1q_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>* +// CHECK-NEXT: store <4 x i32> [[VALUE:%.*]], <4 x i32>* [[TMP0]], align 4 +// CHECK-NEXT: ret void +// +void test_vst1q_s32(int32_t *base, int32x4_t value) +{ +#ifdef POLYMORPHIC + vst1q(base, value); +#else /* POLYMORPHIC */ + vst1q_s32(base, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vst1q_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>* +// CHECK-NEXT: store <16 x i8> [[VALUE:%.*]], <16 x i8>* [[TMP0]], align 1 +// CHECK-NEXT: ret void +// +void test_vst1q_u8(uint8_t *base, uint8x16_t value) +{ +#ifdef POLYMORPHIC + vst1q(base, value); +#else /* POLYMORPHIC */ + vst1q_u8(base, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vst1q_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>* +// CHECK-NEXT: store <8 x i16> [[VALUE:%.*]], <8 x i16>* [[TMP0]], align 2 +// CHECK-NEXT: ret void +// +void test_vst1q_u16(uint16_t *base, uint16x8_t value) +{ +#ifdef POLYMORPHIC + vst1q(base, value); +#else /* POLYMORPHIC */ + vst1q_u16(base, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vst1q_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>* +// CHECK-NEXT: store <4 x i32> [[VALUE:%.*]], <4 x i32>* [[TMP0]], align 4 +// CHECK-NEXT: ret void +// +void test_vst1q_u32(uint32_t *base, uint32x4_t value) +{ +#ifdef POLYMORPHIC + vst1q(base, value); +#else /* POLYMORPHIC */ + vst1q_u32(base, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vst1q_p_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast half* [[BASE:%.*]] to <8 x half>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> [[VALUE:%.*]], <8 x half>* [[TMP0]], i32 2, <8 x i1> [[TMP2]]) +// CHECK-NEXT: ret void +// +void test_vst1q_p_f16(float16_t *base, float16x8_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vst1q_p(base, value, p); +#else /* POLYMORPHIC */ + vst1q_p_f16(base, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vst1q_p_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[BASE:%.*]] to <4 x float>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> [[VALUE:%.*]], <4 x float>* [[TMP0]], i32 4, <4 x i1> [[TMP2]]) +// CHECK-NEXT: ret void +// +void test_vst1q_p_f32(float32_t *base, float32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vst1q_p(base, value, p); +#else /* POLYMORPHIC */ + vst1q_p_f32(base, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vst1q_p_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[VALUE:%.*]], <16 x i8>* [[TMP0]], i32 1, <16 x i1> [[TMP2]]) +// CHECK-NEXT: ret void +// +void test_vst1q_p_s8(int8_t *base, int8x16_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vst1q_p(base, value, p); +#else /* POLYMORPHIC */ + vst1q_p_s8(base, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vst1q_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> [[VALUE:%.*]], <8 x i16>* [[TMP0]], i32 2, <8 x i1> [[TMP2]]) +// CHECK-NEXT: ret void +// +void test_vst1q_p_s16(int16_t *base, int16x8_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vst1q_p(base, value, p); +#else /* POLYMORPHIC */ + vst1q_p_s16(base, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vst1q_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[VALUE:%.*]], <4 x i32>* [[TMP0]], i32 4, <4 x i1> [[TMP2]]) +// CHECK-NEXT: ret void +// +void test_vst1q_p_s32(int32_t *base, int32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vst1q_p(base, value, p); +#else /* POLYMORPHIC */ + vst1q_p_s32(base, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vst1q_p_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[VALUE:%.*]], <16 x i8>* [[TMP0]], i32 1, <16 x i1> [[TMP2]]) +// CHECK-NEXT: ret void +// +void test_vst1q_p_u8(uint8_t *base, uint8x16_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vst1q_p(base, value, p); +#else /* POLYMORPHIC */ + vst1q_p_u8(base, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vst1q_p_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> [[VALUE:%.*]], <8 x i16>* [[TMP0]], i32 2, <8 x i1> [[TMP2]]) +// CHECK-NEXT: ret void +// +void test_vst1q_p_u16(uint16_t *base, uint16x8_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vst1q_p(base, value, p); +#else /* POLYMORPHIC */ + vst1q_p_u16(base, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vst1q_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[VALUE:%.*]], <4 x i32>* [[TMP0]], i32 4, <4 x i1> [[TMP2]]) +// CHECK-NEXT: ret void +// +void test_vst1q_p_u32(uint32_t *base, uint32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vst1q_p(base, value, p); +#else /* POLYMORPHIC */ + vst1q_p_u32(base, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>* +// CHECK-NEXT: store <16 x i8> [[VALUE:%.*]], <16 x i8>* [[TMP0]], align 1 +// CHECK-NEXT: ret void +// +void test_vstrbq_s8(int8_t *base, int8x16_t value) +{ +#ifdef POLYMORPHIC + vstrbq(base, value); +#else /* POLYMORPHIC */ + vstrbq_s8(base, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = trunc <8 x i16> [[VALUE:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[BASE:%.*]] to <8 x i8>* +// CHECK-NEXT: store <8 x i8> [[TMP0]], <8 x i8>* [[TMP1]], align 1 +// CHECK-NEXT: ret void +// +void test_vstrbq_s16(int8_t *base, int16x8_t value) +{ +#ifdef POLYMORPHIC + vstrbq(base, value); +#else /* POLYMORPHIC */ + vstrbq_s16(base, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[BASE:%.*]] to <4 x i8>* +// CHECK-NEXT: store <4 x i8> [[TMP0]], <4 x i8>* [[TMP1]], align 1 +// CHECK-NEXT: ret void +// +void test_vstrbq_s32(int8_t *base, int32x4_t value) +{ +#ifdef POLYMORPHIC + vstrbq(base, value); +#else /* POLYMORPHIC */ + vstrbq_s32(base, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>* +// CHECK-NEXT: store <16 x i8> [[VALUE:%.*]], <16 x i8>* [[TMP0]], align 1 +// CHECK-NEXT: ret void +// +void test_vstrbq_u8(uint8_t *base, uint8x16_t value) +{ +#ifdef POLYMORPHIC + vstrbq(base, value); +#else /* POLYMORPHIC */ + vstrbq_u8(base, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = trunc <8 x i16> [[VALUE:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[BASE:%.*]] to <8 x i8>* +// CHECK-NEXT: store <8 x i8> [[TMP0]], <8 x i8>* [[TMP1]], align 1 +// CHECK-NEXT: ret void +// +void test_vstrbq_u16(uint8_t *base, uint16x8_t value) +{ +#ifdef POLYMORPHIC + vstrbq(base, value); +#else /* POLYMORPHIC */ + vstrbq_u16(base, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[BASE:%.*]] to <4 x i8>* +// CHECK-NEXT: store <4 x i8> [[TMP0]], <4 x i8>* [[TMP1]], align 1 +// CHECK-NEXT: ret void +// +void test_vstrbq_u32(uint8_t *base, uint32x4_t value) +{ +#ifdef POLYMORPHIC + vstrbq(base, value); +#else /* POLYMORPHIC */ + vstrbq_u32(base, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_p_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[VALUE:%.*]], <16 x i8>* [[TMP0]], i32 1, <16 x i1> [[TMP2]]) +// CHECK-NEXT: ret void +// +void test_vstrbq_p_s8(int8_t *base, int8x16_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrbq_p(base, value, p); +#else /* POLYMORPHIC */ + vstrbq_p_s8(base, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = trunc <8 x i16> [[VALUE:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[BASE:%.*]] to <8 x i8>* +// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]]) +// CHECK-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[TMP0]], <8 x i8>* [[TMP1]], i32 1, <8 x i1> [[TMP3]]) +// CHECK-NEXT: ret void +// +void test_vstrbq_p_s16(int8_t *base, int16x8_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrbq_p(base, value, p); +#else /* POLYMORPHIC */ + vstrbq_p_s16(base, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[BASE:%.*]] to <4 x i8>* +// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]]) +// CHECK-NEXT: call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> [[TMP0]], <4 x i8>* [[TMP1]], i32 1, <4 x i1> [[TMP3]]) +// CHECK-NEXT: ret void +// +void test_vstrbq_p_s32(int8_t *base, int32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrbq_p(base, value, p); +#else /* POLYMORPHIC */ + vstrbq_p_s32(base, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_p_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i8* [[BASE:%.*]] to <16 x i8>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> [[VALUE:%.*]], <16 x i8>* [[TMP0]], i32 1, <16 x i1> [[TMP2]]) +// CHECK-NEXT: ret void +// +void test_vstrbq_p_u8(uint8_t *base, uint8x16_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrbq_p(base, value, p); +#else /* POLYMORPHIC */ + vstrbq_p_u8(base, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_p_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = trunc <8 x i16> [[VALUE:%.*]] to <8 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[BASE:%.*]] to <8 x i8>* +// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP2]]) +// CHECK-NEXT: call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> [[TMP0]], <8 x i8>* [[TMP1]], i32 1, <8 x i1> [[TMP3]]) +// CHECK-NEXT: ret void +// +void test_vstrbq_p_u16(uint8_t *base, uint16x8_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrbq_p(base, value, p); +#else /* POLYMORPHIC */ + vstrbq_p_u16(base, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i8* [[BASE:%.*]] to <4 x i8>* +// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]]) +// CHECK-NEXT: call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> [[TMP0]], <4 x i8>* [[TMP1]], i32 1, <4 x i1> [[TMP3]]) +// CHECK-NEXT: ret void +// +void test_vstrbq_p_u32(uint8_t *base, uint32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrbq_p(base, value, p); +#else /* POLYMORPHIC */ + vstrbq_p_u32(base, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast half* [[BASE:%.*]] to <8 x half>* +// CHECK-NEXT: store <8 x half> [[VALUE:%.*]], <8 x half>* [[TMP0]], align 2 +// CHECK-NEXT: ret void +// +void test_vstrhq_f16(float16_t *base, float16x8_t value) +{ +#ifdef POLYMORPHIC + vstrhq(base, value); +#else /* POLYMORPHIC */ + vstrhq_f16(base, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>* +// CHECK-NEXT: store <8 x i16> [[VALUE:%.*]], <8 x i16>* [[TMP0]], align 2 +// CHECK-NEXT: ret void +// +void test_vstrhq_s16(int16_t *base, int16x8_t value) +{ +#ifdef POLYMORPHIC + vstrhq(base, value); +#else /* POLYMORPHIC */ + vstrhq_s16(base, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[BASE:%.*]] to <4 x i16>* +// CHECK-NEXT: store <4 x i16> [[TMP0]], <4 x i16>* [[TMP1]], align 2 +// CHECK-NEXT: ret void +// +void test_vstrhq_s32(int16_t *base, int32x4_t value) +{ +#ifdef POLYMORPHIC + vstrhq(base, value); +#else /* POLYMORPHIC */ + vstrhq_s32(base, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>* +// CHECK-NEXT: store <8 x i16> [[VALUE:%.*]], <8 x i16>* [[TMP0]], align 2 +// CHECK-NEXT: ret void +// +void test_vstrhq_u16(uint16_t *base, uint16x8_t value) +{ +#ifdef POLYMORPHIC + vstrhq(base, value); +#else /* POLYMORPHIC */ + vstrhq_u16(base, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[BASE:%.*]] to <4 x i16>* +// CHECK-NEXT: store <4 x i16> [[TMP0]], <4 x i16>* [[TMP1]], align 2 +// CHECK-NEXT: ret void +// +void test_vstrhq_u32(uint16_t *base, uint32x4_t value) +{ +#ifdef POLYMORPHIC + vstrhq(base, value); +#else /* POLYMORPHIC */ + vstrhq_u32(base, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_p_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast half* [[BASE:%.*]] to <8 x half>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> [[VALUE:%.*]], <8 x half>* [[TMP0]], i32 2, <8 x i1> [[TMP2]]) +// CHECK-NEXT: ret void +// +void test_vstrhq_p_f16(float16_t *base, float16x8_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrhq_p(base, value, p); +#else /* POLYMORPHIC */ + vstrhq_p_f16(base, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> [[VALUE:%.*]], <8 x i16>* [[TMP0]], i32 2, <8 x i1> [[TMP2]]) +// CHECK-NEXT: ret void +// +void test_vstrhq_p_s16(int16_t *base, int16x8_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrhq_p(base, value, p); +#else /* POLYMORPHIC */ + vstrhq_p_s16(base, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[BASE:%.*]] to <4 x i16>* +// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]]) +// CHECK-NEXT: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> [[TMP0]], <4 x i16>* [[TMP1]], i32 2, <4 x i1> [[TMP3]]) +// CHECK-NEXT: ret void +// +void test_vstrhq_p_s32(int16_t *base, int32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrhq_p(base, value, p); +#else /* POLYMORPHIC */ + vstrhq_p_s32(base, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_p_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i16* [[BASE:%.*]] to <8 x i16>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> [[VALUE:%.*]], <8 x i16>* [[TMP0]], i32 2, <8 x i1> [[TMP2]]) +// CHECK-NEXT: ret void +// +void test_vstrhq_p_u16(uint16_t *base, uint16x8_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrhq_p(base, value, p); +#else /* POLYMORPHIC */ + vstrhq_p_u16(base, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = trunc <4 x i32> [[VALUE:%.*]] to <4 x i16> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast i16* [[BASE:%.*]] to <4 x i16>* +// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]]) +// CHECK-NEXT: call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> [[TMP0]], <4 x i16>* [[TMP1]], i32 2, <4 x i1> [[TMP3]]) +// CHECK-NEXT: ret void +// +void test_vstrhq_p_u32(uint16_t *base, uint32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrhq_p(base, value, p); +#else /* POLYMORPHIC */ + vstrhq_p_u32(base, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[BASE:%.*]] to <4 x float>* +// CHECK-NEXT: store <4 x float> [[VALUE:%.*]], <4 x float>* [[TMP0]], align 4 +// CHECK-NEXT: ret void +// +void test_vstrwq_f32(float32_t *base, float32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq(base, value); +#else /* POLYMORPHIC */ + vstrwq_f32(base, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>* +// CHECK-NEXT: store <4 x i32> [[VALUE:%.*]], <4 x i32>* [[TMP0]], align 4 +// CHECK-NEXT: ret void +// +void test_vstrwq_s32(int32_t *base, int32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq(base, value); +#else /* POLYMORPHIC */ + vstrwq_s32(base, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>* +// CHECK-NEXT: store <4 x i32> [[VALUE:%.*]], <4 x i32>* [[TMP0]], align 4 +// CHECK-NEXT: ret void +// +void test_vstrwq_u32(uint32_t *base, uint32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq(base, value); +#else /* POLYMORPHIC */ + vstrwq_u32(base, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_p_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast float* [[BASE:%.*]] to <4 x float>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> [[VALUE:%.*]], <4 x float>* [[TMP0]], i32 4, <4 x i1> [[TMP2]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_p_f32(float32_t *base, float32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_p(base, value, p); +#else /* POLYMORPHIC */ + vstrwq_p_f32(base, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[VALUE:%.*]], <4 x i32>* [[TMP0]], i32 4, <4 x i1> [[TMP2]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_p_s32(int32_t *base, int32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_p(base, value, p); +#else /* POLYMORPHIC */ + vstrwq_p_s32(base, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast i32* [[BASE:%.*]] to <4 x i32>* +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> [[VALUE:%.*]], <4 x i32>* [[TMP0]], i32 4, <4 x i1> [[TMP2]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_p_u32(uint32_t *base, uint32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_p(base, value, p); +#else /* POLYMORPHIC */ + vstrwq_p_u32(base, value, p); +#endif /* POLYMORPHIC */ +} diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp --- a/clang/utils/TableGen/MveEmitter.cpp +++ b/clang/utils/TableGen/MveEmitter.cpp @@ -283,12 +283,9 @@ unsigned Lanes; public: - VectorType(const ScalarType *Element) - : CRegularNamedType(TypeKind::Vector), Element(Element) { - // MVE has a fixed 128-bit vector size - Lanes = 128 / Element->sizeInBits(); - } - unsigned sizeInBits() const override { return 128; } + VectorType(const ScalarType *Element, unsigned Lanes) + : CRegularNamedType(TypeKind::Vector), Element(Element), Lanes(Lanes) {} + unsigned sizeInBits() const override { return Lanes * Element->sizeInBits(); } unsigned lanes() const { return Lanes; } bool requiresFloat() const override { return Element->requiresFloat(); } std::string cNameBase() const override { @@ -609,6 +606,23 @@ } }; +// Result subclass representing a cast between different pointer types. +class PointerCastResult : public Result { +public: + const PointerType *PtrType; + Ptr V; + PointerCastResult(const PointerType *PtrType, Ptr V) + : PtrType(PtrType), V(V) {} + void genCode(raw_ostream &OS, + CodeGenParamAllocator &ParamAlloc) const override { + OS << "Builder.CreatePointerCast(" << V->asValue() << ", " + << ParamAlloc.allocParam("llvm::Type *", PtrType->llvmName()) << ")"; + } + void morePrerequisites(std::vector &output) const override { + output.push_back(V); + } +}; + // Result subclass representing a call to an IRBuilder method. Each IRBuilder // method we want to use will have a Tablegen record giving the method name and // describing any important details of how to call it, such as whether a @@ -619,14 +633,22 @@ std::vector Args; std::set AddressArgs; std::set IntConstantArgs; + bool IsHelper, IsStatic; IRBuilderResult(StringRef BuilderMethod, std::vector Args, std::set AddressArgs, - std::set IntConstantArgs) + std::set IntConstantArgs, bool IsHelper, + bool IsStatic) : BuilderMethod(BuilderMethod), Args(Args), AddressArgs(AddressArgs), - IntConstantArgs(IntConstantArgs) {} + IntConstantArgs(IntConstantArgs), IsHelper(IsHelper), + IsStatic(IsStatic) {} void genCode(raw_ostream &OS, CodeGenParamAllocator &ParamAlloc) const override { - OS << "Builder." << BuilderMethod << "("; + if (IsStatic) + OS << BuilderMethod << "("; + else if (IsHelper) + OS << BuilderMethod << "(Builder, "; + else + OS << "Builder." << BuilderMethod << "("; const char *Sep = ""; for (unsigned i = 0, e = Args.size(); i < e; ++i) { Ptr Arg = Args[i]; @@ -652,6 +674,25 @@ } }; +// Result subclass representing making an Address out of a Value. +class AddressResult : public Result { +public: + Ptr Arg; + unsigned Align; + AddressResult(Ptr Arg, unsigned Align) : Arg(Arg), Align(Align) {} + void genCode(raw_ostream &OS, + CodeGenParamAllocator &ParamAlloc) const override { + OS << "Address(" << Arg->varname() << ", CharUnits::fromQuantity(" + << Align << "))"; + } + std::string typeName() const override { + return "Address"; + } + void morePrerequisites(std::vector &output) const override { + output.push_back(Arg); + } +}; + // Result subclass representing a call to an IR intrinsic, which we first have // to look up using an Intrinsic::ID constant and an array of types. class IRIntrinsicResult : public Result { @@ -665,7 +706,7 @@ void genCode(raw_ostream &OS, CodeGenParamAllocator &ParamAlloc) const override { std::string IntNo = ParamAlloc.allocParam( - "Intrinsic::ID", "Intrinsic::arm_mve_" + IntrinsicID); + "Intrinsic::ID", "Intrinsic::" + IntrinsicID); OS << "Builder.CreateCall(CGM.getIntrinsic(" << IntNo; if (!ParamTypes.empty()) { OS << ", llvm::SmallVector {"; @@ -689,6 +730,20 @@ } }; +// Result subclass that specifies a type, for use in IRBuilder operations such +// as CreateBitCast that take a type argument. +class TypeResult : public Result { +public: + const Type *T; + TypeResult(const Type *T) : T(T) {} + void genCode(raw_ostream &OS, CodeGenParamAllocator &) const override { + OS << T->llvmName(); + } + std::string typeName() const override { + return "llvm::Type *"; + } +}; + // ----------------------------------------------------------------------------- // Class that describes a single ACLE intrinsic. // @@ -852,7 +907,8 @@ // MveEmitter holds a collection of all the types we've instantiated. VoidType Void; std::map> ScalarTypes; - std::map, std::unique_ptr> + std::map, + std::unique_ptr> VectorTypes; std::map, std::unique_ptr> MultiVectorTypes; @@ -872,12 +928,16 @@ const ScalarType *getScalarType(Record *R) { return getScalarType(R->getName()); } - const VectorType *getVectorType(const ScalarType *ST) { - std::pair key(ST->kind(), ST->sizeInBits()); + const VectorType *getVectorType(const ScalarType *ST, unsigned Lanes) { + std::tuple key(ST->kind(), + ST->sizeInBits(), Lanes); if (VectorTypes.find(key) == VectorTypes.end()) - VectorTypes[key] = std::make_unique(ST); + VectorTypes[key] = std::make_unique(ST, Lanes); return VectorTypes[key].get(); } + const VectorType *getVectorType(const ScalarType *ST) { + return getVectorType(ST, 128 / ST->sizeInBits()); + } const MultiVectorType *getMultiVectorType(unsigned Registers, const VectorType *VT) { std::pair key(VT->cNameBase(), Registers); @@ -969,7 +1029,13 @@ if (Op->getName() == "CTO_Vec") { const Type *Element = getType(D->getArg(0), Param); - return getVectorType(cast(Element)); + if (D->getNumArgs() == 1) { + return getVectorType(cast(Element)); + } else { + const Type *ExistingVector = getType(D->getArg(1), Param); + return getVectorType(cast(Element), + cast(ExistingVector)->lanes()); + } } if (Op->getName() == "CTO_Pred") { @@ -1035,8 +1101,21 @@ else return std::make_shared(ST, Arg); } + } else if (const auto *PT = dyn_cast(CastType)) { + return std::make_shared(PT, Arg); } PrintFatalError("Unsupported type cast"); + } else if (Op->getName() == "address") { + if (D->getNumArgs() != 2) + PrintFatalError("'address' should have two arguments"); + Result::Ptr Arg = getCodeForDagArg(D, 0, Scope, Param); + unsigned Alignment; + if (auto *II = dyn_cast(D->getArg(1))) { + Alignment = II->getValue(); + } else { + PrintFatalError("'address' alignment argument should be an integer"); + } + return std::make_shared(Arg, Alignment); } else if (Op->getName() == "unsignedflag") { if (D->getNumArgs() != 1) PrintFatalError("unsignedflag should have exactly one argument"); @@ -1060,9 +1139,12 @@ std::set IntConstantArgs; for (unsigned i : Op->getValueAsListOfInts("int_constant_params")) IntConstantArgs.insert(i); + bool IsHelper = Op->getValueAsBit("is_helper_function"); + bool IsStatic = Op->getValueAsBit("is_static"); return std::make_shared( - Op->getValueAsString("func"), Args, AddressArgs, IntConstantArgs); - } else if (Op->isSubClassOf("IRInt")) { + Op->getValueAsString("func"), Args, AddressArgs, IntConstantArgs, + IsHelper, IsStatic); + } else if (Op->isSubClassOf("IRIntBase")) { std::vector ParamTypes; for (Record *RParam : Op->getValueAsListOfDefs("params")) ParamTypes.push_back(getType(RParam, Param)); @@ -1099,6 +1181,14 @@ if (auto *DI = dyn_cast(Arg)) return getCodeForDag(DI, Scope, Param); + if (auto *DI = dyn_cast(Arg)) { + Record *Rec = DI->getDef(); + if (Rec->isSubClassOf("Type")) { + const Type *T = getType(Rec, Param); + return std::make_shared(T); + } + } + PrintFatalError("bad dag argument type for code generation"); } @@ -1111,8 +1201,9 @@ V = std::make_shared(getScalarType("u32"), V); } else if (const auto *PT = dyn_cast(ArgType)) { V = std::make_shared(getScalarType("u32"), V); - V = std::make_shared( - "pred_i2v", std::vector{PT}, std::vector{V}); + V = std::make_shared("arm_mve_pred_i2v", + std::vector{PT}, + std::vector{V}); } return V; diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/load-store.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/load-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/load-store.ll @@ -0,0 +1,1208 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -enable-arm-maskedldst -o - %s | FileCheck %s + +define arm_aapcs_vfpcc <8 x half> @test_vld1q_f16(half* %base) { +; CHECK-LABEL: test_vld1q_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast half* %base to <8 x half>* + %1 = load <8 x half>, <8 x half>* %0, align 2 + ret <8 x half> %1 +} + +define arm_aapcs_vfpcc <4 x float> @test_vld1q_f32(float* %base) { +; CHECK-LABEL: test_vld1q_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float* %base to <4 x float>* + %1 = load <4 x float>, <4 x float>* %0, align 4 + ret <4 x float> %1 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vld1q_s8(i8* %base) { +; CHECK-LABEL: test_vld1q_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i8* %base to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + ret <16 x i8> %1 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vld1q_s16(i16* %base) { +; CHECK-LABEL: test_vld1q_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i16* %base to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vld1q_s32(i32* %base) { +; CHECK-LABEL: test_vld1q_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i32* %base to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vld1q_u8(i8* %base) { +; CHECK-LABEL: test_vld1q_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i8* %base to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + ret <16 x i8> %1 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vld1q_u16(i16* %base) { +; CHECK-LABEL: test_vld1q_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i16* %base to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vld1q_u32(i32* %base) { +; CHECK-LABEL: test_vld1q_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i32* %base to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <8 x half> @test_vld1q_z_f16(half* %base, i16 zeroext %p) { +; CHECK-LABEL: test_vld1q_z_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast half* %base to <8 x half>* + %1 = zext i16 %p to i32 + %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %2, <8 x half> zeroinitializer) + ret <8 x half> %3 +} + +declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) + +declare <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>*, i32 immarg, <8 x i1>, <8 x half>) + +define arm_aapcs_vfpcc <4 x float> @test_vld1q_z_f32(float* %base, i16 zeroext %p) { +; CHECK-LABEL: test_vld1q_z_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float* %base to <4 x float>* + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %2, <4 x float> zeroinitializer) + ret <4 x float> %3 +} + +declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) + +declare <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>*, i32 immarg, <4 x i1>, <4 x float>) + +define arm_aapcs_vfpcc <16 x i8> @test_vld1q_z_s8(i8* %base, i16 zeroext %p) { +; CHECK-LABEL: test_vld1q_z_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u8 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i8* %base to <16 x i8>* + %1 = zext i16 %p to i32 + %2 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + %3 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %2, <16 x i8> zeroinitializer) + ret <16 x i8> %3 +} + +declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) + +declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) + +define arm_aapcs_vfpcc <8 x i16> @test_vld1q_z_s16(i16* %base, i16 zeroext %p) { +; CHECK-LABEL: test_vld1q_z_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i16* %base to <8 x i16>* + %1 = zext i16 %p to i32 + %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %2, <8 x i16> zeroinitializer) + ret <8 x i16> %3 +} + +declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) + +define arm_aapcs_vfpcc <4 x i32> @test_vld1q_z_s32(i32* %base, i16 zeroext %p) { +; CHECK-LABEL: test_vld1q_z_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i32* %base to <4 x i32>* + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %2, <4 x i32> zeroinitializer) + ret <4 x i32> %3 +} + +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) + +define arm_aapcs_vfpcc <16 x i8> @test_vld1q_z_u8(i8* %base, i16 zeroext %p) { +; CHECK-LABEL: test_vld1q_z_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u8 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i8* %base to <16 x i8>* + %1 = zext i16 %p to i32 + %2 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + %3 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %2, <16 x i8> zeroinitializer) + ret <16 x i8> %3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vld1q_z_u16(i16* %base, i16 zeroext %p) { +; CHECK-LABEL: test_vld1q_z_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i16* %base to <8 x i16>* + %1 = zext i16 %p to i32 + %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %2, <8 x i16> zeroinitializer) + ret <8 x i16> %3 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vld1q_z_u32(i32* %base, i16 zeroext %p) { +; CHECK-LABEL: test_vld1q_z_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i32* %base to <4 x i32>* + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %2, <4 x i32> zeroinitializer) + ret <4 x i32> %3 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_s8(i8* %base) { +; CHECK-LABEL: test_vldrbq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i8* %base to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + ret <16 x i8> %1 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_s16(i8* %base) { +; CHECK-LABEL: test_vldrbq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.s16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i8* %base to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = sext <8 x i8> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_s32(i8* %base) { +; CHECK-LABEL: test_vldrbq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.s32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i8* %base to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 1 + %2 = sext <4 x i8> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_u8(i8* %base) { +; CHECK-LABEL: test_vldrbq_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i8* %base to <16 x i8>* + %1 = load <16 x i8>, <16 x i8>* %0, align 1 + ret <16 x i8> %1 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_u16(i8* %base) { +; CHECK-LABEL: test_vldrbq_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i8* %base to <8 x i8>* + %1 = load <8 x i8>, <8 x i8>* %0, align 1 + %2 = zext <8 x i8> %1 to <8 x i16> + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_u32(i8* %base) { +; CHECK-LABEL: test_vldrbq_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i8* %base to <4 x i8>* + %1 = load <4 x i8>, <4 x i8>* %0, align 1 + %2 = zext <4 x i8> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_z_s8(i8* %base, i16 zeroext %p) { +; CHECK-LABEL: test_vldrbq_z_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u8 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i8* %base to <16 x i8>* + %1 = zext i16 %p to i32 + %2 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + %3 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %2, <16 x i8> zeroinitializer) + ret <16 x i8> %3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_z_s16(i8* %base, i16 zeroext %p) { +; CHECK-LABEL: test_vldrbq_z_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.s16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i8* %base to <8 x i8>* + %1 = zext i16 %p to i32 + %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %2, <8 x i8> zeroinitializer) + %4 = sext <8 x i8> %3 to <8 x i16> + ret <8 x i16> %4 +} + +declare <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>*, i32 immarg, <8 x i1>, <8 x i8>) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_z_s32(i8* %base, i16 zeroext %p) { +; CHECK-LABEL: test_vldrbq_z_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.s32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i8* %base to <4 x i8>* + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %2, <4 x i8> zeroinitializer) + %4 = sext <4 x i8> %3 to <4 x i32> + ret <4 x i32> %4 +} + +declare <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>*, i32 immarg, <4 x i1>, <4 x i8>) + +define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_z_u8(i8* %base, i16 zeroext %p) { +; CHECK-LABEL: test_vldrbq_z_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u8 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i8* %base to <16 x i8>* + %1 = zext i16 %p to i32 + %2 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + %3 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %0, i32 1, <16 x i1> %2, <16 x i8> zeroinitializer) + ret <16 x i8> %3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_z_u16(i8* %base, i16 zeroext %p) { +; CHECK-LABEL: test_vldrbq_z_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i8* %base to <8 x i8>* + %1 = zext i16 %p to i32 + %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = call <8 x i8> @llvm.masked.load.v8i8.p0v8i8(<8 x i8>* %0, i32 1, <8 x i1> %2, <8 x i8> zeroinitializer) + %4 = zext <8 x i8> %3 to <8 x i16> + ret <8 x i16> %4 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_z_u32(i8* %base, i16 zeroext %p) { +; CHECK-LABEL: test_vldrbq_z_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i8* %base to <4 x i8>* + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <4 x i8> @llvm.masked.load.v4i8.p0v4i8(<4 x i8>* %0, i32 1, <4 x i1> %2, <4 x i8> zeroinitializer) + %4 = zext <4 x i8> %3 to <4 x i32> + ret <4 x i32> %4 +} + +define arm_aapcs_vfpcc <8 x half> @test_vldrhq_f16(half* %base) { +; CHECK-LABEL: test_vldrhq_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast half* %base to <8 x half>* + %1 = load <8 x half>, <8 x half>* %0, align 2 + ret <8 x half> %1 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_s16(i16* %base) { +; CHECK-LABEL: test_vldrhq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i16* %base to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_s32(i16* %base) { +; CHECK-LABEL: test_vldrhq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.s32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i16* %base to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 2 + %2 = sext <4 x i16> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_u16(i16* %base) { +; CHECK-LABEL: test_vldrhq_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i16* %base to <8 x i16>* + %1 = load <8 x i16>, <8 x i16>* %0, align 2 + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_u32(i16* %base) { +; CHECK-LABEL: test_vldrhq_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i16* %base to <4 x i16>* + %1 = load <4 x i16>, <4 x i16>* %0, align 2 + %2 = zext <4 x i16> %1 to <4 x i32> + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <8 x half> @test_vldrhq_z_f16(half* %base, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_z_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast half* %base to <8 x half>* + %1 = zext i16 %p to i32 + %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = call <8 x half> @llvm.masked.load.v8f16.p0v8f16(<8 x half>* %0, i32 2, <8 x i1> %2, <8 x half> zeroinitializer) + ret <8 x half> %3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_z_s16(i16* %base, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_z_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i16* %base to <8 x i16>* + %1 = zext i16 %p to i32 + %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %2, <8 x i16> zeroinitializer) + ret <8 x i16> %3 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_z_s32(i16* %base, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_z_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.s32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i16* %base to <4 x i16>* + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %2, <4 x i16> zeroinitializer) + %4 = sext <4 x i16> %3 to <4 x i32> + ret <4 x i32> %4 +} + +declare <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>*, i32 immarg, <4 x i1>, <4 x i16>) + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_z_u16(i16* %base, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_z_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i16* %base to <8 x i16>* + %1 = zext i16 %p to i32 + %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %0, i32 2, <8 x i1> %2, <8 x i16> zeroinitializer) + ret <8 x i16> %3 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_z_u32(i16* %base, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_z_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i16* %base to <4 x i16>* + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <4 x i16> @llvm.masked.load.v4i16.p0v4i16(<4 x i16>* %0, i32 2, <4 x i1> %2, <4 x i16> zeroinitializer) + %4 = zext <4 x i16> %3 to <4 x i32> + ret <4 x i32> %4 +} + +define arm_aapcs_vfpcc <4 x float> @test_vldrwq_f32(float* %base) { +; CHECK-LABEL: test_vldrwq_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float* %base to <4 x float>* + %1 = load <4 x float>, <4 x float>* %0, align 4 + ret <4 x float> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_s32(i32* %base) { +; CHECK-LABEL: test_vldrwq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i32* %base to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_u32(i32* %base) { +; CHECK-LABEL: test_vldrwq_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i32* %base to <4 x i32>* + %1 = load <4 x i32>, <4 x i32>* %0, align 4 + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <4 x float> @test_vldrwq_z_f32(float* %base, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_z_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float* %base to <4 x float>* + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <4 x float> @llvm.masked.load.v4f32.p0v4f32(<4 x float>* %0, i32 4, <4 x i1> %2, <4 x float> zeroinitializer) + ret <4 x float> %3 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_z_s32(i32* %base, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_z_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i32* %base to <4 x i32>* + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %2, <4 x i32> zeroinitializer) + ret <4 x i32> %3 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_z_u32(i32* %base, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_z_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i32* %base to <4 x i32>* + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %0, i32 4, <4 x i1> %2, <4 x i32> zeroinitializer) + ret <4 x i32> %3 +} + +define arm_aapcs_vfpcc void @test_vst1q_f16(half* %base, <8 x half> %value) { +; CHECK-LABEL: test_vst1q_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast half* %base to <8 x half>* + store <8 x half> %value, <8 x half>* %0, align 2 + ret void +} + +define arm_aapcs_vfpcc void @test_vst1q_f32(float* %base, <4 x float> %value) { +; CHECK-LABEL: test_vst1q_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float* %base to <4 x float>* + store <4 x float> %value, <4 x float>* %0, align 4 + ret void +} + +define arm_aapcs_vfpcc void @test_vst1q_s8(i8* %base, <16 x i8> %value) { +; CHECK-LABEL: test_vst1q_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.8 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i8* %base to <16 x i8>* + store <16 x i8> %value, <16 x i8>* %0, align 1 + ret void +} + +define arm_aapcs_vfpcc void @test_vst1q_s16(i16* %base, <8 x i16> %value) { +; CHECK-LABEL: test_vst1q_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i16* %base to <8 x i16>* + store <8 x i16> %value, <8 x i16>* %0, align 2 + ret void +} + +define arm_aapcs_vfpcc void @test_vst1q_s32(i32* %base, <4 x i32> %value) { +; CHECK-LABEL: test_vst1q_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i32* %base to <4 x i32>* + store <4 x i32> %value, <4 x i32>* %0, align 4 + ret void +} + +define arm_aapcs_vfpcc void @test_vst1q_u8(i8* %base, <16 x i8> %value) { +; CHECK-LABEL: test_vst1q_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.8 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i8* %base to <16 x i8>* + store <16 x i8> %value, <16 x i8>* %0, align 1 + ret void +} + +define arm_aapcs_vfpcc void @test_vst1q_u16(i16* %base, <8 x i16> %value) { +; CHECK-LABEL: test_vst1q_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i16* %base to <8 x i16>* + store <8 x i16> %value, <8 x i16>* %0, align 2 + ret void +} + +define arm_aapcs_vfpcc void @test_vst1q_u32(i32* %base, <4 x i32> %value) { +; CHECK-LABEL: test_vst1q_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i32* %base to <4 x i32>* + store <4 x i32> %value, <4 x i32>* %0, align 4 + ret void +} + +define arm_aapcs_vfpcc void @test_vst1q_p_f16(half* %base, <8 x half> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vst1q_p_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast half* %base to <8 x half>* + %1 = zext i16 %p to i32 + %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %value, <8 x half>* %0, i32 2, <8 x i1> %2) + ret void +} + +declare void @llvm.masked.store.v8f16.p0v8f16(<8 x half>, <8 x half>*, i32 immarg, <8 x i1>) + +define arm_aapcs_vfpcc void @test_vst1q_p_f32(float* %base, <4 x float> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vst1q_p_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float* %base to <4 x float>* + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %value, <4 x float>* %0, i32 4, <4 x i1> %2) + ret void +} + +declare void @llvm.masked.store.v4f32.p0v4f32(<4 x float>, <4 x float>*, i32 immarg, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vst1q_p_s8(i8* %base, <16 x i8> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vst1q_p_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.8 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i8* %base to <16 x i8>* + %1 = zext i16 %p to i32 + %2 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %value, <16 x i8>* %0, i32 1, <16 x i1> %2) + ret void +} + +declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) + +define arm_aapcs_vfpcc void @test_vst1q_p_s16(i16* %base, <8 x i16> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vst1q_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i16* %base to <8 x i16>* + %1 = zext i16 %p to i32 + %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %value, <8 x i16>* %0, i32 2, <8 x i1> %2) + ret void +} + +declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) + +define arm_aapcs_vfpcc void @test_vst1q_p_s32(i32* %base, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vst1q_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i32* %base to <4 x i32>* + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %value, <4 x i32>* %0, i32 4, <4 x i1> %2) + ret void +} + +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vst1q_p_u8(i8* %base, <16 x i8> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vst1q_p_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.8 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i8* %base to <16 x i8>* + %1 = zext i16 %p to i32 + %2 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %value, <16 x i8>* %0, i32 1, <16 x i1> %2) + ret void +} + +define arm_aapcs_vfpcc void @test_vst1q_p_u16(i16* %base, <8 x i16> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vst1q_p_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i16* %base to <8 x i16>* + %1 = zext i16 %p to i32 + %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %value, <8 x i16>* %0, i32 2, <8 x i1> %2) + ret void +} + +define arm_aapcs_vfpcc void @test_vst1q_p_u32(i32* %base, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vst1q_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i32* %base to <4 x i32>* + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %value, <4 x i32>* %0, i32 4, <4 x i1> %2) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrbq_s8(i8* %base, <16 x i8> %value) { +; CHECK-LABEL: test_vstrbq_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.8 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i8* %base to <16 x i8>* + store <16 x i8> %value, <16 x i8>* %0, align 1 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrbq_s16(i8* %base, <8 x i16> %value) { +; CHECK-LABEL: test_vstrbq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = trunc <8 x i16> %value to <8 x i8> + %1 = bitcast i8* %base to <8 x i8>* + store <8 x i8> %0, <8 x i8>* %1, align 1 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrbq_s32(i8* %base, <4 x i32> %value) { +; CHECK-LABEL: test_vstrbq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = trunc <4 x i32> %value to <4 x i8> + %1 = bitcast i8* %base to <4 x i8>* + store <4 x i8> %0, <4 x i8>* %1, align 1 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrbq_u8(i8* %base, <16 x i8> %value) { +; CHECK-LABEL: test_vstrbq_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.8 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i8* %base to <16 x i8>* + store <16 x i8> %value, <16 x i8>* %0, align 1 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrbq_u16(i8* %base, <8 x i16> %value) { +; CHECK-LABEL: test_vstrbq_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = trunc <8 x i16> %value to <8 x i8> + %1 = bitcast i8* %base to <8 x i8>* + store <8 x i8> %0, <8 x i8>* %1, align 1 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrbq_u32(i8* %base, <4 x i32> %value) { +; CHECK-LABEL: test_vstrbq_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = trunc <4 x i32> %value to <4 x i8> + %1 = bitcast i8* %base to <4 x i8>* + store <4 x i8> %0, <4 x i8>* %1, align 1 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrbq_p_s8(i8* %base, <16 x i8> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrbq_p_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.8 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i8* %base to <16 x i8>* + %1 = zext i16 %p to i32 + %2 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %value, <16 x i8>* %0, i32 1, <16 x i1> %2) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrbq_p_s16(i8* %base, <8 x i16> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrbq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = trunc <8 x i16> %value to <8 x i8> + %1 = bitcast i8* %base to <8 x i8>* + %2 = zext i16 %p to i32 + %3 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2) + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %0, <8 x i8>* %1, i32 1, <8 x i1> %3) + ret void +} + +declare void @llvm.masked.store.v8i8.p0v8i8(<8 x i8>, <8 x i8>*, i32 immarg, <8 x i1>) + +define arm_aapcs_vfpcc void @test_vstrbq_p_s32(i8* %base, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrbq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = trunc <4 x i32> %value to <4 x i8> + %1 = bitcast i8* %base to <4 x i8>* + %2 = zext i16 %p to i32 + %3 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2) + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %0, <4 x i8>* %1, i32 1, <4 x i1> %3) + ret void +} + +declare void @llvm.masked.store.v4i8.p0v4i8(<4 x i8>, <4 x i8>*, i32 immarg, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrbq_p_u8(i8* %base, <16 x i8> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrbq_p_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.8 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i8* %base to <16 x i8>* + %1 = zext i16 %p to i32 + %2 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %value, <16 x i8>* %0, i32 1, <16 x i1> %2) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrbq_p_u16(i8* %base, <8 x i16> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrbq_p_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = trunc <8 x i16> %value to <8 x i8> + %1 = bitcast i8* %base to <8 x i8>* + %2 = zext i16 %p to i32 + %3 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %2) + call void @llvm.masked.store.v8i8.p0v8i8(<8 x i8> %0, <8 x i8>* %1, i32 1, <8 x i1> %3) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrbq_p_u32(i8* %base, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrbq_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = trunc <4 x i32> %value to <4 x i8> + %1 = bitcast i8* %base to <4 x i8>* + %2 = zext i16 %p to i32 + %3 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2) + call void @llvm.masked.store.v4i8.p0v4i8(<4 x i8> %0, <4 x i8>* %1, i32 1, <4 x i1> %3) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_f16(half* %base, <8 x half> %value) { +; CHECK-LABEL: test_vstrhq_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast half* %base to <8 x half>* + store <8 x half> %value, <8 x half>* %0, align 2 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_s16(i16* %base, <8 x i16> %value) { +; CHECK-LABEL: test_vstrhq_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i16* %base to <8 x i16>* + store <8 x i16> %value, <8 x i16>* %0, align 2 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_s32(i16* %base, <4 x i32> %value) { +; CHECK-LABEL: test_vstrhq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = trunc <4 x i32> %value to <4 x i16> + %1 = bitcast i16* %base to <4 x i16>* + store <4 x i16> %0, <4 x i16>* %1, align 2 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_u16(i16* %base, <8 x i16> %value) { +; CHECK-LABEL: test_vstrhq_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i16* %base to <8 x i16>* + store <8 x i16> %value, <8 x i16>* %0, align 2 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_u32(i16* %base, <4 x i32> %value) { +; CHECK-LABEL: test_vstrhq_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = trunc <4 x i32> %value to <4 x i16> + %1 = bitcast i16* %base to <4 x i16>* + store <4 x i16> %0, <4 x i16>* %1, align 2 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_p_f16(half* %base, <8 x half> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_p_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast half* %base to <8 x half>* + %1 = zext i16 %p to i32 + %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + call void @llvm.masked.store.v8f16.p0v8f16(<8 x half> %value, <8 x half>* %0, i32 2, <8 x i1> %2) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_p_s16(i16* %base, <8 x i16> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i16* %base to <8 x i16>* + %1 = zext i16 %p to i32 + %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %value, <8 x i16>* %0, i32 2, <8 x i1> %2) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_p_s32(i16* %base, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = trunc <4 x i32> %value to <4 x i16> + %1 = bitcast i16* %base to <4 x i16>* + %2 = zext i16 %p to i32 + %3 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2) + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %0, <4 x i16>* %1, i32 2, <4 x i1> %3) + ret void +} + +declare void @llvm.masked.store.v4i16.p0v4i16(<4 x i16>, <4 x i16>*, i32 immarg, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrhq_p_u16(i16* %base, <8 x i16> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_p_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i16* %base to <8 x i16>* + %1 = zext i16 %p to i32 + %2 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %value, <8 x i16>* %0, i32 2, <8 x i1> %2) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_p_u32(i16* %base, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = trunc <4 x i32> %value to <4 x i16> + %1 = bitcast i16* %base to <4 x i16>* + %2 = zext i16 %p to i32 + %3 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2) + call void @llvm.masked.store.v4i16.p0v4i16(<4 x i16> %0, <4 x i16>* %1, i32 2, <4 x i1> %3) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_f32(float* %base, <4 x float> %value) { +; CHECK-LABEL: test_vstrwq_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float* %base to <4 x float>* + store <4 x float> %value, <4 x float>* %0, align 4 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_s32(i32* %base, <4 x i32> %value) { +; CHECK-LABEL: test_vstrwq_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i32* %base to <4 x i32>* + store <4 x i32> %value, <4 x i32>* %0, align 4 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_u32(i32* %base, <4 x i32> %value) { +; CHECK-LABEL: test_vstrwq_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i32* %base to <4 x i32>* + store <4 x i32> %value, <4 x i32>* %0, align 4 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_p_f32(float* %base, <4 x float> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_p_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast float* %base to <4 x float>* + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + call void @llvm.masked.store.v4f32.p0v4f32(<4 x float> %value, <4 x float>* %0, i32 4, <4 x i1> %2) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_p_s32(i32* %base, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i32* %base to <4 x i32>* + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %value, <4 x i32>* %0, i32 4, <4 x i1> %2) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_p_u32(i32* %base, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = bitcast i32* %base to <4 x i32>* + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> %value, <4 x i32>* %0, i32 4, <4 x i1> %2) + ret void +}