diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td --- a/clang/include/clang/Basic/arm_mve.td +++ b/clang/include/clang/Basic/arm_mve.td @@ -72,22 +72,158 @@ } // loop over half = "b", "t" -let params = T.All32, pnt = PNT_None in -def vldrwq_gather_base_wb: Intrinsic< - Vector, (args Ptr>>:$addr, imm_mem7bit<4>:$offset), - (seq (IRInt<"vldr_gather_base_wb", [Vector, VecOf>]> +multiclass gather_base types, int size> { + let params = types, pnt = PNT_None in { + def _gather_base: Intrinsic< + Vector, (args UVector:$addr, imm_mem7bit:$offset), + (IRInt<"vldr_gather_base", [Vector, UVector]> $addr, $offset)>; + + def _gather_base_z: Intrinsic< + Vector, (args UVector:$addr, imm_mem7bit:$offset, Predicate:$pred), + (IRInt<"vldr_gather_base_predicated", [Vector, UVector, Predicate]> + $addr, $offset, $pred)>; + + def _gather_base_wb: Intrinsic< + Vector, (args Ptr:$addr, imm_mem7bit:$offset), + (seq (IRInt<"vldr_gather_base_wb", [Vector, UVector]> (load $addr), $offset):$pair, - (store (xval $pair, 1), $addr), - (xval $pair, 0))>; + (store (xval $pair, 1), $addr), + (xval $pair, 0))>; -let params = T.All64, pnt = PNT_None in -def vldrdq_gather_base_wb_z: Intrinsic< - Vector, (args Ptr>>:$addr, imm_mem7bit<8>:$offset, - Predicate:$pred), - (seq (IRInt<"vldr_gather_base_wb_predicated", [Vector, VecOf>, Predicate]> + def _gather_base_wb_z: Intrinsic< + Vector, (args Ptr:$addr, imm_mem7bit:$offset, + Predicate:$pred), + (seq (IRInt<"vldr_gather_base_wb_predicated", + [Vector, UVector, Predicate]> (load $addr), $offset, $pred):$pair, - (store (xval $pair, 1), $addr), - (xval $pair, 0))>; + (store (xval $pair, 1), $addr), + (xval $pair, 0))>; + } +} + +defm vldrwq: gather_base; +defm vldrdq: gather_base; + +multiclass scatter_base types, int size> { + let params = types in { + def _scatter_base: Intrinsic< + Void, (args UVector:$addr, imm_mem7bit:$offset, Vector:$data), + (IRInt<"vstr_scatter_base", [UVector, Vector]> $addr, $offset, $data)>; + + def _scatter_base_p: Intrinsic< + Void, (args UVector:$addr, imm_mem7bit:$offset, Vector:$data, + Predicate:$pred), + (IRInt<"vstr_scatter_base_predicated", [UVector, Vector, Predicate]> + $addr, $offset, $data, $pred)>; + + def _scatter_base_wb: Intrinsic< + Void, (args Ptr:$addr, imm_mem7bit:$offset, Vector:$data), + (seq (IRInt<"vstr_scatter_base_wb", [UVector, Vector]> + (load $addr), $offset, $data):$wbaddr, + (store $wbaddr, $addr))>; + + def _scatter_base_wb_p: Intrinsic< + Void, (args Ptr:$addr, imm_mem7bit:$offset, + Vector:$data, Predicate:$pred), + (seq (IRInt<"vstr_scatter_base_wb_predicated", + [UVector, Vector, Predicate]> + (load $addr), $offset, $data, $pred):$wbaddr, + (store $wbaddr, $addr))>; + } +} + +defm vstrwq: scatter_base; +defm vstrdq: scatter_base; + +multiclass gather_offset_unshifted types, PrimitiveType memtype> { + let params = types in { + def _gather_offset: Intrinsic< + Vector, (args CPtr>:$base, UVector:$offsets), + (IRInt<"vldr_gather_offset", + [Vector, CPtr>, UVector]> + $base, $offsets, memtype.size, 0, (unsignedflag Scalar))>; + def _gather_offset_z: Intrinsic< + Vector, (args CPtr>:$base, UVector:$offsets, + Predicate:$pred), + (IRInt<"vldr_gather_offset_predicated", + [Vector, CPtr>, UVector, Predicate]> + $base, $offsets, memtype.size, 0, (unsignedflag Scalar), $pred)>; + } +} + +multiclass gather_offset_shifted types, PrimitiveType memtype, + int shift> { + let params = types in { + def _gather_shifted_offset: Intrinsic< + Vector, (args CPtr>:$base, UVector:$offsets), + (IRInt<"vldr_gather_offset", + [Vector, CPtr>, UVector]> + $base, $offsets, memtype.size, shift, (unsignedflag Scalar))>; + def _gather_shifted_offset_z: Intrinsic< + Vector, (args CPtr>:$base, UVector:$offsets, + Predicate:$pred), + (IRInt<"vldr_gather_offset_predicated", + [Vector, CPtr>, UVector, Predicate]> + $base, $offsets, memtype.size, shift, (unsignedflag Scalar), $pred)>; + } +} + +multiclass gather_offset_both types, PrimitiveType memtype, + int shift> { + defm "": gather_offset_unshifted; + defm "": gather_offset_shifted; +} + +defm vldrbq: gather_offset_unshifted; +defm vldrhq: gather_offset_both; +defm vldrwq: gather_offset_both; +defm vldrdq: gather_offset_both; + +multiclass scatter_offset_unshifted types, PrimitiveType memtype> { + let params = types in { + def _scatter_offset: Intrinsic< + Void, (args Ptr>:$base, UVector:$offsets, + Vector:$data), + (IRInt<"vstr_scatter_offset", + [Ptr>, UVector, Vector]> + $base, $offsets, $data, memtype.size, 0)>; + def _scatter_offset_p: Intrinsic< + Void, (args Ptr>:$base, UVector:$offsets, + Vector:$data, Predicate:$pred), + (IRInt<"vstr_scatter_offset_predicated", + [Ptr>, UVector, Vector, Predicate]> + $base, $offsets, $data, memtype.size, 0, $pred)>; + } +} + +multiclass scatter_offset_shifted types, PrimitiveType memtype, + int shift> { + let params = types in { + def _scatter_shifted_offset: Intrinsic< + Void, (args Ptr>:$base, UVector:$offsets, + Vector:$data), + (IRInt<"vstr_scatter_offset", + [Ptr>, UVector, Vector]> + $base, $offsets, $data, memtype.size, shift)>; + def _scatter_shifted_offset_p: Intrinsic< + Void, (args Ptr>:$base, UVector:$offsets, + Vector:$data, Predicate:$pred), + (IRInt<"vstr_scatter_offset_predicated", + [Ptr>, UVector, Vector, Predicate]> + $base, $offsets, $data, memtype.size, shift, $pred)>; + } +} + +multiclass scatter_offset_both types, PrimitiveType memtype, + int shift> { + defm "": scatter_offset_unshifted; + defm "": scatter_offset_shifted; +} + +defm vstrbq: scatter_offset_unshifted; +defm vstrhq: scatter_offset_both; +defm vstrwq: scatter_offset_both; +defm vstrdq: scatter_offset_both; let params = [Void], pnt = PNT_None in def urshrl: Intrinsic in an intrinsic's codegen field, it // indicates that the IR generation for that intrinsic is done by handwritten // C++ and not autogenerated at all. The effect in the MVE builtin codegen @@ -109,7 +114,7 @@ def CTO_Pred: ComplexTypeOp; class CTO_Tuple: ComplexTypeOp { int n = n_; } class CTO_Pointer: ComplexTypeOp { bit const = const_; } -class CTO_Sign: ComplexTypeOp { bit signed = signed_; } +def CTO_CopyKind: ComplexTypeOp; // ----------------------------------------------------------------------------- // Instances of Type intended to be used directly in the specification of an @@ -167,10 +172,20 @@ class Ptr: ComplexType<(CTO_Pointer<0> t)>; class CPtr: ComplexType<(CTO_Pointer<1> t)>; -// Unsigned expects t to be a scalar, and expands to the unsigned integer -// scalar of the same size. So it returns u16 if you give it s16 or f16 (or -// u16 itself). -class Unsigned: ComplexType<(CTO_Sign<0> t)>; +// CopyKind expects s and k to be scalar types. It returns a scalar type +// whose kind (signed, unsigned or float) matches that of k, and whose size +// matches that of s. +class CopyKind: ComplexType<(CTO_CopyKind s, k)>; + +// Unsigned expects t to be a scalar type, and expands to the unsigned +// integer scalar of the same size. So it returns u16 if you give it s16 or +// f16 (or u16 itself). +class Unsigned: ComplexType<(CTO_CopyKind t, u32)>; + +// UScalar and UVector expand to the unsigned-integer versions of +// Scalar and Vector. +def UScalar: Unsigned; +def UVector: VecOf; // ----------------------------------------------------------------------------- // Internal definitions for specifying immediate arguments for an intrinsic. diff --git a/clang/test/CodeGen/arm-mve-intrinsics/scatter-gather.c b/clang/test/CodeGen/arm-mve-intrinsics/scatter-gather.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/arm-mve-intrinsics/scatter-gather.c @@ -0,0 +1,2146 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s + +#include + +// CHECK-LABEL: @test_vldrbq_gather_offset_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i8.v8i16(i8* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 8, i32 0, i32 0) +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +int16x8_t test_vldrbq_gather_offset_s16(const int8_t *base, uint16x8_t offset) +{ +#ifdef POLYMORPHIC + return vldrbq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrbq_gather_offset_s16(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrbq_gather_offset_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i8.v4i32(i8* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 8, i32 0, i32 0) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vldrbq_gather_offset_s32(const int8_t *base, uint32x4_t offset) +{ +#ifdef POLYMORPHIC + return vldrbq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrbq_gather_offset_s32(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrbq_gather_offset_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vldr.gather.offset.v16i8.p0i8.v16i8(i8* [[BASE:%.*]], <16 x i8> [[OFFSET:%.*]], i32 8, i32 0, i32 0) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +int8x16_t test_vldrbq_gather_offset_s8(const int8_t *base, uint8x16_t offset) +{ +#ifdef POLYMORPHIC + return vldrbq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrbq_gather_offset_s8(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrbq_gather_offset_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i8.v8i16(i8* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 8, i32 0, i32 1) +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +uint16x8_t test_vldrbq_gather_offset_u16(const uint8_t *base, uint16x8_t offset) +{ +#ifdef POLYMORPHIC + return vldrbq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrbq_gather_offset_u16(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrbq_gather_offset_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i8.v4i32(i8* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 8, i32 0, i32 1) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +uint32x4_t test_vldrbq_gather_offset_u32(const uint8_t *base, uint32x4_t offset) +{ +#ifdef POLYMORPHIC + return vldrbq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrbq_gather_offset_u32(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrbq_gather_offset_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <16 x i8> @llvm.arm.mve.vldr.gather.offset.v16i8.p0i8.v16i8(i8* [[BASE:%.*]], <16 x i8> [[OFFSET:%.*]], i32 8, i32 0, i32 1) +// CHECK-NEXT: ret <16 x i8> [[TMP0]] +// +uint8x16_t test_vldrbq_gather_offset_u8(const uint8_t *base, uint8x16_t offset) +{ +#ifdef POLYMORPHIC + return vldrbq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrbq_gather_offset_u8(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrbq_gather_offset_z_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i8.v8i16.v8i1(i8* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 8, i32 0, i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vldrbq_gather_offset_z_s16(const int8_t *base, uint16x8_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrbq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrbq_gather_offset_z_s16(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrbq_gather_offset_z_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i8.v4i32.v4i1(i8* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 8, i32 0, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vldrbq_gather_offset_z_s32(const int8_t *base, uint32x4_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrbq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrbq_gather_offset_z_s32(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrbq_gather_offset_z_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.vldr.gather.offset.predicated.v16i8.p0i8.v16i8.v16i1(i8* [[BASE:%.*]], <16 x i8> [[OFFSET:%.*]], i32 8, i32 0, i32 0, <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +int8x16_t test_vldrbq_gather_offset_z_s8(const int8_t *base, uint8x16_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrbq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrbq_gather_offset_z_s8(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrbq_gather_offset_z_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i8.v8i16.v8i1(i8* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 8, i32 0, i32 1, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +uint16x8_t test_vldrbq_gather_offset_z_u16(const uint8_t *base, uint16x8_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrbq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrbq_gather_offset_z_u16(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrbq_gather_offset_z_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i8.v4i32.v4i1(i8* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 8, i32 0, i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +uint32x4_t test_vldrbq_gather_offset_z_u32(const uint8_t *base, uint32x4_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrbq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrbq_gather_offset_z_u32(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrbq_gather_offset_z_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.mve.vldr.gather.offset.predicated.v16i8.p0i8.v16i8.v16i1(i8* [[BASE:%.*]], <16 x i8> [[OFFSET:%.*]], i32 8, i32 0, i32 1, <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +uint8x16_t test_vldrbq_gather_offset_z_u8(const uint8_t *base, uint8x16_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrbq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrbq_gather_offset_z_u8(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrdq_gather_base_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.base.v2i64.v2i64(<2 x i64> [[ADDR:%.*]], i32 616) +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// +int64x2_t test_vldrdq_gather_base_s64(uint64x2_t addr) +{ + return vldrdq_gather_base_s64(addr, 0x268); +} + +// CHECK-LABEL: @test_vldrdq_gather_base_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.base.v2i64.v2i64(<2 x i64> [[ADDR:%.*]], i32 336) +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// +uint64x2_t test_vldrdq_gather_base_u64(uint64x2_t addr) +{ + return vldrdq_gather_base_u64(addr, 0x150); +} + +// CHECK-LABEL: @test_vldrdq_gather_base_wb_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.v2i64.v2i64(<2 x i64> [[TMP0]], i32 576) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP1]], 1 +// CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* [[ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP1]], 0 +// CHECK-NEXT: ret <2 x i64> [[TMP3]] +// +int64x2_t test_vldrdq_gather_base_wb_s64(uint64x2_t *addr) +{ + return vldrdq_gather_base_wb_s64(addr, 0x240); +} + +// CHECK-LABEL: @test_vldrdq_gather_base_wb_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.v2i64.v2i64(<2 x i64> [[TMP0]], i32 328) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP1]], 1 +// CHECK-NEXT: store <2 x i64> [[TMP2]], <2 x i64>* [[ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP1]], 0 +// CHECK-NEXT: ret <2 x i64> [[TMP3]] +// +uint64x2_t test_vldrdq_gather_base_wb_u64(uint64x2_t *addr) +{ + return vldrdq_gather_base_wb_u64(addr, 0x148); +} + +// CHECK-LABEL: @test_vldrdq_gather_base_wb_z_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> [[TMP0]], i32 664, <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP3]], 1 +// CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[ADDR]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP3]], 0 +// CHECK-NEXT: ret <2 x i64> [[TMP5]] +// +int64x2_t test_vldrdq_gather_base_wb_z_s64(uint64x2_t *addr, mve_pred16_t p) +{ + return vldrdq_gather_base_wb_z_s64(addr, 0x298, p); +} + +// CHECK-LABEL: @test_vldrdq_gather_base_wb_z_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> [[TMP0]], i32 656, <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP3]], 1 +// CHECK-NEXT: store <2 x i64> [[TMP4]], <2 x i64>* [[ADDR]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[TMP3]], 0 +// CHECK-NEXT: ret <2 x i64> [[TMP5]] +// +uint64x2_t test_vldrdq_gather_base_wb_z_u64(uint64x2_t *addr, mve_pred16_t p) +{ + return vldrdq_gather_base_wb_z_u64(addr, 0x290, p); +} + +// CHECK-LABEL: @test_vldrdq_gather_base_z_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1(<2 x i64> [[ADDR:%.*]], i32 888, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// +int64x2_t test_vldrdq_gather_base_z_s64(uint64x2_t addr, mve_pred16_t p) +{ + return vldrdq_gather_base_z_s64(addr, 0x378, p); +} + +// CHECK-LABEL: @test_vldrdq_gather_base_z_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1(<2 x i64> [[ADDR:%.*]], i32 1000, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// +uint64x2_t test_vldrdq_gather_base_z_u64(uint64x2_t addr, mve_pred16_t p) +{ + return vldrdq_gather_base_z_u64(addr, 0x3e8, p); +} + +// CHECK-LABEL: @test_vldrdq_gather_offset_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0i64.v2i64(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 0, i32 0) +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// +int64x2_t test_vldrdq_gather_offset_s64(const int64_t *base, uint64x2_t offset) +{ +#ifdef POLYMORPHIC + return vldrdq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrdq_gather_offset_s64(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrdq_gather_offset_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0i64.v2i64(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 0, i32 1) +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// +uint64x2_t test_vldrdq_gather_offset_u64(const uint64_t *base, uint64x2_t offset) +{ +#ifdef POLYMORPHIC + return vldrdq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrdq_gather_offset_u64(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrdq_gather_offset_z_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 0, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// +int64x2_t test_vldrdq_gather_offset_z_s64(const int64_t *base, uint64x2_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrdq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrdq_gather_offset_z_s64(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrdq_gather_offset_z_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 0, i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// +uint64x2_t test_vldrdq_gather_offset_z_u64(const uint64_t *base, uint64x2_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrdq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrdq_gather_offset_z_u64(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrdq_gather_shifted_offset_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0i64.v2i64(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 3, i32 0) +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// +int64x2_t test_vldrdq_gather_shifted_offset_s64(const int64_t *base, uint64x2_t offset) +{ +#ifdef POLYMORPHIC + return vldrdq_gather_shifted_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrdq_gather_shifted_offset_s64(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrdq_gather_shifted_offset_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0i64.v2i64(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 3, i32 1) +// CHECK-NEXT: ret <2 x i64> [[TMP0]] +// +uint64x2_t test_vldrdq_gather_shifted_offset_u64(const uint64_t *base, uint64x2_t offset) +{ +#ifdef POLYMORPHIC + return vldrdq_gather_shifted_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrdq_gather_shifted_offset_u64(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrdq_gather_shifted_offset_z_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 3, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// +int64x2_t test_vldrdq_gather_shifted_offset_z_s64(const int64_t *base, uint64x2_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrdq_gather_shifted_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrdq_gather_shifted_offset_z_s64(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrdq_gather_shifted_offset_z_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], i32 64, i32 3, i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <2 x i64> [[TMP2]] +// +uint64x2_t test_vldrdq_gather_shifted_offset_z_u64(const uint64_t *base, uint64x2_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrdq_gather_shifted_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrdq_gather_shifted_offset_z_u64(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_offset_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.arm.mve.vldr.gather.offset.v8f16.p0f16.v8i16(half* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 16, i32 0, i32 0) +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// +float16x8_t test_vldrhq_gather_offset_f16(const float16_t *base, uint16x8_t offset) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrhq_gather_offset_f16(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_offset_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i16.v8i16(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 16, i32 0, i32 0) +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +int16x8_t test_vldrhq_gather_offset_s16(const int16_t *base, uint16x8_t offset) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrhq_gather_offset_s16(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_offset_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i16.v4i32(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 16, i32 0, i32 0) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vldrhq_gather_offset_s32(const int16_t *base, uint32x4_t offset) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrhq_gather_offset_s32(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_offset_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i16.v8i16(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 16, i32 0, i32 1) +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +uint16x8_t test_vldrhq_gather_offset_u16(const uint16_t *base, uint16x8_t offset) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrhq_gather_offset_u16(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_offset_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i16.v4i32(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 16, i32 0, i32 1) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +uint32x4_t test_vldrhq_gather_offset_u32(const uint16_t *base, uint32x4_t offset) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrhq_gather_offset_u32(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_offset_z_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.vldr.gather.offset.predicated.v8f16.p0f16.v8i16.v8i1(half* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 16, i32 0, i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x half> [[TMP2]] +// +float16x8_t test_vldrhq_gather_offset_z_f16(const float16_t *base, uint16x8_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrhq_gather_offset_z_f16(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_offset_z_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i16.v8i16.v8i1(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 16, i32 0, i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vldrhq_gather_offset_z_s16(const int16_t *base, uint16x8_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrhq_gather_offset_z_s16(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_offset_z_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i16.v4i32.v4i1(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 16, i32 0, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vldrhq_gather_offset_z_s32(const int16_t *base, uint32x4_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrhq_gather_offset_z_s32(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_offset_z_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i16.v8i16.v8i1(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 16, i32 0, i32 1, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +uint16x8_t test_vldrhq_gather_offset_z_u16(const uint16_t *base, uint16x8_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrhq_gather_offset_z_u16(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_offset_z_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i16.v4i32.v4i1(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 16, i32 0, i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +uint32x4_t test_vldrhq_gather_offset_z_u32(const uint16_t *base, uint32x4_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrhq_gather_offset_z_u32(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_shifted_offset_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x half> @llvm.arm.mve.vldr.gather.offset.v8f16.p0f16.v8i16(half* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 16, i32 1, i32 0) +// CHECK-NEXT: ret <8 x half> [[TMP0]] +// +float16x8_t test_vldrhq_gather_shifted_offset_f16(const float16_t *base, uint16x8_t offset) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_shifted_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrhq_gather_shifted_offset_f16(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_shifted_offset_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i16.v8i16(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 16, i32 1, i32 0) +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +int16x8_t test_vldrhq_gather_shifted_offset_s16(const int16_t *base, uint16x8_t offset) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_shifted_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrhq_gather_shifted_offset_s16(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_shifted_offset_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i16.v4i32(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 16, i32 1, i32 0) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vldrhq_gather_shifted_offset_s32(const int16_t *base, uint32x4_t offset) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_shifted_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrhq_gather_shifted_offset_s32(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_shifted_offset_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i16.v8i16(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 16, i32 1, i32 1) +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +uint16x8_t test_vldrhq_gather_shifted_offset_u16(const uint16_t *base, uint16x8_t offset) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_shifted_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrhq_gather_shifted_offset_u16(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_shifted_offset_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i16.v4i32(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 16, i32 1, i32 1) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +uint32x4_t test_vldrhq_gather_shifted_offset_u32(const uint16_t *base, uint32x4_t offset) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_shifted_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrhq_gather_shifted_offset_u32(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_shifted_offset_z_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x half> @llvm.arm.mve.vldr.gather.offset.predicated.v8f16.p0f16.v8i16.v8i1(half* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 16, i32 1, i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x half> [[TMP2]] +// +float16x8_t test_vldrhq_gather_shifted_offset_z_f16(const float16_t *base, uint16x8_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_shifted_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrhq_gather_shifted_offset_z_f16(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_shifted_offset_z_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i16.v8i16.v8i1(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 16, i32 1, i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vldrhq_gather_shifted_offset_z_s16(const int16_t *base, uint16x8_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_shifted_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrhq_gather_shifted_offset_z_s16(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_shifted_offset_z_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i16.v4i32.v4i1(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 16, i32 1, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vldrhq_gather_shifted_offset_z_s32(const int16_t *base, uint32x4_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_shifted_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrhq_gather_shifted_offset_z_s32(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_shifted_offset_z_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i16.v8i16.v8i1(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], i32 16, i32 1, i32 1, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +uint16x8_t test_vldrhq_gather_shifted_offset_z_u16(const uint16_t *base, uint16x8_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_shifted_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrhq_gather_shifted_offset_z_u16(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrhq_gather_shifted_offset_z_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i16.v4i32.v4i1(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 16, i32 1, i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +uint32x4_t test_vldrhq_gather_shifted_offset_z_u32(const uint16_t *base, uint32x4_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrhq_gather_shifted_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrhq_gather_shifted_offset_z_u32(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrwq_gather_base_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.arm.mve.vldr.gather.base.v4f32.v4i32(<4 x i32> [[ADDR:%.*]], i32 12) +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// +float32x4_t test_vldrwq_gather_base_f32(uint32x4_t addr) +{ + return vldrwq_gather_base_f32(addr, 0xc); +} + +// CHECK-LABEL: @test_vldrwq_gather_base_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.base.v4i32.v4i32(<4 x i32> [[ADDR:%.*]], i32 400) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vldrwq_gather_base_s32(uint32x4_t addr) +{ + return vldrwq_gather_base_s32(addr, 0x190); +} + +// CHECK-LABEL: @test_vldrwq_gather_base_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.base.v4i32.v4i32(<4 x i32> [[ADDR:%.*]], i32 284) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +uint32x4_t test_vldrwq_gather_base_u32(uint32x4_t addr) +{ + return vldrwq_gather_base_u32(addr, 0x11c); +} + +// CHECK-LABEL: @test_vldrwq_gather_base_wb_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4f32.v4i32(<4 x i32> [[TMP0]], i32 64) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP1]], 1 +// CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP1]], 0 +// CHECK-NEXT: ret <4 x float> [[TMP3]] +// +float32x4_t test_vldrwq_gather_base_wb_f32(uint32x4_t *addr) +{ + return vldrwq_gather_base_wb_f32(addr, 0x40); +} + +// CHECK-LABEL: @test_vldrwq_gather_base_wb_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4i32.v4i32(<4 x i32> [[TMP0]], i32 80) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP1]], 1 +// CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP1]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// +int32x4_t test_vldrwq_gather_base_wb_s32(uint32x4_t *addr) +{ + return vldrwq_gather_base_wb_s32(addr, 0x50); +} + +// CHECK-LABEL: @test_vldrwq_gather_base_wb_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4i32.v4i32(<4 x i32> [[TMP0]], i32 480) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP1]], 1 +// CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[ADDR]], align 8 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP1]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// +uint32x4_t test_vldrwq_gather_base_wb_u32(uint32x4_t *addr) +{ + return vldrwq_gather_base_wb_u32(addr, 0x1e0); +} + +// CHECK-LABEL: @test_vldrwq_gather_base_wb_z_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> [[TMP0]], i32 352, <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP3]], 1 +// CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[ADDR]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x float>, <4 x i32> } [[TMP3]], 0 +// CHECK-NEXT: ret <4 x float> [[TMP5]] +// +float32x4_t test_vldrwq_gather_base_wb_z_f32(uint32x4_t *addr, mve_pred16_t p) +{ + return vldrwq_gather_base_wb_z_f32(addr, 0x160, p); +} + +// CHECK-LABEL: @test_vldrwq_gather_base_wb_z_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32> [[TMP0]], i32 276, <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP3]], 1 +// CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[ADDR]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP3]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP5]] +// +int32x4_t test_vldrwq_gather_base_wb_z_s32(uint32x4_t *addr, mve_pred16_t p) +{ + return vldrwq_gather_base_wb_z_s32(addr, 0x114, p); +} + +// CHECK-LABEL: @test_vldrwq_gather_base_wb_z_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32> [[TMP0]], i32 88, <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP3]], 1 +// CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[ADDR]], align 8 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[TMP3]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP5]] +// +uint32x4_t test_vldrwq_gather_base_wb_z_u32(uint32x4_t *addr, mve_pred16_t p) +{ + return vldrwq_gather_base_wb_z_u32(addr, 0x58, p); +} + +// CHECK-LABEL: @test_vldrwq_gather_base_z_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.vldr.gather.base.predicated.v4f32.v4i32.v4i1(<4 x i32> [[ADDR:%.*]], i32 300, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x float> [[TMP2]] +// +float32x4_t test_vldrwq_gather_base_z_f32(uint32x4_t addr, mve_pred16_t p) +{ + return vldrwq_gather_base_z_f32(addr, 0x12c, p); +} + +// CHECK-LABEL: @test_vldrwq_gather_base_z_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.base.predicated.v4i32.v4i32.v4i1(<4 x i32> [[ADDR:%.*]], i32 440, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vldrwq_gather_base_z_s32(uint32x4_t addr, mve_pred16_t p) +{ + return vldrwq_gather_base_z_s32(addr, 0x1b8, p); +} + +// CHECK-LABEL: @test_vldrwq_gather_base_z_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.base.predicated.v4i32.v4i32.v4i1(<4 x i32> [[ADDR:%.*]], i32 300, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +uint32x4_t test_vldrwq_gather_base_z_u32(uint32x4_t addr, mve_pred16_t p) +{ + return vldrwq_gather_base_z_u32(addr, 0x12c, p); +} + +// CHECK-LABEL: @test_vldrwq_gather_offset_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.arm.mve.vldr.gather.offset.v4f32.p0f32.v4i32(float* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 32, i32 0, i32 0) +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// +float32x4_t test_vldrwq_gather_offset_f32(const float32_t *base, uint32x4_t offset) +{ +#ifdef POLYMORPHIC + return vldrwq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrwq_gather_offset_f32(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrwq_gather_offset_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 32, i32 0, i32 0) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vldrwq_gather_offset_s32(const int32_t *base, uint32x4_t offset) +{ +#ifdef POLYMORPHIC + return vldrwq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrwq_gather_offset_s32(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrwq_gather_offset_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 32, i32 0, i32 1) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +uint32x4_t test_vldrwq_gather_offset_u32(const uint32_t *base, uint32x4_t offset) +{ +#ifdef POLYMORPHIC + return vldrwq_gather_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrwq_gather_offset_u32(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrwq_gather_offset_z_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.vldr.gather.offset.predicated.v4f32.p0f32.v4i32.v4i1(float* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 32, i32 0, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x float> [[TMP2]] +// +float32x4_t test_vldrwq_gather_offset_z_f32(const float32_t *base, uint32x4_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrwq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrwq_gather_offset_z_f32(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrwq_gather_offset_z_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i32.v4i32.v4i1(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 32, i32 0, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vldrwq_gather_offset_z_s32(const int32_t *base, uint32x4_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrwq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrwq_gather_offset_z_s32(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrwq_gather_offset_z_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i32.v4i32.v4i1(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 32, i32 0, i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +uint32x4_t test_vldrwq_gather_offset_z_u32(const uint32_t *base, uint32x4_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrwq_gather_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrwq_gather_offset_z_u32(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrwq_gather_shifted_offset_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.arm.mve.vldr.gather.offset.v4f32.p0f32.v4i32(float* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 32, i32 2, i32 0) +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// +float32x4_t test_vldrwq_gather_shifted_offset_f32(const float32_t *base, uint32x4_t offset) +{ +#ifdef POLYMORPHIC + return vldrwq_gather_shifted_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrwq_gather_shifted_offset_f32(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrwq_gather_shifted_offset_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 32, i32 2, i32 0) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vldrwq_gather_shifted_offset_s32(const int32_t *base, uint32x4_t offset) +{ +#ifdef POLYMORPHIC + return vldrwq_gather_shifted_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrwq_gather_shifted_offset_s32(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrwq_gather_shifted_offset_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 32, i32 2, i32 1) +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +uint32x4_t test_vldrwq_gather_shifted_offset_u32(const uint32_t *base, uint32x4_t offset) +{ +#ifdef POLYMORPHIC + return vldrwq_gather_shifted_offset(base, offset); +#else /* POLYMORPHIC */ + return vldrwq_gather_shifted_offset_u32(base, offset); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrwq_gather_shifted_offset_z_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.vldr.gather.offset.predicated.v4f32.p0f32.v4i32.v4i1(float* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 32, i32 2, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x float> [[TMP2]] +// +float32x4_t test_vldrwq_gather_shifted_offset_z_f32(const float32_t *base, uint32x4_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrwq_gather_shifted_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrwq_gather_shifted_offset_z_f32(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrwq_gather_shifted_offset_z_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i32.v4i32.v4i1(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 32, i32 2, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vldrwq_gather_shifted_offset_z_s32(const int32_t *base, uint32x4_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrwq_gather_shifted_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrwq_gather_shifted_offset_z_s32(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vldrwq_gather_shifted_offset_z_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i32.v4i32.v4i1(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], i32 32, i32 2, i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +uint32x4_t test_vldrwq_gather_shifted_offset_z_u32(const uint32_t *base, uint32x4_t offset, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vldrwq_gather_shifted_offset_z(base, offset, p); +#else /* POLYMORPHIC */ + return vldrwq_gather_shifted_offset_z_u32(base, offset, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_scatter_offset_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v8i16.v8i16.v8i1(i8* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x i16> [[VALUE:%.*]], i32 8, i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrbq_scatter_offset_p_s16(int8_t *base, uint16x8_t offset, int16x8_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrbq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrbq_scatter_offset_p_s16(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_scatter_offset_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v4i32.v4i32.v4i1(i8* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 8, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrbq_scatter_offset_p_s32(int8_t *base, uint32x4_t offset, int32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrbq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrbq_scatter_offset_p_s32(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_scatter_offset_p_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v16i8.v16i8.v16i1(i8* [[BASE:%.*]], <16 x i8> [[OFFSET:%.*]], <16 x i8> [[VALUE:%.*]], i32 8, i32 0, <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrbq_scatter_offset_p_s8(int8_t *base, uint8x16_t offset, int8x16_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrbq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrbq_scatter_offset_p_s8(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_scatter_offset_p_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v8i16.v8i16.v8i1(i8* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x i16> [[VALUE:%.*]], i32 8, i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrbq_scatter_offset_p_u16(uint8_t *base, uint16x8_t offset, uint16x8_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrbq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrbq_scatter_offset_p_u16(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_scatter_offset_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v4i32.v4i32.v4i1(i8* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 8, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrbq_scatter_offset_p_u32(uint8_t *base, uint32x4_t offset, uint32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrbq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrbq_scatter_offset_p_u32(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_scatter_offset_p_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v16i8.v16i8.v16i1(i8* [[BASE:%.*]], <16 x i8> [[OFFSET:%.*]], <16 x i8> [[VALUE:%.*]], i32 8, i32 0, <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrbq_scatter_offset_p_u8(uint8_t *base, uint8x16_t offset, uint8x16_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrbq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrbq_scatter_offset_p_u8(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_scatter_offset_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v8i16.v8i16(i8* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x i16> [[VALUE:%.*]], i32 8, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrbq_scatter_offset_s16(int8_t *base, uint16x8_t offset, int16x8_t value) +{ +#ifdef POLYMORPHIC + vstrbq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrbq_scatter_offset_s16(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_scatter_offset_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v4i32.v4i32(i8* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 8, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrbq_scatter_offset_s32(int8_t *base, uint32x4_t offset, int32x4_t value) +{ +#ifdef POLYMORPHIC + vstrbq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrbq_scatter_offset_s32(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_scatter_offset_s8( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v16i8.v16i8(i8* [[BASE:%.*]], <16 x i8> [[OFFSET:%.*]], <16 x i8> [[VALUE:%.*]], i32 8, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrbq_scatter_offset_s8(int8_t *base, uint8x16_t offset, int8x16_t value) +{ +#ifdef POLYMORPHIC + vstrbq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrbq_scatter_offset_s8(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_scatter_offset_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v8i16.v8i16(i8* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x i16> [[VALUE:%.*]], i32 8, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrbq_scatter_offset_u16(uint8_t *base, uint16x8_t offset, uint16x8_t value) +{ +#ifdef POLYMORPHIC + vstrbq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrbq_scatter_offset_u16(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_scatter_offset_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v4i32.v4i32(i8* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 8, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrbq_scatter_offset_u32(uint8_t *base, uint32x4_t offset, uint32x4_t value) +{ +#ifdef POLYMORPHIC + vstrbq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrbq_scatter_offset_u32(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrbq_scatter_offset_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v16i8.v16i8(i8* [[BASE:%.*]], <16 x i8> [[OFFSET:%.*]], <16 x i8> [[VALUE:%.*]], i32 8, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrbq_scatter_offset_u8(uint8_t *base, uint8x16_t offset, uint8x16_t value) +{ +#ifdef POLYMORPHIC + vstrbq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrbq_scatter_offset_u8(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_base_p_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1(<2 x i64> [[ADDR:%.*]], i32 888, <2 x i64> [[VALUE:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_base_p_s64(uint64x2_t addr, int64x2_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_base_p(addr, 0x378, value, p); +#else /* POLYMORPHIC */ + vstrdq_scatter_base_p_s64(addr, 0x378, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_base_p_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1(<2 x i64> [[ADDR:%.*]], i32 264, <2 x i64> [[VALUE:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_base_p_u64(uint64x2_t addr, uint64x2_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_base_p(addr, 0x108, value, p); +#else /* POLYMORPHIC */ + vstrdq_scatter_base_p_u64(addr, 0x108, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_base_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.base.v2i64.v2i64(<2 x i64> [[ADDR:%.*]], i32 408, <2 x i64> [[VALUE:%.*]]) +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_base_s64(uint64x2_t addr, int64x2_t value) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_base(addr, 0x198, value); +#else /* POLYMORPHIC */ + vstrdq_scatter_base_s64(addr, 0x198, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_base_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.base.v2i64.v2i64(<2 x i64> [[ADDR:%.*]], i32 472, <2 x i64> [[VALUE:%.*]]) +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_base_u64(uint64x2_t addr, uint64x2_t value) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_base(addr, 0x1d8, value); +#else /* POLYMORPHIC */ + vstrdq_scatter_base_u64(addr, 0x1d8, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_base_wb_p_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> [[TMP0]], i32 248, <2 x i64> [[VALUE:%.*]], <4 x i1> [[TMP2]]) +// CHECK-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* [[ADDR]], align 8 +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_base_wb_p_s64(uint64x2_t *addr, int64x2_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_base_wb_p(addr, 0xf8, value, p); +#else /* POLYMORPHIC */ + vstrdq_scatter_base_wb_p_s64(addr, 0xf8, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_base_wb_p_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> [[TMP0]], i32 136, <2 x i64> [[VALUE:%.*]], <4 x i1> [[TMP2]]) +// CHECK-NEXT: store <2 x i64> [[TMP3]], <2 x i64>* [[ADDR]], align 8 +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_base_wb_p_u64(uint64x2_t *addr, uint64x2_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_base_wb_p(addr, 0x88, value, p); +#else /* POLYMORPHIC */ + vstrdq_scatter_base_wb_p_u64(addr, 0x88, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_base_wb_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.v2i64.v2i64(<2 x i64> [[TMP0]], i32 208, <2 x i64> [[VALUE:%.*]]) +// CHECK-NEXT: store <2 x i64> [[TMP1]], <2 x i64>* [[ADDR]], align 8 +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_base_wb_s64(uint64x2_t *addr, int64x2_t value) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_base_wb(addr, 0xd0, value); +#else /* POLYMORPHIC */ + vstrdq_scatter_base_wb_s64(addr, 0xd0, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_base_wb_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i64>, <2 x i64>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.v2i64.v2i64(<2 x i64> [[TMP0]], i32 168, <2 x i64> [[VALUE:%.*]]) +// CHECK-NEXT: store <2 x i64> [[TMP1]], <2 x i64>* [[ADDR]], align 8 +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_base_wb_u64(uint64x2_t *addr, uint64x2_t value) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_base_wb(addr, 0xa8, value); +#else /* POLYMORPHIC */ + vstrdq_scatter_base_wb_u64(addr, 0xa8, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_offset_p_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_offset_p_s64(int64_t *base, uint64x2_t offset, int64x2_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrdq_scatter_offset_p_s64(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_offset_p_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_offset_p_u64(uint64_t *base, uint64x2_t offset, uint64x2_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrdq_scatter_offset_p_u64(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_offset_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i64.v2i64.v2i64(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_offset_s64(int64_t *base, uint64x2_t offset, int64x2_t value) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrdq_scatter_offset_s64(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_offset_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i64.v2i64.v2i64(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_offset_u64(uint64_t *base, uint64x2_t offset, uint64x2_t value) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrdq_scatter_offset_u64(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_shifted_offset_p_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 3, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_shifted_offset_p_s64(int64_t *base, uint64x2_t offset, int64x2_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_shifted_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrdq_scatter_shifted_offset_p_s64(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_shifted_offset_p_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 3, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_shifted_offset_p_u64(uint64_t *base, uint64x2_t offset, uint64x2_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_shifted_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrdq_scatter_shifted_offset_p_u64(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_shifted_offset_s64( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i64.v2i64.v2i64(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 3) +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_shifted_offset_s64(int64_t *base, uint64x2_t offset, int64x2_t value) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_shifted_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrdq_scatter_shifted_offset_s64(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrdq_scatter_shifted_offset_u64( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i64.v2i64.v2i64(i64* [[BASE:%.*]], <2 x i64> [[OFFSET:%.*]], <2 x i64> [[VALUE:%.*]], i32 64, i32 3) +// CHECK-NEXT: ret void +// +void test_vstrdq_scatter_shifted_offset_u64(uint64_t *base, uint64x2_t offset, uint64x2_t value) +{ +#ifdef POLYMORPHIC + vstrdq_scatter_shifted_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrdq_scatter_shifted_offset_u64(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_offset_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0f16.v8i16.v8f16(half* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x half> [[VALUE:%.*]], i32 16, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_offset_f16(float16_t *base, uint16x8_t offset, float16x8_t value) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrhq_scatter_offset_f16(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_offset_p_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0f16.v8i16.v8f16.v8i1(half* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x half> [[VALUE:%.*]], i32 16, i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_offset_p_f16(float16_t *base, uint16x8_t offset, float16x8_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrhq_scatter_offset_p_f16(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_offset_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v8i16.v8i16.v8i1(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x i16> [[VALUE:%.*]], i32 16, i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_offset_p_s16(int16_t *base, uint16x8_t offset, int16x8_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrhq_scatter_offset_p_s16(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_offset_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v4i32.v4i32.v4i1(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 16, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_offset_p_s32(int16_t *base, uint32x4_t offset, int32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrhq_scatter_offset_p_s32(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_offset_p_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v8i16.v8i16.v8i1(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x i16> [[VALUE:%.*]], i32 16, i32 0, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_offset_p_u16(uint16_t *base, uint16x8_t offset, uint16x8_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrhq_scatter_offset_p_u16(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_offset_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v4i32.v4i32.v4i1(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 16, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_offset_p_u32(uint16_t *base, uint32x4_t offset, uint32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrhq_scatter_offset_p_u32(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_offset_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v8i16.v8i16(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x i16> [[VALUE:%.*]], i32 16, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_offset_s16(int16_t *base, uint16x8_t offset, int16x8_t value) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrhq_scatter_offset_s16(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_offset_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v4i32.v4i32(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 16, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_offset_s32(int16_t *base, uint32x4_t offset, int32x4_t value) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrhq_scatter_offset_s32(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_offset_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v8i16.v8i16(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x i16> [[VALUE:%.*]], i32 16, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_offset_u16(uint16_t *base, uint16x8_t offset, uint16x8_t value) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrhq_scatter_offset_u16(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_offset_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v4i32.v4i32(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 16, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_offset_u32(uint16_t *base, uint32x4_t offset, uint32x4_t value) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrhq_scatter_offset_u32(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_shifted_offset_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0f16.v8i16.v8f16(half* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x half> [[VALUE:%.*]], i32 16, i32 1) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_shifted_offset_f16(float16_t *base, uint16x8_t offset, float16x8_t value) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_shifted_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrhq_scatter_shifted_offset_f16(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_shifted_offset_p_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0f16.v8i16.v8f16.v8i1(half* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x half> [[VALUE:%.*]], i32 16, i32 1, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_shifted_offset_p_f16(float16_t *base, uint16x8_t offset, float16x8_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_shifted_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrhq_scatter_shifted_offset_p_f16(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_shifted_offset_p_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v8i16.v8i16.v8i1(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x i16> [[VALUE:%.*]], i32 16, i32 1, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_shifted_offset_p_s16(int16_t *base, uint16x8_t offset, int16x8_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_shifted_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrhq_scatter_shifted_offset_p_s16(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_shifted_offset_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v4i32.v4i32.v4i1(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 16, i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_shifted_offset_p_s32(int16_t *base, uint32x4_t offset, int32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_shifted_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrhq_scatter_shifted_offset_p_s32(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_shifted_offset_p_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v8i16.v8i16.v8i1(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x i16> [[VALUE:%.*]], i32 16, i32 1, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_shifted_offset_p_u16(uint16_t *base, uint16x8_t offset, uint16x8_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_shifted_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrhq_scatter_shifted_offset_p_u16(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_shifted_offset_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v4i32.v4i32.v4i1(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 16, i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_shifted_offset_p_u32(uint16_t *base, uint32x4_t offset, uint32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_shifted_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrhq_scatter_shifted_offset_p_u32(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_shifted_offset_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v8i16.v8i16(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x i16> [[VALUE:%.*]], i32 16, i32 1) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_shifted_offset_s16(int16_t *base, uint16x8_t offset, int16x8_t value) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_shifted_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrhq_scatter_shifted_offset_s16(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_shifted_offset_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v4i32.v4i32(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 16, i32 1) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_shifted_offset_s32(int16_t *base, uint32x4_t offset, int32x4_t value) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_shifted_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrhq_scatter_shifted_offset_s32(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_shifted_offset_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v8i16.v8i16(i16* [[BASE:%.*]], <8 x i16> [[OFFSET:%.*]], <8 x i16> [[VALUE:%.*]], i32 16, i32 1) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_shifted_offset_u16(uint16_t *base, uint16x8_t offset, uint16x8_t value) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_shifted_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrhq_scatter_shifted_offset_u16(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrhq_scatter_shifted_offset_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v4i32.v4i32(i16* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 16, i32 1) +// CHECK-NEXT: ret void +// +void test_vstrhq_scatter_shifted_offset_u32(uint16_t *base, uint32x4_t offset, uint32x4_t value) +{ +#ifdef POLYMORPHIC + vstrhq_scatter_shifted_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrhq_scatter_shifted_offset_u32(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_base_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.base.v4i32.v4f32(<4 x i32> [[ADDR:%.*]], i32 380, <4 x float> [[VALUE:%.*]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_base_f32(uint32x4_t addr, float32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_base(addr, 0x17c, value); +#else /* POLYMORPHIC */ + vstrwq_scatter_base_f32(addr, 0x17c, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_base_p_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.base.predicated.v4i32.v4f32.v4i1(<4 x i32> [[ADDR:%.*]], i32 400, <4 x float> [[VALUE:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_base_p_f32(uint32x4_t addr, float32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_base_p(addr, 0x190, value, p); +#else /* POLYMORPHIC */ + vstrwq_scatter_base_p_f32(addr, 0x190, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_base_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.base.predicated.v4i32.v4i32.v4i1(<4 x i32> [[ADDR:%.*]], i32 48, <4 x i32> [[VALUE:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_base_p_s32(uint32x4_t addr, int32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_base_p(addr, 0x30, value, p); +#else /* POLYMORPHIC */ + vstrwq_scatter_base_p_s32(addr, 0x30, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_base_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.base.predicated.v4i32.v4i32.v4i1(<4 x i32> [[ADDR:%.*]], i32 376, <4 x i32> [[VALUE:%.*]], <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_base_p_u32(uint32x4_t addr, uint32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_base_p(addr, 0x178, value, p); +#else /* POLYMORPHIC */ + vstrwq_scatter_base_p_u32(addr, 0x178, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_base_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.base.v4i32.v4i32(<4 x i32> [[ADDR:%.*]], i32 156, <4 x i32> [[VALUE:%.*]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_base_s32(uint32x4_t addr, int32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_base(addr, 0x9c, value); +#else /* POLYMORPHIC */ + vstrwq_scatter_base_s32(addr, 0x9c, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_base_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.base.v4i32.v4i32(<4 x i32> [[ADDR:%.*]], i32 212, <4 x i32> [[VALUE:%.*]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_base_u32(uint32x4_t addr, uint32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_base(addr, 0xd4, value); +#else /* POLYMORPHIC */ + vstrwq_scatter_base_u32(addr, 0xd4, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_base_wb_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.v4i32.v4f32(<4 x i32> [[TMP0]], i32 412, <4 x float> [[VALUE:%.*]]) +// CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[ADDR]], align 8 +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_base_wb_f32(uint32x4_t *addr, float32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_base_wb(addr, 0x19c, value); +#else /* POLYMORPHIC */ + vstrwq_scatter_base_wb_f32(addr, 0x19c, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_base_wb_p_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v4i32.v4f32.v4i1(<4 x i32> [[TMP0]], i32 236, <4 x float> [[VALUE:%.*]], <4 x i1> [[TMP2]]) +// CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[ADDR]], align 8 +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_base_wb_p_f32(uint32x4_t *addr, float32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_base_wb_p(addr, 0xec, value, p); +#else /* POLYMORPHIC */ + vstrwq_scatter_base_wb_p_f32(addr, 0xec, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_base_wb_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32> [[TMP0]], i32 328, <4 x i32> [[VALUE:%.*]], <4 x i1> [[TMP2]]) +// CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[ADDR]], align 8 +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_base_wb_p_s32(uint32x4_t *addr, int32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_base_wb_p(addr, 0x148, value, p); +#else /* POLYMORPHIC */ + vstrwq_scatter_base_wb_p_s32(addr, 0x148, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_base_wb_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32> [[TMP0]], i32 412, <4 x i32> [[VALUE:%.*]], <4 x i1> [[TMP2]]) +// CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[ADDR]], align 8 +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_base_wb_p_u32(uint32x4_t *addr, uint32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_base_wb_p(addr, 0x19c, value, p); +#else /* POLYMORPHIC */ + vstrwq_scatter_base_wb_p_u32(addr, 0x19c, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_base_wb_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.v4i32.v4i32(<4 x i32> [[TMP0]], i32 152, <4 x i32> [[VALUE:%.*]]) +// CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[ADDR]], align 8 +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_base_wb_s32(uint32x4_t *addr, int32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_base_wb(addr, 0x98, value); +#else /* POLYMORPHIC */ + vstrwq_scatter_base_wb_s32(addr, 0x98, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_base_wb_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[ADDR:%.*]], align 8 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.v4i32.v4i32(<4 x i32> [[TMP0]], i32 64, <4 x i32> [[VALUE:%.*]]) +// CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[ADDR]], align 8 +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_base_wb_u32(uint32x4_t *addr, uint32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_base_wb(addr, 0x40, value); +#else /* POLYMORPHIC */ + vstrwq_scatter_base_wb_u32(addr, 0x40, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_offset_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0f32.v4i32.v4f32(float* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x float> [[VALUE:%.*]], i32 32, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_offset_f32(float32_t *base, uint32x4_t offset, float32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrwq_scatter_offset_f32(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_offset_p_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0f32.v4i32.v4f32.v4i1(float* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x float> [[VALUE:%.*]], i32 32, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_offset_p_f32(float32_t *base, uint32x4_t offset, float32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrwq_scatter_offset_p_f32(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_offset_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i32.v4i32.v4i32.v4i1(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 32, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_offset_p_s32(int32_t *base, uint32x4_t offset, int32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrwq_scatter_offset_p_s32(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_offset_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i32.v4i32.v4i32.v4i1(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 32, i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_offset_p_u32(uint32_t *base, uint32x4_t offset, uint32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrwq_scatter_offset_p_u32(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_offset_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i32.v4i32.v4i32(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 32, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_offset_s32(int32_t *base, uint32x4_t offset, int32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrwq_scatter_offset_s32(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_offset_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i32.v4i32.v4i32(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 32, i32 0) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_offset_u32(uint32_t *base, uint32x4_t offset, uint32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrwq_scatter_offset_u32(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_shifted_offset_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0f32.v4i32.v4f32(float* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x float> [[VALUE:%.*]], i32 32, i32 2) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_shifted_offset_f32(float32_t *base, uint32x4_t offset, float32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_shifted_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrwq_scatter_shifted_offset_f32(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_shifted_offset_p_f32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0f32.v4i32.v4f32.v4i1(float* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x float> [[VALUE:%.*]], i32 32, i32 2, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_shifted_offset_p_f32(float32_t *base, uint32x4_t offset, float32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_shifted_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrwq_scatter_shifted_offset_p_f32(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_shifted_offset_p_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i32.v4i32.v4i32.v4i1(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 32, i32 2, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_shifted_offset_p_s32(int32_t *base, uint32x4_t offset, int32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_shifted_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrwq_scatter_shifted_offset_p_s32(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_shifted_offset_p_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i32.v4i32.v4i32.v4i1(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 32, i32 2, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_shifted_offset_p_u32(uint32_t *base, uint32x4_t offset, uint32x4_t value, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_shifted_offset_p(base, offset, value, p); +#else /* POLYMORPHIC */ + vstrwq_scatter_shifted_offset_p_u32(base, offset, value, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_shifted_offset_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i32.v4i32.v4i32(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 32, i32 2) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_shifted_offset_s32(int32_t *base, uint32x4_t offset, int32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_shifted_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrwq_scatter_shifted_offset_s32(base, offset, value); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vstrwq_scatter_shifted_offset_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: call void @llvm.arm.mve.vstr.scatter.offset.p0i32.v4i32.v4i32(i32* [[BASE:%.*]], <4 x i32> [[OFFSET:%.*]], <4 x i32> [[VALUE:%.*]], i32 32, i32 2) +// CHECK-NEXT: ret void +// +void test_vstrwq_scatter_shifted_offset_u32(uint32_t *base, uint32x4_t offset, uint32x4_t value) +{ +#ifdef POLYMORPHIC + vstrwq_scatter_shifted_offset(base, offset, value); +#else /* POLYMORPHIC */ + vstrwq_scatter_shifted_offset_u32(base, offset, value); +#endif /* POLYMORPHIC */ +} + diff --git a/clang/test/Sema/arm-mve-immediates.c b/clang/test/Sema/arm-mve-immediates.c new file mode 100644 --- /dev/null +++ b/clang/test/Sema/arm-mve-immediates.c @@ -0,0 +1,56 @@ +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -verify -fsyntax-only %s + +#include + +void test_load_offsets(uint32x4_t addr32, uint64x2_t addr64) +{ + // Offsets that should be a multiple of 8 times 0,1,...,127 + vldrdq_gather_base_s64(addr64, 0); + vldrdq_gather_base_s64(addr64, 8); + vldrdq_gather_base_s64(addr64, 2*8); + vldrdq_gather_base_s64(addr64, 125*8); + vldrdq_gather_base_s64(addr64, 126*8); + vldrdq_gather_base_s64(addr64, 127*8); + vldrdq_gather_base_s64(addr64, -8); // expected-error {{argument value -8 is outside the valid range [0, 1016]}} + vldrdq_gather_base_s64(addr64, 128*8); // expected-error {{argument value 1024 is outside the valid range [0, 1016]}} + vldrdq_gather_base_s64(addr64, 4); // expected-error {{argument should be a multiple of 8}} + vldrdq_gather_base_s64(addr64, 1); // expected-error {{argument should be a multiple of 8}} + + // Offsets that should be a multiple of 4 times 0,1,...,127 + vldrwq_gather_base_s32(addr32, 0); + vldrwq_gather_base_s32(addr32, 4); + vldrwq_gather_base_s32(addr32, 2*4); + vldrwq_gather_base_s32(addr32, 125*4); + vldrwq_gather_base_s32(addr32, 126*4); + vldrwq_gather_base_s32(addr32, 127*4); + vldrwq_gather_base_s32(addr32, -4); // expected-error {{argument value -4 is outside the valid range [0, 508]}} + vldrwq_gather_base_s32(addr32, 128*4); // expected-error {{argument value 512 is outside the valid range [0, 508]}} + vldrwq_gather_base_s32(addr32, 2); // expected-error {{argument should be a multiple of 4}} + vldrwq_gather_base_s32(addr32, 1); // expected-error {{argument should be a multiple of 4}} + + // Show that the polymorphic store intrinsics get the right set of + // error checks after overload resolution. These ones expand to the + // 8-byte granular versions... + vstrdq_scatter_base(addr64, 0, addr64); + vstrdq_scatter_base(addr64, 8, addr64); + vstrdq_scatter_base(addr64, 2*8, addr64); + vstrdq_scatter_base(addr64, 125*8, addr64); + vstrdq_scatter_base(addr64, 126*8, addr64); + vstrdq_scatter_base(addr64, 127*8, addr64); + vstrdq_scatter_base(addr64, -8, addr64); // expected-error {{argument value -8 is outside the valid range [0, 1016]}} + vstrdq_scatter_base(addr64, 128*8, addr64); // expected-error {{argument value 1024 is outside the valid range [0, 1016]}} + vstrdq_scatter_base(addr64, 4, addr64); // expected-error {{argument should be a multiple of 8}} + vstrdq_scatter_base(addr64, 1, addr64); // expected-error {{argument should be a multiple of 8}} + + /// ... and these ones to the 4-byte. + vstrwq_scatter_base(addr32, 0, addr32); + vstrwq_scatter_base(addr32, 4, addr32); + vstrwq_scatter_base(addr32, 2*4, addr32); + vstrwq_scatter_base(addr32, 125*4, addr32); + vstrwq_scatter_base(addr32, 126*4, addr32); + vstrwq_scatter_base(addr32, 127*4, addr32); + vstrwq_scatter_base(addr32, -4, addr32); // expected-error {{argument value -4 is outside the valid range [0, 508]}} + vstrwq_scatter_base(addr32, 128*4, addr32); // expected-error {{argument value 512 is outside the valid range [0, 508]}} + vstrwq_scatter_base(addr32, 2, addr32); // expected-error {{argument should be a multiple of 4}} + vstrwq_scatter_base(addr32, 1, addr32); // expected-error {{argument should be a multiple of 4}} +} diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp --- a/clang/utils/TableGen/MveEmitter.cpp +++ b/clang/utils/TableGen/MveEmitter.cpp @@ -204,6 +204,9 @@ Name = "const " + Name; return Name + " *"; } + std::string llvmName() const override { + return "llvm::PointerType::getUnqual(" + Pointee->llvmName() + ")"; + } static bool classof(const Type *T) { return T->typeKind() == TypeKind::Pointer; @@ -512,6 +515,11 @@ void setVarname(const StringRef s) { VarName = s; } bool varnameUsed() const { return VarNameUsed; } + // Emit code to generate this result as a Value *. + virtual std::string asValue() { + return varname(); + } + // Code generation happens in multiple passes. This method tracks whether a // Result has yet been visited in a given pass, without the need for a // tedious loop in between passes that goes through and resets a 'visited' @@ -547,6 +555,12 @@ std::string typeName() const override { return AddressType ? "Address" : Result::typeName(); } + // Emit code to generate this result as a Value *. + std::string asValue() override { + if (AddressType) + return "(" + varname() + ".getPointer())"; + return Result::asValue(); + } }; // Result subclass for an integer literal appearing in Tablegen. This may need @@ -665,7 +679,7 @@ OS << "), llvm::SmallVector {"; const char *Sep = ""; for (auto Arg : Args) { - OS << Sep << Arg->varname(); + OS << Sep << Arg->asValue(); Sep = ", "; } OS << "})"; @@ -974,17 +988,15 @@ return getPointerType(Pointee, Op->getValueAsBit("const")); } - if (Op->isSubClassOf("CTO_Sign")) { - const ScalarType *ST = cast(getType(D->getArg(0), Param)); - ScalarTypeKind NewKind = Op->getValueAsBit("signed") - ? ScalarTypeKind::SignedInt - : ScalarTypeKind::UnsignedInt; + if (Op->getName() == "CTO_CopyKind") { + const ScalarType *STSize = cast(getType(D->getArg(0), Param)); + const ScalarType *STKind = cast(getType(D->getArg(1), Param)); for (const auto &kv : ScalarTypes) { const ScalarType *RT = kv.second.get(); - if (RT->kind() == NewKind && RT->sizeInBits() == ST->sizeInBits()) + if (RT->kind() == STKind->kind() && RT->sizeInBits() == STSize->sizeInBits()) return RT; } - PrintFatalError("Cannot change sign of this type"); + PrintFatalError("Cannot find a type to satisfy CopyKind"); } PrintFatalError("Bad operator in type dag expression"); @@ -1025,6 +1037,18 @@ } } PrintFatalError("Unsupported type cast"); + } else if (Op->getName() == "unsignedflag") { + if (D->getNumArgs() != 1) + PrintFatalError("unsignedflag should have exactly one argument"); + Record *TypeRec = cast(D->getArg(0))->getDef(); + if (!TypeRec->isSubClassOf("Type")) + PrintFatalError("unsignedflag's argument should be a type"); + if (const auto *ST = dyn_cast(getType(TypeRec, Param))) { + return std::make_shared( + getScalarType("u32"), ST->kind() == ScalarTypeKind::UnsignedInt); + } else { + PrintFatalError("unsignedflag's argument should be a scalar type"); + } } else { std::vector Args; for (unsigned i = 0, e = D->getNumArgs(); i < e; ++i) diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -812,17 +812,43 @@ defm int_arm_mve_maxv: IntrinsicSignSuffix<[llvm_i32_ty], [llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>; -def int_arm_mve_vcvt_narrow: Intrinsic<[llvm_v8f16_ty], - [llvm_v8f16_ty, llvm_v4f32_ty, llvm_i32_ty], [IntrNoMem]>; -def int_arm_mve_vcvt_narrow_predicated: Intrinsic<[llvm_v8f16_ty], - [llvm_v8f16_ty, llvm_v4f32_ty, llvm_i32_ty, llvm_v4i1_ty], [IntrNoMem]>; +multiclass MVEPredicated rets, list params, + LLVMType pred, list props = []> { + def "": Intrinsic; + def _predicated: Intrinsic; +} -def int_arm_mve_vldr_gather_base_wb: Intrinsic< - [llvm_anyvector_ty, llvm_anyvector_ty], - [LLVMMatchType<1>, llvm_i32_ty], [IntrReadMem]>; -def int_arm_mve_vldr_gather_base_wb_predicated: Intrinsic< +defm int_arm_mve_vcvt_narrow: MVEPredicated<[llvm_v8f16_ty], + [llvm_v8f16_ty, llvm_v4f32_ty, llvm_i32_ty], llvm_v4i1_ty, [IntrNoMem]>; + +defm int_arm_mve_vldr_gather_base: MVEPredicated< + [llvm_anyvector_ty], [llvm_anyvector_ty, llvm_i32_ty], + llvm_anyvector_ty, [IntrReadMem]>; +defm int_arm_mve_vldr_gather_base_wb: MVEPredicated< [llvm_anyvector_ty, llvm_anyvector_ty], - [LLVMMatchType<1>, llvm_i32_ty, llvm_anyvector_ty], [IntrReadMem]>; + [LLVMMatchType<1>, llvm_i32_ty], llvm_anyvector_ty, [IntrReadMem]>; +defm int_arm_mve_vstr_scatter_base: MVEPredicated< + [], [llvm_anyvector_ty, llvm_i32_ty, llvm_anyvector_ty], + llvm_anyvector_ty, [IntrWriteMem]>; +defm int_arm_mve_vstr_scatter_base_wb: MVEPredicated< + [llvm_anyvector_ty], [LLVMMatchType<0>, llvm_i32_ty, llvm_anyvector_ty], + llvm_anyvector_ty, [IntrWriteMem]>; + +// gather_offset takes three i32 parameters. The first is the size of +// memory element loaded, in bits. The second is a left bit shift to +// apply to each offset in the vector parameter (must be either 0, or +// correspond to the element size of the destination vector type). The +// last is 1 to indicate zero extension (if the load is widening), or +// 0 for sign extension. +// +// scatter_offset has the first two of those parameters, but since it +// narrows rather than widening, it doesn't have the last one. +defm int_arm_mve_vldr_gather_offset: MVEPredicated< + [llvm_anyvector_ty], [llvm_anyptr_ty, llvm_anyvector_ty, + llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], llvm_anyvector_ty, [IntrReadMem]>; +defm int_arm_mve_vstr_scatter_offset: MVEPredicated< + [], [llvm_anyptr_ty, llvm_anyvector_ty, llvm_anyvector_ty, + llvm_i32_ty, llvm_i32_ty], llvm_anyvector_ty, [IntrWriteMem]>; def int_arm_mve_urshrl: Intrinsic< [llvm_i32_ty, llvm_i32_ty], [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty], diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -278,7 +278,7 @@ // A family of classes wrapping up information about the vector types // used by MVE. class MVEVectorVTInfo size, - string suffix, bit unsigned> { + string suffixletter, bit unsigned> { // The LLVM ValueType representing the vector, so we can use it in // ISel patterns. ValueType Vec = vec; @@ -304,32 +304,39 @@ // signed and 1 for unsigned. For anything else, undefined. bit Unsigned = unsigned; - // The suffix used on the instruction in assembly language. - string Suffix = suffix; + // The number of bits in a vector element, in integer form. + int LaneBits = !shl(8, Size); + + // The suffix used in assembly language on an instruction operating + // on this lane if it only cares about number of bits. + string BitsSuffix = !cast(LaneBits); + + // The suffix used on an instruction that mentions the whole type. + string Suffix = suffixletter ## BitsSuffix; } // Integer vector types that don't treat signed and unsigned differently. -def MVE_v16i8 : MVEVectorVTInfo; -def MVE_v8i16 : MVEVectorVTInfo; -def MVE_v4i32 : MVEVectorVTInfo; -def MVE_v2i64 : MVEVectorVTInfo; +def MVE_v16i8 : MVEVectorVTInfo; +def MVE_v8i16 : MVEVectorVTInfo; +def MVE_v4i32 : MVEVectorVTInfo; +def MVE_v2i64 : MVEVectorVTInfo; // Explicitly signed and unsigned integer vectors. They map to the // same set of LLVM ValueTypes as above, but are represented // differently in assembly and instruction encodings. -def MVE_v16s8 : MVEVectorVTInfo; -def MVE_v8s16 : MVEVectorVTInfo; -def MVE_v4s32 : MVEVectorVTInfo; -def MVE_v2s64 : MVEVectorVTInfo; -def MVE_v16u8 : MVEVectorVTInfo; -def MVE_v8u16 : MVEVectorVTInfo; -def MVE_v4u32 : MVEVectorVTInfo; -def MVE_v2u64 : MVEVectorVTInfo; +def MVE_v16s8 : MVEVectorVTInfo; +def MVE_v8s16 : MVEVectorVTInfo; +def MVE_v4s32 : MVEVectorVTInfo; +def MVE_v2s64 : MVEVectorVTInfo; +def MVE_v16u8 : MVEVectorVTInfo; +def MVE_v8u16 : MVEVectorVTInfo; +def MVE_v4u32 : MVEVectorVTInfo; +def MVE_v2u64 : MVEVectorVTInfo; // FP vector types. -def MVE_v8f16 : MVEVectorVTInfo; -def MVE_v4f32 : MVEVectorVTInfo; -def MVE_v2f64 : MVEVectorVTInfo; +def MVE_v8f16 : MVEVectorVTInfo; +def MVE_v4f32 : MVEVectorVTInfo; +def MVE_v2f64 : MVEVectorVTInfo; // --------- Start of base classes for the instructions themselves @@ -4614,28 +4621,80 @@ string asm, string suffix, bit U, bits<2> size> : MVE_VLDRSTR_rq; +// Multiclasses wrapping that to add ISel patterns for intrinsics. +multiclass MVE_VLDR_rq_w VTIs> { + defm "": MVE_VLDRSTR_rq_w; + foreach VTI = VTIs in + foreach UnsignedFlag = !if(!eq(VTI.Size, memsz.encoding), + [0,1], [VTI.Unsigned]) in { + def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), memsz.TypeBits, 0, UnsignedFlag)), + (VTI.Vec (!cast(NAME#"_u") GPR:$base, MQPR:$offsets))>; + def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), memsz.TypeBits, memsz.shift, UnsignedFlag)), + (VTI.Vec (!cast(NAME) GPR:$base, MQPR:$offsets))>; + def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), memsz.TypeBits, 0, UnsignedFlag, (VTI.Pred VCCR:$pred))), + (VTI.Vec (!cast(NAME#"_u") GPR:$base, MQPR:$offsets, 1, VCCR:$pred))>; + def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), memsz.TypeBits, memsz.shift, UnsignedFlag, (VTI.Pred VCCR:$pred))), + (VTI.Vec (!cast(NAME) GPR:$base, MQPR:$offsets, 1, VCCR:$pred))>; + } +} +multiclass MVE_VLDR_rq_b VTIs> { + def "": MVE_VLDRSTR_rq_b; + foreach VTI = VTIs in { + def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), 8, 0, VTI.Unsigned)), + (VTI.Vec (!cast(NAME) GPR:$base, MQPR:$offsets))>; + def : Pat<(VTI.Vec (int_arm_mve_vldr_gather_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), 8, 0, VTI.Unsigned, (VTI.Pred VCCR:$pred))), + (VTI.Vec (!cast(NAME) GPR:$base, MQPR:$offsets, 1, VCCR:$pred))>; + } +} +multiclass MVE_VSTR_rq_w VTIs> { + defm "": MVE_VLDRSTR_rq_w; + foreach VTI = VTIs in { + def : Pat<(int_arm_mve_vstr_scatter_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), memsz.TypeBits, 0), + (!cast(NAME#"_u") MQPR:$data, GPR:$base, MQPR:$offsets)>; + def : Pat<(int_arm_mve_vstr_scatter_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), memsz.TypeBits, memsz.shift), + (!cast(NAME) MQPR:$data, GPR:$base, MQPR:$offsets)>; + def : Pat<(int_arm_mve_vstr_scatter_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), memsz.TypeBits, 0, (VTI.Pred VCCR:$pred)), + (!cast(NAME#"_u") MQPR:$data, GPR:$base, MQPR:$offsets, 1, VCCR:$pred)>; + def : Pat<(int_arm_mve_vstr_scatter_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), memsz.TypeBits, memsz.shift, (VTI.Pred VCCR:$pred)), + (!cast(NAME) MQPR:$data, GPR:$base, MQPR:$offsets, 1, VCCR:$pred)>; + } +} +multiclass MVE_VSTR_rq_b VTIs> { + def "": MVE_VLDRSTR_rq_b; + foreach VTI = VTIs in { + def : Pat<(int_arm_mve_vstr_scatter_offset GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), 8, 0), + (!cast(NAME) MQPR:$data, GPR:$base, MQPR:$offsets)>; + def : Pat<(int_arm_mve_vstr_scatter_offset_predicated GPR:$base, (VTIs[0].Vec MQPR:$offsets), (VTI.Vec MQPR:$data), 8, 0, (VTI.Pred VCCR:$pred)), + (!cast(NAME) MQPR:$data, GPR:$base, MQPR:$offsets, 1, VCCR:$pred)>; + } +} + // Actually define all the loads and stores in this family. -def MVE_VLDRBU8_rq : MVE_VLDRSTR_rq_b; -def MVE_VLDRBU16_rq: MVE_VLDRSTR_rq_b; -def MVE_VLDRBS16_rq: MVE_VLDRSTR_rq_b; -def MVE_VLDRBU32_rq: MVE_VLDRSTR_rq_b; -def MVE_VLDRBS32_rq: MVE_VLDRSTR_rq_b; +defm MVE_VLDRBU8_rq : MVE_VLDR_rq_b<[MVE_v16u8,MVE_v16s8]>; +defm MVE_VLDRBU16_rq: MVE_VLDR_rq_b<[MVE_v8u16]>; +defm MVE_VLDRBS16_rq: MVE_VLDR_rq_b<[MVE_v8s16]>; +defm MVE_VLDRBU32_rq: MVE_VLDR_rq_b<[MVE_v4u32]>; +defm MVE_VLDRBS32_rq: MVE_VLDR_rq_b<[MVE_v4s32]>; -defm MVE_VLDRHU16_rq: MVE_VLDRSTR_rq_w; -defm MVE_VLDRHU32_rq: MVE_VLDRSTR_rq_w; -defm MVE_VLDRHS32_rq: MVE_VLDRSTR_rq_w; -defm MVE_VLDRWU32_rq: MVE_VLDRSTR_rq_w; -defm MVE_VLDRDU64_rq: MVE_VLDRSTR_rq_w; +defm MVE_VLDRHU16_rq: MVE_VLDR_rq_w; +defm MVE_VLDRHU32_rq: MVE_VLDR_rq_w; +defm MVE_VLDRHS32_rq: MVE_VLDR_rq_w; +defm MVE_VLDRWU32_rq: MVE_VLDR_rq_w; +defm MVE_VLDRDU64_rq: MVE_VLDR_rq_w; -def MVE_VSTRB8_rq : MVE_VLDRSTR_rq_b; -def MVE_VSTRB16_rq : MVE_VLDRSTR_rq_b; -def MVE_VSTRB32_rq : MVE_VLDRSTR_rq_b; +defm MVE_VSTRB8_rq : MVE_VSTR_rq_b<[MVE_v16i8]>; +defm MVE_VSTRB16_rq : MVE_VSTR_rq_b<[MVE_v8i16]>; +defm MVE_VSTRB32_rq : MVE_VSTR_rq_b<[MVE_v4i32]>; -defm MVE_VSTRH16_rq : MVE_VLDRSTR_rq_w; -defm MVE_VSTRH32_rq : MVE_VLDRSTR_rq_w; -defm MVE_VSTRW32_rq : MVE_VLDRSTR_rq_w; -defm MVE_VSTRD64_rq : MVE_VLDRSTR_rq_w; +defm MVE_VSTRH16_rq : MVE_VSTR_rq_w; +defm MVE_VSTRH32_rq : MVE_VSTR_rq_w; +defm MVE_VSTRW32_rq : MVE_VSTR_rq_w; +defm MVE_VSTRD64_rq : MVE_VSTR_rq_w; // Gather loads / scatter stores whose address operand is of the form // [Qm,#imm], i.e. a vector containing a full base address for each @@ -4674,11 +4733,56 @@ } } +// Multiclasses wrapping that one, adding selection patterns for the +// non-writeback loads and all the stores. (The writeback loads must +// deliver multiple output values, so they have to be selected by C++ +// code.) +multiclass MVE_VLDR_qi DVTIs> { + defm "" : MVE_VLDRSTR_qi_m; + + foreach DVTI = DVTIs in { + def : Pat<(DVTI.Vec (int_arm_mve_vldr_gather_base + (AVTI.Vec MQPR:$addr), (i32 imm:$offset))), + (DVTI.Vec (!cast(NAME) + (AVTI.Vec MQPR:$addr), (i32 imm:$offset)))>; + def : Pat<(DVTI.Vec (int_arm_mve_vldr_gather_base_predicated + (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (AVTI.Pred VCCR:$pred))), + (DVTI.Vec (!cast(NAME) + (AVTI.Vec MQPR:$addr), (i32 imm:$offset), 1, VCCR:$pred))>; + } +} +multiclass MVE_VSTR_qi DVTIs> { + defm "" : MVE_VLDRSTR_qi_m(memsz.TypeBits)>; + + foreach DVTI = DVTIs in { + def : Pat<(int_arm_mve_vstr_scatter_base + (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (DVTI.Vec MQPR:$data)), + (!cast(NAME) + (DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr), (i32 imm:$offset))>; + def : Pat<(int_arm_mve_vstr_scatter_base_predicated + (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (DVTI.Vec MQPR:$data), (AVTI.Pred VCCR:$pred)), + (!cast(NAME) + (DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr), (i32 imm:$offset), 1, VCCR:$pred)>; + def : Pat<(AVTI.Vec (int_arm_mve_vstr_scatter_base_wb + (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (DVTI.Vec MQPR:$data))), + (AVTI.Vec (!cast(NAME # "_pre") + (DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr), (i32 imm:$offset)))>; + def : Pat<(AVTI.Vec (int_arm_mve_vstr_scatter_base_wb_predicated + (AVTI.Vec MQPR:$addr), (i32 imm:$offset), (DVTI.Vec MQPR:$data), (AVTI.Pred VCCR:$pred))), + (AVTI.Vec (!cast(NAME # "_pre") + (DVTI.Vec MQPR:$data), (AVTI.Vec MQPR:$addr), (i32 imm:$offset), 1, VCCR:$pred))>; + } +} + // Actual instruction definitions. -defm MVE_VLDRWU32_qi: MVE_VLDRSTR_qi_m; -defm MVE_VLDRDU64_qi: MVE_VLDRSTR_qi_m; -defm MVE_VSTRW32_qi: MVE_VLDRSTR_qi_m; -defm MVE_VSTRD64_qi: MVE_VLDRSTR_qi_m; +defm MVE_VLDRWU32_qi: MVE_VLDR_qi; +defm MVE_VLDRDU64_qi: MVE_VLDR_qi; +defm MVE_VSTRW32_qi: MVE_VSTR_qi; +defm MVE_VSTRD64_qi: MVE_VSTR_qi; // Define aliases for all the instructions where memory size and // vector lane size are the same. These are mnemonic aliases, so they diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/scatter-gather.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/scatter-gather.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/scatter-gather.ll @@ -0,0 +1,2018 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s + +define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_gather_offset_s16(i8* %base, <8 x i16> %offset) { +; CHECK-LABEL: test_vldrbq_gather_offset_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.s16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i8.v8i16(i8* %base, <8 x i16> %offset, i32 8, i32 0, i32 0) + ret <8 x i16> %0 +} + +declare <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i8.v8i16(i8*, <8 x i16>, i32, i32, i32) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_gather_offset_s32(i8* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrbq_gather_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.s32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i8.v4i32(i8* %base, <4 x i32> %offset, i32 8, i32 0, i32 0) + ret <4 x i32> %0 +} + +declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i8.v4i32(i8*, <4 x i32>, i32, i32, i32) + +define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_gather_offset_s8(i8* %base, <16 x i8> %offset) { +; CHECK-LABEL: test_vldrbq_gather_offset_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vldr.gather.offset.v16i8.p0i8.v16i8(i8* %base, <16 x i8> %offset, i32 8, i32 0, i32 0) + ret <16 x i8> %0 +} + +declare <16 x i8> @llvm.arm.mve.vldr.gather.offset.v16i8.p0i8.v16i8(i8*, <16 x i8>, i32, i32, i32) + +define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_gather_offset_u16(i8* %base, <8 x i16> %offset) { +; CHECK-LABEL: test_vldrbq_gather_offset_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i8.v8i16(i8* %base, <8 x i16> %offset, i32 8, i32 0, i32 1) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_gather_offset_u32(i8* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrbq_gather_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i8.v4i32(i8* %base, <4 x i32> %offset, i32 8, i32 0, i32 1) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_gather_offset_u8(i8* %base, <16 x i8> %offset) { +; CHECK-LABEL: test_vldrbq_gather_offset_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrb.u8 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <16 x i8> @llvm.arm.mve.vldr.gather.offset.v16i8.p0i8.v16i8(i8* %base, <16 x i8> %offset, i32 8, i32 0, i32 1) + ret <16 x i8> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_gather_offset_z_s16(i8* %base, <8 x i16> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrbq_gather_offset_z_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.s16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i8.v8i16.v8i1(i8* %base, <8 x i16> %offset, i32 8, i32 0, i32 0, <8 x i1> %1) + ret <8 x i16> %2 +} + +declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) + +declare <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i8.v8i16.v8i1(i8*, <8 x i16>, i32, i32, i32, <8 x i1>) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_gather_offset_z_s32(i8* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrbq_gather_offset_z_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.s32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i8.v4i32.v4i1(i8* %base, <4 x i32> %offset, i32 8, i32 0, i32 0, <4 x i1> %1) + ret <4 x i32> %2 +} + +declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) + +declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i8.v4i32.v4i1(i8*, <4 x i32>, i32, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_gather_offset_z_s8(i8* %base, <16 x i8> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrbq_gather_offset_z_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u8 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vldr.gather.offset.predicated.v16i8.p0i8.v16i8.v16i1(i8* %base, <16 x i8> %offset, i32 8, i32 0, i32 0, <16 x i1> %1) + ret <16 x i8> %2 +} + +declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) + +declare <16 x i8> @llvm.arm.mve.vldr.gather.offset.predicated.v16i8.p0i8.v16i8.v16i1(i8*, <16 x i8>, i32, i32, i32, <16 x i1>) + +define arm_aapcs_vfpcc <8 x i16> @test_vldrbq_gather_offset_z_u16(i8* %base, <8 x i16> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrbq_gather_offset_z_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i8.v8i16.v8i1(i8* %base, <8 x i16> %offset, i32 8, i32 0, i32 1, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrbq_gather_offset_z_u32(i8* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrbq_gather_offset_z_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i8.v4i32.v4i1(i8* %base, <4 x i32> %offset, i32 8, i32 0, i32 1, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vldrbq_gather_offset_z_u8(i8* %base, <16 x i8> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrbq_gather_offset_z_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrbt.u8 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.mve.vldr.gather.offset.predicated.v16i8.p0i8.v16i8.v16i1(i8* %base, <16 x i8> %offset, i32 8, i32 0, i32 1, <16 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_s64(<2 x i64> %addr) { +; CHECK-LABEL: test_vldrdq_gather_base_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrd.u64 q1, [q0, #616] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <2 x i64> @llvm.arm.mve.vldr.gather.base.v2i64.v2i64(<2 x i64> %addr, i32 616) + ret <2 x i64> %0 +} + +declare <2 x i64> @llvm.arm.mve.vldr.gather.base.v2i64.v2i64(<2 x i64>, i32) + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_u64(<2 x i64> %addr) { +; CHECK-LABEL: test_vldrdq_gather_base_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrd.u64 q1, [q0, #336] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <2 x i64> @llvm.arm.mve.vldr.gather.base.v2i64.v2i64(<2 x i64> %addr, i32 336) + ret <2 x i64> %0 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_wb_s64(<2 x i64>* %addr) { +; CHECK-LABEL: test_vldrdq_gather_base_wb_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrd.u64 q1, [q0, #576]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <2 x i64>, <2 x i64>* %addr, align 8 + %1 = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.v2i64.v2i64(<2 x i64> %0, i32 576) + %2 = extractvalue { <2 x i64>, <2 x i64> } %1, 1 + store <2 x i64> %2, <2 x i64>* %addr, align 8 + %3 = extractvalue { <2 x i64>, <2 x i64> } %1, 0 + ret <2 x i64> %3 +} + +declare { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.v2i64.v2i64(<2 x i64>, i32) + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_wb_u64(<2 x i64>* %addr) { +; CHECK-LABEL: test_vldrdq_gather_base_wb_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrd.u64 q1, [q0, #328]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <2 x i64>, <2 x i64>* %addr, align 8 + %1 = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.v2i64.v2i64(<2 x i64> %0, i32 328) + %2 = extractvalue { <2 x i64>, <2 x i64> } %1, 1 + store <2 x i64> %2, <2 x i64>* %addr, align 8 + %3 = extractvalue { <2 x i64>, <2 x i64> } %1, 0 + ret <2 x i64> %3 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_wb_z_s64(<2 x i64>* %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrdq_gather_base_wb_z_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrdt.u64 q1, [q0, #664]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <2 x i64>, <2 x i64>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> %0, i32 664, <4 x i1> %2) + %4 = extractvalue { <2 x i64>, <2 x i64> } %3, 1 + store <2 x i64> %4, <2 x i64>* %addr, align 8 + %5 = extractvalue { <2 x i64>, <2 x i64> } %3, 0 + ret <2 x i64> %5 +} + +declare { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <4 x i1>) + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_wb_z_u64(<2 x i64>* %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrdq_gather_base_wb_z_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrdt.u64 q1, [q0, #656]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <2 x i64>, <2 x i64>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call { <2 x i64>, <2 x i64> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> %0, i32 656, <4 x i1> %2) + %4 = extractvalue { <2 x i64>, <2 x i64> } %3, 1 + store <2 x i64> %4, <2 x i64>* %addr, align 8 + %5 = extractvalue { <2 x i64>, <2 x i64> } %3, 0 + ret <2 x i64> %5 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_z_s64(<2 x i64> %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrdq_gather_base_z_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrdt.u64 q1, [q0, #888] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1(<2 x i64> %addr, i32 888, <4 x i1> %1) + ret <2 x i64> %2 +} + +declare <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <4 x i1>) + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_base_z_u64(<2 x i64> %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrdq_gather_base_z_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrdt.u64 q1, [q0, #1000] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1(<2 x i64> %addr, i32 1000, <4 x i1> %1) + ret <2 x i64> %2 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_offset_s64(i64* %base, <2 x i64> %offset) { +; CHECK-LABEL: test_vldrdq_gather_offset_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrd.u64 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0i64.v2i64(i64* %base, <2 x i64> %offset, i32 64, i32 0, i32 0) + ret <2 x i64> %0 +} + +declare <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0i64.v2i64(i64*, <2 x i64>, i32, i32, i32) + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_offset_u64(i64* %base, <2 x i64> %offset) { +; CHECK-LABEL: test_vldrdq_gather_offset_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrd.u64 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0i64.v2i64(i64* %base, <2 x i64> %offset, i32 64, i32 0, i32 1) + ret <2 x i64> %0 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_offset_z_s64(i64* %base, <2 x i64> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrdq_gather_offset_z_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrdt.u64 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, i32 64, i32 0, i32 0, <4 x i1> %1) + ret <2 x i64> %2 +} + +declare <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64*, <2 x i64>, i32, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_offset_z_u64(i64* %base, <2 x i64> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrdq_gather_offset_z_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrdt.u64 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, i32 64, i32 0, i32 1, <4 x i1> %1) + ret <2 x i64> %2 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_shifted_offset_s64(i64* %base, <2 x i64> %offset) { +; CHECK-LABEL: test_vldrdq_gather_shifted_offset_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrd.u64 q1, [r0, q0, uxtw #3] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0i64.v2i64(i64* %base, <2 x i64> %offset, i32 64, i32 3, i32 0) + ret <2 x i64> %0 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_shifted_offset_u64(i64* %base, <2 x i64> %offset) { +; CHECK-LABEL: test_vldrdq_gather_shifted_offset_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrd.u64 q1, [r0, q0, uxtw #3] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.v2i64.p0i64.v2i64(i64* %base, <2 x i64> %offset, i32 64, i32 3, i32 1) + ret <2 x i64> %0 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_shifted_offset_z_s64(i64* %base, <2 x i64> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrdq_gather_shifted_offset_z_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrdt.u64 q1, [r0, q0, uxtw #3] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, i32 64, i32 3, i32 0, <4 x i1> %1) + ret <2 x i64> %2 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vldrdq_gather_shifted_offset_z_u64(i64* %base, <2 x i64> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrdq_gather_shifted_offset_z_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrdt.u64 q1, [r0, q0, uxtw #3] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <2 x i64> @llvm.arm.mve.vldr.gather.offset.predicated.v2i64.p0i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, i32 64, i32 3, i32 1, <4 x i1> %1) + ret <2 x i64> %2 +} + +define arm_aapcs_vfpcc <8 x half> @test_vldrhq_gather_offset_f16(half* %base, <8 x i16> %offset) { +; CHECK-LABEL: test_vldrhq_gather_offset_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x half> @llvm.arm.mve.vldr.gather.offset.v8f16.p0f16.v8i16(half* %base, <8 x i16> %offset, i32 16, i32 0, i32 0) + ret <8 x half> %0 +} + +declare <8 x half> @llvm.arm.mve.vldr.gather.offset.v8f16.p0f16.v8i16(half*, <8 x i16>, i32, i32, i32) + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_offset_s16(i16* %base, <8 x i16> %offset) { +; CHECK-LABEL: test_vldrhq_gather_offset_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i16.v8i16(i16* %base, <8 x i16> %offset, i32 16, i32 0, i32 0) + ret <8 x i16> %0 +} + +declare <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i16.v8i16(i16*, <8 x i16>, i32, i32, i32) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_offset_s32(i16* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrhq_gather_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.s32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i16.v4i32(i16* %base, <4 x i32> %offset, i32 16, i32 0, i32 0) + ret <4 x i32> %0 +} + +declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i16.v4i32(i16*, <4 x i32>, i32, i32, i32) + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_offset_u16(i16* %base, <8 x i16> %offset) { +; CHECK-LABEL: test_vldrhq_gather_offset_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i16.v8i16(i16* %base, <8 x i16> %offset, i32 16, i32 0, i32 1) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_offset_u32(i16* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrhq_gather_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i16.v4i32(i16* %base, <4 x i32> %offset, i32 16, i32 0, i32 1) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <8 x half> @test_vldrhq_gather_offset_z_f16(half* %base, <8 x i16> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_offset_z_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x half> @llvm.arm.mve.vldr.gather.offset.predicated.v8f16.p0f16.v8i16.v8i1(half* %base, <8 x i16> %offset, i32 16, i32 0, i32 0, <8 x i1> %1) + ret <8 x half> %2 +} + +declare <8 x half> @llvm.arm.mve.vldr.gather.offset.predicated.v8f16.p0f16.v8i16.v8i1(half*, <8 x i16>, i32, i32, i32, <8 x i1>) + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_offset_z_s16(i16* %base, <8 x i16> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_offset_z_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i16.v8i16.v8i1(i16* %base, <8 x i16> %offset, i32 16, i32 0, i32 0, <8 x i1> %1) + ret <8 x i16> %2 +} + +declare <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i16.v8i16.v8i1(i16*, <8 x i16>, i32, i32, i32, <8 x i1>) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_offset_z_s32(i16* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_offset_z_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.s32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i16.v4i32.v4i1(i16* %base, <4 x i32> %offset, i32 16, i32 0, i32 0, <4 x i1> %1) + ret <4 x i32> %2 +} + +declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i16.v4i32.v4i1(i16*, <4 x i32>, i32, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_offset_z_u16(i16* %base, <8 x i16> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_offset_z_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i16.v8i16.v8i1(i16* %base, <8 x i16> %offset, i32 16, i32 0, i32 1, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_offset_z_u32(i16* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_offset_z_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i16.v4i32.v4i1(i16* %base, <4 x i32> %offset, i32 16, i32 0, i32 1, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <8 x half> @test_vldrhq_gather_shifted_offset_f16(half* %base, <8 x i16> %offset) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x half> @llvm.arm.mve.vldr.gather.offset.v8f16.p0f16.v8i16(half* %base, <8 x i16> %offset, i32 16, i32 1, i32 0) + ret <8 x half> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_shifted_offset_s16(i16* %base, <8 x i16> %offset) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i16.v8i16(i16* %base, <8 x i16> %offset, i32 16, i32 1, i32 0) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_shifted_offset_s32(i16* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.s32 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i16.v4i32(i16* %base, <4 x i32> %offset, i32 16, i32 1, i32 0) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_shifted_offset_u16(i16* %base, <8 x i16> %offset) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.v8i16.p0i16.v8i16(i16* %base, <8 x i16> %offset, i32 16, i32 1, i32 1) + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_shifted_offset_u32(i16* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.u32 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i16.v4i32(i16* %base, <4 x i32> %offset, i32 16, i32 1, i32 1) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <8 x half> @test_vldrhq_gather_shifted_offset_z_f16(half* %base, <8 x i16> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_z_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x half> @llvm.arm.mve.vldr.gather.offset.predicated.v8f16.p0f16.v8i16.v8i1(half* %base, <8 x i16> %offset, i32 16, i32 1, i32 0, <8 x i1> %1) + ret <8 x half> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_shifted_offset_z_s16(i16* %base, <8 x i16> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_z_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i16.v8i16.v8i1(i16* %base, <8 x i16> %offset, i32 16, i32 1, i32 0, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_shifted_offset_z_s32(i16* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_z_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.s32 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i16.v4i32.v4i1(i16* %base, <4 x i32> %offset, i32 16, i32 1, i32 0, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vldrhq_gather_shifted_offset_z_u16(i16* %base, <8 x i16> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_z_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.mve.vldr.gather.offset.predicated.v8i16.p0i16.v8i16.v8i1(i16* %base, <8 x i16> %offset, i32 16, i32 1, i32 1, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrhq_gather_shifted_offset_z_u32(i16* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrhq_gather_shifted_offset_z_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u32 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i16.v4i32.v4i1(i16* %base, <4 x i32> %offset, i32 16, i32 1, i32 1, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_base_f32(<4 x i32> %addr) { +; CHECK-LABEL: test_vldrwq_gather_base_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [q0, #12] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x float> @llvm.arm.mve.vldr.gather.base.v4f32.v4i32(<4 x i32> %addr, i32 12) + ret <4 x float> %0 +} + +declare <4 x float> @llvm.arm.mve.vldr.gather.base.v4f32.v4i32(<4 x i32>, i32) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_s32(<4 x i32> %addr) { +; CHECK-LABEL: test_vldrwq_gather_base_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [q0, #400] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.base.v4i32.v4i32(<4 x i32> %addr, i32 400) + ret <4 x i32> %0 +} + +declare <4 x i32> @llvm.arm.mve.vldr.gather.base.v4i32.v4i32(<4 x i32>, i32) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_u32(<4 x i32> %addr) { +; CHECK-LABEL: test_vldrwq_gather_base_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [q0, #284] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.base.v4i32.v4i32(<4 x i32> %addr, i32 284) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_base_wb_f32(<4 x i32>* %addr) { +; CHECK-LABEL: test_vldrwq_gather_base_wb_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [q0, #64]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4f32.v4i32(<4 x i32> %0, i32 64) + %2 = extractvalue { <4 x float>, <4 x i32> } %1, 1 + store <4 x i32> %2, <4 x i32>* %addr, align 8 + %3 = extractvalue { <4 x float>, <4 x i32> } %1, 0 + ret <4 x float> %3 +} + +declare { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4f32.v4i32(<4 x i32>, i32) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_wb_s32(<4 x i32>* %addr) { +; CHECK-LABEL: test_vldrwq_gather_base_wb_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [q0, #80]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4i32.v4i32(<4 x i32> %0, i32 80) + %2 = extractvalue { <4 x i32>, <4 x i32> } %1, 1 + store <4 x i32> %2, <4 x i32>* %addr, align 8 + %3 = extractvalue { <4 x i32>, <4 x i32> } %1, 0 + ret <4 x i32> %3 +} + +declare { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4i32.v4i32(<4 x i32>, i32) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_wb_u32(<4 x i32>* %addr) { +; CHECK-LABEL: test_vldrwq_gather_base_wb_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [q0, #480]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.v4i32.v4i32(<4 x i32> %0, i32 480) + %2 = extractvalue { <4 x i32>, <4 x i32> } %1, 1 + store <4 x i32> %2, <4 x i32>* %addr, align 8 + %3 = extractvalue { <4 x i32>, <4 x i32> } %1, 0 + ret <4 x i32> %3 +} + +define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_base_wb_z_f32(<4 x i32>* %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_base_wb_z_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [q0, #352]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32> %0, i32 352, <4 x i1> %2) + %4 = extractvalue { <4 x float>, <4 x i32> } %3, 1 + store <4 x i32> %4, <4 x i32>* %addr, align 8 + %5 = extractvalue { <4 x float>, <4 x i32> } %3, 0 + ret <4 x float> %5 +} + +declare { <4 x float>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4f32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_wb_z_s32(<4 x i32>* %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_base_wb_z_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [q0, #276]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32> %0, i32 276, <4 x i1> %2) + %4 = extractvalue { <4 x i32>, <4 x i32> } %3, 1 + store <4 x i32> %4, <4 x i32>* %addr, align 8 + %5 = extractvalue { <4 x i32>, <4 x i32> } %3, 0 + ret <4 x i32> %5 +} + +declare { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_wb_z_u32(<4 x i32>* %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_base_wb_z_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [q0, #88]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call { <4 x i32>, <4 x i32> } @llvm.arm.mve.vldr.gather.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32> %0, i32 88, <4 x i1> %2) + %4 = extractvalue { <4 x i32>, <4 x i32> } %3, 1 + store <4 x i32> %4, <4 x i32>* %addr, align 8 + %5 = extractvalue { <4 x i32>, <4 x i32> } %3, 0 + ret <4 x i32> %5 +} + +define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_base_z_f32(<4 x i32> %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_base_z_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [q0, #300] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x float> @llvm.arm.mve.vldr.gather.base.predicated.v4f32.v4i32.v4i1(<4 x i32> %addr, i32 300, <4 x i1> %1) + ret <4 x float> %2 +} + +declare <4 x float> @llvm.arm.mve.vldr.gather.base.predicated.v4f32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_z_s32(<4 x i32> %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_base_z_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [q0, #440] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.base.predicated.v4i32.v4i32.v4i1(<4 x i32> %addr, i32 440, <4 x i1> %1) + ret <4 x i32> %2 +} + +declare <4 x i32> @llvm.arm.mve.vldr.gather.base.predicated.v4i32.v4i32.v4i1(<4 x i32>, i32, <4 x i1>) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_base_z_u32(<4 x i32> %addr, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_base_z_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [q0, #300] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.base.predicated.v4i32.v4i32.v4i1(<4 x i32> %addr, i32 300, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_offset_f32(float* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrwq_gather_offset_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x float> @llvm.arm.mve.vldr.gather.offset.v4f32.p0f32.v4i32(float* %base, <4 x i32> %offset, i32 32, i32 0, i32 0) + ret <4 x float> %0 +} + +declare <4 x float> @llvm.arm.mve.vldr.gather.offset.v4f32.p0f32.v4i32(float*, <4 x i32>, i32, i32, i32) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_offset_s32(i32* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrwq_gather_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* %base, <4 x i32> %offset, i32 32, i32 0, i32 0) + ret <4 x i32> %0 +} + +declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32*, <4 x i32>, i32, i32, i32) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_offset_u32(i32* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrwq_gather_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* %base, <4 x i32> %offset, i32 32, i32 0, i32 1) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_offset_z_f32(float* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_offset_z_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x float> @llvm.arm.mve.vldr.gather.offset.predicated.v4f32.p0f32.v4i32.v4i1(float* %base, <4 x i32> %offset, i32 32, i32 0, i32 0, <4 x i1> %1) + ret <4 x float> %2 +} + +declare <4 x float> @llvm.arm.mve.vldr.gather.offset.predicated.v4f32.p0f32.v4i32.v4i1(float*, <4 x i32>, i32, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_offset_z_s32(i32* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_offset_z_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i32.v4i32.v4i1(i32* %base, <4 x i32> %offset, i32 32, i32 0, i32 0, <4 x i1> %1) + ret <4 x i32> %2 +} + +declare <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i32.v4i32.v4i1(i32*, <4 x i32>, i32, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_offset_z_u32(i32* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_offset_z_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [r0, q0] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i32.v4i32.v4i1(i32* %base, <4 x i32> %offset, i32 32, i32 0, i32 1, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_shifted_offset_f32(float* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrwq_gather_shifted_offset_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x float> @llvm.arm.mve.vldr.gather.offset.v4f32.p0f32.v4i32(float* %base, <4 x i32> %offset, i32 32, i32 2, i32 0) + ret <4 x float> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_shifted_offset_s32(i32* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrwq_gather_shifted_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* %base, <4 x i32> %offset, i32 32, i32 2, i32 0) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_shifted_offset_u32(i32* %base, <4 x i32> %offset) { +; CHECK-LABEL: test_vldrwq_gather_shifted_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* %base, <4 x i32> %offset, i32 32, i32 2, i32 1) + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <4 x float> @test_vldrwq_gather_shifted_offset_z_f32(float* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_shifted_offset_z_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x float> @llvm.arm.mve.vldr.gather.offset.predicated.v4f32.p0f32.v4i32.v4i1(float* %base, <4 x i32> %offset, i32 32, i32 2, i32 0, <4 x i1> %1) + ret <4 x float> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_shifted_offset_z_s32(i32* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_shifted_offset_z_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i32.v4i32.v4i1(i32* %base, <4 x i32> %offset, i32 32, i32 2, i32 0, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vldrwq_gather_shifted_offset_z_u32(i32* %base, <4 x i32> %offset, i16 zeroext %p) { +; CHECK-LABEL: test_vldrwq_gather_shifted_offset_z_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: vmov q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.predicated.v4i32.p0i32.v4i32.v4i1(i32* %base, <4 x i32> %offset, i32 32, i32 2, i32 1, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_p_s16(i8* %base, <8 x i16> %offset, <8 x i16> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrbq_scatter_offset_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v8i16.v8i16.v8i1(i8* %base, <8 x i16> %offset, <8 x i16> %value, i32 8, i32 0, <8 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v8i16.v8i16.v8i1(i8*, <8 x i16>, <8 x i16>, i32, i32, <8 x i1>) + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_p_s32(i8* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrbq_scatter_offset_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v4i32.v4i32.v4i1(i8* %base, <4 x i32> %offset, <4 x i32> %value, i32 8, i32 0, <4 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v4i32.v4i32.v4i1(i8*, <4 x i32>, <4 x i32>, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_p_s8(i8* %base, <16 x i8> %offset, <16 x i8> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrbq_scatter_offset_p_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.8 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v16i8.v16i8.v16i1(i8* %base, <16 x i8> %offset, <16 x i8> %value, i32 8, i32 0, <16 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v16i8.v16i8.v16i1(i8*, <16 x i8>, <16 x i8>, i32, i32, <16 x i1>) + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_p_u16(i8* %base, <8 x i16> %offset, <8 x i16> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrbq_scatter_offset_p_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v8i16.v8i16.v8i1(i8* %base, <8 x i16> %offset, <8 x i16> %value, i32 8, i32 0, <8 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_p_u32(i8* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrbq_scatter_offset_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v4i32.v4i32.v4i1(i8* %base, <4 x i32> %offset, <4 x i32> %value, i32 8, i32 0, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_p_u8(i8* %base, <16 x i8> %offset, <16 x i8> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrbq_scatter_offset_p_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrbt.8 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i8.v16i8.v16i8.v16i1(i8* %base, <16 x i8> %offset, <16 x i8> %value, i32 8, i32 0, <16 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_s16(i8* %base, <8 x i16> %offset, <8 x i16> %value) { +; CHECK-LABEL: test_vstrbq_scatter_offset_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v8i16.v8i16(i8* %base, <8 x i16> %offset, <8 x i16> %value, i32 8, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0i8.v8i16.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_s32(i8* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrbq_scatter_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v4i32.v4i32(i8* %base, <4 x i32> %offset, <4 x i32> %value, i32 8, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0i8.v4i32.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_s8(i8* %base, <16 x i8> %offset, <16 x i8> %value) { +; CHECK-LABEL: test_vstrbq_scatter_offset_s8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.8 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v16i8.v16i8(i8* %base, <16 x i8> %offset, <16 x i8> %value, i32 8, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0i8.v16i8.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_u16(i8* %base, <8 x i16> %offset, <8 x i16> %value) { +; CHECK-LABEL: test_vstrbq_scatter_offset_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v8i16.v8i16(i8* %base, <8 x i16> %offset, <8 x i16> %value, i32 8, i32 0) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_u32(i8* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrbq_scatter_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v4i32.v4i32(i8* %base, <4 x i32> %offset, <4 x i32> %value, i32 8, i32 0) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrbq_scatter_offset_u8(i8* %base, <16 x i8> %offset, <16 x i8> %value) { +; CHECK-LABEL: test_vstrbq_scatter_offset_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrb.8 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i8.v16i8.v16i8(i8* %base, <16 x i8> %offset, <16 x i8> %value, i32 8, i32 0) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_p_s64(<2 x i64> %addr, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrdq_scatter_base_p_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrdt.64 q1, [q0, #888] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1(<2 x i64> %addr, i32 888, <2 x i64> %value, <4 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <2 x i64>, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_p_u64(<2 x i64> %addr, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrdq_scatter_base_p_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrdt.64 q1, [q0, #264] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.base.predicated.v2i64.v2i64.v4i1(<2 x i64> %addr, i32 264, <2 x i64> %value, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_s64(<2 x i64> %addr, <2 x i64> %value) { +; CHECK-LABEL: test_vstrdq_scatter_base_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrd.64 q1, [q0, #408] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.base.v2i64.v2i64(<2 x i64> %addr, i32 408, <2 x i64> %value) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.base.v2i64.v2i64(<2 x i64>, i32, <2 x i64>) + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_u64(<2 x i64> %addr, <2 x i64> %value) { +; CHECK-LABEL: test_vstrdq_scatter_base_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrd.64 q1, [q0, #472] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.base.v2i64.v2i64(<2 x i64> %addr, i32 472, <2 x i64> %value) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_wb_p_s64(<2 x i64>* %addr, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrdq_scatter_base_wb_p_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrdt.64 q0, [q1, #248]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <2 x i64>, <2 x i64>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> %0, i32 248, <2 x i64> %value, <4 x i1> %2) + store <2 x i64> %3, <2 x i64>* %addr, align 8 + ret void +} + +declare <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64>, i32, <2 x i64>, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_wb_p_u64(<2 x i64>* %addr, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrdq_scatter_base_wb_p_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrdt.64 q0, [q1, #136]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <2 x i64>, <2 x i64>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v2i64.v2i64.v4i1(<2 x i64> %0, i32 136, <2 x i64> %value, <4 x i1> %2) + store <2 x i64> %3, <2 x i64>* %addr, align 8 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_wb_s64(<2 x i64>* %addr, <2 x i64> %value) { +; CHECK-LABEL: test_vstrdq_scatter_base_wb_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vstrd.64 q0, [q1, #208]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <2 x i64>, <2 x i64>* %addr, align 8 + %1 = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.v2i64.v2i64(<2 x i64> %0, i32 208, <2 x i64> %value) + store <2 x i64> %1, <2 x i64>* %addr, align 8 + ret void +} + +declare <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.v2i64.v2i64(<2 x i64>, i32, <2 x i64>) + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_base_wb_u64(<2 x i64>* %addr, <2 x i64> %value) { +; CHECK-LABEL: test_vstrdq_scatter_base_wb_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vstrd.64 q0, [q1, #168]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <2 x i64>, <2 x i64>* %addr, align 8 + %1 = call <2 x i64> @llvm.arm.mve.vstr.scatter.base.wb.v2i64.v2i64(<2 x i64> %0, i32 168, <2 x i64> %value) + store <2 x i64> %1, <2 x i64>* %addr, align 8 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_offset_p_s64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrdq_scatter_offset_p_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrdt.64 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 0, <4 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64*, <2 x i64>, <2 x i64>, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_offset_p_u64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrdq_scatter_offset_p_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrdt.64 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 0, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_offset_s64(i64* %base, <2 x i64> %offset, <2 x i64> %value) { +; CHECK-LABEL: test_vstrdq_scatter_offset_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrd.64 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i64.v2i64.v2i64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0i64.v2i64.v2i64(i64*, <2 x i64>, <2 x i64>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_offset_u64(i64* %base, <2 x i64> %offset, <2 x i64> %value) { +; CHECK-LABEL: test_vstrdq_scatter_offset_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrd.64 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i64.v2i64.v2i64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 0) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_shifted_offset_p_s64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrdq_scatter_shifted_offset_p_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrdt.64 q1, [r0, q0, uxtw #3] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 3, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_shifted_offset_p_u64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrdq_scatter_shifted_offset_p_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrdt.64 q1, [r0, q0, uxtw #3] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i64.v2i64.v2i64.v4i1(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 3, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_shifted_offset_s64(i64* %base, <2 x i64> %offset, <2 x i64> %value) { +; CHECK-LABEL: test_vstrdq_scatter_shifted_offset_s64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrd.64 q1, [r0, q0, uxtw #3] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i64.v2i64.v2i64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 3) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrdq_scatter_shifted_offset_u64(i64* %base, <2 x i64> %offset, <2 x i64> %value) { +; CHECK-LABEL: test_vstrdq_scatter_shifted_offset_u64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrd.64 q1, [r0, q0, uxtw #3] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i64.v2i64.v2i64(i64* %base, <2 x i64> %offset, <2 x i64> %value, i32 64, i32 3) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_f16(half* %base, <8 x i16> %offset, <8 x half> %value) { +; CHECK-LABEL: test_vstrhq_scatter_offset_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0f16.v8i16.v8f16(half* %base, <8 x i16> %offset, <8 x half> %value, i32 16, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0f16.v8i16.v8f16(half*, <8 x i16>, <8 x half>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_p_f16(half* %base, <8 x i16> %offset, <8 x half> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_offset_p_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0f16.v8i16.v8f16.v8i1(half* %base, <8 x i16> %offset, <8 x half> %value, i32 16, i32 0, <8 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0f16.v8i16.v8f16.v8i1(half*, <8 x i16>, <8 x half>, i32, i32, <8 x i1>) + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_p_s16(i16* %base, <8 x i16> %offset, <8 x i16> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_offset_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v8i16.v8i16.v8i1(i16* %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 0, <8 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v8i16.v8i16.v8i1(i16*, <8 x i16>, <8 x i16>, i32, i32, <8 x i1>) + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_p_s32(i16* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_offset_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v4i32.v4i32.v4i1(i16* %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 0, <4 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v4i32.v4i32.v4i1(i16*, <4 x i32>, <4 x i32>, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_p_u16(i16* %base, <8 x i16> %offset, <8 x i16> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_offset_p_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v8i16.v8i16.v8i1(i16* %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 0, <8 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_p_u32(i16* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_offset_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v4i32.v4i32.v4i1(i16* %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 0, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_s16(i16* %base, <8 x i16> %offset, <8 x i16> %value) { +; CHECK-LABEL: test_vstrhq_scatter_offset_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v8i16.v8i16(i16* %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0i16.v8i16.v8i16(i16*, <8 x i16>, <8 x i16>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_s32(i16* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrhq_scatter_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v4i32.v4i32(i16* %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0i16.v4i32.v4i32(i16*, <4 x i32>, <4 x i32>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_u16(i16* %base, <8 x i16> %offset, <8 x i16> %value) { +; CHECK-LABEL: test_vstrhq_scatter_offset_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.16 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v8i16.v8i16(i16* %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 0) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_offset_u32(i16* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrhq_scatter_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v4i32.v4i32(i16* %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 0) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_f16(half* %base, <8 x i16> %offset, <8 x half> %value) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0f16.v8i16.v8f16(half* %base, <8 x i16> %offset, <8 x half> %value, i32 16, i32 1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_p_f16(half* %base, <8 x i16> %offset, <8 x half> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_p_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0f16.v8i16.v8f16.v8i1(half* %base, <8 x i16> %offset, <8 x half> %value, i32 16, i32 1, <8 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_p_s16(i16* %base, <8 x i16> %offset, <8 x i16> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_p_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v8i16.v8i16.v8i1(i16* %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 1, <8 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_p_s32(i16* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.32 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v4i32.v4i32.v4i1(i16* %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 1, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_p_u16(i16* %base, <8 x i16> %offset, <8 x i16> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_p_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v8i16.v8i16.v8i1(i16* %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 1, <8 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_p_u32(i16* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.32 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i16.v4i32.v4i32.v4i1(i16* %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 1, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_s16(i16* %base, <8 x i16> %offset, <8 x i16> %value) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_s16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v8i16.v8i16(i16* %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_s32(i16* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.32 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v4i32.v4i32(i16* %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_u16(i16* %base, <8 x i16> %offset, <8 x i16> %value) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v8i16.v8i16(i16* %base, <8 x i16> %offset, <8 x i16> %value, i32 16, i32 1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrhq_scatter_shifted_offset_u32(i16* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrhq_scatter_shifted_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrh.32 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i16.v4i32.v4i32(i16* %base, <4 x i32> %offset, <4 x i32> %value, i32 16, i32 1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_f32(<4 x i32> %addr, <4 x float> %value) { +; CHECK-LABEL: test_vstrwq_scatter_base_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [q0, #380] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.base.v4i32.v4f32(<4 x i32> %addr, i32 380, <4 x float> %value) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.base.v4i32.v4f32(<4 x i32>, i32, <4 x float>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_p_f32(<4 x i32> %addr, <4 x float> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_base_p_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [q0, #400] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.base.predicated.v4i32.v4f32.v4i1(<4 x i32> %addr, i32 400, <4 x float> %value, <4 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.base.predicated.v4i32.v4f32.v4i1(<4 x i32>, i32, <4 x float>, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_p_s32(<4 x i32> %addr, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_base_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [q0, #48] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.base.predicated.v4i32.v4i32.v4i1(<4 x i32> %addr, i32 48, <4 x i32> %value, <4 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.base.predicated.v4i32.v4i32.v4i1(<4 x i32>, i32, <4 x i32>, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_p_u32(<4 x i32> %addr, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_base_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [q0, #376] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.base.predicated.v4i32.v4i32.v4i1(<4 x i32> %addr, i32 376, <4 x i32> %value, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_s32(<4 x i32> %addr, <4 x i32> %value) { +; CHECK-LABEL: test_vstrwq_scatter_base_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [q0, #156] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.base.v4i32.v4i32(<4 x i32> %addr, i32 156, <4 x i32> %value) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.base.v4i32.v4i32(<4 x i32>, i32, <4 x i32>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_u32(<4 x i32> %addr, <4 x i32> %value) { +; CHECK-LABEL: test_vstrwq_scatter_base_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [q0, #212] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.base.v4i32.v4i32(<4 x i32> %addr, i32 212, <4 x i32> %value) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_wb_f32(<4 x i32>* %addr, <4 x float> %value) { +; CHECK-LABEL: test_vstrwq_scatter_base_wb_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vstrw.32 q0, [q1, #412]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.v4i32.v4f32(<4 x i32> %0, i32 412, <4 x float> %value) + store <4 x i32> %1, <4 x i32>* %addr, align 8 + ret void +} + +declare <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.v4i32.v4f32(<4 x i32>, i32, <4 x float>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_wb_p_f32(<4 x i32>* %addr, <4 x float> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_base_wb_p_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q0, [q1, #236]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v4i32.v4f32.v4i1(<4 x i32> %0, i32 236, <4 x float> %value, <4 x i1> %2) + store <4 x i32> %3, <4 x i32>* %addr, align 8 + ret void +} + +declare <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v4i32.v4f32.v4i1(<4 x i32>, i32, <4 x float>, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_wb_p_s32(<4 x i32>* %addr, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_base_wb_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q0, [q1, #328]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32> %0, i32 328, <4 x i32> %value, <4 x i1> %2) + store <4 x i32> %3, <4 x i32>* %addr, align 8 + ret void +} + +declare <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32>, i32, <4 x i32>, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_wb_p_u32(<4 x i32>* %addr, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_base_wb_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q0, [q1, #412]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.predicated.v4i32.v4i32.v4i1(<4 x i32> %0, i32 412, <4 x i32> %value, <4 x i1> %2) + store <4 x i32> %3, <4 x i32>* %addr, align 8 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_wb_s32(<4 x i32>* %addr, <4 x i32> %value) { +; CHECK-LABEL: test_vstrwq_scatter_base_wb_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vstrw.32 q0, [q1, #152]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.v4i32.v4i32(<4 x i32> %0, i32 152, <4 x i32> %value) + store <4 x i32> %1, <4 x i32>* %addr, align 8 + ret void +} + +declare <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.v4i32.v4i32(<4 x i32>, i32, <4 x i32>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_base_wb_u32(<4 x i32>* %addr, <4 x i32> %value) { +; CHECK-LABEL: test_vstrwq_scatter_base_wb_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vstrw.32 q0, [q1, #64]! +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load <4 x i32>, <4 x i32>* %addr, align 8 + %1 = call <4 x i32> @llvm.arm.mve.vstr.scatter.base.wb.v4i32.v4i32(<4 x i32> %0, i32 64, <4 x i32> %value) + store <4 x i32> %1, <4 x i32>* %addr, align 8 + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_offset_f32(float* %base, <4 x i32> %offset, <4 x float> %value) { +; CHECK-LABEL: test_vstrwq_scatter_offset_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0f32.v4i32.v4f32(float* %base, <4 x i32> %offset, <4 x float> %value, i32 32, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0f32.v4i32.v4f32(float*, <4 x i32>, <4 x float>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_offset_p_f32(float* %base, <4 x i32> %offset, <4 x float> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_offset_p_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0f32.v4i32.v4f32.v4i1(float* %base, <4 x i32> %offset, <4 x float> %value, i32 32, i32 0, <4 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0f32.v4i32.v4f32.v4i1(float*, <4 x i32>, <4 x float>, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_offset_p_s32(i32* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_offset_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i32.v4i32.v4i32.v4i1(i32* %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 0, <4 x i1> %1) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i32.v4i32.v4i32.v4i1(i32*, <4 x i32>, <4 x i32>, i32, i32, <4 x i1>) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_offset_p_u32(i32* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_offset_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i32.v4i32.v4i32.v4i1(i32* %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 0, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_offset_s32(i32* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrwq_scatter_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i32.v4i32.v4i32(i32* %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 0) + ret void +} + +declare void @llvm.arm.mve.vstr.scatter.offset.p0i32.v4i32.v4i32(i32*, <4 x i32>, <4 x i32>, i32, i32) + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_offset_u32(i32* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrwq_scatter_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [r0, q0] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i32.v4i32.v4i32(i32* %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 0) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_shifted_offset_f32(float* %base, <4 x i32> %offset, <4 x float> %value) { +; CHECK-LABEL: test_vstrwq_scatter_shifted_offset_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0f32.v4i32.v4f32(float* %base, <4 x i32> %offset, <4 x float> %value, i32 32, i32 2) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_shifted_offset_p_f32(float* %base, <4 x i32> %offset, <4 x float> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_shifted_offset_p_f32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0f32.v4i32.v4f32.v4i1(float* %base, <4 x i32> %offset, <4 x float> %value, i32 32, i32 2, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_shifted_offset_p_s32(i32* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_shifted_offset_p_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i32.v4i32.v4i32.v4i1(i32* %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 2, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_shifted_offset_p_u32(i32* %base, <4 x i32> %offset, <4 x i32> %value, i16 zeroext %p) { +; CHECK-LABEL: test_vstrwq_scatter_shifted_offset_p_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrwt.32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + call void @llvm.arm.mve.vstr.scatter.offset.predicated.p0i32.v4i32.v4i32.v4i1(i32* %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 2, <4 x i1> %1) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_shifted_offset_s32(i32* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrwq_scatter_shifted_offset_s32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i32.v4i32.v4i32(i32* %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 2) + ret void +} + +define arm_aapcs_vfpcc void @test_vstrwq_scatter_shifted_offset_u32(i32* %base, <4 x i32> %offset, <4 x i32> %value) { +; CHECK-LABEL: test_vstrwq_scatter_shifted_offset_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vstrw.32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: bx lr +entry: + call void @llvm.arm.mve.vstr.scatter.offset.p0i32.v4i32.v4i32(i32* %base, <4 x i32> %offset, <4 x i32> %value, i32 32, i32 2) + ret void +}