diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td --- a/clang/include/clang/Basic/arm_mve.td +++ b/clang/include/clang/Basic/arm_mve.td @@ -147,6 +147,47 @@ (select $pred, (splat $s), $inactive), 1, "_n", PNT_NType, PNT_None>; } +multiclass vxdup_mc { + defvar UnpredInt = IRInt; + defvar PredInt = IRInt; + defvar UnpredIntCall = !con((UnpredInt $base), paramsOut); + defvar PredIntCall = !con((PredInt $inactive, $base), paramsOut, (? $pred)); + + // Straightforward case with neither writeback nor predication + let pnt = PNT_N in + def q_n: Intrinsic; + + // Predicated form without writeback + defm q: IntrinsicMX< + Vector, !con((args u32:$base), paramsIn, (? Predicate:$pred)), + (xval PredIntCall, 0), 1, "_n", PNT_NType, PNT_N>; + + // Writeback without predication + let pnt = PNT_WB in + def q_wb: Intrinsic< + Vector, !con((args Ptr:$baseaddr), paramsIn), + (seq (load $baseaddr):$base, + UnpredIntCall:$pair, + (store (xval $pair, 1), $baseaddr), + (xval $pair, 0))>; + + // Both writeback and predicated + defm q: IntrinsicMX< + Vector, !con((args Ptr:$baseaddr), paramsIn, (? Predicate:$pred)), + (seq (load $baseaddr):$base, + PredIntCall:$pair, + (store (xval $pair, 1), $baseaddr), + (xval $pair, 0)), 1, "_wb", PNT_WBType, PNT_WB>; +} + +let params = T.Unsigned in { + defm vidup: vxdup_mc<(? imm_1248:$step), (? $step)>; + defm vddup: vxdup_mc<(? imm_1248:$step), (? $step)>; + defm viwdup: vxdup_mc<(? u32:$limit, imm_1248:$step), (? $limit, $step)>; + defm vdwdup: vxdup_mc<(? u32:$limit, imm_1248:$step), (? $limit, $step)>; +} + // The bitcasting below is not overcomplicating the IR because while // Vector and UVector may be different vector types at the C level i.e. // vectors of same size signed/unsigned ints. Once they're lowered diff --git a/clang/include/clang/Basic/arm_mve_defs.td b/clang/include/clang/Basic/arm_mve_defs.td --- a/clang/include/clang/Basic/arm_mve_defs.td +++ b/clang/include/clang/Basic/arm_mve_defs.td @@ -359,7 +359,7 @@ def imm_1to32 : Immediate>; // imm_1248 can be 1, 2, 4 or 8. (e.g. vidupq) -def imm_1248 : Immediate> { +def imm_1248 : Immediate> { let extra = "Power2"; } diff --git a/clang/test/CodeGen/arm-mve-intrinsics/idup.c b/clang/test/CodeGen/arm-mve-intrinsics/idup.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/arm-mve-intrinsics/idup.c @@ -0,0 +1,1289 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve -mfloat-abi hard -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s + +#include + +// CHECK-LABEL: @test_vidupq_n_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32 [[A:%.*]], i32 4) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP0]], 0 +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// +uint8x16_t test_vidupq_n_u8(uint32_t a) +{ +#ifdef POLYMORPHIC + return vidupq_u8(a, 4); +#else /* POLYMORPHIC */ + return vidupq_n_u8(a, 4); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vidupq_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32 [[A:%.*]], i32 1) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP0]], 0 +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +uint16x8_t test_vidupq_n_u16(uint32_t a) +{ +#ifdef POLYMORPHIC + return vidupq_u16(a, 1); +#else /* POLYMORPHIC */ + return vidupq_n_u16(a, 1); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vidupq_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 [[A:%.*]], i32 4) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP0]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +uint32x4_t test_vidupq_n_u32(uint32_t a) +{ +#ifdef POLYMORPHIC + return vidupq_u32(a, 4); +#else /* POLYMORPHIC */ + return vidupq_n_u32(a, 4); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vddupq_n_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { <16 x i8>, i32 } @llvm.arm.mve.vddup.v16i8(i32 [[A:%.*]], i32 2) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP0]], 0 +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// +uint8x16_t test_vddupq_n_u8(uint32_t a) +{ +#ifdef POLYMORPHIC + return vddupq_u8(a, 2); +#else /* POLYMORPHIC */ + return vddupq_n_u8(a, 2); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vddupq_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { <8 x i16>, i32 } @llvm.arm.mve.vddup.v8i16(i32 [[A:%.*]], i32 4) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP0]], 0 +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +uint16x8_t test_vddupq_n_u16(uint32_t a) +{ +#ifdef POLYMORPHIC + return vddupq_u16(a, 4); +#else /* POLYMORPHIC */ + return vddupq_n_u16(a, 4); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vddupq_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vddup.v4i32(i32 [[A:%.*]], i32 2) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP0]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +uint32x4_t test_vddupq_n_u32(uint32_t a) +{ +#ifdef POLYMORPHIC + return vddupq_u32(a, 2); +#else /* POLYMORPHIC */ + return vddupq_n_u32(a, 2); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_viwdupq_n_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.v16i8(i32 [[A:%.*]], i32 [[B:%.*]], i32 4) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP0]], 0 +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// +uint8x16_t test_viwdupq_n_u8(uint32_t a, uint32_t b) +{ +#ifdef POLYMORPHIC + return viwdupq_u8(a, b, 4); +#else /* POLYMORPHIC */ + return viwdupq_n_u8(a, b, 4); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_viwdupq_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.v8i16(i32 [[A:%.*]], i32 [[B:%.*]], i32 2) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP0]], 0 +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +uint16x8_t test_viwdupq_n_u16(uint32_t a, uint32_t b) +{ +#ifdef POLYMORPHIC + return viwdupq_u16(a, b, 2); +#else /* POLYMORPHIC */ + return viwdupq_n_u16(a, b, 2); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_viwdupq_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 8) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP0]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +uint32x4_t test_viwdupq_n_u32(uint32_t a, uint32_t b) +{ +#ifdef POLYMORPHIC + return viwdupq_u32(a, b, 8); +#else /* POLYMORPHIC */ + return viwdupq_n_u32(a, b, 8); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vdwdupq_n_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.v16i8(i32 [[A:%.*]], i32 [[B:%.*]], i32 4) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP0]], 0 +// CHECK-NEXT: ret <16 x i8> [[TMP1]] +// +uint8x16_t test_vdwdupq_n_u8(uint32_t a, uint32_t b) +{ +#ifdef POLYMORPHIC + return vdwdupq_u8(a, b, 4); +#else /* POLYMORPHIC */ + return vdwdupq_n_u8(a, b, 4); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vdwdupq_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.v8i16(i32 [[A:%.*]], i32 [[B:%.*]], i32 8) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP0]], 0 +// CHECK-NEXT: ret <8 x i16> [[TMP1]] +// +uint16x8_t test_vdwdupq_n_u16(uint32_t a, uint32_t b) +{ +#ifdef POLYMORPHIC + return vdwdupq_u16(a, b, 8); +#else /* POLYMORPHIC */ + return vdwdupq_n_u16(a, b, 8); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vdwdupq_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.v4i32(i32 [[A:%.*]], i32 [[B:%.*]], i32 1) +// CHECK-NEXT: [[TMP1:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP0]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP1]] +// +uint32x4_t test_vdwdupq_n_u32(uint32_t a, uint32_t b) +{ +#ifdef POLYMORPHIC + return vdwdupq_u32(a, b, 1); +#else /* POLYMORPHIC */ + return vdwdupq_n_u32(a, b, 1); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vidupq_wb_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32 [[TMP0]], i32 8) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP1]], 1 +// CHECK-NEXT: store i32 [[TMP2]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP1]], 0 +// CHECK-NEXT: ret <16 x i8> [[TMP3]] +// +uint8x16_t test_vidupq_wb_u8(uint32_t *a) +{ +#ifdef POLYMORPHIC + return vidupq_u8(a, 8); +#else /* POLYMORPHIC */ + return vidupq_wb_u8(a, 8); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vidupq_wb_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32 [[TMP0]], i32 1) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP1]], 1 +// CHECK-NEXT: store i32 [[TMP2]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP1]], 0 +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// +uint16x8_t test_vidupq_wb_u16(uint32_t *a) +{ +#ifdef POLYMORPHIC + return vidupq_u16(a, 1); +#else /* POLYMORPHIC */ + return vidupq_wb_u16(a, 1); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vidupq_wb_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 [[TMP0]], i32 4) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP1]], 1 +// CHECK-NEXT: store i32 [[TMP2]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP1]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// +uint32x4_t test_vidupq_wb_u32(uint32_t *a) +{ +#ifdef POLYMORPHIC + return vidupq_u32(a, 4); +#else /* POLYMORPHIC */ + return vidupq_wb_u32(a, 4); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vddupq_wb_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call { <16 x i8>, i32 } @llvm.arm.mve.vddup.v16i8(i32 [[TMP0]], i32 2) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP1]], 1 +// CHECK-NEXT: store i32 [[TMP2]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP1]], 0 +// CHECK-NEXT: ret <16 x i8> [[TMP3]] +// +uint8x16_t test_vddupq_wb_u8(uint32_t *a) +{ +#ifdef POLYMORPHIC + return vddupq_u8(a, 2); +#else /* POLYMORPHIC */ + return vddupq_wb_u8(a, 2); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vddupq_wb_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call { <8 x i16>, i32 } @llvm.arm.mve.vddup.v8i16(i32 [[TMP0]], i32 8) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP1]], 1 +// CHECK-NEXT: store i32 [[TMP2]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP1]], 0 +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// +uint16x8_t test_vddupq_wb_u16(uint32_t *a) +{ +#ifdef POLYMORPHIC + return vddupq_u16(a, 8); +#else /* POLYMORPHIC */ + return vddupq_wb_u16(a, 8); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vddupq_wb_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vddup.v4i32(i32 [[TMP0]], i32 2) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP1]], 1 +// CHECK-NEXT: store i32 [[TMP2]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP1]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// +uint32x4_t test_vddupq_wb_u32(uint32_t *a) +{ +#ifdef POLYMORPHIC + return vddupq_u32(a, 2); +#else /* POLYMORPHIC */ + return vddupq_wb_u32(a, 2); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vdwdupq_wb_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.v16i8(i32 [[TMP0]], i32 [[B:%.*]], i32 4) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP1]], 1 +// CHECK-NEXT: store i32 [[TMP2]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP1]], 0 +// CHECK-NEXT: ret <16 x i8> [[TMP3]] +// +uint8x16_t test_vdwdupq_wb_u8(uint32_t *a, uint32_t b) +{ +#ifdef POLYMORPHIC + return vdwdupq_u8(a, b, 4); +#else /* POLYMORPHIC */ + return vdwdupq_wb_u8(a, b, 4); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vdwdupq_wb_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.v8i16(i32 [[TMP0]], i32 [[B:%.*]], i32 4) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP1]], 1 +// CHECK-NEXT: store i32 [[TMP2]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP1]], 0 +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// +uint16x8_t test_vdwdupq_wb_u16(uint32_t *a, uint32_t b) +{ +#ifdef POLYMORPHIC + return vdwdupq_u16(a, b, 4); +#else /* POLYMORPHIC */ + return vdwdupq_wb_u16(a, b, 4); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_viwdupq_wb_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.v16i8(i32 [[TMP0]], i32 [[B:%.*]], i32 1) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP1]], 1 +// CHECK-NEXT: store i32 [[TMP2]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP1]], 0 +// CHECK-NEXT: ret <16 x i8> [[TMP3]] +// +uint8x16_t test_viwdupq_wb_u8(uint32_t *a, uint32_t b) +{ +#ifdef POLYMORPHIC + return viwdupq_u8(a, b, 1); +#else /* POLYMORPHIC */ + return viwdupq_wb_u8(a, b, 1); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_viwdupq_wb_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.v8i16(i32 [[TMP0]], i32 [[B:%.*]], i32 1) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP1]], 1 +// CHECK-NEXT: store i32 [[TMP2]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP1]], 0 +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// +uint16x8_t test_viwdupq_wb_u16(uint32_t *a, uint32_t b) +{ +#ifdef POLYMORPHIC + return viwdupq_u16(a, b, 1); +#else /* POLYMORPHIC */ + return viwdupq_wb_u16(a, b, 1); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_viwdupq_wb_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32 [[TMP0]], i32 [[B:%.*]], i32 8) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP1]], 1 +// CHECK-NEXT: store i32 [[TMP2]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP1]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// +uint32x4_t test_viwdupq_wb_u32(uint32_t *a, uint32_t b) +{ +#ifdef POLYMORPHIC + return viwdupq_u32(a, b, 8); +#else /* POLYMORPHIC */ + return viwdupq_wb_u32(a, b, 8); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vdwdupq_wb_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.v4i32(i32 [[TMP0]], i32 [[B:%.*]], i32 2) +// CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP1]], 1 +// CHECK-NEXT: store i32 [[TMP2]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP1]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// +uint32x4_t test_vdwdupq_wb_u32(uint32_t *a, uint32_t b) +{ +#ifdef POLYMORPHIC + return vdwdupq_u32(a, b, 2); +#else /* POLYMORPHIC */ + return vdwdupq_wb_u32(a, b, 2); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vidupq_m_n_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { <16 x i8>, i32 } @llvm.arm.mve.vidup.predicated.v16i8.v16i1(<16 x i8> [[INACTIVE:%.*]], i32 [[A:%.*]], i32 8, <16 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP2]], 0 +// CHECK-NEXT: ret <16 x i8> [[TMP3]] +// +uint8x16_t test_vidupq_m_n_u8(uint8x16_t inactive, uint32_t a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vidupq_m(inactive, a, 8, p); +#else /* POLYMORPHIC */ + return vidupq_m_n_u8(inactive, a, 8, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vidupq_m_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { <8 x i16>, i32 } @llvm.arm.mve.vidup.predicated.v8i16.v8i1(<8 x i16> [[INACTIVE:%.*]], i32 [[A:%.*]], i32 8, <8 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP2]], 0 +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// +uint16x8_t test_vidupq_m_n_u16(uint16x8_t inactive, uint32_t a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vidupq_m(inactive, a, 8, p); +#else /* POLYMORPHIC */ + return vidupq_m_n_u16(inactive, a, 8, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vidupq_m_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vidup.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], i32 [[A:%.*]], i32 2, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP2]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// +uint32x4_t test_vidupq_m_n_u32(uint32x4_t inactive, uint32_t a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vidupq_m(inactive, a, 2, p); +#else /* POLYMORPHIC */ + return vidupq_m_n_u32(inactive, a, 2, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vddupq_m_n_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { <16 x i8>, i32 } @llvm.arm.mve.vddup.predicated.v16i8.v16i1(<16 x i8> [[INACTIVE:%.*]], i32 [[A:%.*]], i32 8, <16 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP2]], 0 +// CHECK-NEXT: ret <16 x i8> [[TMP3]] +// +uint8x16_t test_vddupq_m_n_u8(uint8x16_t inactive, uint32_t a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vddupq_m(inactive, a, 8, p); +#else /* POLYMORPHIC */ + return vddupq_m_n_u8(inactive, a, 8, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vddupq_m_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { <8 x i16>, i32 } @llvm.arm.mve.vddup.predicated.v8i16.v8i1(<8 x i16> [[INACTIVE:%.*]], i32 [[A:%.*]], i32 2, <8 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP2]], 0 +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// +uint16x8_t test_vddupq_m_n_u16(uint16x8_t inactive, uint32_t a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vddupq_m(inactive, a, 2, p); +#else /* POLYMORPHIC */ + return vddupq_m_n_u16(inactive, a, 2, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vddupq_m_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vddup.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], i32 [[A:%.*]], i32 8, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP2]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// +uint32x4_t test_vddupq_m_n_u32(uint32x4_t inactive, uint32_t a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vddupq_m(inactive, a, 8, p); +#else /* POLYMORPHIC */ + return vddupq_m_n_u32(inactive, a, 8, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_viwdupq_m_n_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.predicated.v16i8.v16i1(<16 x i8> [[INACTIVE:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], i32 8, <16 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP2]], 0 +// CHECK-NEXT: ret <16 x i8> [[TMP3]] +// +uint8x16_t test_viwdupq_m_n_u8(uint8x16_t inactive, uint32_t a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return viwdupq_m(inactive, a, b, 8, p); +#else /* POLYMORPHIC */ + return viwdupq_m_n_u8(inactive, a, b, 8, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_viwdupq_m_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.predicated.v8i16.v8i1(<8 x i16> [[INACTIVE:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], i32 8, <8 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP2]], 0 +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// +uint16x8_t test_viwdupq_m_n_u16(uint16x8_t inactive, uint32_t a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return viwdupq_m(inactive, a, b, 8, p); +#else /* POLYMORPHIC */ + return viwdupq_m_n_u16(inactive, a, b, 8, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_viwdupq_m_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], i32 4, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP2]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// +uint32x4_t test_viwdupq_m_n_u32(uint32x4_t inactive, uint32_t a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return viwdupq_m(inactive, a, b, 4, p); +#else /* POLYMORPHIC */ + return viwdupq_m_n_u32(inactive, a, b, 4, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vdwdupq_m_n_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.predicated.v16i8.v16i1(<16 x i8> [[INACTIVE:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], i32 1, <16 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP2]], 0 +// CHECK-NEXT: ret <16 x i8> [[TMP3]] +// +uint8x16_t test_vdwdupq_m_n_u8(uint8x16_t inactive, uint32_t a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vdwdupq_m(inactive, a, b, 1, p); +#else /* POLYMORPHIC */ + return vdwdupq_m_n_u8(inactive, a, b, 1, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vdwdupq_m_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.predicated.v8i16.v8i1(<8 x i16> [[INACTIVE:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], i32 2, <8 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP2]], 0 +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// +uint16x8_t test_vdwdupq_m_n_u16(uint16x8_t inactive, uint32_t a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vdwdupq_m(inactive, a, b, 2, p); +#else /* POLYMORPHIC */ + return vdwdupq_m_n_u16(inactive, a, b, 2, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vdwdupq_m_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], i32 [[A:%.*]], i32 [[B:%.*]], i32 4, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP2]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// +uint32x4_t test_vdwdupq_m_n_u32(uint32x4_t inactive, uint32_t a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vdwdupq_m(inactive, a, b, 4, p); +#else /* POLYMORPHIC */ + return vdwdupq_m_n_u32(inactive, a, b, 4, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vidupq_m_wb_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <16 x i8>, i32 } @llvm.arm.mve.vidup.predicated.v16i8.v16i1(<16 x i8> [[INACTIVE:%.*]], i32 [[TMP0]], i32 8, <16 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP3]], 1 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP3]], 0 +// CHECK-NEXT: ret <16 x i8> [[TMP5]] +// +uint8x16_t test_vidupq_m_wb_u8(uint8x16_t inactive, uint32_t *a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vidupq_m(inactive, a, 8, p); +#else /* POLYMORPHIC */ + return vidupq_m_wb_u8(inactive, a, 8, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vidupq_m_wb_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <8 x i16>, i32 } @llvm.arm.mve.vidup.predicated.v8i16.v8i1(<8 x i16> [[INACTIVE:%.*]], i32 [[TMP0]], i32 2, <8 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP3]], 1 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP3]], 0 +// CHECK-NEXT: ret <8 x i16> [[TMP5]] +// +uint16x8_t test_vidupq_m_wb_u16(uint16x8_t inactive, uint32_t *a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vidupq_m(inactive, a, 2, p); +#else /* POLYMORPHIC */ + return vidupq_m_wb_u16(inactive, a, 2, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vidupq_m_wb_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vidup.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], i32 [[TMP0]], i32 8, <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 1 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP5]] +// +uint32x4_t test_vidupq_m_wb_u32(uint32x4_t inactive, uint32_t *a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vidupq_m(inactive, a, 8, p); +#else /* POLYMORPHIC */ + return vidupq_m_wb_u32(inactive, a, 8, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vddupq_m_wb_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <16 x i8>, i32 } @llvm.arm.mve.vddup.predicated.v16i8.v16i1(<16 x i8> [[INACTIVE:%.*]], i32 [[TMP0]], i32 1, <16 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP3]], 1 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP3]], 0 +// CHECK-NEXT: ret <16 x i8> [[TMP5]] +// +uint8x16_t test_vddupq_m_wb_u8(uint8x16_t inactive, uint32_t *a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vddupq_m(inactive, a, 1, p); +#else /* POLYMORPHIC */ + return vddupq_m_wb_u8(inactive, a, 1, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vddupq_m_wb_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <8 x i16>, i32 } @llvm.arm.mve.vddup.predicated.v8i16.v8i1(<8 x i16> [[INACTIVE:%.*]], i32 [[TMP0]], i32 1, <8 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP3]], 1 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP3]], 0 +// CHECK-NEXT: ret <8 x i16> [[TMP5]] +// +uint16x8_t test_vddupq_m_wb_u16(uint16x8_t inactive, uint32_t *a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vddupq_m(inactive, a, 1, p); +#else /* POLYMORPHIC */ + return vddupq_m_wb_u16(inactive, a, 1, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vddupq_m_wb_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vddup.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], i32 [[TMP0]], i32 4, <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 1 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP5]] +// +uint32x4_t test_vddupq_m_wb_u32(uint32x4_t inactive, uint32_t *a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vddupq_m(inactive, a, 4, p); +#else /* POLYMORPHIC */ + return vddupq_m_wb_u32(inactive, a, 4, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_viwdupq_m_wb_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.predicated.v16i8.v16i1(<16 x i8> [[INACTIVE:%.*]], i32 [[TMP0]], i32 [[B:%.*]], i32 8, <16 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP3]], 1 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP3]], 0 +// CHECK-NEXT: ret <16 x i8> [[TMP5]] +// +uint8x16_t test_viwdupq_m_wb_u8(uint8x16_t inactive, uint32_t *a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return viwdupq_m(inactive, a, b, 8, p); +#else /* POLYMORPHIC */ + return viwdupq_m_wb_u8(inactive, a, b, 8, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_viwdupq_m_wb_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.predicated.v8i16.v8i1(<8 x i16> [[INACTIVE:%.*]], i32 [[TMP0]], i32 [[B:%.*]], i32 8, <8 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP3]], 1 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP3]], 0 +// CHECK-NEXT: ret <8 x i16> [[TMP5]] +// +uint16x8_t test_viwdupq_m_wb_u16(uint16x8_t inactive, uint32_t *a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return viwdupq_m(inactive, a, b, 8, p); +#else /* POLYMORPHIC */ + return viwdupq_m_wb_u16(inactive, a, b, 8, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_viwdupq_m_wb_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], i32 [[TMP0]], i32 [[B:%.*]], i32 4, <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 1 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP5]] +// +uint32x4_t test_viwdupq_m_wb_u32(uint32x4_t inactive, uint32_t *a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return viwdupq_m(inactive, a, b, 4, p); +#else /* POLYMORPHIC */ + return viwdupq_m_wb_u32(inactive, a, b, 4, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vdwdupq_m_wb_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.predicated.v16i8.v16i1(<16 x i8> [[INACTIVE:%.*]], i32 [[TMP0]], i32 [[B:%.*]], i32 1, <16 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP3]], 1 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP3]], 0 +// CHECK-NEXT: ret <16 x i8> [[TMP5]] +// +uint8x16_t test_vdwdupq_m_wb_u8(uint8x16_t inactive, uint32_t *a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vdwdupq_m(inactive, a, b, 1, p); +#else /* POLYMORPHIC */ + return vdwdupq_m_wb_u8(inactive, a, b, 1, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vdwdupq_m_wb_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.predicated.v8i16.v8i1(<8 x i16> [[INACTIVE:%.*]], i32 [[TMP0]], i32 [[B:%.*]], i32 4, <8 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP3]], 1 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP3]], 0 +// CHECK-NEXT: ret <8 x i16> [[TMP5]] +// +uint16x8_t test_vdwdupq_m_wb_u16(uint16x8_t inactive, uint32_t *a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vdwdupq_m(inactive, a, b, 4, p); +#else /* POLYMORPHIC */ + return vdwdupq_m_wb_u16(inactive, a, b, 4, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vdwdupq_m_wb_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.predicated.v4i32.v4i1(<4 x i32> [[INACTIVE:%.*]], i32 [[TMP0]], i32 [[B:%.*]], i32 4, <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 1 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP5]] +// +uint32x4_t test_vdwdupq_m_wb_u32(uint32x4_t inactive, uint32_t *a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vdwdupq_m(inactive, a, b, 4, p); +#else /* POLYMORPHIC */ + return vdwdupq_m_wb_u32(inactive, a, b, 4, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vidupq_x_n_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { <16 x i8>, i32 } @llvm.arm.mve.vidup.predicated.v16i8.v16i1(<16 x i8> undef, i32 [[A:%.*]], i32 2, <16 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP2]], 0 +// CHECK-NEXT: ret <16 x i8> [[TMP3]] +// +uint8x16_t test_vidupq_x_n_u8(uint32_t a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vidupq_x_u8(a, 2, p); +#else /* POLYMORPHIC */ + return vidupq_x_n_u8(a, 2, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vidupq_x_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { <8 x i16>, i32 } @llvm.arm.mve.vidup.predicated.v8i16.v8i1(<8 x i16> undef, i32 [[A:%.*]], i32 2, <8 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP2]], 0 +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// +uint16x8_t test_vidupq_x_n_u16(uint32_t a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vidupq_x_u16(a, 2, p); +#else /* POLYMORPHIC */ + return vidupq_x_n_u16(a, 2, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vidupq_x_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vidup.predicated.v4i32.v4i1(<4 x i32> undef, i32 [[A:%.*]], i32 8, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP2]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// +uint32x4_t test_vidupq_x_n_u32(uint32_t a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vidupq_x_u32(a, 8, p); +#else /* POLYMORPHIC */ + return vidupq_x_n_u32(a, 8, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vddupq_x_n_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { <16 x i8>, i32 } @llvm.arm.mve.vddup.predicated.v16i8.v16i1(<16 x i8> undef, i32 [[A:%.*]], i32 8, <16 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP2]], 0 +// CHECK-NEXT: ret <16 x i8> [[TMP3]] +// +uint8x16_t test_vddupq_x_n_u8(uint32_t a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vddupq_x_u8(a, 8, p); +#else /* POLYMORPHIC */ + return vddupq_x_n_u8(a, 8, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vddupq_x_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { <8 x i16>, i32 } @llvm.arm.mve.vddup.predicated.v8i16.v8i1(<8 x i16> undef, i32 [[A:%.*]], i32 4, <8 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP2]], 0 +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// +uint16x8_t test_vddupq_x_n_u16(uint32_t a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vddupq_x_u16(a, 4, p); +#else /* POLYMORPHIC */ + return vddupq_x_n_u16(a, 4, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vddupq_x_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vddup.predicated.v4i32.v4i1(<4 x i32> undef, i32 [[A:%.*]], i32 2, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP2]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// +uint32x4_t test_vddupq_x_n_u32(uint32_t a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vddupq_x_u32(a, 2, p); +#else /* POLYMORPHIC */ + return vddupq_x_n_u32(a, 2, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_viwdupq_x_n_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.predicated.v16i8.v16i1(<16 x i8> undef, i32 [[A:%.*]], i32 [[B:%.*]], i32 2, <16 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP2]], 0 +// CHECK-NEXT: ret <16 x i8> [[TMP3]] +// +uint8x16_t test_viwdupq_x_n_u8(uint32_t a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return viwdupq_x_u8(a, b, 2, p); +#else /* POLYMORPHIC */ + return viwdupq_x_n_u8(a, b, 2, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_viwdupq_x_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.predicated.v8i16.v8i1(<8 x i16> undef, i32 [[A:%.*]], i32 [[B:%.*]], i32 4, <8 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP2]], 0 +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// +uint16x8_t test_viwdupq_x_n_u16(uint32_t a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return viwdupq_x_u16(a, b, 4, p); +#else /* POLYMORPHIC */ + return viwdupq_x_n_u16(a, b, 4, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_viwdupq_x_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.predicated.v4i32.v4i1(<4 x i32> undef, i32 [[A:%.*]], i32 [[B:%.*]], i32 2, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP2]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// +uint32x4_t test_viwdupq_x_n_u32(uint32_t a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return viwdupq_x_u32(a, b, 2, p); +#else /* POLYMORPHIC */ + return viwdupq_x_n_u32(a, b, 2, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vdwdupq_x_n_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.predicated.v16i8.v16i1(<16 x i8> undef, i32 [[A:%.*]], i32 [[B:%.*]], i32 2, <16 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP2]], 0 +// CHECK-NEXT: ret <16 x i8> [[TMP3]] +// +uint8x16_t test_vdwdupq_x_n_u8(uint32_t a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vdwdupq_x_u8(a, b, 2, p); +#else /* POLYMORPHIC */ + return vdwdupq_x_n_u8(a, b, 2, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vdwdupq_x_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.predicated.v8i16.v8i1(<8 x i16> undef, i32 [[A:%.*]], i32 [[B:%.*]], i32 2, <8 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP2]], 0 +// CHECK-NEXT: ret <8 x i16> [[TMP3]] +// +uint16x8_t test_vdwdupq_x_n_u16(uint32_t a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vdwdupq_x_u16(a, b, 2, p); +#else /* POLYMORPHIC */ + return vdwdupq_x_n_u16(a, b, 2, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vdwdupq_x_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.predicated.v4i32.v4i1(<4 x i32> undef, i32 [[A:%.*]], i32 [[B:%.*]], i32 8, <4 x i1> [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP2]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// +uint32x4_t test_vdwdupq_x_n_u32(uint32_t a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vdwdupq_x_u32(a, b, 8, p); +#else /* POLYMORPHIC */ + return vdwdupq_x_n_u32(a, b, 8, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vidupq_x_wb_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <16 x i8>, i32 } @llvm.arm.mve.vidup.predicated.v16i8.v16i1(<16 x i8> undef, i32 [[TMP0]], i32 2, <16 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP3]], 1 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP3]], 0 +// CHECK-NEXT: ret <16 x i8> [[TMP5]] +// +uint8x16_t test_vidupq_x_wb_u8(uint32_t *a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vidupq_x_u8(a, 2, p); +#else /* POLYMORPHIC */ + return vidupq_x_wb_u8(a, 2, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vidupq_x_wb_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <8 x i16>, i32 } @llvm.arm.mve.vidup.predicated.v8i16.v8i1(<8 x i16> undef, i32 [[TMP0]], i32 4, <8 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP3]], 1 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP3]], 0 +// CHECK-NEXT: ret <8 x i16> [[TMP5]] +// +uint16x8_t test_vidupq_x_wb_u16(uint32_t *a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vidupq_x_u16(a, 4, p); +#else /* POLYMORPHIC */ + return vidupq_x_wb_u16(a, 4, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vidupq_x_wb_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vidup.predicated.v4i32.v4i1(<4 x i32> undef, i32 [[TMP0]], i32 2, <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 1 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP5]] +// +uint32x4_t test_vidupq_x_wb_u32(uint32_t *a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vidupq_x_u32(a, 2, p); +#else /* POLYMORPHIC */ + return vidupq_x_wb_u32(a, 2, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vddupq_x_wb_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <16 x i8>, i32 } @llvm.arm.mve.vddup.predicated.v16i8.v16i1(<16 x i8> undef, i32 [[TMP0]], i32 1, <16 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP3]], 1 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP3]], 0 +// CHECK-NEXT: ret <16 x i8> [[TMP5]] +// +uint8x16_t test_vddupq_x_wb_u8(uint32_t *a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vddupq_x_u8(a, 1, p); +#else /* POLYMORPHIC */ + return vddupq_x_wb_u8(a, 1, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vddupq_x_wb_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <8 x i16>, i32 } @llvm.arm.mve.vddup.predicated.v8i16.v8i1(<8 x i16> undef, i32 [[TMP0]], i32 4, <8 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP3]], 1 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP3]], 0 +// CHECK-NEXT: ret <8 x i16> [[TMP5]] +// +uint16x8_t test_vddupq_x_wb_u16(uint32_t *a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vddupq_x_u16(a, 4, p); +#else /* POLYMORPHIC */ + return vddupq_x_wb_u16(a, 4, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vddupq_x_wb_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vddup.predicated.v4i32.v4i1(<4 x i32> undef, i32 [[TMP0]], i32 4, <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 1 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP5]] +// +uint32x4_t test_vddupq_x_wb_u32(uint32_t *a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vddupq_x_u32(a, 4, p); +#else /* POLYMORPHIC */ + return vddupq_x_wb_u32(a, 4, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_viwdupq_x_wb_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.predicated.v16i8.v16i1(<16 x i8> undef, i32 [[TMP0]], i32 [[B:%.*]], i32 1, <16 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP3]], 1 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP3]], 0 +// CHECK-NEXT: ret <16 x i8> [[TMP5]] +// +uint8x16_t test_viwdupq_x_wb_u8(uint32_t *a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return viwdupq_x_u8(a, b, 1, p); +#else /* POLYMORPHIC */ + return viwdupq_x_wb_u8(a, b, 1, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_viwdupq_x_wb_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.predicated.v8i16.v8i1(<8 x i16> undef, i32 [[TMP0]], i32 [[B:%.*]], i32 2, <8 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP3]], 1 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP3]], 0 +// CHECK-NEXT: ret <8 x i16> [[TMP5]] +// +uint16x8_t test_viwdupq_x_wb_u16(uint32_t *a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return viwdupq_x_u16(a, b, 2, p); +#else /* POLYMORPHIC */ + return viwdupq_x_wb_u16(a, b, 2, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_viwdupq_x_wb_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.predicated.v4i32.v4i1(<4 x i32> undef, i32 [[TMP0]], i32 [[B:%.*]], i32 1, <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 1 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP5]] +// +uint32x4_t test_viwdupq_x_wb_u32(uint32_t *a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return viwdupq_x_u32(a, b, 1, p); +#else /* POLYMORPHIC */ + return viwdupq_x_wb_u32(a, b, 1, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vdwdupq_x_wb_u8( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.predicated.v16i8.v16i1(<16 x i8> undef, i32 [[TMP0]], i32 [[B:%.*]], i32 4, <16 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP3]], 1 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <16 x i8>, i32 } [[TMP3]], 0 +// CHECK-NEXT: ret <16 x i8> [[TMP5]] +// +uint8x16_t test_vdwdupq_x_wb_u8(uint32_t *a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vdwdupq_x_u8(a, b, 4, p); +#else /* POLYMORPHIC */ + return vdwdupq_x_wb_u8(a, b, 4, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vdwdupq_x_wb_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.predicated.v8i16.v8i1(<8 x i16> undef, i32 [[TMP0]], i32 [[B:%.*]], i32 4, <8 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP3]], 1 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <8 x i16>, i32 } [[TMP3]], 0 +// CHECK-NEXT: ret <8 x i16> [[TMP5]] +// +uint16x8_t test_vdwdupq_x_wb_u16(uint32_t *a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vdwdupq_x_u16(a, b, 4, p); +#else /* POLYMORPHIC */ + return vdwdupq_x_wb_u16(a, b, 4, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vdwdupq_x_wb_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = load i32, i32* [[A:%.*]], align 4 +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.predicated.v4i32.v4i1(<4 x i32> undef, i32 [[TMP0]], i32 [[B:%.*]], i32 4, <4 x i1> [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 1 +// CHECK-NEXT: store i32 [[TMP4]], i32* [[A]], align 4 +// CHECK-NEXT: [[TMP5:%.*]] = extractvalue { <4 x i32>, i32 } [[TMP3]], 0 +// CHECK-NEXT: ret <4 x i32> [[TMP5]] +// +uint32x4_t test_vdwdupq_x_wb_u32(uint32_t *a, uint32_t b, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vdwdupq_x_u32(a, b, 4, p); +#else /* POLYMORPHIC */ + return vdwdupq_x_wb_u32(a, b, 4, p); +#endif /* POLYMORPHIC */ +} diff --git a/clang/test/Sema/arm-mve-immediates.c b/clang/test/Sema/arm-mve-immediates.c --- a/clang/test/Sema/arm-mve-immediates.c +++ b/clang/test/Sema/arm-mve-immediates.c @@ -273,3 +273,20 @@ w = vmvnq_n_u32(0x0001FFFF); // expected-error {{argument should be an 8-bit value shifted by a multiple of 8 bits, or in the form 0x??FF}} w = vmvnq_n_u32(0x000001FF); } + +void test_vidup(void) +{ + vidupq_n_u16(0x12345678, 1); + vidupq_n_u16(0x12345678, 2); + vidupq_n_u16(0x12345678, 4); + vidupq_n_u16(0x12345678, 8); + + vidupq_n_u16(0x12345678, 0); // expected-error {{argument value 0 is outside the valid range [1, 8]}} + vidupq_n_u16(0x12345678, 16); // expected-error {{argument value 16 is outside the valid range [1, 8]}} + vidupq_n_u16(0x12345678, -1); // expected-error {{argument value -1 is outside the valid range [1, 8]}} + vidupq_n_u16(0x12345678, -2); // expected-error {{argument value -2 is outside the valid range [1, 8]}} + vidupq_n_u16(0x12345678, -4); // expected-error {{argument value -4 is outside the valid range [1, 8]}} + vidupq_n_u16(0x12345678, -8); // expected-error {{argument value -8 is outside the valid range [1, 8]}} + vidupq_n_u16(0x12345678, 3); // expected-error {{argument should be a power of 2}} + vidupq_n_u16(0x12345678, 7); // expected-error {{argument should be a power of 2}} +} diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp --- a/clang/utils/TableGen/MveEmitter.cpp +++ b/clang/utils/TableGen/MveEmitter.cpp @@ -501,8 +501,17 @@ } void setPredecessor(Ptr p) { - assert(!Predecessor); - Predecessor = p; + // If the user has nested one 'seq' node inside another, and this + // method is called on the return value of the inner 'seq' (i.e. + // the final item inside it), then we can't link _this_ node to p, + // because it already has a predecessor. Instead, walk the chain + // until we find the first item in the inner seq, and link that to + // p, so that nesting seqs has the obvious effect of linking + // everything together into one long sequential chain. + Result *r = this; + while (r->Predecessor) + r = r->Predecessor.get(); + r->Predecessor = p; } // Each Result will be assigned a variable name in the output code, but not diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -1132,4 +1132,22 @@ [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>], llvm_anyvector_ty>; + +defm int_arm_mve_vidup: MVEMXPredicated< + [llvm_anyvector_ty /* output */, llvm_i32_ty /* written-back base */], [], + [llvm_i32_ty /* base */, llvm_i32_ty /* step */], + LLVMMatchType<0>, llvm_anyvector_ty>; +defm int_arm_mve_vddup: MVEMXPredicated< + [llvm_anyvector_ty /* output */, llvm_i32_ty /* written-back base */], [], + [llvm_i32_ty /* base */, llvm_i32_ty /* step */], + LLVMMatchType<0>, llvm_anyvector_ty>; +defm int_arm_mve_viwdup: MVEMXPredicated< + [llvm_anyvector_ty /* output */, llvm_i32_ty /* written-back base */], [], + [llvm_i32_ty /* base */, llvm_i32_ty /* limit */, llvm_i32_ty /* step */], + LLVMMatchType<0>, llvm_anyvector_ty>; +defm int_arm_mve_vdwdup: MVEMXPredicated< + [llvm_anyvector_ty /* output */, llvm_i32_ty /* written-back base */], [], + [llvm_i32_ty /* base */, llvm_i32_ty /* limit */, llvm_i32_ty /* step */], + LLVMMatchType<0>, llvm_anyvector_ty>; + } // end TargetPrefix diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -268,6 +268,11 @@ void SelectMVE_VLD(SDNode *N, unsigned NumVecs, const uint16_t *const *Opcodes, bool HasWriteback); + /// SelectMVE_VxDUP - Select MVE incrementing-dup instructions. Opcodes is an + /// array of 3 elements for the 8, 16 and 32-bit lane sizes. + void SelectMVE_VxDUP(SDNode *N, const uint16_t *Opcodes, + bool Wrapping, bool Predicated); + /// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs /// should be 1, 2, 3 or 4. The opcode array specifies the instructions used /// for loading D registers. @@ -2729,6 +2734,49 @@ CurDAG->RemoveDeadNode(N); } +void ARMDAGToDAGISel::SelectMVE_VxDUP(SDNode *N, const uint16_t *Opcodes, + bool Wrapping, bool Predicated) { + EVT VT = N->getValueType(0); + SDLoc Loc(N); + + uint16_t Opcode; + switch (VT.getVectorElementType().getSizeInBits()) { + case 8: + Opcode = Opcodes[0]; + break; + case 16: + Opcode = Opcodes[1]; + break; + case 32: + Opcode = Opcodes[2]; + break; + default: + llvm_unreachable("bad vector element size in SelectMVE_VxDUP"); + } + + SmallVector Ops; + unsigned OpIdx = 1; + + SDValue Inactive; + if (Predicated) + Inactive = N->getOperand(OpIdx++); + + Ops.push_back(N->getOperand(OpIdx++)); // base + if (Wrapping) + Ops.push_back(N->getOperand(OpIdx++)); // limit + + SDValue ImmOp = N->getOperand(OpIdx++); // step + int ImmValue = cast(ImmOp)->getZExtValue(); + Ops.push_back(getI32Imm(ImmValue, Loc)); + + if (Predicated) + AddMVEPredicateToOps(Ops, Loc, N->getOperand(OpIdx), Inactive); + else + AddEmptyMVEPredicateToOps(Ops, Loc, N->getValueType(0)); + + CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops)); +} + void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic, bool isUpdating, unsigned NumVecs, const uint16_t *DOpcodes, @@ -4617,6 +4665,46 @@ OpcodesS, OpcodesU); return; } + + case Intrinsic::arm_mve_vidup: + case Intrinsic::arm_mve_vidup_predicated: { + static const uint16_t Opcodes[] = { + ARM::MVE_VIDUPu8, ARM::MVE_VIDUPu16, ARM::MVE_VIDUPu32, + }; + SelectMVE_VxDUP(N, Opcodes, false, + IntNo == Intrinsic::arm_mve_vidup_predicated); + return; + } + + case Intrinsic::arm_mve_vddup: + case Intrinsic::arm_mve_vddup_predicated: { + static const uint16_t Opcodes[] = { + ARM::MVE_VDDUPu8, ARM::MVE_VDDUPu16, ARM::MVE_VDDUPu32, + }; + SelectMVE_VxDUP(N, Opcodes, false, + IntNo == Intrinsic::arm_mve_vddup_predicated); + return; + } + + case Intrinsic::arm_mve_viwdup: + case Intrinsic::arm_mve_viwdup_predicated: { + static const uint16_t Opcodes[] = { + ARM::MVE_VIWDUPu8, ARM::MVE_VIWDUPu16, ARM::MVE_VIWDUPu32, + }; + SelectMVE_VxDUP(N, Opcodes, true, + IntNo == Intrinsic::arm_mve_viwdup_predicated); + return; + } + + case Intrinsic::arm_mve_vdwdup: + case Intrinsic::arm_mve_vdwdup_predicated: { + static const uint16_t Opcodes[] = { + ARM::MVE_VDWDUPu8, ARM::MVE_VDWDUPu16, ARM::MVE_VDWDUPu32, + }; + SelectMVE_VxDUP(N, Opcodes, true, + IntNo == Intrinsic::arm_mve_vdwdup_predicated); + return; + } } break; } diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/idup.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/idup.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/idup.ll @@ -0,0 +1,775 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s + +define arm_aapcs_vfpcc <16 x i8> @test_vidupq_n_u8(i32 %a) { +; CHECK-LABEL: test_vidupq_n_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vidup.u8 q0, r0, #4 +; CHECK-NEXT: bx lr +entry: + %0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32 %a, i32 4) + %1 = extractvalue { <16 x i8>, i32 } %0, 0 + ret <16 x i8> %1 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vidupq_n_u16(i32 %a) { +; CHECK-LABEL: test_vidupq_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vidup.u16 q0, r0, #1 +; CHECK-NEXT: bx lr +entry: + %0 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32 %a, i32 1) + %1 = extractvalue { <8 x i16>, i32 } %0, 0 + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vidupq_n_u32(i32 %a) { +; CHECK-LABEL: test_vidupq_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vidup.u32 q0, r0, #4 +; CHECK-NEXT: bx lr +entry: + %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 %a, i32 4) + %1 = extractvalue { <4 x i32>, i32 } %0, 0 + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vddupq_n_u8(i32 %a) { +; CHECK-LABEL: test_vddupq_n_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vddup.u8 q0, r0, #2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vddup.v16i8(i32 %a, i32 2) + %1 = extractvalue { <16 x i8>, i32 } %0, 0 + ret <16 x i8> %1 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vddupq_n_u16(i32 %a) { +; CHECK-LABEL: test_vddupq_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vddup.u16 q0, r0, #4 +; CHECK-NEXT: bx lr +entry: + %0 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vddup.v8i16(i32 %a, i32 4) + %1 = extractvalue { <8 x i16>, i32 } %0, 0 + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vddupq_n_u32(i32 %a) { +; CHECK-LABEL: test_vddupq_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vddup.u32 q0, r0, #2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vddup.v4i32(i32 %a, i32 2) + %1 = extractvalue { <4 x i32>, i32 } %0, 0 + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <16 x i8> @test_viwdupq_n_u8(i32 %a, i32 %b) { +; CHECK-LABEL: test_viwdupq_n_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: viwdup.u8 q0, r0, r1, #4 +; CHECK-NEXT: bx lr +entry: + %0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.v16i8(i32 %a, i32 %b, i32 4) + %1 = extractvalue { <16 x i8>, i32 } %0, 0 + ret <16 x i8> %1 +} + +define arm_aapcs_vfpcc <8 x i16> @test_viwdupq_n_u16(i32 %a, i32 %b) { +; CHECK-LABEL: test_viwdupq_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: viwdup.u16 q0, r0, r1, #2 +; CHECK-NEXT: bx lr +entry: + %0 = tail call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.v8i16(i32 %a, i32 %b, i32 2) + %1 = extractvalue { <8 x i16>, i32 } %0, 0 + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @test_viwdupq_n_u32(i32 %a, i32 %b) { +; CHECK-LABEL: test_viwdupq_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: viwdup.u32 q0, r0, r1, #8 +; CHECK-NEXT: bx lr +entry: + %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32 %a, i32 %b, i32 8) + %1 = extractvalue { <4 x i32>, i32 } %0, 0 + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vdwdupq_n_u8(i32 %a, i32 %b) { +; CHECK-LABEL: test_vdwdupq_n_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vdwdup.u8 q0, r0, r1, #4 +; CHECK-NEXT: bx lr +entry: + %0 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.v16i8(i32 %a, i32 %b, i32 4) + %1 = extractvalue { <16 x i8>, i32 } %0, 0 + ret <16 x i8> %1 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vdwdupq_n_u16(i32 %a, i32 %b) { +; CHECK-LABEL: test_vdwdupq_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vdwdup.u16 q0, r0, r1, #8 +; CHECK-NEXT: bx lr +entry: + %0 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.v8i16(i32 %a, i32 %b, i32 8) + %1 = extractvalue { <8 x i16>, i32 } %0, 0 + ret <8 x i16> %1 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vdwdupq_n_u32(i32 %a, i32 %b) { +; CHECK-LABEL: test_vdwdupq_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vdwdup.u32 q0, r0, r1, #1 +; CHECK-NEXT: bx lr +entry: + %0 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.v4i32(i32 %a, i32 %b, i32 1) + %1 = extractvalue { <4 x i32>, i32 } %0, 0 + ret <4 x i32> %1 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vidupq_wb_u8(i32* nocapture %a) { +; CHECK-LABEL: test_vidupq_wb_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r2, [r0] +; CHECK-NEXT: vidup.u8 q0, r2, #8 +; CHECK-NEXT: str r2, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %a, align 4 + %1 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32 %0, i32 8) + %2 = extractvalue { <16 x i8>, i32 } %1, 1 + store i32 %2, i32* %a, align 4 + %3 = extractvalue { <16 x i8>, i32 } %1, 0 + ret <16 x i8> %3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vidupq_wb_u16(i32* nocapture %a) { +; CHECK-LABEL: test_vidupq_wb_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r2, [r0] +; CHECK-NEXT: vidup.u16 q0, r2, #1 +; CHECK-NEXT: str r2, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %a, align 4 + %1 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32 %0, i32 1) + %2 = extractvalue { <8 x i16>, i32 } %1, 1 + store i32 %2, i32* %a, align 4 + %3 = extractvalue { <8 x i16>, i32 } %1, 0 + ret <8 x i16> %3 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vidupq_wb_u32(i32* nocapture %a) { +; CHECK-LABEL: test_vidupq_wb_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r2, [r0] +; CHECK-NEXT: vidup.u32 q0, r2, #4 +; CHECK-NEXT: str r2, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %a, align 4 + %1 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32 %0, i32 4) + %2 = extractvalue { <4 x i32>, i32 } %1, 1 + store i32 %2, i32* %a, align 4 + %3 = extractvalue { <4 x i32>, i32 } %1, 0 + ret <4 x i32> %3 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vddupq_wb_u8(i32* nocapture %a) { +; CHECK-LABEL: test_vddupq_wb_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r2, [r0] +; CHECK-NEXT: vddup.u8 q0, r2, #2 +; CHECK-NEXT: str r2, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %a, align 4 + %1 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vddup.v16i8(i32 %0, i32 2) + %2 = extractvalue { <16 x i8>, i32 } %1, 1 + store i32 %2, i32* %a, align 4 + %3 = extractvalue { <16 x i8>, i32 } %1, 0 + ret <16 x i8> %3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vddupq_wb_u16(i32* nocapture %a) { +; CHECK-LABEL: test_vddupq_wb_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r2, [r0] +; CHECK-NEXT: vddup.u16 q0, r2, #8 +; CHECK-NEXT: str r2, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %a, align 4 + %1 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vddup.v8i16(i32 %0, i32 8) + %2 = extractvalue { <8 x i16>, i32 } %1, 1 + store i32 %2, i32* %a, align 4 + %3 = extractvalue { <8 x i16>, i32 } %1, 0 + ret <8 x i16> %3 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vddupq_wb_u32(i32* nocapture %a) { +; CHECK-LABEL: test_vddupq_wb_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r2, [r0] +; CHECK-NEXT: vddup.u32 q0, r2, #2 +; CHECK-NEXT: str r2, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %a, align 4 + %1 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vddup.v4i32(i32 %0, i32 2) + %2 = extractvalue { <4 x i32>, i32 } %1, 1 + store i32 %2, i32* %a, align 4 + %3 = extractvalue { <4 x i32>, i32 } %1, 0 + ret <4 x i32> %3 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vdwdupq_wb_u8(i32* nocapture %a, i32 %b) { +; CHECK-LABEL: test_vdwdupq_wb_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r2, [r0] +; CHECK-NEXT: vdwdup.u8 q0, r2, r1, #4 +; CHECK-NEXT: str r2, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %a, align 4 + %1 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.v16i8(i32 %0, i32 %b, i32 4) + %2 = extractvalue { <16 x i8>, i32 } %1, 1 + store i32 %2, i32* %a, align 4 + %3 = extractvalue { <16 x i8>, i32 } %1, 0 + ret <16 x i8> %3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vdwdupq_wb_u16(i32* nocapture %a, i32 %b) { +; CHECK-LABEL: test_vdwdupq_wb_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r2, [r0] +; CHECK-NEXT: vdwdup.u16 q0, r2, r1, #4 +; CHECK-NEXT: str r2, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %a, align 4 + %1 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.v8i16(i32 %0, i32 %b, i32 4) + %2 = extractvalue { <8 x i16>, i32 } %1, 1 + store i32 %2, i32* %a, align 4 + %3 = extractvalue { <8 x i16>, i32 } %1, 0 + ret <8 x i16> %3 +} + +define arm_aapcs_vfpcc <16 x i8> @test_viwdupq_wb_u8(i32* nocapture %a, i32 %b) { +; CHECK-LABEL: test_viwdupq_wb_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r2, [r0] +; CHECK-NEXT: viwdup.u8 q0, r2, r1, #1 +; CHECK-NEXT: str r2, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %a, align 4 + %1 = tail call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.v16i8(i32 %0, i32 %b, i32 1) + %2 = extractvalue { <16 x i8>, i32 } %1, 1 + store i32 %2, i32* %a, align 4 + %3 = extractvalue { <16 x i8>, i32 } %1, 0 + ret <16 x i8> %3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_viwdupq_wb_u16(i32* nocapture %a, i32 %b) { +; CHECK-LABEL: test_viwdupq_wb_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r2, [r0] +; CHECK-NEXT: viwdup.u16 q0, r2, r1, #1 +; CHECK-NEXT: str r2, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %a, align 4 + %1 = tail call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.v8i16(i32 %0, i32 %b, i32 1) + %2 = extractvalue { <8 x i16>, i32 } %1, 1 + store i32 %2, i32* %a, align 4 + %3 = extractvalue { <8 x i16>, i32 } %1, 0 + ret <8 x i16> %3 +} + +define arm_aapcs_vfpcc <4 x i32> @test_viwdupq_wb_u32(i32* nocapture %a, i32 %b) { +; CHECK-LABEL: test_viwdupq_wb_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r2, [r0] +; CHECK-NEXT: viwdup.u32 q0, r2, r1, #8 +; CHECK-NEXT: str r2, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %a, align 4 + %1 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32 %0, i32 %b, i32 8) + %2 = extractvalue { <4 x i32>, i32 } %1, 1 + store i32 %2, i32* %a, align 4 + %3 = extractvalue { <4 x i32>, i32 } %1, 0 + ret <4 x i32> %3 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vdwdupq_wb_u32(i32* nocapture %a, i32 %b) { +; CHECK-LABEL: test_vdwdupq_wb_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r2, [r0] +; CHECK-NEXT: vdwdup.u32 q0, r2, r1, #2 +; CHECK-NEXT: str r2, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %a, align 4 + %1 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.v4i32(i32 %0, i32 %b, i32 2) + %2 = extractvalue { <4 x i32>, i32 } %1, 1 + store i32 %2, i32* %a, align 4 + %3 = extractvalue { <4 x i32>, i32 } %1, 0 + ret <4 x i32> %3 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vidupq_m_n_u8(<16 x i8> %inactive, i32 %a, i16 zeroext %p) { +; CHECK-LABEL: test_vidupq_m_n_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vidupt.u8 q0, r0, #8 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %a, i32 8, <16 x i1> %1) + %3 = extractvalue { <16 x i8>, i32 } %2, 0 + ret <16 x i8> %3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vidupq_m_n_u16(<8 x i16> %inactive, i32 %a, i16 zeroext %p) { +; CHECK-LABEL: test_vidupq_m_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vidupt.u16 q0, r0, #8 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %a, i32 8, <8 x i1> %1) + %3 = extractvalue { <8 x i16>, i32 } %2, 0 + ret <8 x i16> %3 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vidupq_m_n_u32(<4 x i32> %inactive, i32 %a, i16 zeroext %p) { +; CHECK-LABEL: test_vidupq_m_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vidupt.u32 q0, r0, #2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %a, i32 2, <4 x i1> %1) + %3 = extractvalue { <4 x i32>, i32 } %2, 0 + ret <4 x i32> %3 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vddupq_m_n_u8(<16 x i8> %inactive, i32 %a, i16 zeroext %p) { +; CHECK-LABEL: test_vddupq_m_n_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vddupt.u8 q0, r0, #8 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vddup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %a, i32 8, <16 x i1> %1) + %3 = extractvalue { <16 x i8>, i32 } %2, 0 + ret <16 x i8> %3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vddupq_m_n_u16(<8 x i16> %inactive, i32 %a, i16 zeroext %p) { +; CHECK-LABEL: test_vddupq_m_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vddupt.u16 q0, r0, #2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vddup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %a, i32 2, <8 x i1> %1) + %3 = extractvalue { <8 x i16>, i32 } %2, 0 + ret <8 x i16> %3 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vddupq_m_n_u32(<4 x i32> %inactive, i32 %a, i16 zeroext %p) { +; CHECK-LABEL: test_vddupq_m_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vddupt.u32 q0, r0, #8 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vddup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %a, i32 8, <4 x i1> %1) + %3 = extractvalue { <4 x i32>, i32 } %2, 0 + ret <4 x i32> %3 +} + +define arm_aapcs_vfpcc <16 x i8> @test_viwdupq_m_n_u8(<16 x i8> %inactive, i32 %a, i32 %b, i16 zeroext %p) { +; CHECK-LABEL: test_viwdupq_m_n_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: viwdupt.u8 q0, r0, r1, #8 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = tail call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %a, i32 %b, i32 8, <16 x i1> %1) + %3 = extractvalue { <16 x i8>, i32 } %2, 0 + ret <16 x i8> %3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_viwdupq_m_n_u16(<8 x i16> %inactive, i32 %a, i32 %b, i16 zeroext %p) { +; CHECK-LABEL: test_viwdupq_m_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: viwdupt.u16 q0, r0, r1, #8 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %a, i32 %b, i32 8, <8 x i1> %1) + %3 = extractvalue { <8 x i16>, i32 } %2, 0 + ret <8 x i16> %3 +} + +define arm_aapcs_vfpcc <4 x i32> @test_viwdupq_m_n_u32(<4 x i32> %inactive, i32 %a, i32 %b, i16 zeroext %p) { +; CHECK-LABEL: test_viwdupq_m_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: viwdupt.u32 q0, r0, r1, #4 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %a, i32 %b, i32 4, <4 x i1> %1) + %3 = extractvalue { <4 x i32>, i32 } %2, 0 + ret <4 x i32> %3 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vdwdupq_m_n_u8(<16 x i8> %inactive, i32 %a, i32 %b, i16 zeroext %p) { +; CHECK-LABEL: test_vdwdupq_m_n_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vdwdupt.u8 q0, r0, r1, #1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %a, i32 %b, i32 1, <16 x i1> %1) + %3 = extractvalue { <16 x i8>, i32 } %2, 0 + ret <16 x i8> %3 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vdwdupq_m_n_u16(<8 x i16> %inactive, i32 %a, i32 %b, i16 zeroext %p) { +; CHECK-LABEL: test_vdwdupq_m_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vdwdupt.u16 q0, r0, r1, #2 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %a, i32 %b, i32 2, <8 x i1> %1) + %3 = extractvalue { <8 x i16>, i32 } %2, 0 + ret <8 x i16> %3 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vdwdupq_m_n_u32(<4 x i32> %inactive, i32 %a, i32 %b, i16 zeroext %p) { +; CHECK-LABEL: test_vdwdupq_m_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vdwdupt.u32 q0, r0, r1, #4 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %a, i32 %b, i32 4, <4 x i1> %1) + %3 = extractvalue { <4 x i32>, i32 } %2, 0 + ret <4 x i32> %3 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vidupq_m_wb_u8(<16 x i8> %inactive, i32* nocapture %a, i16 zeroext %p) { +; CHECK-LABEL: test_vidupq_m_wb_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r2, [r0] +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vidupt.u8 q0, r2, #8 +; CHECK-NEXT: str r2, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %a, align 4 + %1 = zext i16 %p to i32 + %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + %3 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vidup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %0, i32 8, <16 x i1> %2) + %4 = extractvalue { <16 x i8>, i32 } %3, 1 + store i32 %4, i32* %a, align 4 + %5 = extractvalue { <16 x i8>, i32 } %3, 0 + ret <16 x i8> %5 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vidupq_m_wb_u16(<8 x i16> %inactive, i32* nocapture %a, i16 zeroext %p) { +; CHECK-LABEL: test_vidupq_m_wb_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r2, [r0] +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vidupt.u16 q0, r2, #2 +; CHECK-NEXT: str r2, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %a, align 4 + %1 = zext i16 %p to i32 + %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vidup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %0, i32 2, <8 x i1> %2) + %4 = extractvalue { <8 x i16>, i32 } %3, 1 + store i32 %4, i32* %a, align 4 + %5 = extractvalue { <8 x i16>, i32 } %3, 0 + ret <8 x i16> %5 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vidupq_m_wb_u32(<4 x i32> %inactive, i32* nocapture %a, i16 zeroext %p) { +; CHECK-LABEL: test_vidupq_m_wb_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r2, [r0] +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vidupt.u32 q0, r2, #8 +; CHECK-NEXT: str r2, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %a, align 4 + %1 = zext i16 %p to i32 + %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vidup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %0, i32 8, <4 x i1> %2) + %4 = extractvalue { <4 x i32>, i32 } %3, 1 + store i32 %4, i32* %a, align 4 + %5 = extractvalue { <4 x i32>, i32 } %3, 0 + ret <4 x i32> %5 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vddupq_m_wb_u8(<16 x i8> %inactive, i32* nocapture %a, i16 zeroext %p) { +; CHECK-LABEL: test_vddupq_m_wb_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r2, [r0] +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vddupt.u8 q0, r2, #1 +; CHECK-NEXT: str r2, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %a, align 4 + %1 = zext i16 %p to i32 + %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + %3 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vddup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %0, i32 1, <16 x i1> %2) + %4 = extractvalue { <16 x i8>, i32 } %3, 1 + store i32 %4, i32* %a, align 4 + %5 = extractvalue { <16 x i8>, i32 } %3, 0 + ret <16 x i8> %5 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vddupq_m_wb_u16(<8 x i16> %inactive, i32* nocapture %a, i16 zeroext %p) { +; CHECK-LABEL: test_vddupq_m_wb_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r2, [r0] +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vddupt.u16 q0, r2, #1 +; CHECK-NEXT: str r2, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %a, align 4 + %1 = zext i16 %p to i32 + %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vddup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %0, i32 1, <8 x i1> %2) + %4 = extractvalue { <8 x i16>, i32 } %3, 1 + store i32 %4, i32* %a, align 4 + %5 = extractvalue { <8 x i16>, i32 } %3, 0 + ret <8 x i16> %5 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vddupq_m_wb_u32(<4 x i32> %inactive, i32* nocapture %a, i16 zeroext %p) { +; CHECK-LABEL: test_vddupq_m_wb_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr r2, [r0] +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vddupt.u32 q0, r2, #4 +; CHECK-NEXT: str r2, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %a, align 4 + %1 = zext i16 %p to i32 + %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vddup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %0, i32 4, <4 x i1> %2) + %4 = extractvalue { <4 x i32>, i32 } %3, 1 + store i32 %4, i32* %a, align 4 + %5 = extractvalue { <4 x i32>, i32 } %3, 0 + ret <4 x i32> %5 +} + +define arm_aapcs_vfpcc <16 x i8> @test_viwdupq_m_wb_u8(<16 x i8> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) { +; CHECK-LABEL: test_viwdupq_m_wb_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr.w r12, [r0] +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: viwdupt.u8 q0, r12, r1, #8 +; CHECK-NEXT: str.w r12, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %a, align 4 + %1 = zext i16 %p to i32 + %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + %3 = tail call { <16 x i8>, i32 } @llvm.arm.mve.viwdup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %0, i32 %b, i32 8, <16 x i1> %2) + %4 = extractvalue { <16 x i8>, i32 } %3, 1 + store i32 %4, i32* %a, align 4 + %5 = extractvalue { <16 x i8>, i32 } %3, 0 + ret <16 x i8> %5 +} + +define arm_aapcs_vfpcc <8 x i16> @test_viwdupq_m_wb_u16(<8 x i16> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) { +; CHECK-LABEL: test_viwdupq_m_wb_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr.w r12, [r0] +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: viwdupt.u16 q0, r12, r1, #8 +; CHECK-NEXT: str.w r12, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %a, align 4 + %1 = zext i16 %p to i32 + %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = tail call { <8 x i16>, i32 } @llvm.arm.mve.viwdup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %0, i32 %b, i32 8, <8 x i1> %2) + %4 = extractvalue { <8 x i16>, i32 } %3, 1 + store i32 %4, i32* %a, align 4 + %5 = extractvalue { <8 x i16>, i32 } %3, 0 + ret <8 x i16> %5 +} + +define arm_aapcs_vfpcc <4 x i32> @test_viwdupq_m_wb_u32(<4 x i32> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) { +; CHECK-LABEL: test_viwdupq_m_wb_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr.w r12, [r0] +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: viwdupt.u32 q0, r12, r1, #4 +; CHECK-NEXT: str.w r12, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %a, align 4 + %1 = zext i16 %p to i32 + %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.viwdup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %0, i32 %b, i32 4, <4 x i1> %2) + %4 = extractvalue { <4 x i32>, i32 } %3, 1 + store i32 %4, i32* %a, align 4 + %5 = extractvalue { <4 x i32>, i32 } %3, 0 + ret <4 x i32> %5 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vdwdupq_m_wb_u8(<16 x i8> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) { +; CHECK-LABEL: test_vdwdupq_m_wb_u8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr.w r12, [r0] +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vdwdupt.u8 q0, r12, r1, #1 +; CHECK-NEXT: str.w r12, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %a, align 4 + %1 = zext i16 %p to i32 + %2 = tail call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %1) + %3 = tail call { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.predicated.v16i8.v16i1(<16 x i8> %inactive, i32 %0, i32 %b, i32 1, <16 x i1> %2) + %4 = extractvalue { <16 x i8>, i32 } %3, 1 + store i32 %4, i32* %a, align 4 + %5 = extractvalue { <16 x i8>, i32 } %3, 0 + ret <16 x i8> %5 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vdwdupq_m_wb_u16(<8 x i16> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) { +; CHECK-LABEL: test_vdwdupq_m_wb_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr.w r12, [r0] +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vdwdupt.u16 q0, r12, r1, #4 +; CHECK-NEXT: str.w r12, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %a, align 4 + %1 = zext i16 %p to i32 + %2 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %1) + %3 = tail call { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.predicated.v8i16.v8i1(<8 x i16> %inactive, i32 %0, i32 %b, i32 4, <8 x i1> %2) + %4 = extractvalue { <8 x i16>, i32 } %3, 1 + store i32 %4, i32* %a, align 4 + %5 = extractvalue { <8 x i16>, i32 } %3, 0 + ret <8 x i16> %5 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vdwdupq_m_wb_u32(<4 x i32> %inactive, i32* nocapture %a, i32 %b, i16 zeroext %p) { +; CHECK-LABEL: test_vdwdupq_m_wb_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: ldr.w r12, [r0] +; CHECK-NEXT: vmsr p0, r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vdwdupt.u32 q0, r12, r1, #4 +; CHECK-NEXT: str.w r12, [r0] +; CHECK-NEXT: bx lr +entry: + %0 = load i32, i32* %a, align 4 + %1 = zext i16 %p to i32 + %2 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = tail call { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.predicated.v4i32.v4i1(<4 x i32> %inactive, i32 %0, i32 %b, i32 4, <4 x i1> %2) + %4 = extractvalue { <4 x i32>, i32 } %3, 1 + store i32 %4, i32* %a, align 4 + %5 = extractvalue { <4 x i32>, i32 } %3, 0 + ret <4 x i32> %5 +} + +declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) +declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) +declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) + +declare { <16 x i8>, i32 } @llvm.arm.mve.vidup.v16i8(i32, i32) +declare { <8 x i16>, i32 } @llvm.arm.mve.vidup.v8i16(i32, i32) +declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.v4i32(i32, i32) +declare { <16 x i8>, i32 } @llvm.arm.mve.vddup.v16i8(i32, i32) +declare { <8 x i16>, i32 } @llvm.arm.mve.vddup.v8i16(i32, i32) +declare { <4 x i32>, i32 } @llvm.arm.mve.vddup.v4i32(i32, i32) +declare { <16 x i8>, i32 } @llvm.arm.mve.viwdup.v16i8(i32, i32, i32) +declare { <8 x i16>, i32 } @llvm.arm.mve.viwdup.v8i16(i32, i32, i32) +declare { <4 x i32>, i32 } @llvm.arm.mve.viwdup.v4i32(i32, i32, i32) +declare { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.v16i8(i32, i32, i32) +declare { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.v8i16(i32, i32, i32) +declare { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.v4i32(i32, i32, i32) +declare { <16 x i8>, i32 } @llvm.arm.mve.vidup.predicated.v16i8.v16i1(<16 x i8>, i32, i32, <16 x i1>) +declare { <8 x i16>, i32 } @llvm.arm.mve.vidup.predicated.v8i16.v8i1(<8 x i16>, i32, i32, <8 x i1>) +declare { <4 x i32>, i32 } @llvm.arm.mve.vidup.predicated.v4i32.v4i1(<4 x i32>, i32, i32, <4 x i1>) +declare { <16 x i8>, i32 } @llvm.arm.mve.vddup.predicated.v16i8.v16i1(<16 x i8>, i32, i32, <16 x i1>) +declare { <8 x i16>, i32 } @llvm.arm.mve.vddup.predicated.v8i16.v8i1(<8 x i16>, i32, i32, <8 x i1>) +declare { <4 x i32>, i32 } @llvm.arm.mve.vddup.predicated.v4i32.v4i1(<4 x i32>, i32, i32, <4 x i1>) +declare { <16 x i8>, i32 } @llvm.arm.mve.viwdup.predicated.v16i8.v16i1(<16 x i8>, i32, i32, i32, <16 x i1>) +declare { <8 x i16>, i32 } @llvm.arm.mve.viwdup.predicated.v8i16.v8i1(<8 x i16>, i32, i32, i32, <8 x i1>) +declare { <4 x i32>, i32 } @llvm.arm.mve.viwdup.predicated.v4i32.v4i1(<4 x i32>, i32, i32, i32, <4 x i1>) +declare { <16 x i8>, i32 } @llvm.arm.mve.vdwdup.predicated.v16i8.v16i1(<16 x i8>, i32, i32, i32, <16 x i1>) +declare { <8 x i16>, i32 } @llvm.arm.mve.vdwdup.predicated.v8i16.v8i1(<8 x i16>, i32, i32, i32, <8 x i1>) +declare { <4 x i32>, i32 } @llvm.arm.mve.vdwdup.predicated.v4i32.v4i1(<4 x i32>, i32, i32, i32, <4 x i1>)