diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td --- a/clang/include/clang/Basic/arm_mve.td +++ b/clang/include/clang/Basic/arm_mve.td @@ -116,6 +116,28 @@ NameOverride<"vmulq">; } +let params = !listconcat(T.Int16, T.Int32) in { + let pnt = PNT_None in { + def vmvnq_n: Intrinsic; + } + defm vmvnq: IntrinsicMX; + let pnt = PNT_NType in { + def vbicq_n: Intrinsic; + def vorrq_n: Intrinsic; + } + def vbicq_m_n: Intrinsic< + Vector, (args Vector:$v, imm_simd_restrictive:$imm, Predicate:$pred), + (IRInt<"bic_imm_predicated", [Vector, Predicate]> $v, (u32 $imm), $pred)>; + def vorrq_m_n: Intrinsic< + Vector, (args Vector:$v, imm_simd_restrictive:$imm, Predicate:$pred), + (IRInt<"orr_imm_predicated", [Vector, Predicate]> $v, (u32 $imm), $pred)>; +} + // The bitcasting below is not overcomplicating the IR because while // Vector and UVector may be different vector types at the C level i.e. // vectors of same size signed/unsigned ints. Once they're lowered diff --git a/clang/include/clang/Basic/arm_mve_defs.td b/clang/include/clang/Basic/arm_mve_defs.td --- a/clang/include/clang/Basic/arm_mve_defs.td +++ b/clang/include/clang/Basic/arm_mve_defs.td @@ -319,6 +319,7 @@ int base = base_; Type type = type_; } +def IB_ExtraArg_LaneSize; // ----------------------------------------------------------------------------- // End-user definitions for immediate arguments. @@ -327,11 +328,13 @@ // intrinsics like vmvnq or vorrq. imm_simd_restrictive has to be an 8-bit // value shifted left by a whole number of bytes; imm_simd_vmvn can also be of // the form 0xXXFF for some byte value XX. -def imm_simd_restrictive : Immediate { +def imm_simd_restrictive : Immediate { let extra = "ShiftedByte"; + let extraarg = "!lanesize"; } -def imm_simd_vmvn : Immediate { +def imm_simd_vmvn : Immediate { let extra = "ShiftedByteOrXXFF"; + let extraarg = "!lanesize"; } // imm_1toN can take any value from 1 to N inclusive, where N is the number of @@ -457,26 +460,31 @@ // A wrapper to define both _m and _x versions of a predicated // intrinsic. +// +// We provide optional parameters to override the polymorphic name +// types separately for the _m and _x variants, because sometimes they +// polymorph differently (typically because the type of the inactive +// parameter can be used as a disambiguator if it's present). multiclass IntrinsicMX { // The _m variant takes an initial parameter called $inactive, which // provides the input value of the output register, i.e. all the // inactive lanes in the predicated operation take their values from // this. def "_m" # nameSuffix: - Intrinsic; + Intrinsic { + let pnt = pnt_m; + } foreach unusedVar = !if(!eq(wantXVariant, 1), [1], []) in { // The _x variant leaves off that parameter, and simply uses an // undef value of the same type. + def "_x" # nameSuffix: - Intrinsic { - // Allow overriding of the polymorphic name type, because - // sometimes the _m and _x variants polymorph differently - // (typically because the type of the inactive parameter can be - // used as a disambiguator if it's present). + Intrinsic { let pnt = pnt_x; } } diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h --- a/clang/include/clang/Sema/Sema.h +++ b/clang/include/clang/Sema/Sema.h @@ -11601,8 +11601,10 @@ bool SemaBuiltinConstantArgMultiple(CallExpr *TheCall, int ArgNum, unsigned Multiple); bool SemaBuiltinConstantArgPower2(CallExpr *TheCall, int ArgNum); - bool SemaBuiltinConstantArgShiftedByte(CallExpr *TheCall, int ArgNum); - bool SemaBuiltinConstantArgShiftedByteOrXXFF(CallExpr *TheCall, int ArgNum); + bool SemaBuiltinConstantArgShiftedByte(CallExpr *TheCall, int ArgNum, + unsigned ArgBits); + bool SemaBuiltinConstantArgShiftedByteOrXXFF(CallExpr *TheCall, int ArgNum, + unsigned ArgBits); bool SemaBuiltinARMSpecialReg(unsigned BuiltinID, CallExpr *TheCall, int ArgNum, unsigned ExpectedFieldNum, bool AllowName); diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp --- a/clang/lib/Sema/SemaChecking.cpp +++ b/clang/lib/Sema/SemaChecking.cpp @@ -6412,7 +6412,8 @@ /// SemaBuiltinConstantArgShiftedByte - Check if argument ArgNum of TheCall is /// a constant expression representing an arbitrary byte value shifted left by /// a multiple of 8 bits. -bool Sema::SemaBuiltinConstantArgShiftedByte(CallExpr *TheCall, int ArgNum) { +bool Sema::SemaBuiltinConstantArgShiftedByte(CallExpr *TheCall, int ArgNum, + unsigned ArgBits) { llvm::APSInt Result; // We can't check the value of a dependent argument. @@ -6424,6 +6425,10 @@ if (SemaBuiltinConstantArg(TheCall, ArgNum, Result)) return true; + // Truncate to the given size. + Result = Result.getLoBits(ArgBits); + Result.setIsUnsigned(true); + if (IsShiftedByte(Result)) return false; @@ -6437,7 +6442,8 @@ /// 0x00FF, 0x01FF, ..., 0xFFFF). This strange range check is needed for some /// Arm MVE intrinsics. bool Sema::SemaBuiltinConstantArgShiftedByteOrXXFF(CallExpr *TheCall, - int ArgNum) { + int ArgNum, + unsigned ArgBits) { llvm::APSInt Result; // We can't check the value of a dependent argument. @@ -6449,6 +6455,10 @@ if (SemaBuiltinConstantArg(TheCall, ArgNum, Result)) return true; + // Truncate to the given size. + Result = Result.getLoBits(ArgBits); + Result.setIsUnsigned(true); + // Check to see if it's in either of the required forms. if (IsShiftedByte(Result) || (Result > 0 && Result < 0x10000 && (Result & 0xFF) == 0xFF)) diff --git a/clang/test/CodeGen/arm-mve-intrinsics/bitwise-imm.c b/clang/test/CodeGen/arm-mve-intrinsics/bitwise-imm.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/arm-mve-intrinsics/bitwise-imm.c @@ -0,0 +1,394 @@ +// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s +// RUN: %clang_cc1 -triple thumbv8.1m.main-arm-none-eabi -target-feature +mve.fp -mfloat-abi hard -fallow-half-arguments-and-returns -O0 -disable-O0-optnone -DPOLYMORPHIC -S -emit-llvm -o - %s | opt -S -mem2reg | FileCheck %s + +#include + +// CHECK-LABEL: @test_vbicq_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = and <8 x i16> [[A:%.*]], +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +int16x8_t test_vbicq_n_s16(int16x8_t a) +{ +#ifdef POLYMORPHIC + return vbicq(a, 0xd500); +#else /* POLYMORPHIC */ + return vbicq_n_s16(a, 0xd500); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vbicq_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = and <4 x i32> [[A:%.*]], +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vbicq_n_s32(int32x4_t a) +{ +#ifdef POLYMORPHIC + return vbicq(a, 0xfb); +#else /* POLYMORPHIC */ + return vbicq_n_s32(a, 0xfb); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vbicq_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = and <8 x i16> [[A:%.*]], +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +uint16x8_t test_vbicq_n_u16(uint16x8_t a) +{ +#ifdef POLYMORPHIC + return vbicq(a, 0xf2); +#else /* POLYMORPHIC */ + return vbicq_n_u16(a, 0xf2); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vbicq_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = and <4 x i32> [[A:%.*]], +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +uint32x4_t test_vbicq_n_u32(uint32x4_t a) +{ +#ifdef POLYMORPHIC + return vbicq(a, 0x2000); +#else /* POLYMORPHIC */ + return vbicq_n_u32(a, 0x2000); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vorrq_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = or <8 x i16> [[A:%.*]], +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +int16x8_t test_vorrq_n_s16(int16x8_t a) +{ +#ifdef POLYMORPHIC + return vorrq(a, 0xc3); +#else /* POLYMORPHIC */ + return vorrq_n_s16(a, 0xc3); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vorrq_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = or <4 x i32> [[A:%.*]], +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +int32x4_t test_vorrq_n_s32(int32x4_t a) +{ +#ifdef POLYMORPHIC + return vorrq(a, 0x10000); +#else /* POLYMORPHIC */ + return vorrq_n_s32(a, 0x10000); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vorrq_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = or <8 x i16> [[A:%.*]], +// CHECK-NEXT: ret <8 x i16> [[TMP0]] +// +uint16x8_t test_vorrq_n_u16(uint16x8_t a) +{ +#ifdef POLYMORPHIC + return vorrq(a, 0xf000); +#else /* POLYMORPHIC */ + return vorrq_n_u16(a, 0xf000); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vorrq_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = or <4 x i32> [[A:%.*]], +// CHECK-NEXT: ret <4 x i32> [[TMP0]] +// +uint32x4_t test_vorrq_n_u32(uint32x4_t a) +{ +#ifdef POLYMORPHIC + return vorrq(a, 0x890000); +#else /* POLYMORPHIC */ + return vorrq_n_u32(a, 0x890000); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmvnq_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: ret <8 x i16> +// +int16x8_t test_vmvnq_n_s16() +{ + return vmvnq_n_s16(0x9500); +} + +// CHECK-LABEL: @test_vmvnq_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: ret <4 x i32> +// +int32x4_t test_vmvnq_n_s32() +{ + return vmvnq_n_s32(0x550000); +} + +// CHECK-LABEL: @test_vmvnq_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: ret <8 x i16> +// +uint16x8_t test_vmvnq_n_u16() +{ + return vmvnq_n_u16(0x4900); +} + +// CHECK-LABEL: @test_vmvnq_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: ret <4 x i32> +// +uint32x4_t test_vmvnq_n_u32() +{ + return vmvnq_n_u32(0xc3000000); +} + +// CHECK-LABEL: @test_vbicq_m_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.bic.imm.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 11264, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vbicq_m_n_s16(int16x8_t a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vbicq_m_n(a, 0x2c00, p); +#else /* POLYMORPHIC */ + return vbicq_m_n_s16(a, 0x2c00, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vbicq_m_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.bic.imm.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 13893632, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vbicq_m_n_s32(int32x4_t a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vbicq_m_n(a, 0xd40000, p); +#else /* POLYMORPHIC */ + return vbicq_m_n_s32(a, 0xd40000, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vbicq_m_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.bic.imm.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 36, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +uint16x8_t test_vbicq_m_n_u16(uint16x8_t a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vbicq_m_n(a, 0x24, p); +#else /* POLYMORPHIC */ + return vbicq_m_n_u16(a, 0x24, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vbicq_m_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.bic.imm.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 1644167168, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +uint32x4_t test_vbicq_m_n_u32(uint32x4_t a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vbicq_m_n(a, 0x62000000, p); +#else /* POLYMORPHIC */ + return vbicq_m_n_u32(a, 0x62000000, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vorrq_m_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.orr.imm.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 13568, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vorrq_m_n_s16(int16x8_t a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vorrq_m_n(a, 0x3500, p); +#else /* POLYMORPHIC */ + return vorrq_m_n_s16(a, 0x3500, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vorrq_m_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.orr.imm.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 654311424, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vorrq_m_n_s32(int32x4_t a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vorrq_m_n(a, 0x27000000, p); +#else /* POLYMORPHIC */ + return vorrq_m_n_s32(a, 0x27000000, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vorrq_m_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.mve.orr.imm.predicated.v8i16.v8i1(<8 x i16> [[A:%.*]], i32 175, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +uint16x8_t test_vorrq_m_n_u16(uint16x8_t a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vorrq_m_n(a, 0xaf, p); +#else /* POLYMORPHIC */ + return vorrq_m_n_u16(a, 0xaf, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vorrq_m_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.orr.imm.predicated.v4i32.v4i1(<4 x i32> [[A:%.*]], i32 89, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +uint32x4_t test_vorrq_m_n_u32(uint32x4_t a, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vorrq_m_n(a, 0x59, p); +#else /* POLYMORPHIC */ + return vorrq_m_n_u32(a, 0x59, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmvnq_m_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> , <8 x i16> [[INACTIVE:%.*]] +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vmvnq_m_n_s16(int16x8_t inactive, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vmvnq_m(inactive, 0xf00, p); +#else /* POLYMORPHIC */ + return vmvnq_m_n_s16(inactive, 0xf00, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmvnq_m_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> , <4 x i32> [[INACTIVE:%.*]] +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vmvnq_m_n_s32(int32x4_t inactive, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vmvnq_m(inactive, 0x4a00, p); +#else /* POLYMORPHIC */ + return vmvnq_m_n_s32(inactive, 0x4a00, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmvnq_m_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> , <8 x i16> [[INACTIVE:%.*]] +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +uint16x8_t test_vmvnq_m_n_u16(uint16x8_t inactive, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vmvnq_m(inactive, 0xa500, p); +#else /* POLYMORPHIC */ + return vmvnq_m_n_u16(inactive, 0xa500, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmvnq_m_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> , <4 x i32> [[INACTIVE:%.*]] +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +uint32x4_t test_vmvnq_m_n_u32(uint32x4_t inactive, mve_pred16_t p) +{ +#ifdef POLYMORPHIC + return vmvnq_m(inactive, 0xf800, p); +#else /* POLYMORPHIC */ + return vmvnq_m_n_u32(inactive, 0xf800, p); +#endif /* POLYMORPHIC */ +} + +// CHECK-LABEL: @test_vmvnq_x_n_s16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> , <8 x i16> undef +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +int16x8_t test_vmvnq_x_n_s16(mve_pred16_t p) +{ + return vmvnq_x_n_s16(0xfd00, p); +} + +// CHECK-LABEL: @test_vmvnq_x_n_s32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> , <4 x i32> undef +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +int32x4_t test_vmvnq_x_n_s32(mve_pred16_t p) +{ + return vmvnq_x_n_s32(0xba0000, p); +} + +// CHECK-LABEL: @test_vmvnq_x_n_u16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> , <8 x i16> undef +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +uint16x8_t test_vmvnq_x_n_u16(mve_pred16_t p) +{ + return vmvnq_x_n_u16(0x5400, p); +} + +// CHECK-LABEL: @test_vmvnq_x_n_u32( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = select <4 x i1> [[TMP1]], <4 x i32> , <4 x i32> undef +// CHECK-NEXT: ret <4 x i32> [[TMP2]] +// +uint32x4_t test_vmvnq_x_n_u32(mve_pred16_t p) +{ + return vmvnq_x_n_u32(0x1300, p); +} + diff --git a/clang/test/Sema/arm-mve-immediates.c b/clang/test/Sema/arm-mve-immediates.c --- a/clang/test/Sema/arm-mve-immediates.c +++ b/clang/test/Sema/arm-mve-immediates.c @@ -203,3 +203,73 @@ vsriq(vw, vw, 0); // expected-error {{argument value 0 is outside the valid range [1, 32]}} vsriq(vw, vw, 33); // expected-error {{argument value 33 is outside the valid range [1, 32]}} } + +void test_simd_bic_orr(int16x8_t h, int32x4_t w) +{ + h = vbicq(h, 0x0000); + h = vbicq(h, 0x0001); + h = vbicq(h, 0x00FF); + h = vbicq(h, 0x0100); + h = vbicq(h, 0x0101); // expected-error-re {{argument should be an 8-bit value shifted by a multiple of 8 bits{{$}}}} + h = vbicq(h, 0x01FF); // expected-error-re {{argument should be an 8-bit value shifted by a multiple of 8 bits{{$}}}} + h = vbicq(h, 0xFF00); + + w = vbicq(w, 0x00000000); + w = vbicq(w, 0x00000001); + w = vbicq(w, 0x000000FF); + w = vbicq(w, 0x00000100); + w = vbicq(w, 0x0000FF00); + w = vbicq(w, 0x00010000); + w = vbicq(w, 0x00FF0000); + w = vbicq(w, 0x01000000); + w = vbicq(w, 0xFF000000); + w = vbicq(w, 0x01000001); // expected-error-re {{argument should be an 8-bit value shifted by a multiple of 8 bits{{$}}}} + w = vbicq(w, 0x01FFFFFF); // expected-error-re {{argument should be an 8-bit value shifted by a multiple of 8 bits{{$}}}} + + h = vorrq(h, 0x0000); + h = vorrq(h, 0x0001); + h = vorrq(h, 0x00FF); + h = vorrq(h, 0x0100); + h = vorrq(h, 0x0101); // expected-error-re {{argument should be an 8-bit value shifted by a multiple of 8 bits{{$}}}} + h = vorrq(h, 0x01FF); // expected-error-re {{argument should be an 8-bit value shifted by a multiple of 8 bits{{$}}}} + h = vorrq(h, 0xFF00); + + w = vorrq(w, 0x00000000); + w = vorrq(w, 0x00000001); + w = vorrq(w, 0x000000FF); + w = vorrq(w, 0x00000100); + w = vorrq(w, 0x0000FF00); + w = vorrq(w, 0x00010000); + w = vorrq(w, 0x00FF0000); + w = vorrq(w, 0x01000000); + w = vorrq(w, 0xFF000000); + w = vorrq(w, 0x01000001); // expected-error-re {{argument should be an 8-bit value shifted by a multiple of 8 bits{{$}}}} + w = vorrq(w, 0x01FFFFFF); // expected-error-re {{argument should be an 8-bit value shifted by a multiple of 8 bits{{$}}}} +} + +void test_simd_vmvn(void) +{ + uint16x8_t h; + h = vmvnq_n_u16(0x0000); + h = vmvnq_n_u16(0x0001); + h = vmvnq_n_u16(0x00FF); + h = vmvnq_n_u16(0x0100); + h = vmvnq_n_u16(0x0101); // expected-error {{argument should be an 8-bit value shifted by a multiple of 8 bits, or in the form 0x??FF}} + h = vmvnq_n_u16(0x01FF); + h = vmvnq_n_u16(0xFF00); + + uint32x4_t w; + w = vmvnq_n_u32(0x00000000); + w = vmvnq_n_u32(0x00000001); + w = vmvnq_n_u32(0x000000FF); + w = vmvnq_n_u32(0x00000100); + w = vmvnq_n_u32(0x0000FF00); + w = vmvnq_n_u32(0x00010000); + w = vmvnq_n_u32(0x00FF0000); + w = vmvnq_n_u32(0x01000000); + w = vmvnq_n_u32(0xFF000000); + w = vmvnq_n_u32(0x01000001); // expected-error {{argument should be an 8-bit value shifted by a multiple of 8 bits, or in the form 0x??FF}} + w = vmvnq_n_u32(0x01FFFFFF); // expected-error {{argument should be an 8-bit value shifted by a multiple of 8 bits, or in the form 0x??FF}} + w = vmvnq_n_u32(0x0001FFFF); // expected-error {{argument should be an 8-bit value shifted by a multiple of 8 bits, or in the form 0x??FF}} + w = vmvnq_n_u32(0x000001FF); +} diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp --- a/clang/utils/TableGen/MveEmitter.cpp +++ b/clang/utils/TableGen/MveEmitter.cpp @@ -882,38 +882,38 @@ break; case ImmediateArg::BoundsType::UInt: lo = 0; - hi = IA.i1; + hi = llvm::APInt::getMaxValue(IA.i1).zext(128); break; } - llvm::APInt typelo, typehi; - unsigned Bits = IA.ArgType->sizeInBits(); - if (cast(IA.ArgType)->kind() == ScalarTypeKind::SignedInt) { - typelo = llvm::APInt::getSignedMinValue(Bits).sext(128); - typehi = llvm::APInt::getSignedMaxValue(Bits).sext(128); - } else { - typelo = llvm::APInt::getMinValue(Bits).zext(128); - typehi = llvm::APInt::getMaxValue(Bits).zext(128); - } - std::string Index = utostr(kv.first); - if (lo.sle(typelo) && hi.sge(typehi)) - SemaChecks.push_back("SemaBuiltinConstantArg(TheCall, " + Index + ")"); - else + unsigned ArgTypeBits = IA.ArgType->sizeInBits(); + llvm::APInt ArgTypeRange = llvm::APInt::getMaxValue(ArgTypeBits).zext(128); + llvm::APInt ActualRange = (hi-lo).trunc(64).sext(128); + if (ActualRange.ult(ArgTypeRange)) SemaChecks.push_back("SemaBuiltinConstantArgRange(TheCall, " + Index + ", " + signedHexLiteral(lo) + ", " + signedHexLiteral(hi) + ")"); if (!IA.ExtraCheckType.empty()) { std::string Suffix; - if (!IA.ExtraCheckArgs.empty()) - Suffix = (Twine(", ") + IA.ExtraCheckArgs).str(); + if (!IA.ExtraCheckArgs.empty()) { + std::string tmp; + StringRef Arg = IA.ExtraCheckArgs; + if (Arg == "!lanesize") { + tmp = utostr(IA.ArgType->sizeInBits()); + Arg = tmp; + } + Suffix = (Twine(", ") + Arg).str(); + } SemaChecks.push_back((Twine("SemaBuiltinConstantArg") + IA.ExtraCheckType + "(TheCall, " + Index + Suffix + ")") .str()); } + + assert(!SemaChecks.empty()); } if (SemaChecks.empty()) return ""; diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -1133,4 +1133,10 @@ [llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_i32_ty, llvm_anyvector_ty, LLVMMatchType<0>], llvm_anyvector_ty>; + +def int_arm_mve_bic_imm_predicated: Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>; +def int_arm_mve_orr_imm_predicated: Intrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, llvm_i32_ty, llvm_anyvector_ty], [IntrNoMem]>; + } // end TargetPrefix diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp --- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp +++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -268,6 +268,20 @@ void SelectMVE_VLD(SDNode *N, unsigned NumVecs, const uint16_t *const *Opcodes); + /// SelectMVE_VBIC_VORR_imm - Select immediate forms of MVE VBIC and + /// VORR instructions, which have different MC ids depending on the + /// shape of the constant. N should be an ARMISD::VBICIMM / VORRIMM + /// node; the array Opcodes has 6 entries, used for 8-bit constants + /// shifted left by 0, 8, 16 or 24 bits in a 32-bit word, and 0 or 8 + /// bits in a 16-bit word, respectively. + void SelectMVE_VBIC_VORR_imm(SDNode *N, const uint16_t *Opcodes); + + /// SelectMVE_VBIC_VORR_imm_predicated - Select immediate MVE VBIC + /// and VORR, but this time the predicated forms, starting from a + /// node that is a call to the arm.mve.{bic,orr}.predicated + /// intrinsic. + void SelectMVE_VBIC_VORR_imm_predicated(SDNode *N, const uint16_t *Opcodes); + /// SelectVLDDup - Select NEON load-duplicate intrinsics. NumVecs /// should be 1, 2, 3 or 4. The opcode array specifies the instructions used /// for loading D registers. @@ -2581,6 +2595,91 @@ CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops)); } +static const uint16_t MVE_VBIC_Opcodes[] = { + ARM::MVE_VBICIZ0v4i32, ARM::MVE_VBICIZ8v4i32, + ARM::MVE_VBICIZ16v4i32, ARM::MVE_VBICIZ24v4i32, + ARM::MVE_VBICIZ0v8i16, ARM::MVE_VBICIZ8v8i16, +}; +static const uint16_t MVE_VORR_Opcodes[] = { + ARM::MVE_VORRIZ0v4i32, ARM::MVE_VORRIZ8v4i32, + ARM::MVE_VORRIZ16v4i32, ARM::MVE_VORRIZ24v4i32, + ARM::MVE_VORRIZ0v8i16, ARM::MVE_VORRIZ8v8i16, +}; + +void ARMDAGToDAGISel::SelectMVE_VBIC_VORR_imm(SDNode *N, + const uint16_t *Opcodes) { + SDLoc Loc(N); + + // The VBICIMM or VORRIMM SDNode will have encoded the constant in + // NEON form, which we unpack to select between MVE opcodes. + // + // We expect its low 8 bits to be the literal 8-bit value that will + // be shifted to form the true constant, and the higher bits to + // specify the NEON control bits. In our case those bits will be + // 0,2,4,6,8,10 for the six entries in our Opcodes array. + unsigned NeonConst = cast(N->getOperand(1))->getZExtValue(); + unsigned ShiftedValue = NeonConst & 0xFF; + unsigned ControlBits = NeonConst >> 8; + assert(ControlBits % 2 == 0 && + "unexpected NEON constant encoding for MVE VBIC/VORR"); + unsigned OpcodeIndex = ControlBits >> 1; + assert(OpcodeIndex < 6 && + "unexpected NEON constant encoding for MVE VBIC/VORR"); + uint16_t Opcode = Opcodes[OpcodeIndex]; + + // The actual constant operand will have the same value that would + // be shown in assembly. So we shift it left by the right number of + // bytes, which is 0,1,2,3 for the first four opcode types (with a + // 32-bit word size) and then 0,1 again for the other two (16-bit). + unsigned ShiftBytes = OpcodeIndex & 3; + uint32_t MveConst = ShiftedValue << (8 * ShiftBytes); + + SmallVector Ops; + Ops.push_back(N->getOperand(0)); + Ops.push_back(CurDAG->getTargetConstant(MveConst, Loc, MVT::i32)); + // FIXME take that predicate away again + AddEmptyMVEPredicateToOps(Ops, Loc); + CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops)); +} + +void ARMDAGToDAGISel::SelectMVE_VBIC_VORR_imm_predicated( + SDNode *N, const uint16_t *Opcodes) { + SDLoc Loc(N); + + uint32_t MveConst = cast(N->getOperand(2))->getZExtValue(); + + unsigned ElementBits; + switch (N->getValueType(0).getSimpleVT().SimpleTy) { + case MVT::v8i16: + ElementBits = 16; + break; + case MVT::v4i32: + ElementBits = 32; + break; + default: + llvm_unreachable("bad vector type in SelectMVE_VBIC_VORR_imm_predicated"); + } + + unsigned Shift, ShiftedValue; + for (Shift = 0; Shift < ElementBits; Shift += 8) { + ShiftedValue = MveConst >> Shift; + if (MveConst == ShiftedValue << Shift) + break; + } + assert(Shift < ElementBits && + "bad constant in SelectMVE_VBIC_VORR_imm_predicated"); + + unsigned ShiftBytes = Shift / 8; + unsigned OpcodeIndex = (ElementBits == 16 ? 4 : 0) + ShiftBytes; + uint16_t Opcode = Opcodes[OpcodeIndex]; + + SmallVector Ops; + Ops.push_back(N->getOperand(1)); + Ops.push_back(CurDAG->getTargetConstant(MveConst, Loc, MVT::i32)); + AddMVEPredicateToOps(Ops, Loc, N->getOperand(3)); + CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops)); +} + static bool SDValueToConstBool(SDValue SDVal) { assert(isa(SDVal) && "expected a compile-time constant"); ConstantSDNode *SDValConstant = dyn_cast(SDVal); @@ -4573,6 +4672,14 @@ OpcodesS, OpcodesU); return; } + + case Intrinsic::arm_mve_bic_imm_predicated: + SelectMVE_VBIC_VORR_imm_predicated(N, MVE_VBIC_Opcodes); + return; + case Intrinsic::arm_mve_orr_imm_predicated: + SelectMVE_VBIC_VORR_imm_predicated(N, MVE_VORR_Opcodes); + return; + } break; } @@ -4580,6 +4687,26 @@ case ISD::ATOMIC_CMP_SWAP: SelectCMP_SWAP(N); return; + + case ARMISD::VBICIMM: + if (Subtarget->hasMVEIntegerOps()) { + // The MVE version of this instruction is divided into + // sub-opcodes in a way that makes it tricky to select using + // Tablegen patterns, so we use custom C++. + SelectMVE_VBIC_VORR_imm(N, MVE_VBIC_Opcodes); + return; + } + // On NEON, Tablegen can handle the job. + break; + + case ARMISD::VORRIMM: + // As with VBICIMM, we have to do this by hand on MVE, and leave + // it to the automation otherwise. + if (Subtarget->hasMVEIntegerOps()) { + SelectMVE_VBIC_VORR_imm(N, MVE_VORR_Opcodes); + return; + } + break; } SelectCode(N); diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp --- a/llvm/lib/Target/ARM/ARMISelLowering.cpp +++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -12176,7 +12176,7 @@ APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; - if (BVN && Subtarget->hasNEON() && + if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) && BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { if (SplatBitSize <= 64) { EVT VbicVT; @@ -12483,7 +12483,7 @@ APInt SplatBits, SplatUndef; unsigned SplatBitSize; bool HasAnyUndefs; - if (BVN && Subtarget->hasNEON() && + if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) && BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) { if (SplatBitSize <= 64) { EVT VorrVT; diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td --- a/llvm/lib/Target/ARM/ARMInstrInfo.td +++ b/llvm/lib/Target/ARM/ARMInstrInfo.td @@ -274,6 +274,10 @@ def ARMvmvnImm : SDNode<"ARMISD::VMVNIMM", SDTARMVMOVIMM>; def ARMvmovFPImm : SDNode<"ARMISD::VMOVFPIMM", SDTARMVMOVIMM>; +def SDTARMVORRIMM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, + SDTCisVT<2, i32>]>; +def ARMvorrImm : SDNode<"ARMISD::VORRIMM", SDTARMVORRIMM>; +def ARMvbicImm : SDNode<"ARMISD::VBICIMM", SDTARMVORRIMM>; def SDTARMVSHIMM : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>, SDTCisVT<2, i32>]>; diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -2265,6 +2265,15 @@ def : Pat<(v4f32 (ARMvmovFPImm timm:$simm)), (v4f32 (MVE_VMOVimmf32 nImmVMOVF32:$simm))>; + + def : Pat<(v8i16 (vselect (v8i1 VCCR:$pred), (ARMvmvnImm timm:$simm), + MQPR:$inactive)), + (v8i16 (MVE_VMVNimmi16 nImmSplatI16:$simm, + ARMVCCThen, VCCR:$pred, MQPR:$inactive))>; + def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred), (ARMvmvnImm timm:$simm), + MQPR:$inactive)), + (v4i32 (MVE_VMVNimmi32 nImmSplatI32:$simm, + ARMVCCThen, VCCR:$pred, MQPR:$inactive))>; } class MVE_VMINMAXA size, diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td --- a/llvm/lib/Target/ARM/ARMInstrNEON.td +++ b/llvm/lib/Target/ARM/ARMInstrNEON.td @@ -509,11 +509,6 @@ def NEONvsliImm : SDNode<"ARMISD::VSLIIMM", SDTARMVSHINSIMM>; def NEONvsriImm : SDNode<"ARMISD::VSRIIMM", SDTARMVSHINSIMM>; -def SDTARMVORRIMM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>, - SDTCisVT<2, i32>]>; -def NEONvorrImm : SDNode<"ARMISD::VORRIMM", SDTARMVORRIMM>; -def NEONvbicImm : SDNode<"ARMISD::VBICIMM", SDTARMVORRIMM>; - def NEONvbsl : SDNode<"ARMISD::VBSL", SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>, @@ -5296,7 +5291,7 @@ IIC_VMOVImm, "vorr", "i16", "$Vd, $SIMM", "$src = $Vd", [(set DPR:$Vd, - (v4i16 (NEONvorrImm DPR:$src, timm:$SIMM)))]> { + (v4i16 (ARMvorrImm DPR:$src, timm:$SIMM)))]> { let Inst{9} = SIMM{9}; } @@ -5305,7 +5300,7 @@ IIC_VMOVImm, "vorr", "i32", "$Vd, $SIMM", "$src = $Vd", [(set DPR:$Vd, - (v2i32 (NEONvorrImm DPR:$src, timm:$SIMM)))]> { + (v2i32 (ARMvorrImm DPR:$src, timm:$SIMM)))]> { let Inst{10-9} = SIMM{10-9}; } @@ -5314,7 +5309,7 @@ IIC_VMOVImm, "vorr", "i16", "$Vd, $SIMM", "$src = $Vd", [(set QPR:$Vd, - (v8i16 (NEONvorrImm QPR:$src, timm:$SIMM)))]> { + (v8i16 (ARMvorrImm QPR:$src, timm:$SIMM)))]> { let Inst{9} = SIMM{9}; } @@ -5323,7 +5318,7 @@ IIC_VMOVImm, "vorr", "i32", "$Vd, $SIMM", "$src = $Vd", [(set QPR:$Vd, - (v4i32 (NEONvorrImm QPR:$src, timm:$SIMM)))]> { + (v4i32 (ARMvorrImm QPR:$src, timm:$SIMM)))]> { let Inst{10-9} = SIMM{10-9}; } @@ -5347,7 +5342,7 @@ IIC_VMOVImm, "vbic", "i16", "$Vd, $SIMM", "$src = $Vd", [(set DPR:$Vd, - (v4i16 (NEONvbicImm DPR:$src, timm:$SIMM)))]> { + (v4i16 (ARMvbicImm DPR:$src, timm:$SIMM)))]> { let Inst{9} = SIMM{9}; } @@ -5356,7 +5351,7 @@ IIC_VMOVImm, "vbic", "i32", "$Vd, $SIMM", "$src = $Vd", [(set DPR:$Vd, - (v2i32 (NEONvbicImm DPR:$src, timm:$SIMM)))]> { + (v2i32 (ARMvbicImm DPR:$src, timm:$SIMM)))]> { let Inst{10-9} = SIMM{10-9}; } @@ -5365,7 +5360,7 @@ IIC_VMOVImm, "vbic", "i16", "$Vd, $SIMM", "$src = $Vd", [(set QPR:$Vd, - (v8i16 (NEONvbicImm QPR:$src, timm:$SIMM)))]> { + (v8i16 (ARMvbicImm QPR:$src, timm:$SIMM)))]> { let Inst{9} = SIMM{9}; } @@ -5374,7 +5369,7 @@ IIC_VMOVImm, "vbic", "i32", "$Vd, $SIMM", "$src = $Vd", [(set QPR:$Vd, - (v4i32 (NEONvbicImm QPR:$src, timm:$SIMM)))]> { + (v4i32 (ARMvbicImm QPR:$src, timm:$SIMM)))]> { let Inst{10-9} = SIMM{10-9}; } diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll @@ -588,7 +588,7 @@ ; CHECK-NEXT: vmov.16 q0[5], r2 ; CHECK-NEXT: vmov.16 q0[6], r1 ; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: vmovlb.u8 q0, q0 +; CHECK-NEXT: vbic.i16 q0, #0xff00 ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: %offs = load <8 x i8*>, <8 x i8*>* %offptr, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/bitwise-imm.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/bitwise-imm.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/bitwise-imm.ll @@ -0,0 +1,343 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s + +define arm_aapcs_vfpcc <8 x i16> @test_vbicq_n_u16_sh0(<8 x i16> %a) { +; CHECK-LABEL: test_vbicq_n_u16_sh0: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vbic.i16 q0, #0x64 +; CHECK-NEXT: bx lr +entry: + %0 = and <8 x i16> %a, + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vbicq_n_u16_sh8(<8 x i16> %a) { +; CHECK-LABEL: test_vbicq_n_u16_sh8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vbic.i16 q0, #0x6400 +; CHECK-NEXT: bx lr +entry: + %0 = and <8 x i16> %a, + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vbicq_n_u32_sh0(<4 x i32> %a) { +; CHECK-LABEL: test_vbicq_n_u32_sh0: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vbic.i32 q0, #0x64 +; CHECK-NEXT: bx lr +entry: + %0 = and <4 x i32> %a, + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vbicq_n_u32_sh8(<4 x i32> %a) { +; CHECK-LABEL: test_vbicq_n_u32_sh8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vbic.i32 q0, #0x6400 +; CHECK-NEXT: bx lr +entry: + %0 = and <4 x i32> %a, + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vbicq_n_u32_sh16(<4 x i32> %a) { +; CHECK-LABEL: test_vbicq_n_u32_sh16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vbic.i32 q0, #0x640000 +; CHECK-NEXT: bx lr +entry: + %0 = and <4 x i32> %a, + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vbicq_n_u32_sh24(<4 x i32> %a) { +; CHECK-LABEL: test_vbicq_n_u32_sh24: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vbic.i32 q0, #0x64000000 +; CHECK-NEXT: bx lr +entry: + %0 = and <4 x i32> %a, + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vorrq_n_u16_sh0(<8 x i16> %a) { +; CHECK-LABEL: test_vorrq_n_u16_sh0: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vorr.i16 q0, #0x64 +; CHECK-NEXT: bx lr +entry: + %0 = or <8 x i16> %a, + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vorrq_n_u16_sh8(<8 x i16> %a) { +; CHECK-LABEL: test_vorrq_n_u16_sh8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vorr.i16 q0, #0x6400 +; CHECK-NEXT: bx lr +entry: + %0 = or <8 x i16> %a, + ret <8 x i16> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vorrq_n_u32_sh0(<4 x i32> %a) { +; CHECK-LABEL: test_vorrq_n_u32_sh0: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vorr.i32 q0, #0x64 +; CHECK-NEXT: bx lr +entry: + %0 = or <4 x i32> %a, + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vorrq_n_u32_sh8(<4 x i32> %a) { +; CHECK-LABEL: test_vorrq_n_u32_sh8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vorr.i32 q0, #0x6400 +; CHECK-NEXT: bx lr +entry: + %0 = or <4 x i32> %a, + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vorrq_n_u32_sh16(<4 x i32> %a) { +; CHECK-LABEL: test_vorrq_n_u32_sh16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vorr.i32 q0, #0x640000 +; CHECK-NEXT: bx lr +entry: + %0 = or <4 x i32> %a, + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vorrq_n_u32_sh24(<4 x i32> %a) { +; CHECK-LABEL: test_vorrq_n_u32_sh24: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vorr.i32 q0, #0x64000000 +; CHECK-NEXT: bx lr +entry: + %0 = or <4 x i32> %a, + ret <4 x i32> %0 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vbicq_m_n_u16_sh0(<8 x i16> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vbicq_m_n_u16_sh0: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vbict.i16 q0, #0x64 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = tail call <8 x i16> @llvm.arm.mve.bic.imm.predicated.v8i16.v8i1(<8 x i16> %a, i32 100, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vbicq_m_n_u16_sh8(<8 x i16> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vbicq_m_n_u16_sh8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vbict.i16 q0, #0x6400 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = tail call <8 x i16> @llvm.arm.mve.bic.imm.predicated.v8i16.v8i1(<8 x i16> %a, i32 25600, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vbicq_m_n_u32_sh0(<4 x i32> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vbicq_m_n_u32_sh0: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vbict.i32 q0, #0x64 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x i32> @llvm.arm.mve.bic.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 100, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vbicq_m_n_u32_sh8(<4 x i32> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vbicq_m_n_u32_sh8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vbict.i32 q0, #0x6400 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x i32> @llvm.arm.mve.bic.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 25600, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vbicq_m_n_u32_sh16(<4 x i32> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vbicq_m_n_u32_sh16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vbict.i32 q0, #0x640000 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x i32> @llvm.arm.mve.bic.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 6553600, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vbicq_m_n_u32_sh24(<4 x i32> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vbicq_m_n_u32_sh24: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vbict.i32 q0, #0x64000000 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x i32> @llvm.arm.mve.bic.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 1677721600, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vorrq_m_n_u16_sh0(<8 x i16> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vorrq_m_n_u16_sh0: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vorrt.i16 q0, #0x64 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = tail call <8 x i16> @llvm.arm.mve.orr.imm.predicated.v8i16.v8i1(<8 x i16> %a, i32 100, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vorrq_m_n_u16_sh8(<8 x i16> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vorrq_m_n_u16_sh8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vorrt.i16 q0, #0x6400 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = tail call <8 x i16> @llvm.arm.mve.orr.imm.predicated.v8i16.v8i1(<8 x i16> %a, i32 25600, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vorrq_m_n_u32_sh0(<4 x i32> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vorrq_m_n_u32_sh0: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vorrt.i32 q0, #0x64 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x i32> @llvm.arm.mve.orr.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 100, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vorrq_m_n_u32_sh8(<4 x i32> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vorrq_m_n_u32_sh8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vorrt.i32 q0, #0x6400 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x i32> @llvm.arm.mve.orr.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 25600, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vorrq_m_n_u32_sh16(<4 x i32> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vorrq_m_n_u32_sh16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vorrt.i32 q0, #0x640000 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x i32> @llvm.arm.mve.orr.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 6553600, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vorrq_m_n_u32_sh24(<4 x i32> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vorrq_m_n_u32_sh24: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vorrt.i32 q0, #0x64000000 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x i32> @llvm.arm.mve.orr.imm.predicated.v4i32.v4i1(<4 x i32> %a, i32 1677721600, <4 x i1> %1) + ret <4 x i32> %2 +} + +define arm_aapcs_vfpcc <8 x i16> @test_vmvnq_n_u16() { +; CHECK-LABEL: test_vmvnq_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmvn.i16 q0, #0xaa00 +; CHECK-NEXT: bx lr +entry: + ret <8 x i16> +} + +define arm_aapcs_vfpcc <4 x i32> @test_vmvnq_n_u32() { +; CHECK-LABEL: test_vmvnq_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmvn.i32 q0, #0xaa00 +; CHECK-NEXT: bx lr +entry: + ret <4 x i32> +} + +define arm_aapcs_vfpcc <8 x i16> @test_vmvnq_m_n_u16(<8 x i16> %inactive, i16 zeroext %p) { +; CHECK-LABEL: test_vmvnq_m_n_u16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmvnt.i16 q0, #0xaa00 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = select <8 x i1> %1, <8 x i16> , <8 x i16> %inactive + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vmvnq_m_n_u32(<4 x i32> %inactive, i16 zeroext %p) { +; CHECK-LABEL: test_vmvnq_m_n_u32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vmvnt.i32 q0, #0xaa00 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = select <4 x i1> %1, <4 x i32> , <4 x i32> %inactive + ret <4 x i32> %2 +} + +declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) +declare <8 x i16> @llvm.arm.mve.bic.imm.predicated.v8i16.v8i1(<8 x i16>, i32, <8 x i1>) +declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) +declare <4 x i32> @llvm.arm.mve.bic.imm.predicated.v4i32.v4i1(<4 x i32>, i32, <4 x i1>) +declare <8 x i16> @llvm.arm.mve.orr.imm.predicated.v8i16.v8i1(<8 x i16>, i32, <8 x i1>) +declare <4 x i32> @llvm.arm.mve.orr.imm.predicated.v4i32.v4i1(<4 x i32>, i32, <4 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll --- a/llvm/test/CodeGen/Thumb2/mve-masked-load.ll +++ b/llvm/test/CodeGen/Thumb2/mve-masked-load.ll @@ -998,21 +998,21 @@ define arm_aapcs_vfpcc <8 x i16> @zext8_masked_v8i16_align1_other(<8 x i8> *%dest, <8 x i8> %a) { ; CHECK-LE-LABEL: zext8_masked_v8i16_align1_other: ; CHECK-LE: @ %bb.0: @ %entry -; CHECK-LE-NEXT: vmovlb.u8 q1, q0 -; CHECK-LE-NEXT: vmovlb.s8 q0, q0 -; CHECK-LE-NEXT: vpt.s16 gt, q0, zr -; CHECK-LE-NEXT: vldrbt.u16 q0, [r0] -; CHECK-LE-NEXT: vpsel q0, q0, q1 +; CHECK-LE-NEXT: vmovlb.s8 q1, q0 +; CHECK-LE-NEXT: vpt.s16 gt, q1, zr +; CHECK-LE-NEXT: vldrbt.u16 q1, [r0] +; CHECK-LE-NEXT: vbic.i16 q0, #0xff00 +; CHECK-LE-NEXT: vpsel q0, q1, q0 ; CHECK-LE-NEXT: bx lr ; ; CHECK-BE-LABEL: zext8_masked_v8i16_align1_other: ; CHECK-BE: @ %bb.0: @ %entry ; CHECK-BE-NEXT: vrev64.16 q1, q0 -; CHECK-BE-NEXT: vmovlb.u8 q0, q1 -; CHECK-BE-NEXT: vmovlb.s8 q1, q1 -; CHECK-BE-NEXT: vpt.s16 gt, q1, zr -; CHECK-BE-NEXT: vldrbt.u16 q1, [r0] -; CHECK-BE-NEXT: vpsel q1, q1, q0 +; CHECK-BE-NEXT: vmovlb.s8 q0, q1 +; CHECK-BE-NEXT: vpt.s16 gt, q0, zr +; CHECK-BE-NEXT: vldrbt.u16 q0, [r0] +; CHECK-BE-NEXT: vbic.i16 q1, #0xff00 +; CHECK-BE-NEXT: vpsel q1, q0, q1 ; CHECK-BE-NEXT: vrev64.16 q0, q1 ; CHECK-BE-NEXT: bx lr entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-sext.ll b/llvm/test/CodeGen/Thumb2/mve-sext.ll --- a/llvm/test/CodeGen/Thumb2/mve-sext.ll +++ b/llvm/test/CodeGen/Thumb2/mve-sext.ll @@ -277,7 +277,7 @@ define arm_aapcs_vfpcc <8 x i16> @zext_v8i8_v8i16(<8 x i8> %src) { ; CHECK-LABEL: zext_v8i8_v8i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.u8 q0, q0 +; CHECK-NEXT: vbic.i16 q0, #0xff00 ; CHECK-NEXT: bx lr entry: %0 = zext <8 x i8> %src to <8 x i16> @@ -308,41 +308,41 @@ define arm_aapcs_vfpcc <16 x i16> @zext_v16i8_v16i16(<16 x i8> %src) { ; CHECK-LABEL: zext_v16i8_v16i16: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vmov.u8 r0, q0[0] +; CHECK-NEXT: vmov.16 q0[0], r0 +; CHECK-NEXT: vmov.u8 r0, q2[1] +; CHECK-NEXT: vmov.16 q0[1], r0 +; CHECK-NEXT: vmov.u8 r0, q2[2] +; CHECK-NEXT: vmov.16 q0[2], r0 +; CHECK-NEXT: vmov.u8 r0, q2[3] +; CHECK-NEXT: vmov.16 q0[3], r0 +; CHECK-NEXT: vmov.u8 r0, q2[4] +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.u8 r0, q2[5] +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.u8 r0, q2[6] +; CHECK-NEXT: vmov.16 q0[6], r0 +; CHECK-NEXT: vmov.u8 r0, q2[7] +; CHECK-NEXT: vmov.16 q0[7], r0 +; CHECK-NEXT: vmov.u8 r0, q2[8] ; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[1] -; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[2] -; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[3] -; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[4] -; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[5] -; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[6] -; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[7] -; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmov.u8 r0, q0[8] -; CHECK-NEXT: vmovlb.u8 q2, q1 -; CHECK-NEXT: vmov.16 q1[0], r0 -; CHECK-NEXT: vmov.u8 r0, q0[9] +; CHECK-NEXT: vmov.u8 r0, q2[9] ; CHECK-NEXT: vmov.16 q1[1], r0 -; CHECK-NEXT: vmov.u8 r0, q0[10] +; CHECK-NEXT: vmov.u8 r0, q2[10] ; CHECK-NEXT: vmov.16 q1[2], r0 -; CHECK-NEXT: vmov.u8 r0, q0[11] +; CHECK-NEXT: vmov.u8 r0, q2[11] ; CHECK-NEXT: vmov.16 q1[3], r0 -; CHECK-NEXT: vmov.u8 r0, q0[12] +; CHECK-NEXT: vmov.u8 r0, q2[12] ; CHECK-NEXT: vmov.16 q1[4], r0 -; CHECK-NEXT: vmov.u8 r0, q0[13] +; CHECK-NEXT: vmov.u8 r0, q2[13] ; CHECK-NEXT: vmov.16 q1[5], r0 -; CHECK-NEXT: vmov.u8 r0, q0[14] +; CHECK-NEXT: vmov.u8 r0, q2[14] ; CHECK-NEXT: vmov.16 q1[6], r0 -; CHECK-NEXT: vmov.u8 r0, q0[15] +; CHECK-NEXT: vmov.u8 r0, q2[15] ; CHECK-NEXT: vmov.16 q1[7], r0 -; CHECK-NEXT: vmov q0, q2 -; CHECK-NEXT: vmovlb.u8 q1, q1 +; CHECK-NEXT: vbic.i16 q0, #0xff00 +; CHECK-NEXT: vbic.i16 q1, #0xff00 ; CHECK-NEXT: bx lr entry: %0 = zext <16 x i8> %src to <16 x i16> diff --git a/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll b/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll --- a/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll +++ b/llvm/test/CodeGen/Thumb2/mve-shuffleext.ll @@ -73,7 +73,7 @@ define arm_aapcs_vfpcc <8 x i16> @zext_02468101214(<16 x i8> %src) { ; CHECK-LABEL: zext_02468101214: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vmovlb.u8 q0, q0 +; CHECK-NEXT: vbic.i16 q0, #0xff00 ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <16 x i8> %src, <16 x i8> undef, <8 x i32> @@ -85,7 +85,7 @@ ; CHECK-LABEL: zext_13579111315: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vrev16.8 q0, q0 -; CHECK-NEXT: vmovlb.u8 q0, q0 +; CHECK-NEXT: vbic.i16 q0, #0xff00 ; CHECK-NEXT: bx lr entry: %strided.vec = shufflevector <16 x i8> %src, <16 x i8> undef, <8 x i32>