diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp --- a/clang/lib/CodeGen/CGBuiltin.cpp +++ b/clang/lib/CodeGen/CGBuiltin.cpp @@ -9061,33 +9061,32 @@ if (IsBoolTy) EltTy = IntegerType::get(getLLVMContext(), SVEBitsPerBlock / NumOpnds); - Address Alloca = CreateTempAlloca(llvm::ArrayType::get(EltTy, NumOpnds), - CharUnits::fromQuantity(16)); + SmallVector VecOps; for (unsigned I = 0; I < NumOpnds; ++I) - Builder.CreateDefaultAlignedStore( - IsBoolTy ? Builder.CreateZExt(Ops[I], EltTy) : Ops[I], - Builder.CreateGEP(Alloca.getElementType(), Alloca.getPointer(), - {Builder.getInt64(0), Builder.getInt64(I)})); + VecOps.push_back(Builder.CreateZExt(Ops[I], EltTy)); + Value *Vec = BuildVector(VecOps); SVETypeFlags TypeFlags(Builtin->TypeModifier); Value *Pred = EmitSVEAllTruePred(TypeFlags); llvm::Type *OverloadedTy = getSVEVectorForElementType(EltTy); - Function *F = CGM.getIntrinsic(Intrinsic::aarch64_sve_ld1rq, OverloadedTy); - Value *Alloca0 = Builder.CreateGEP( - Alloca.getElementType(), Alloca.getPointer(), - {Builder.getInt64(0), Builder.getInt64(0)}); - Value *LD1RQ = Builder.CreateCall(F, {Pred, Alloca0}); + Value *InsertSubVec = Builder.CreateInsertVector( + OverloadedTy, UndefValue::get(OverloadedTy), Vec, Builder.getInt64(0)); + + Function *F = + CGM.getIntrinsic(Intrinsic::aarch64_sve_dupq_lane, OverloadedTy); + Value *DupQLane = + Builder.CreateCall(F, {InsertSubVec, Builder.getInt64(0)}); if (!IsBoolTy) - return LD1RQ; + return DupQLane; // For svdupq_n_b* we need to add an additional 'cmpne' with '0'. F = CGM.getIntrinsic(NumOpnds == 2 ? Intrinsic::aarch64_sve_cmpne : Intrinsic::aarch64_sve_cmpne_wide, OverloadedTy); - Value *Call = - Builder.CreateCall(F, {Pred, LD1RQ, EmitSVEDupX(Builder.getInt64(0))}); + Value *Call = Builder.CreateCall( + F, {Pred, DupQLane, EmitSVEDupX(Builder.getInt64(0))}); return EmitSVEPredicateCast(Call, cast(Ty)); } diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq-bfloat.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq-bfloat.c --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq-bfloat.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq-bfloat.c @@ -24,16 +24,13 @@ svbfloat16_t test_svdupq_n_bf16(bfloat16_t x0, bfloat16_t x1, bfloat16_t x2, bfloat16_t x3, bfloat16_t x4, bfloat16_t x5, bfloat16_t x6, bfloat16_t x7) { // CHECK-LABEL: test_svdupq_n_bf16 - // CHECK: %[[ALLOCA:.*]] = alloca [8 x bfloat], align 16 - // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [8 x bfloat], [8 x bfloat]* %[[ALLOCA]], i64 0, i64 0 - // CHECK-DAG: store bfloat %x0, bfloat* %[[BASE]], align 16 - // - // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [8 x bfloat], [8 x bfloat]* %[[ALLOCA]], i64 0, i64 7 - // CHECK: store bfloat %x7, bfloat* %[[GEP]], align 2 - // CHECK-NOT: store - // CHECK: call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) - // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld1rq.nxv8bf16( %{{.*}}, bfloat* nonnull %[[BASE]]) - // CHECK: ret %[[LOAD]] + // CHECK: insertelement <8 x bfloat> undef, bfloat %x0, i32 0 + // + // CHECK: %[[VEC:.*]] = insertelement <8 x bfloat> %[[X:.*]], bfloat %x7, i32 7 + // CHECK-NOT: insertelement + // CHECK: %[[INS:.*]] = call @llvm.experimental.vector.insert.nxv8bf16.v8bf16( undef, <8 x bfloat> %[[VEC]], i64 0) + // CHECK: %[[DUPQ:.*]] = call @llvm.aarch64.sve.dupq.lane.nxv8bf16( %[[INS]], i64 0) + // CHECK: ret %[[DUPQ]] // expected-warning@+1 {{implicit declaration of function 'svdupq_n_bf16'}} return SVE_ACLE_FUNC(svdupq, _n, _bf16, )(x0, x1, x2, x3, x4, x5, x6, x7); } diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq.c --- a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq.c +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq.c @@ -107,16 +107,13 @@ int8_t x12, int8_t x13, int8_t x14, int8_t x15) { // CHECK-LABEL: test_svdupq_n_s8 - // CHECK: %[[ALLOCA:.*]] = alloca [16 x i8], align 16 - // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [16 x i8], [16 x i8]* %[[ALLOCA]], i64 0, i64 0 - // CHECK-DAG: store i8 %x0, i8* %[[BASE]], align 16 - // - // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [16 x i8], [16 x i8]* %[[ALLOCA]], i64 0, i64 15 - // CHECK: store i8 %x15, i8* %[[GEP]], align 1 - // CHECK-NOT: store - // CHECK: %[[PTRUE:.*]] = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) - // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld1rq.nxv16i8( %[[PTRUE]], i8* nonnull %[[BASE]]) - // CHECK: ret %[[LOAD]] + // CHECK: insertelement <16 x i8> undef, i8 %x0, i32 0 + // + // CHECK: %[[VEC:.*]] = insertelement <16 x i8> %[[X:.*]], i8 %x15, i32 15 + // CHECK-NOT: insertelement + // CHECK: %[[INS:.*]] = call @llvm.experimental.vector.insert.nxv16i8.v16i8( undef, <16 x i8> %[[VEC]], i64 0) + // CHECK: %[[DUPQ:.*]] = call @llvm.aarch64.sve.dupq.lane.nxv16i8( %[[INS]], i64 0) + // CHECK: ret %[[DUPQ]] return SVE_ACLE_FUNC(svdupq,_n,_s8,)(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15); } @@ -124,47 +121,38 @@ int16_t x4, int16_t x5, int16_t x6, int16_t x7) { // CHECK-LABEL: test_svdupq_n_s16 - // CHECK-DAG: %[[ALLOCA:.*]] = alloca [8 x i16], align 16 - // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [8 x i16], [8 x i16]* %[[ALLOCA]], i64 0, i64 0 - // CHECK-DAG: store i16 %x0, i16* %[[BASE]], align 16 - // - // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [8 x i16], [8 x i16]* %[[ALLOCA]], i64 0, i64 7 - // CHECK: store i16 %x7, i16* %[[GEP]], align 2 - // CHECK-NOT: store - // CHECK: call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) - // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld1rq.nxv8i16( %{{.*}}, i16* nonnull %[[BASE]]) - // CHECK: ret %[[LOAD]] + // CHECK: insertelement <8 x i16> undef, i16 %x0, i32 0 + // + // CHECK: %[[VEC:.*]] = insertelement <8 x i16> %[[X:.*]], i16 %x7, i32 7 + // CHECK-NOT: insertelement + // CHECK: %[[INS:.*]] = call @llvm.experimental.vector.insert.nxv8i16.v8i16( undef, <8 x i16> %[[VEC]], i64 0) + // CHECK: %[[DUPQ:.*]] = call @llvm.aarch64.sve.dupq.lane.nxv8i16( %[[INS]], i64 0) + // CHECK: ret %[[DUPQ]] return SVE_ACLE_FUNC(svdupq,_n,_s16,)(x0, x1, x2, x3, x4, x5, x6, x7); } svint32_t test_svdupq_n_s32(int32_t x0, int32_t x1, int32_t x2, int32_t x3) { // CHECK-LABEL: test_svdupq_n_s32 - // CHECK: %[[ALLOCA:.*]] = alloca [4 x i32], align 16 - // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [4 x i32], [4 x i32]* %[[ALLOCA]], i64 0, i64 0 - // CHECK-DAG: store i32 %x0, i32* %[[BASE]], align 16 - // - // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [4 x i32], [4 x i32]* %[[ALLOCA]], i64 0, i64 3 - // CHECK: store i32 %x3, i32* %[[GEP]], align 4 - // CHECK-NOT: store - // CHECK: call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) - // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld1rq.nxv4i32( %{{.*}}, i32* nonnull %[[BASE]]) - // CHECK: ret %[[LOAD]] + // CHECK: insertelement <4 x i32> undef, i32 %x0, i32 0 + // + // CHECK: %[[VEC:.*]] = insertelement <4 x i32> %[[X:.*]], i32 %x3, i32 3 + // CHECK-NOT: insertelement + // CHECK: %[[INS:.*]] = call @llvm.experimental.vector.insert.nxv4i32.v4i32( undef, <4 x i32> %[[VEC]], i64 0) + // CHECK: %[[DUPQ:.*]] = call @llvm.aarch64.sve.dupq.lane.nxv4i32( %[[INS]], i64 0) + // CHECK: ret %[[DUPQ]] return SVE_ACLE_FUNC(svdupq,_n,_s32,)(x0, x1, x2, x3); } svint64_t test_svdupq_n_s64(int64_t x0, int64_t x1) { // CHECK-LABEL: test_svdupq_n_s64 - // CHECK: %[[ALLOCA:.*]] = alloca [2 x i64], align 16 - // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [2 x i64], [2 x i64]* %[[ALLOCA]], i64 0, i64 0 - // CHECK-DAG: store i64 %x0, i64* %[[BASE]], align 16 - // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [2 x i64], [2 x i64]* %[[ALLOCA]], i64 0, i64 1 - // CHECK: store i64 %x1, i64* %[[GEP]], align 8 - // CHECK-NOT: store - // CHECK: call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) - // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld1rq.nxv2i64( %{{.*}}, i64* nonnull %[[BASE]]) - // CHECK: ret %[[LOAD]] + // CHECK: %[[SVEC:.*]] = insertelement <2 x i64> undef, i64 %x0, i32 0 + // CHECK: %[[VEC:.*]] = insertelement <2 x i64> %[[SVEC]], i64 %x1, i32 1 + // CHECK-NOT: insertelement + // CHECK: %[[INS:.*]] = call @llvm.experimental.vector.insert.nxv2i64.v2i64( undef, <2 x i64> %[[VEC]], i64 0) + // CHECK: %[[DUPQ:.*]] = call @llvm.aarch64.sve.dupq.lane.nxv2i64( %[[INS]], i64 0) + // CHECK: ret %[[DUPQ]] return SVE_ACLE_FUNC(svdupq,_n,_s64,)(x0, x1); } @@ -174,16 +162,13 @@ uint8_t x12, uint8_t x13, uint8_t x14, uint8_t x15) { // CHECK-LABEL: test_svdupq_n_u8 - // CHECK: %[[ALLOCA:.*]] = alloca [16 x i8], align 16 - // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [16 x i8], [16 x i8]* %[[ALLOCA]], i64 0, i64 0 - // CHECK-DAG: store i8 %x0, i8* %[[BASE]], align 16 - // - // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [16 x i8], [16 x i8]* %[[ALLOCA]], i64 0, i64 15 - // CHECK: store i8 %x15, i8* %[[GEP]], align 1 - // CHECK-NOT: store - // CHECK: %[[PTRUE:.*]] = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) - // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld1rq.nxv16i8( %[[PTRUE]], i8* nonnull %[[BASE]]) - // CHECK: ret %[[LOAD]] + // CHECK: insertelement <16 x i8> undef, i8 %x0, i32 0 + // + // CHECK: %[[VEC:.*]] = insertelement <16 x i8> %[[X:.*]], i8 %x15, i32 15 + // CHECK-NOT: insertelement + // CHECK: %[[INS:.*]] = call @llvm.experimental.vector.insert.nxv16i8.v16i8( undef, <16 x i8> %[[VEC]], i64 0) + // CHECK: %[[DUPQ:.*]] = call @llvm.aarch64.sve.dupq.lane.nxv16i8( %[[INS]], i64 0) + // CHECK: ret %[[DUPQ]] return SVE_ACLE_FUNC(svdupq,_n,_u8,)(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15); } @@ -191,47 +176,38 @@ uint16_t x4, uint16_t x5, uint16_t x6, uint16_t x7) { // CHECK-LABEL: test_svdupq_n_u16 - // CHECK: %[[ALLOCA:.*]] = alloca [8 x i16], align 16 - // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [8 x i16], [8 x i16]* %[[ALLOCA]], i64 0, i64 0 - // CHECK-DAG: store i16 %x0, i16* %[[BASE]], align 16 - // - // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [8 x i16], [8 x i16]* %[[ALLOCA]], i64 0, i64 7 - // CHECK: store i16 %x7, i16* %[[GEP]], align 2 - // CHECK-NOT: store - // CHECK: call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) - // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld1rq.nxv8i16( %{{.*}}, i16* nonnull %[[BASE]]) - // CHECK: ret %[[LOAD]] + // CHECK: insertelement <8 x i16> undef, i16 %x0, i32 0 + // + // CHECK: %[[VEC:.*]] = insertelement <8 x i16> %[[X:.*]], i16 %x7, i32 7 + // CHECK-NOT: insertelement + // CHECK: %[[INS:.*]] = call @llvm.experimental.vector.insert.nxv8i16.v8i16( undef, <8 x i16> %[[VEC]], i64 0) + // CHECK: %[[DUPQ:.*]] = call @llvm.aarch64.sve.dupq.lane.nxv8i16( %[[INS]], i64 0) + // CHECK: ret %[[DUPQ]] return SVE_ACLE_FUNC(svdupq,_n,_u16,)(x0, x1, x2, x3, x4, x5, x6, x7); } svuint32_t test_svdupq_n_u32(uint32_t x0, uint32_t x1, uint32_t x2, uint32_t x3) { // CHECK-LABEL: test_svdupq_n_u32 - // CHECK: %[[ALLOCA:.*]] = alloca [4 x i32], align 16 - // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [4 x i32], [4 x i32]* %[[ALLOCA]], i64 0, i64 0 - // CHECK-DAG: store i32 %x0, i32* %[[BASE]], align 16 - // - // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [4 x i32], [4 x i32]* %[[ALLOCA]], i64 0, i64 3 - // CHECK: store i32 %x3, i32* %[[GEP]], align 4 - // CHECK-NOT: store - // CHECK: call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) - // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld1rq.nxv4i32( %{{.*}}, i32* nonnull %[[BASE]]) - // CHECK: ret %[[LOAD]] + // CHECK: insertelement <4 x i32> undef, i32 %x0, i32 0 + // + // CHECK: %[[VEC:.*]] = insertelement <4 x i32> %[[X:.*]], i32 %x3, i32 3 + // CHECK-NOT: insertelement + // CHECK: %[[INS:.*]] = call @llvm.experimental.vector.insert.nxv4i32.v4i32( undef, <4 x i32> %[[VEC]], i64 0) + // CHECK: %[[DUPQ:.*]] = call @llvm.aarch64.sve.dupq.lane.nxv4i32( %[[INS]], i64 0) + // CHECK: ret %[[DUPQ]] return SVE_ACLE_FUNC(svdupq,_n,_u32,)(x0, x1, x2, x3); } svuint64_t test_svdupq_n_u64(uint64_t x0, uint64_t x1) { // CHECK-LABEL: test_svdupq_n_u64 - // CHECK: %[[ALLOCA:.*]] = alloca [2 x i64], align 16 - // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [2 x i64], [2 x i64]* %[[ALLOCA]], i64 0, i64 0 - // CHECK-DAG: store i64 %x0, i64* %[[BASE]], align 16 - // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [2 x i64], [2 x i64]* %[[ALLOCA]], i64 0, i64 1 - // CHECK: store i64 %x1, i64* %[[GEP]], align 8 - // CHECK-NOT: store - // CHECK: call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) - // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld1rq.nxv2i64( %{{.*}}, i64* nonnull %[[BASE]]) - // CHECK: ret %[[LOAD]] + // CHECK: %[[SVEC:.*]] = insertelement <2 x i64> undef, i64 %x0, i32 0 + // CHECK: %[[VEC:.*]] = insertelement <2 x i64> %[[SVEC]], i64 %x1, i32 1 + // CHECK-NOT: insertelement + // CHECK: %[[INS:.*]] = call @llvm.experimental.vector.insert.nxv2i64.v2i64( undef, <2 x i64> %[[VEC]], i64 0) + // CHECK: %[[DUPQ:.*]] = call @llvm.aarch64.sve.dupq.lane.nxv2i64( %[[INS]], i64 0) + // CHECK: ret %[[DUPQ]] return SVE_ACLE_FUNC(svdupq,_n,_u64,)(x0, x1); } @@ -239,47 +215,38 @@ float16_t x4, float16_t x5, float16_t x6, float16_t x7) { // CHECK-LABEL: test_svdupq_n_f16 - // CHECK: %[[ALLOCA:.*]] = alloca [8 x half], align 16 - // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [8 x half], [8 x half]* %[[ALLOCA]], i64 0, i64 0 - // CHECK-DAG: store half %x0, half* %[[BASE]], align 16 - // - // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [8 x half], [8 x half]* %[[ALLOCA]], i64 0, i64 7 - // CHECK: store half %x7, half* %[[GEP]], align 2 - // CHECK-NOT: store - // CHECK: call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) - // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld1rq.nxv8f16( %{{.*}}, half* nonnull %[[BASE]]) - // CHECK: ret %[[LOAD]] + // CHECK: insertelement <8 x half> undef, half %x0, i32 0 + // + // CHECK: %[[VEC:.*]] = insertelement <8 x half> %[[X:.*]], half %x7, i32 7 + // CHECK-NOT: insertelement + // CHECK: %[[INS:.*]] = call @llvm.experimental.vector.insert.nxv8f16.v8f16( undef, <8 x half> %[[VEC]], i64 0) + // CHECK: %[[DUPQ:.*]] = call @llvm.aarch64.sve.dupq.lane.nxv8f16( %[[INS]], i64 0) + // CHECK: ret %[[DUPQ]] return SVE_ACLE_FUNC(svdupq,_n,_f16,)(x0, x1, x2, x3, x4, x5, x6, x7); } svfloat32_t test_svdupq_n_f32(float32_t x0, float32_t x1, float32_t x2, float32_t x3) { // CHECK-LABEL: test_svdupq_n_f32 - // CHECK: %[[ALLOCA:.*]] = alloca [4 x float], align 16 - // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [4 x float], [4 x float]* %[[ALLOCA]], i64 0, i64 0 - // CHECK-DAG: store float %x0, float* %[[BASE]], align 16 - // - // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [4 x float], [4 x float]* %[[ALLOCA]], i64 0, i64 3 - // CHECK: store float %x3, float* %[[GEP]], align 4 - // CHECK-NOT: store - // CHECK: call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) - // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld1rq.nxv4f32( %{{.*}}, float* nonnull %[[BASE]]) - // CHECK: ret %[[LOAD]] + // CHECK: insertelement <4 x float> undef, float %x0, i32 0 + // + // CHECK: %[[VEC:.*]] = insertelement <4 x float> %[[X:.*]], float %x3, i32 3 + // CHECK-NOT: insertelement + // CHECK: %[[INS:.*]] = call @llvm.experimental.vector.insert.nxv4f32.v4f32( undef, <4 x float> %[[VEC]], i64 0) + // CHECK: %[[DUPQ:.*]] = call @llvm.aarch64.sve.dupq.lane.nxv4f32( %[[INS]], i64 0) + // CHECK: ret %[[DUPQ]] return SVE_ACLE_FUNC(svdupq,_n,_f32,)(x0, x1, x2, x3); } svfloat64_t test_svdupq_n_f64(float64_t x0, float64_t x1) { // CHECK-LABEL: test_svdupq_n_f64 - // CHECK: %[[ALLOCA:.*]] = alloca [2 x double], align 16 - // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [2 x double], [2 x double]* %[[ALLOCA]], i64 0, i64 0 - // CHECK-DAG: store double %x0, double* %[[BASE]], align 16 - // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [2 x double], [2 x double]* %[[ALLOCA]], i64 0, i64 1 - // CHECK: store double %x1, double* %[[GEP]], align 8 - // CHECK-NOT: store - // CHECK: call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) - // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld1rq.nxv2f64( %{{.*}}, double* nonnull %[[BASE]]) - // CHECK: ret %[[LOAD]] + // CHECK: %[[SVEC:.*]] = insertelement <2 x double> undef, double %x0, i32 0 + // CHECK: %[[VEC:.*]] = insertelement <2 x double> %[[SVEC]], double %x1, i32 1 + // CHECK-NOT: insertelement + // CHECK: %[[INS:.*]] = call @llvm.experimental.vector.insert.nxv2f64.v2f64( undef, <2 x double> %[[VEC]], i64 0) + // CHECK: %[[DUPQ:.*]] = call @llvm.aarch64.sve.dupq.lane.nxv2f64( %[[INS]], i64 0) + // CHECK: ret %[[DUPQ]] return SVE_ACLE_FUNC(svdupq,_n,_f64,)(x0, x1); } @@ -289,19 +256,17 @@ bool x12, bool x13, bool x14, bool x15) { // CHECK-LABEL: test_svdupq_n_b8 - // CHECK-DAG: %[[ALLOCA:.*]] = alloca [16 x i8], align 16 // CHECK-DAG: %[[X0:.*]] = zext i1 %x0 to i8 // CHECK-DAG: %[[X15:.*]] = zext i1 %x15 to i8 - // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [16 x i8], [16 x i8]* %[[ALLOCA]], i64 0, i64 0 - // CHECK-DAG: store i8 %[[X0]], i8* %[[BASE]], align 16 - // - // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [16 x i8], [16 x i8]* %[[ALLOCA]], i64 0, i64 15 - // CHECK: store i8 %[[X15]], i8* %[[GEP]], align 1 - // CHECK-NOT: store + // CHECK: insertelement <16 x i8> undef, i8 %[[X0]], i32 0 + // + // CHECK: %[[VEC:.*]] = insertelement <16 x i8> %[[X:.*]], i8 %[[X15]], i32 15 + // CHECK-NOT: insertelement // CHECK: %[[PTRUE:.*]] = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) - // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld1rq.nxv16i8( %[[PTRUE]], i8* nonnull %[[BASE]]) + // CHECK: %[[INS:.*]] = call @llvm.experimental.vector.insert.nxv16i8.v16i8( undef, <16 x i8> %[[VEC]], i64 0) + // CHECK: %[[DUPQ:.*]] = call @llvm.aarch64.sve.dupq.lane.nxv16i8( %[[INS]], i64 0) // CHECK: %[[ZERO:.*]] = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) - // CHECK: %[[CMP:.*]] = call @llvm.aarch64.sve.cmpne.wide.nxv16i8( %[[PTRUE]], %[[LOAD]], %[[ZERO]]) + // CHECK: %[[CMP:.*]] = call @llvm.aarch64.sve.cmpne.wide.nxv16i8( %[[PTRUE]], %[[DUPQ]], %[[ZERO]]) // CHECK: ret %[[CMP]] return SVE_ACLE_FUNC(svdupq,_n,_b8,)(x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15); } @@ -310,19 +275,17 @@ bool x4, bool x5, bool x6, bool x7) { // CHECK-LABEL: test_svdupq_n_b16 - // CHECK-DAG: %[[ALLOCA:.*]] = alloca [8 x i16], align 16 // CHECK-DAG: %[[X0:.*]] = zext i1 %x0 to i16 // CHECK-DAG: %[[X7:.*]] = zext i1 %x7 to i16 - // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [8 x i16], [8 x i16]* %[[ALLOCA]], i64 0, i64 0 - // CHECK-DAG: store i16 %[[X0]], i16* %[[BASE]], align 16 - // - // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [8 x i16], [8 x i16]* %[[ALLOCA]], i64 0, i64 7 - // CHECK: store i16 %[[X7]], i16* %[[GEP]], align 2 - // CHECK-NOT: store + // CHECK: insertelement <8 x i16> undef, i16 %[[X0]], i32 0 + // + // CHECK: %[[VEC:.*]] = insertelement <8 x i16> %[[X:.*]], i16 %[[X7]], i32 7 + // CHECK-NOT: insertelement // CHECK: %[[PTRUE:.*]] = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) - // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld1rq.nxv8i16( %{{.*}}, i16* nonnull %[[BASE]]) + // CHECK: %[[INS:.*]] = call @llvm.experimental.vector.insert.nxv8i16.v8i16( undef, <8 x i16> %[[VEC]], i64 0) + // CHECK: %[[DUPQ:.*]] = call @llvm.aarch64.sve.dupq.lane.nxv8i16( %[[INS]], i64 0) // CHECK: %[[ZERO:.*]] = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) - // CHECK: %[[CMP:.*]] = call @llvm.aarch64.sve.cmpne.wide.nxv8i16( %{{.*}}, %[[LOAD]], %[[ZERO]]) + // CHECK: %[[CMP:.*]] = call @llvm.aarch64.sve.cmpne.wide.nxv8i16( %[[PTRUE]], %[[DUPQ]], %[[ZERO]]) // CHECK: %[[CAST:.*]] = call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %[[CMP]]) // CHECK: ret %[[CAST]] return SVE_ACLE_FUNC(svdupq,_n,_b16,)(x0, x1, x2, x3, x4, x5, x6, x7); @@ -331,20 +294,18 @@ svbool_t test_svdupq_n_b32(bool x0, bool x1, bool x2, bool x3) { // CHECK-LABEL: test_svdupq_n_b32 - // CHECK-DAG: %[[ALLOCA:.*]] = alloca [4 x i32], align 16 // CHECK-DAG: %[[X0:.*]] = zext i1 %x0 to i32 // CHECK-DAG: %[[X3:.*]] = zext i1 %x3 to i32 - // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [4 x i32], [4 x i32]* %[[ALLOCA]], i64 0, i64 0 - // CHECK-DAG: store i32 %[[X0]], i32* %[[BASE]], align 16 - // - // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [4 x i32], [4 x i32]* %[[ALLOCA]], i64 0, i64 3 - // CHECK: store i32 %[[X3]], i32* %[[GEP]], align 4 - // CHECK-NOT: store + // CHECK: insertelement <4 x i32> undef, i32 %[[X0]], i32 0 + // + // CHECK: %[[VEC:.*]] = insertelement <4 x i32> %[[X:.*]], i32 %[[X3]], i32 3 + // CHECK-NOT: insertelement // CHECK: %[[PTRUE:.*]] = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) - // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld1rq.nxv4i32( %{{.*}}, i32* nonnull %[[BASE]]) + // CHECK: %[[INS:.*]] = call @llvm.experimental.vector.insert.nxv4i32.v4i32( undef, <4 x i32> %[[VEC]], i64 0) + // CHECK: %[[DUPQ:.*]] = call @llvm.aarch64.sve.dupq.lane.nxv4i32( %[[INS]], i64 0) // CHECK: %[[ZERO:.*]] = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) - // CHECK: %[[INTRINSIC:.*]] = call @llvm.aarch64.sve.cmpne.wide.nxv4i32( %{{.*}}, %[[LOAD]], %[[ZERO]]) - // CHECK: %[[CAST:.*]] = call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %[[INTRINSIC]]) + // CHECK: %[[CMP:.*]] = call @llvm.aarch64.sve.cmpne.wide.nxv4i32( %[[PTRUE]], %[[DUPQ]], %[[ZERO]]) + // CHECK: %[[CAST:.*]] = call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %[[CMP]]) // CHECK: ret %[[CAST]] return SVE_ACLE_FUNC(svdupq,_n,_b32,)(x0, x1, x2, x3); } @@ -352,41 +313,17 @@ svbool_t test_svdupq_n_b64(bool x0, bool x1) { // CHECK-LABEL: test_svdupq_n_b64 - // CHECK-DAG: %[[ALLOCA:.*]] = alloca [2 x i64], align 16 // CHECK-DAG: %[[X0:.*]] = zext i1 %x0 to i64 // CHECK-DAG: %[[X1:.*]] = zext i1 %x1 to i64 - // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [2 x i64], [2 x i64]* %[[ALLOCA]], i64 0, i64 0 - // CHECK-DAG: store i64 %[[X0]], i64* %[[BASE]], align 16 - // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [2 x i64], [2 x i64]* %[[ALLOCA]], i64 0, i64 1 - // CHECK: store i64 %[[X1]], i64* %[[GEP]], align 8 - // CHECK-NOT: store + // CHECK: %[[SVEC:.*]] = insertelement <2 x i64> undef, i64 %[[X0]], i32 0 + // CHECK: %[[VEC:.*]] = insertelement <2 x i64> %[[SVEC]], i64 %[[X1]], i32 1 + // CHECK-NOT: insertelement // CHECK: %[[PTRUE:.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) - // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld1rq.nxv2i64( %{{.*}}, i64* nonnull %[[BASE]]) + // CHECK: %[[INS:.*]] = call @llvm.experimental.vector.insert.nxv2i64.v2i64( undef, <2 x i64> %[[VEC]], i64 0) + // CHECK: %[[DUPQ:.*]] = call @llvm.aarch64.sve.dupq.lane.nxv2i64( %[[INS]], i64 0) // CHECK: %[[ZERO:.*]] = call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) - // CHECK: %[[INTRINSIC:.*]] = call @llvm.aarch64.sve.cmpne.nxv2i64( %{{.*}}, %[[LOAD]], %[[ZERO]]) - // CHECK: %[[CAST:.*]] = call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %[[INTRINSIC]]) + // CHECK: %[[CMP:.*]] = call @llvm.aarch64.sve.cmpne.nxv2i64( %[[PTRUE]], %[[DUPQ]], %[[ZERO]]) + // CHECK: %[[CAST:.*]] = call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %[[CMP]]) // CHECK: ret %[[CAST]] return SVE_ACLE_FUNC(svdupq,_n,_b64,)(x0, x1); } - -// This test checks that the `alloca` is added to the entry-block. -svint64_t test_svdupq_control_flow(int64_t x0, int64_t x1, svint64_t Default, bool P) -{ - // CHECK-LABEL: test_svdupq_control_flow - // CHECK: entry: - // CHECK-DAG: %[[ALLOCA:.*]] = alloca [2 x i64], align 16 - // CHECK-DAG: [[BR:.*]]: - // CHECK-DAG: %[[BASE:.*]] = getelementptr inbounds [2 x i64], [2 x i64]* %[[ALLOCA]], i64 0, i64 0 - // CHECK-DAG: store i64 %x0, i64* %[[BASE]], align 16 - // CHECK-DAG: %[[GEP:.*]] = getelementptr inbounds [2 x i64], [2 x i64]* %[[ALLOCA]], i64 0, i64 1 - // CHECK: store i64 %x1, i64* %[[GEP]], align 8 - // CHECK-NOT: store - // CHECK: call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) - // CHECK: %[[LOAD:.*]] = call @llvm.aarch64.sve.ld1rq.nxv2i64( %{{.*}}, i64* nonnull %[[BASE]]) - // CHECK: [[END:.*]]: - // CHECK: %[[RETVAL:.*]] = phi [ %[[LOAD]], %if.end ], [ %Default, %entry ] - // CHECK: ret %[[RETVAL]] - if (P) - return Default; - return SVE_ACLE_FUNC(svdupq,_n,_s64,)(x0, x1); -} diff --git a/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq_const.c b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq_const.c new file mode 100644 --- /dev/null +++ b/clang/test/CodeGen/aarch64-sve-intrinsics/acle_sve_dupq_const.c @@ -0,0 +1,36 @@ +// REQUIRES: aarch64-registered-target +// RUN: %clang_cc1 -triple aarch64-none-linux-gnu -target-feature +sve -fallow-half-arguments-and-returns -S -O1 -Werror -Wall -o - %s | FileCheck %s + +#include + +svbool_t test_svdupq_n_b8_const() +{ + // CHECK-LABEL: test_svdupq_n_b8_const + // CHECK: ptrue p0.h + // CHECK-NEXT: ret + return svdupq_n_b8(1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0); +} + +svbool_t test_svdupq_n_b16_const() +{ + // CHECK-LABEL: test_svdupq_n_b16_const + // CHECK: ptrue p0.h + // CHECK-NEXT: ret + return svdupq_n_b16(1, 1, 1, 1, 1, 1, 1, 1); +} + +svbool_t test_svdupq_n_b32_const() +{ + // CHECK-LABEL: test_svdupq_n_b32_const + // CHECK: ptrue p0.s + // CHECK-NEXT: ret + return svdupq_n_b32(1, 1, 1, 1); +} + +svbool_t test_svdupq_n_b64_const() +{ + // CHECK-LABEL: test_svdupq_n_b64_const + // CHECK: ptrue p0.d + // CHECK-NEXT: ret + return svdupq_n_b64(1, 1); +} diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -1594,7 +1594,6 @@ def int_aarch64_sve_dup : AdvSIMD_SVE_DUP_Intrinsic; def int_aarch64_sve_dup_x : AdvSIMD_SVE_DUP_Unpred_Intrinsic; - def int_aarch64_sve_index : AdvSIMD_SVE_Index_Intrinsic; // diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -390,6 +390,111 @@ return IC.replaceInstUsesWith(II, Insert); } +static Optional instCombineSVECmpNE(InstCombiner &IC, + IntrinsicInst &II) { + LLVMContext &Ctx = II.getContext(); + IRBuilder<> Builder(Ctx); + Builder.SetInsertPoint(&II); + + // Check that the predicate is all active + auto *Pg = dyn_cast(II.getArgOperand(0)); + if (!Pg || Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) + return None; + + const auto PTruePattern = + cast(Pg->getOperand(0))->getZExtValue(); + if (PTruePattern != AArch64SVEPredPattern::all) + return None; + + // Check that we have a compare of zero.. + auto *DupX = dyn_cast(II.getArgOperand(2)); + if (!DupX || DupX->getIntrinsicID() != Intrinsic::aarch64_sve_dup_x) + return None; + + auto *DupXArg = dyn_cast(DupX->getArgOperand(0)); + if (!DupXArg || !DupXArg->isZero()) + return None; + + // ..against a dupq + auto *DupQLane = dyn_cast(II.getArgOperand(1)); + if (!DupQLane || + DupQLane->getIntrinsicID() != Intrinsic::aarch64_sve_dupq_lane) + return None; + + // Where the dupq is a lane 0 replicate of a vector insert + if (!cast(DupQLane->getArgOperand(1))->isZero()) + return None; + + auto *VecIns = dyn_cast(DupQLane->getArgOperand(0)); + if (!VecIns || + VecIns->getIntrinsicID() != Intrinsic::experimental_vector_insert) + return None; + + // Where the vector insert is a fixed constant vector insert into undef at + // index zero + if (!isa(VecIns->getArgOperand(0))) + return None; + + if (!cast(VecIns->getArgOperand(2))->isZero()) + return None; + + auto *ConstVec = dyn_cast(VecIns->getArgOperand(1)); + if (!ConstVec) + return None; + + auto *VecTy = dyn_cast(ConstVec->getType()); + auto *OutTy = dyn_cast(II.getType()); + if (!VecTy || !OutTy || VecTy->getNumElements() != OutTy->getMinNumElements()) + return None; + + unsigned NumElts = VecTy->getNumElements(); + unsigned PredicateBits = 0; + + // Expand intrinsic operands to a 16-bit byte level predicate + for (unsigned I = 0; I < NumElts; ++I) { + auto *Arg = dyn_cast(ConstVec->getAggregateElement(I)); + if (!Arg) + return None; + if (!Arg->isZero()) + PredicateBits |= 1 << (I * (16 / NumElts)); + } + + // If all bits are zero bail early with an empty predicate + if (PredicateBits == 0) { + auto *PFalse = Constant::getNullValue(II.getType()); + PFalse->takeName(&II); + return IC.replaceInstUsesWith(II, PFalse); + } + + // Calculate largest predicate type used (where byte predicate is largest) + unsigned Mask = 8; + for (unsigned I = 0; I < 16; ++I) + if ((PredicateBits & (1 << I)) != 0) + Mask |= (I % 8); + + unsigned PredSize = Mask & -Mask; + auto *PredType = ScalableVectorType::get( + Type::getInt1Ty(Ctx), AArch64::SVEBitsPerBlock / (PredSize * 8)); + + // Ensure all relevant bits are set + for (unsigned I = 0; I < 16; I += PredSize) + if ((PredicateBits & (1 << I)) == 0) + return None; + + auto *PTruePat = + ConstantInt::get(Type::getInt32Ty(Ctx), AArch64SVEPredPattern::all); + auto *PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, + {PredType}, {PTruePat}); + auto *ConvertToSVBool = Builder.CreateIntrinsic( + Intrinsic::aarch64_sve_convert_to_svbool, {PredType}, {PTrue}); + auto *ConvertFromSVBool = + Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool, + {II.getType()}, {ConvertToSVBool}); + + ConvertFromSVBool->takeName(&II); + return IC.replaceInstUsesWith(II, ConvertFromSVBool); +} + static Optional instCombineSVELast(InstCombiner &IC, IntrinsicInst &II) { Value *Pg = II.getArgOperand(0); @@ -498,6 +603,9 @@ return instCombineConvertFromSVBool(IC, II); case Intrinsic::aarch64_sve_dup: return instCombineSVEDup(IC, II); + case Intrinsic::aarch64_sve_cmpne: + case Intrinsic::aarch64_sve_cmpne_wide: + return instCombineSVECmpNE(IC, II); case Intrinsic::aarch64_sve_rdffr: return instCombineRDFFR(IC, II); case Intrinsic::aarch64_sve_lasta: diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-cmpne.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-cmpne.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-cmpne.ll @@ -0,0 +1,397 @@ +; RUN: opt -S -instcombine < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +; DUPQ b8 + +define @dupq_b_0() #0 { +; CHECK-LABEL: @dupq_b_0( +; CHECK: ret zeroinitializer + %1 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %2 = tail call @llvm.experimental.vector.insert.nxv16i8.v16i8( undef, + <16 x i8> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv16i8( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv16i8( %1, %3, %4) + ret %5 +} + +define @dupq_b_d() #0 { +; CHECK-LABEL: @dupq_b_d( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: %2 = call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %1) +; CHECK-NEXT: ret %2 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %2 = tail call @llvm.experimental.vector.insert.nxv16i8.v16i8( undef, + <16 x i8> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv16i8( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv16i8( %1, %3, %4) + ret %5 +} + +define @dupq_b_w() #0 { +; CHECK-LABEL: @dupq_b_w( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: %2 = call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %1) +; CHECK-NEXT: ret %2 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %2 = tail call @llvm.experimental.vector.insert.nxv16i8.v16i8( undef, + <16 x i8> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv16i8( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv16i8( %1, %3, %4) + ret %5 +} + +define @dupq_b_h() #0 { +; CHECK-LABEL: @dupq_b_h( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +; CHECK-NEXT: %2 = call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %1) +; CHECK-NEXT: ret %2 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %2 = tail call @llvm.experimental.vector.insert.nxv16i8.v16i8( undef, + <16 x i8> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv16i8( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv16i8( %1, %3, %4) + ret %5 +} + +define @dupq_b_b() #0 { +; CHECK-LABEL: @dupq_b_b( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 31) + %2 = tail call @llvm.experimental.vector.insert.nxv16i8.v16i8( undef, + <16 x i8> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv16i8( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv16i8( %1, %3, %4) + ret %5 +} + +; DUPQ b16 + +define @dupq_h_0() #0 { +; CHECK-LABEL: @dupq_h_0( +; CHECK: ret zeroinitializer + %1 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %2 = tail call @llvm.experimental.vector.insert.nxv8i16.v8i16( undef, + <8 x i16> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv8i16( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv8i16( %1, %3, %4) + ret %5 +} + +define @dupq_h_d() #0 { +; CHECK-LABEL: @dupq_h_d( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: %2 = call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %1) +; CHECK-NEXT: %3 = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %2) +; CHECK-NEXT: ret %3 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %2 = tail call @llvm.experimental.vector.insert.nxv8i16.v8i16( undef, + <8 x i16> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv8i16( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv8i16( %1, %3, %4) + ret %5 +} + +define @dupq_h_w() #0 { +; CHECK-LABEL: @dupq_h_w( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: %2 = call @llvm.aarch64.sve.convert.to.svbool.nxv4i1( %1) +; CHECK-NEXT: %3 = call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %2) +; CHECK-NEXT: ret %3 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %2 = tail call @llvm.experimental.vector.insert.nxv8i16.v8i16( undef, + <8 x i16> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv8i16( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv8i16( %1, %3, %4) + ret %5 +} + +define @dupq_h_h() #0 { +; CHECK-LABEL: @dupq_h_h( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) + %2 = tail call @llvm.experimental.vector.insert.nxv8i16.v8i16( undef, + <8 x i16> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv8i16( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv8i16( %1, %3, %4) + ret %5 +} + +; DUPQ b32 + +define @dupq_w_0() #0 { +; CHECK-LABEL: @dupq_w_0( +; CHECK: ret zeroinitializer + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call @llvm.experimental.vector.insert.nxv4i32.v4i32( undef, + <4 x i32> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv4i32( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv4i32( %1, %3, %4) + ret %5 +} + +define @dupq_w_d() #0 { +; CHECK-LABEL: @dupq_w_d( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: %2 = call @llvm.aarch64.sve.convert.to.svbool.nxv2i1( %1) +; CHECK-NEXT: %3 = call @llvm.aarch64.sve.convert.from.svbool.nxv4i1( %2) +; CHECK-NEXT: ret %3 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call @llvm.experimental.vector.insert.nxv4i32.v4i32( undef, + <4 x i32> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv4i32( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv4i32( %1, %3, %4) + ret %5 +} + +define @dupq_w_w() #0 { +; CHECK-LABEL: @dupq_w_w( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call @llvm.experimental.vector.insert.nxv4i32.v4i32( undef, + <4 x i32> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv4i32( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv4i32( %1, %3, %4) + ret %5 +} + +; DUPQ b64 + +define @dupq_d_0() #0 { +; CHECK-LABEL: @dupq_d_0( +; CHECK: ret zeroinitializer + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call @llvm.experimental.vector.insert.nxv2i64.v2i64( undef, + <2 x i64> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.nxv2i64( %1, %3, %4) + ret %5 +} + +define @dupq_d_d() #0 { +; CHECK-LABEL: @dupq_d_d( +; CHECK: %1 = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: ret %1 + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call @llvm.experimental.vector.insert.nxv2i64.v2i64( undef, + <2 x i64> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.nxv2i64( %1, %3, %4) + ret %5 +} + +; Cases that cannot be converted + +define @dupq_neg1() #0 { +; CHECK-LABEL: @dupq_neg1( +; CHECK: cmpne +; CHECK-NEXT: ret + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call @llvm.experimental.vector.insert.nxv2i64.v2i64( undef, + <2 x i64> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.nxv2i64( %1, %3, %4) + ret %5 +} + +define @dupq_neg2() #0 { +; CHECK-LABEL: @dupq_neg2( +; CHECK: cmpne +; CHECK-NEXT: ret + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call @llvm.experimental.vector.insert.nxv4i32.v4i32( undef, + <4 x i32> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv4i32( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv4i32( %1, %3, %4) + ret %5 +} + +define @dupq_neg3() #0 { +; CHECK-LABEL: @dupq_neg3( +; CHECK: cmpne +; CHECK-NEXT: ret + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call @llvm.experimental.vector.insert.nxv4i32.v4i32( undef, + <4 x i32> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv4i32( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv4i32( %1, %3, %4) + ret %5 +} + +define @dupq_neg4() #0 { +; CHECK-LABEL: @dupq_neg4( +; CHECK: cmpne +; CHECK-NEXT: ret + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call @llvm.experimental.vector.insert.nxv4i32.v4i32( undef, + <4 x i32> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv4i32( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv4i32( %1, %3, %4) + ret %5 +} + +define @dupq_neg5() #0 { +; CHECK-LABEL: @dupq_neg5( +; CHECK: cmpne +; CHECK-NEXT: ret + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = tail call @llvm.experimental.vector.insert.nxv4i32.v4i32( undef, + <4 x i32> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv4i32( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.wide.nxv4i32( %1, %3, %4) + ret %5 +} + +define @dupq_neg6(i1 %a) #0 { +; CHECK-LABEL: @dupq_neg6( +; CHECK: cmpne +; CHECK-NEXT: ret + %1 = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %2 = zext i1 %a to i32 + %3 = insertelement <4 x i32> , i32 %2, i32 3 + %4 = tail call @llvm.experimental.vector.insert.nxv4i32.v4i32( undef, <4 x i32> %3, i64 0) + %5 = tail call @llvm.aarch64.sve.dupq.lane.nxv4i32( %4 , i64 0) + %6 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %7 = tail call @llvm.aarch64.sve.cmpne.wide.nxv4i32( %1, %5, %6) + ret %7 +} + +define @dupq_neg7() #0 { +; CHECK-LABEL: @dupq_neg7( +; CHECK: cmpne +; CHECK-NEXT: ret + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call @llvm.experimental.vector.insert.nxv2i64.v2i64( undef, + <2 x i64> , i64 1) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.nxv2i64( %1, %3, %4) + ret %5 +} + +define @dupq_neg8() #0 { +; CHECK-LABEL: @dupq_neg8( +; CHECK: cmpne +; CHECK-NEXT: ret + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call @llvm.experimental.vector.insert.nxv2i64.v2i64( undef, + <2 x i64> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %2 , i64 1) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.nxv2i64( %1, %3, %4) + ret %5 +} + +define @dupq_neg9( %x) #0 { +; CHECK-LABEL: @dupq_neg9( +; CHECK: cmpne +; CHECK-NEXT: ret + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call @llvm.experimental.vector.insert.nxv2i64.v2i64( %x, + <2 x i64> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.nxv2i64( %1, %3, %4) + ret %5 +} + +define @dupq_neg10() #0 { +; CHECK-LABEL: @dupq_neg10( +; CHECK: cmpne +; CHECK-NEXT: ret + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call @llvm.experimental.vector.insert.nxv2i64.v2i64( undef, + <2 x i64> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 1) + %5 = tail call @llvm.aarch64.sve.cmpne.nxv2i64( %1, %3, %4) + ret %5 +} + +define @dupq_neg11( %pg) #0 { +; CHECK-LABEL: @dupq_neg11( +; CHECK: cmpne +; CHECK-NEXT: ret + %1 = tail call @llvm.experimental.vector.insert.nxv2i64.v2i64( undef, + <2 x i64> , i64 0) + %2 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %1 , i64 0) + %3 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %4 = tail call @llvm.aarch64.sve.cmpne.nxv2i64( %pg, %2, %3) + ret %4 +} + +define @dupq_neg12() #0 { +; CHECK-LABEL: @dupq_neg12( +; CHECK: cmpne +; CHECK-NEXT: ret + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 15) + %2 = tail call @llvm.experimental.vector.insert.nxv2i64.v2i64( undef, + <2 x i64> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.dup.x.nxv2i64(i64 0) + %5 = tail call @llvm.aarch64.sve.cmpne.nxv2i64( %1, %3, %4) + ret %5 +} + +define @dupq_neg13( %x) #0 { +; CHECK-LABEL: @dupq_neg13( +; CHECK: cmpne +; CHECK-NEXT: ret + %1 = tail call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) + %2 = tail call @llvm.experimental.vector.insert.nxv2i64.v2i64( undef, + <2 x i64> , i64 0) + %3 = tail call @llvm.aarch64.sve.dupq.lane.nxv2i64( %2 , i64 0) + %4 = tail call @llvm.aarch64.sve.cmpne.nxv2i64( %1, %3, %x) + ret %4 +} + +declare @llvm.aarch64.sve.ptrue.nxv16i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv8i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv4i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv2i1(i32) + +declare @llvm.experimental.vector.insert.nxv16i8.v16i8(, <16 x i8>, i64) +declare @llvm.experimental.vector.insert.nxv8i16.v8i16(, <8 x i16>, i64) +declare @llvm.experimental.vector.insert.nxv4i32.v4i32(, <4 x i32>, i64) +declare @llvm.experimental.vector.insert.nxv2i64.v2i64(, <2 x i64>, i64) + +declare @llvm.aarch64.sve.dupq.lane.nxv16i8(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv8i16(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv4i32(, i64) +declare @llvm.aarch64.sve.dupq.lane.nxv2i64(, i64) + +declare @llvm.aarch64.sve.cmpne.wide.nxv16i8(, , ) +declare @llvm.aarch64.sve.cmpne.wide.nxv8i16(, , ) +declare @llvm.aarch64.sve.cmpne.wide.nxv4i32(, , ) +declare @llvm.aarch64.sve.cmpne.nxv2i64(, , ) + +declare @llvm.aarch64.sve.dup.x.nxv2i64(i64) + +attributes #0 = { "target-features"="+sve" }