diff --git a/clang/include/clang/Basic/arm_mve.td b/clang/include/clang/Basic/arm_mve.td --- a/clang/include/clang/Basic/arm_mve.td +++ b/clang/include/clang/Basic/arm_mve.td @@ -453,6 +453,15 @@ VecOf, (args VecOf:$inactive, Vector:$a, PredOf:$pred), (IRInt<"vcvt_narrow_predicated"> $inactive, $a, halfconst, $pred)>; } // params = [f32], pnt = PNT_None + + let params = [f16], pnt = PNT_None in { + def vcvt#half#q_f32: Intrinsic, (args Vector:$a), + (IRInt<"vcvt_widen"> $a, halfconst)>; + defm vcvt#half#q: IntrinsicMX< + VecOf, (args Vector:$a, PredOf:$pred), + (IRInt<"vcvt_widen_predicated"> $inactive, $a, halfconst, $pred), + 1, "_f32">; + } // params = [f16], pnt = PNT_None } // loop over half = "b", "t" multiclass float_int_conversions { diff --git a/clang/test/CodeGen/arm-mve-intrinsics/vcvt.c b/clang/test/CodeGen/arm-mve-intrinsics/vcvt.c --- a/clang/test/CodeGen/arm-mve-intrinsics/vcvt.c +++ b/clang/test/CodeGen/arm-mve-intrinsics/vcvt.c @@ -697,3 +697,71 @@ { return vcvtq_x_n_u32_f32(a, 32, p); } + +// CHECK-LABEL: @test_vcvtbq_f32_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.arm.mve.vcvt.widen(<8 x half> [[A:%.*]], i32 0) +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// +float32x4_t test_vcvtbq_f32_f16(float16x8_t a) +{ + return vcvtbq_f32_f16(a); +} + +// CHECK-LABEL: @test_vcvttq_f32_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = call <4 x float> @llvm.arm.mve.vcvt.widen(<8 x half> [[A:%.*]], i32 1) +// CHECK-NEXT: ret <4 x float> [[TMP0]] +// +float32x4_t test_vcvttq_f32_f16(float16x8_t a) +{ + return vcvttq_f32_f16(a); +} + +// CHECK-LABEL: @test_vcvtbq_m_f32_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> [[INACTIVE:%.*]], <8 x half> [[A:%.*]], i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x float> [[TMP2]] +// +float32x4_t test_vcvtbq_m_f32_f16(float32x4_t inactive, float16x8_t a, mve_pred16_t p) +{ + return vcvtbq_m_f32_f16(inactive, a, p); +} + +// CHECK-LABEL: @test_vcvttq_m_f32_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> [[INACTIVE:%.*]], <8 x half> [[A:%.*]], i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x float> [[TMP2]] +// +float32x4_t test_vcvttq_m_f32_f16(float32x4_t inactive, float16x8_t a, mve_pred16_t p) +{ + return vcvttq_m_f32_f16(inactive, a, p); +} + +// CHECK-LABEL: @test_vcvtbq_x_f32_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> undef, <8 x half> [[A:%.*]], i32 0, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x float> [[TMP2]] +// +float32x4_t test_vcvtbq_x_f32_f16(float16x8_t a, mve_pred16_t p) +{ + return vcvtbq_x_f32_f16(a, p); +} + +// CHECK-LABEL: @test_vcvttq_x_f32_f16( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> undef, <8 x half> [[A:%.*]], i32 1, <4 x i1> [[TMP1]]) +// CHECK-NEXT: ret <4 x float> [[TMP2]] +// +float32x4_t test_vcvttq_x_f32_f16(float16x8_t a, mve_pred16_t p) +{ + return vcvttq_x_f32_f16(a, p); +} diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -911,8 +911,22 @@ LLVMMatchType<0>, rets[0])], props>; } +// Intrinsic with a predicated and a non-predicated case. The predicated case +// has two additional parameters: inactive (the value for inactive lanes, can +// be undef) and predicate. +multiclass MVEMXPredicated rets, list flags, + list params, LLVMType inactive, + LLVMType predicate, + list props = [IntrNoMem]> { + def "": Intrinsic; + def _predicated: Intrinsic; +} + defm int_arm_mve_vcvt_narrow: MVEPredicated<[llvm_v8f16_ty], [llvm_v8f16_ty, llvm_v4f32_ty, llvm_i32_ty], llvm_v4i1_ty>; +defm int_arm_mve_vcvt_widen: MVEMXPredicated<[llvm_v4f32_ty], [], + [llvm_v8f16_ty, llvm_i32_ty], llvm_v4f32_ty, llvm_v4i1_ty>; defm int_arm_mve_vldr_gather_base: MVEPredicated< [llvm_anyvector_ty], [llvm_anyvector_ty, llvm_i32_ty], @@ -1044,18 +1058,6 @@ [llvm_anyvector_ty], [llvm_anyvector_ty, LLVMMatchType<1>, llvm_i32_ty], [IntrNoMem]>; -// Intrinsic with a predicated and a non-predicated case. The predicated case -// has two additional parameters: inactive (the value for inactive lanes, can -// be undef) and predicate. -multiclass MVEMXPredicated rets, list flags, - list params, LLVMType inactive, - LLVMType predicate, - list props = [IntrNoMem]> { - def "": Intrinsic; - def _predicated: Intrinsic; -} - // The first two parameters are compile-time constants: // * Halving: 0 means halving (vhcaddq), 1 means non-halving (vcaddq) // instruction. Note: the flag is inverted to match the corresonding diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td --- a/llvm/lib/Target/ARM/ARMInstrMVE.td +++ b/llvm/lib/Target/ARM/ARMInstrMVE.td @@ -4515,6 +4515,17 @@ multiclass MVE_VCVT_h2f_m { def "": MVE_VCVT_ff; + defvar Inst = !cast(NAME); + + let Predicates = [HasMVEFloat] in { + def : Pat<(v4f32 (int_arm_mve_vcvt_widen (v8f16 MQPR:$Qm), (i32 half))), + (v4f32 (Inst (v8f16 MQPR:$Qm)))>; + def : Pat<(v4f32 (int_arm_mve_vcvt_widen_predicated + (v4f32 MQPR:$inactive), (v8f16 MQPR:$Qm), (i32 half), + (v4i1 VCCR:$mask))), + (v4f32 (Inst (v8f16 MQPR:$Qm), ARMVCCThen, + (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive)))>; + } } defm MVE_VCVTf16f32bh : MVE_VCVT_f2h_m<"vcvtb", 0b0>; diff --git a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vcvt.ll b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vcvt.ll --- a/llvm/test/CodeGen/Thumb2/mve-intrinsics/vcvt.ll +++ b/llvm/test/CodeGen/Thumb2/mve-intrinsics/vcvt.ll @@ -6,6 +6,8 @@ declare <8 x half> @llvm.arm.mve.vcvt.narrow(<8 x half>, <4 x float>, i32) declare <8 x half> @llvm.arm.mve.vcvt.narrow.predicated(<8 x half>, <4 x float>, i32, <4 x i1>) +declare <4 x float> @llvm.arm.mve.vcvt.widen(<8 x half>, i32) +declare <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float>, <8 x half>, i32, <4 x i1>) declare <8 x half> @llvm.arm.mve.vcvt.fix.v8f16.v8i16(i32, <8 x i16>, i32) declare <4 x float> @llvm.arm.mve.vcvt.fix.v4f32.v4i32(i32, <4 x i32>, i32) @@ -367,3 +369,51 @@ %2 = call <4 x i32> @llvm.arm.mve.vcvt.fix.predicated.v4i32.v4f32.v4i1(i32 1, <4 x i32> undef, <4 x float> %a, i32 32, <4 x i1> %1) ret <4 x i32> %2 } + +define arm_aapcs_vfpcc <4 x float> @test_vcvtbq_f32_f16(<8 x half> %a) { +; CHECK-LABEL: test_vcvtbq_f32_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcvtb.f32.f16 q0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <4 x float> @llvm.arm.mve.vcvt.widen(<8 x half> %a, i32 0) + ret <4 x float> %0 +} + +define arm_aapcs_vfpcc <4 x float> @test_vcvttq_f32_f16(<8 x half> %a) { +; CHECK-LABEL: test_vcvttq_f32_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vcvtt.f32.f16 q0, q0 +; CHECK-NEXT: bx lr +entry: + %0 = tail call <4 x float> @llvm.arm.mve.vcvt.widen(<8 x half> %a, i32 1) + ret <4 x float> %0 +} + +define arm_aapcs_vfpcc <4 x float> @test_vcvtbq_m_f32_f16(<4 x float> %inactive, <8 x half> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vcvtbq_m_f32_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vcvtbt.f32.f16 q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> %inactive, <8 x half> %a, i32 0, <4 x i1> %1) + ret <4 x float> %2 +} + +define arm_aapcs_vfpcc <4 x float> @test_vcvttq_m_f32_f16(<4 x float> %inactive, <8 x half> %a, i16 zeroext %p) { +; CHECK-LABEL: test_vcvttq_m_f32_f16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vcvttt.f32.f16 q0, q1 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = tail call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %0) + %2 = tail call <4 x float> @llvm.arm.mve.vcvt.widen.predicated(<4 x float> %inactive, <8 x half> %a, i32 1, <4 x i1> %1) + ret <4 x float> %2 +}