diff --git a/clang/include/clang/Basic/arm_cde.td b/clang/include/clang/Basic/arm_cde.td --- a/clang/include/clang/Basic/arm_cde.td +++ b/clang/include/clang/Basic/arm_cde.td @@ -189,6 +189,40 @@ "__arm_vcx3qa_impl((cp), (acc), __arm_vreinterpretq_u8(n), " "__arm_vreinterpretq_u8(m), (imm))">; +class CDEIntrinsicMasked + : CDEIntrinsic + $cp, $inactive_or_acc), cgArgs, (? $imm, $pred))> { + let params = T.All; + let polymorphicOnly = 1; +} + +def vcx1q_m : CDEIntrinsicMasked<"vcx1q", (args), (args imm_12b:$imm), (?)>; +def vcx1qa_m : CDEIntrinsicMasked<"vcx1qa", (args), (args imm_12b:$imm), (?)>; + +multiclass VCXPredicated macroArgs, string macro> { + def _m_impl : CDEIntrinsicMasked; + def a_m_impl : CDEIntrinsicMasked; + + def _m: FunctionMacro< + !listconcat(["cp", "inactive"], macroArgs, ["imm", "pred"]), + "__arm_"#NAME#"_m_impl((cp), (inactive), "#macro#" (imm), (pred))">; + def a_m: FunctionMacro< + !listconcat(["cp", "acc"], macroArgs, ["imm", "pred"]), + "__arm_"#NAME#"a_m_impl((cp), (acc), "#macro#" (imm), (pred))">; +} + +defm vcx2q : + VCXPredicated<(args v16u8:$n), (args imm_7b:$imm), (? $n), ["n"], + "__arm_vreinterpretq_u8(n),">; +defm vcx3q : + VCXPredicated<(args v16u8:$n, v16u8:$m), (args imm_4b:$imm), (? $n, $m), + ["n", "m"], "__arm_vreinterpretq_u8(n), " + "__arm_vreinterpretq_u8(m),">; + // vreinterpretq intrinsics required by the ACLE CDE specification foreach desttype = [/* no u8 */ s8, u16, s16, u32, s32, u64, s64, f16, f32] in { diff --git a/clang/test/CodeGen/arm-cde-vec.c b/clang/test/CodeGen/arm-cde-vec.c --- a/clang/test/CodeGen/arm-cde-vec.c +++ b/clang/test/CodeGen/arm-cde-vec.c @@ -102,3 +102,75 @@ int8x16_t test_vcx3qa(int8x16_t acc, uint16x8_t n, float32x4_t m) { return __arm_vcx3qa(1, acc, n, m, 13); } + +// CHECK-LABEL: @test_vcx1q_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <8 x i16> @llvm.arm.cde.vcx1q.predicated.v8i16.v8i1(i32 0, <8 x i16> [[INACTIVE:%.*]], i32 1111, <8 x i1> [[TMP1]]) +// CHECK-NEXT: ret <8 x i16> [[TMP2]] +// +uint16x8_t test_vcx1q_m(uint16x8_t inactive, mve_pred16_t p) { + return __arm_vcx1q_m(0, inactive, 1111, p); +} + +// CHECK-LABEL: @test_vcx1qa_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP1:%.*]] = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 [[TMP0]]) +// CHECK-NEXT: [[TMP2:%.*]] = call <16 x i8> @llvm.arm.cde.vcx1qa.predicated.v16i8.v16i1(i32 1, <16 x i8> [[ACC:%.*]], i32 1112, <16 x i1> [[TMP1]]) +// CHECK-NEXT: ret <16 x i8> [[TMP2]] +// +uint8x16_t test_vcx1qa_m(uint8x16_t acc, mve_pred16_t p) { + return __arm_vcx1qa_m(1, acc, 1112, p); +} + +// CHECK-LABEL: @test_vcx2q_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[N:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.arm.cde.vcx2q.predicated.v4i32.v4i1(i32 0, <4 x i32> [[INACTIVE:%.*]], <16 x i8> [[TMP0]], i32 111, <4 x i1> [[TMP2]]) +// CHECK-NEXT: ret <4 x i32> [[TMP3]] +// +int32x4_t test_vcx2q_m(int32x4_t inactive, float32x4_t n, mve_pred16_t p) { + return __arm_vcx2q_m(0, inactive, n, 111, p); +} + +// CHECK-LABEL: @test_vcx2qa_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[N:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x float> @llvm.arm.cde.vcx2qa.predicated.v4f32.v4i1(i32 0, <4 x float> [[ACC:%.*]], <16 x i8> [[TMP0]], i32 112, <4 x i1> [[TMP2]]) +// CHECK-NEXT: ret <4 x float> [[TMP3]] +// +float32x4_t test_vcx2qa_m(float32x4_t acc, float16x8_t n, mve_pred16_t p) { + return __arm_vcx2qa_m(0, acc, n, 112, p); +} + +// CHECK-LABEL: @test_vcx3q_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <4 x float> [[N:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP2:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP1]]) +// CHECK-NEXT: [[TMP3:%.*]] = call <2 x i64> @llvm.arm.cde.vcx3q.predicated.v2i64.v4i1(i32 1, <2 x i64> [[INACTIVE:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[M:%.*]], i32 11, <4 x i1> [[TMP2]]) +// CHECK-NEXT: ret <2 x i64> [[TMP3]] +// +int64x2_t test_vcx3q_m(int64x2_t inactive, float32x4_t n, int8x16_t m, mve_pred16_t p) { + return __arm_vcx3q_m(1, inactive, n, m, 11, p); +} + +// CHECK-LABEL: @test_vcx3qa_m( +// CHECK-NEXT: entry: +// CHECK-NEXT: [[TMP0:%.*]] = bitcast <8 x half> [[N:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i32> [[M:%.*]] to <16 x i8> +// CHECK-NEXT: [[TMP2:%.*]] = zext i16 [[P:%.*]] to i32 +// CHECK-NEXT: [[TMP3:%.*]] = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 [[TMP2]]) +// CHECK-NEXT: [[TMP4:%.*]] = call <4 x float> @llvm.arm.cde.vcx3qa.predicated.v4f32.v4i1(i32 1, <4 x float> [[INACTIVE:%.*]], <16 x i8> [[TMP0]], <16 x i8> [[TMP1]], i32 12, <4 x i1> [[TMP3]]) +// CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x float> [[TMP4]] to <8 x half> +// CHECK-NEXT: ret <8 x half> [[TMP5]] +// +float16x8_t test_vcx3qa_m(float32x4_t inactive, float16x8_t n, uint32x4_t m, mve_pred16_t p) { + return __arm_vcx3qa_m(1, inactive, n, m, 12, p); +} diff --git a/llvm/include/llvm/IR/IntrinsicsARM.td b/llvm/include/llvm/IR/IntrinsicsARM.td --- a/llvm/include/llvm/IR/IntrinsicsARM.td +++ b/llvm/include/llvm/IR/IntrinsicsARM.td @@ -1332,6 +1332,17 @@ !listconcat([llvm_i32_ty /* coproc */, llvm_v16i8_ty /* acc */], args, [llvm_i32_ty /* imm */]), [IntrNoMem, ImmArg<0>, ImmArg]>; + + def _predicated : Intrinsic< + [llvm_anyvector_ty], + !listconcat([llvm_i32_ty /* coproc */, LLVMMatchType<0> /* inactive */], + args, [llvm_i32_ty /* imm */, llvm_anyvector_ty /* mask */]), + [IntrNoMem, ImmArg<0>, ImmArg]>; + def a_predicated : Intrinsic< + [llvm_anyvector_ty], + !listconcat([llvm_i32_ty /* coproc */, LLVMMatchType<0> /* acc */], + args, [llvm_i32_ty /* imm */, llvm_anyvector_ty /* mask */]), + [IntrNoMem, ImmArg<0>, ImmArg]>; } defm int_arm_cde_vcx1q : CDEVCXVecIntrinsics<[]>; diff --git a/llvm/lib/Target/ARM/ARMInstrCDE.td b/llvm/lib/Target/ARM/ARMInstrCDE.td --- a/llvm/lib/Target/ARM/ARMInstrCDE.td +++ b/llvm/lib/Target/ARM/ARMInstrCDE.td @@ -606,3 +606,61 @@ (v16i8 (CDE_VCX3A_vec p_imm:$coproc, MQPR:$acc, MQPR:$n, MQPR:$m, imm_4b:$imm))>; } + +multiclass VCXPredicatedPat_m { + def : Pat<(VTI.Vec (int_arm_cde_vcx1q_predicated timm:$coproc, + (VTI.Vec MQPR:$inactive), timm:$imm, + (VTI.Pred VCCR:$pred))), + (VTI.Vec (CDE_VCX1_vec p_imm:$coproc, imm_12b:$imm, ARMVCCThen, + (VTI.Pred VCCR:$pred), + (VTI.Vec MQPR:$inactive)))>; + def : Pat<(VTI.Vec (int_arm_cde_vcx1qa_predicated timm:$coproc, + (VTI.Vec MQPR:$acc), timm:$imm, + (VTI.Pred VCCR:$pred))), + (VTI.Vec (CDE_VCX1A_vec p_imm:$coproc, (VTI.Vec MQPR:$acc), + imm_12b:$imm, ARMVCCThen, + (VTI.Pred VCCR:$pred)))>; + + def : Pat<(VTI.Vec (int_arm_cde_vcx2q_predicated timm:$coproc, + (VTI.Vec MQPR:$inactive), + (v16i8 MQPR:$n), timm:$imm, + (VTI.Pred VCCR:$pred))), + (VTI.Vec (CDE_VCX2_vec p_imm:$coproc, (v16i8 MQPR:$n), + imm_7b:$imm, ARMVCCThen, + (VTI.Pred VCCR:$pred), + (VTI.Vec MQPR:$inactive)))>; + def : Pat<(VTI.Vec (int_arm_cde_vcx2qa_predicated timm:$coproc, + (VTI.Vec MQPR:$acc), + (v16i8 MQPR:$n), timm:$imm, + (VTI.Pred VCCR:$pred))), + (VTI.Vec (CDE_VCX2A_vec p_imm:$coproc, (VTI.Vec MQPR:$acc), + (v16i8 MQPR:$n), timm:$imm, ARMVCCThen, + (VTI.Pred VCCR:$pred)))>; + + def : Pat<(VTI.Vec (int_arm_cde_vcx3q_predicated timm:$coproc, + (VTI.Vec MQPR:$inactive), + (v16i8 MQPR:$n), (v16i8 MQPR:$m), + timm:$imm, + (VTI.Pred VCCR:$pred))), + (VTI.Vec (CDE_VCX3_vec p_imm:$coproc, (v16i8 MQPR:$n), + (v16i8 MQPR:$m), + imm_4b:$imm, ARMVCCThen, + (VTI.Pred VCCR:$pred), + (VTI.Vec MQPR:$inactive)))>; + def : Pat<(VTI.Vec (int_arm_cde_vcx3qa_predicated timm:$coproc, + (VTI.Vec MQPR:$acc), + (v16i8 MQPR:$n), (v16i8 MQPR:$m), timm:$imm, + (VTI.Pred VCCR:$pred))), + (VTI.Vec (CDE_VCX3A_vec p_imm:$coproc, (VTI.Vec MQPR:$acc), + (v16i8 MQPR:$n), (v16i8 MQPR:$m), + imm_4b:$imm, ARMVCCThen, + (VTI.Pred VCCR:$pred)))>; +} + +let Predicates = [HasCDE, HasMVEInt] in + foreach VTI = [ MVE_v16i8, MVE_v8i16, MVE_v4i32, MVE_v2i64 ] in + defm : VCXPredicatedPat_m; + +let Predicates = [HasCDE, HasMVEFloat] in + foreach VTI = [ MVE_v8f16, MVE_v4f32 ] in + defm : VCXPredicatedPat_m; diff --git a/llvm/test/CodeGen/Thumb2/cde-vec.ll b/llvm/test/CodeGen/Thumb2/cde-vec.ll --- a/llvm/test/CodeGen/Thumb2/cde-vec.ll +++ b/llvm/test/CodeGen/Thumb2/cde-vec.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=thumbv8.1m.main -mattr=+cdecp0 -mattr=+cdecp1 -mattr=+mve -verify-machineinstrs -o - %s | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+cdecp0 -mattr=+cdecp1 -mattr=+mve.fp -verify-machineinstrs -o - %s | FileCheck %s declare <16 x i8> @llvm.arm.cde.vcx1q(i32 immarg, i32 immarg) declare <16 x i8> @llvm.arm.cde.vcx1qa(i32 immarg, <16 x i8>, i32 immarg) @@ -112,3 +112,103 @@ %2 = call <16 x i8> @llvm.arm.cde.vcx3qa(i32 1, <16 x i8> %acc, <16 x i8> %0, <16 x i8> %1, i32 13) ret <16 x i8> %2 } + +declare <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32) +declare <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32) +declare <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32) +declare <8 x i16> @llvm.arm.cde.vcx1q.predicated.v8i16.v8i1(i32 immarg, <8 x i16>, i32 immarg, <8 x i1>) +declare <16 x i8> @llvm.arm.cde.vcx1qa.predicated.v16i8.v16i1(i32 immarg, <16 x i8>, i32 immarg, <16 x i1>) +declare <4 x i32> @llvm.arm.cde.vcx2q.predicated.v4i32.v4i1(i32 immarg, <4 x i32>, <16 x i8>, i32 immarg, <4 x i1>) +declare <4 x float> @llvm.arm.cde.vcx2qa.predicated.v4f32.v4i1(i32 immarg, <4 x float>, <16 x i8>, i32 immarg, <4 x i1>) +declare <2 x i64> @llvm.arm.cde.vcx3q.predicated.v2i64.v4i1(i32 immarg, <2 x i64>, <16 x i8>, <16 x i8>, i32 immarg, <4 x i1>) +declare <4 x float> @llvm.arm.cde.vcx3qa.predicated.v4f32.v4i1(i32 immarg, <4 x float>, <16 x i8>, <16 x i8>, i32 immarg, <4 x i1>) + +define arm_aapcs_vfpcc <8 x i16> @test_vcx1q_m(<8 x i16> %inactive, i16 zeroext %p) { +; CHECK-LABEL: test_vcx1q_m: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vcx1t p0, q0, #1111 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <8 x i1> @llvm.arm.mve.pred.i2v.v8i1(i32 %0) + %2 = call <8 x i16> @llvm.arm.cde.vcx1q.predicated.v8i16.v8i1(i32 0, <8 x i16> %inactive, i32 1111, <8 x i1> %1) + ret <8 x i16> %2 +} + +define arm_aapcs_vfpcc <16 x i8> @test_vcx1qa_m(<16 x i8> %acc, i16 zeroext %p) { +; CHECK-LABEL: test_vcx1qa_m: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vcx1at p1, q0, #1112 +; CHECK-NEXT: bx lr +entry: + %0 = zext i16 %p to i32 + %1 = call <16 x i1> @llvm.arm.mve.pred.i2v.v16i1(i32 %0) + %2 = call <16 x i8> @llvm.arm.cde.vcx1qa.predicated.v16i8.v16i1(i32 1, <16 x i8> %acc, i32 1112, <16 x i1> %1) + ret <16 x i8> %2 +} + +define arm_aapcs_vfpcc <4 x i32> @test_vcx2q_m(<4 x i32> %inactive, <4 x float> %n, i16 zeroext %p) { +; CHECK-LABEL: test_vcx2q_m: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vcx2t p0, q0, q1, #111 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast <4 x float> %n to <16 x i8> + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <4 x i32> @llvm.arm.cde.vcx2q.predicated.v4i32.v4i1(i32 0, <4 x i32> %inactive, <16 x i8> %0, i32 111, <4 x i1> %2) + ret <4 x i32> %3 +} + +define arm_aapcs_vfpcc <4 x float> @test_vcx2qa_m(<4 x float> %acc, <8 x half> %n, i16 zeroext %p) { +; CHECK-LABEL: test_vcx2qa_m: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vcx2at p0, q0, q1, #112 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast <8 x half> %n to <16 x i8> + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <4 x float> @llvm.arm.cde.vcx2qa.predicated.v4f32.v4i1(i32 0, <4 x float> %acc, <16 x i8> %0, i32 112, <4 x i1> %2) + ret <4 x float> %3 +} + +define arm_aapcs_vfpcc <2 x i64> @test_vcx3q_m(<2 x i64> %inactive, <4 x float> %n, <16 x i8> %m, i16 zeroext %p) { +; CHECK-LABEL: test_vcx3q_m: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vcx3t p0, q0, q1, q2, #11 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast <4 x float> %n to <16 x i8> + %1 = zext i16 %p to i32 + %2 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %1) + %3 = call <2 x i64> @llvm.arm.cde.vcx3q.predicated.v2i64.v4i1(i32 0, <2 x i64> %inactive, <16 x i8> %0, <16 x i8> %m, i32 11, <4 x i1> %2) + ret <2 x i64> %3 +} + +define arm_aapcs_vfpcc <8 x half> @test_vcx3qa_m(<4 x float> %inactive, <8 x half> %n, <4 x i32> %m, i16 zeroext %p) { +; CHECK-LABEL: test_vcx3qa_m: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmsr p0, r0 +; CHECK-NEXT: vpst +; CHECK-NEXT: vcx3at p0, q0, q1, q2, #12 +; CHECK-NEXT: bx lr +entry: + %0 = bitcast <8 x half> %n to <16 x i8> + %1 = bitcast <4 x i32> %m to <16 x i8> + %2 = zext i16 %p to i32 + %3 = call <4 x i1> @llvm.arm.mve.pred.i2v.v4i1(i32 %2) + %4 = call <4 x float> @llvm.arm.cde.vcx3qa.predicated.v4f32.v4i1(i32 0, <4 x float> %inactive, <16 x i8> %0, <16 x i8> %1, i32 12, <4 x i1> %3) + %5 = bitcast <4 x float> %4 to <8 x half> + ret <8 x half> %5 +}