diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2780,6 +2780,28 @@ [llvm_nxv4f32_ty, llvm_nxv4f32_ty], [IntrNoMem]>; + class SME2_CVT_FtoI_VG2_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], + [LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>], + [IntrNoMem]>; + + class SME2_CVT_ItoF_VG2_Intrinsic + : DefaultAttrsIntrinsic<[LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>], + [llvm_anyvector_ty, LLVMMatchType<0>], + [IntrNoMem]>; + + class SME2_CVT_FtoI_VG4_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>, + LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>], + [IntrNoMem]>; + + class SME2_CVT_ItoF_VG4_Intrinsic + : DefaultAttrsIntrinsic<[LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>, + LLVMVectorOfBitcastsToInt<0>, LLVMVectorOfBitcastsToInt<0>], + [llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], + [IntrNoMem]>; + // // Multi-vector fused multiply-add/subtract // @@ -2838,4 +2860,18 @@ // def int_aarch64_sve_fcvtn_x2 : SME2_CVT_VG2_SINGLE_Intrinsic; def int_aarch64_sve_bfcvtn_x2 : SME2_CVT_VG2_SINGLE_BF16_Intrinsic; + + // + // Multi-vector convert to/from floating-point. + // + def int_aarch64_sve_fcvt_x2 : SME2_CVT_VG2_SINGLE_Intrinsic; + def int_aarch64_sve_bfcvt_x2 : SME2_CVT_VG2_SINGLE_BF16_Intrinsic; + def int_aarch64_sve_fcvts_x2 : SME2_CVT_FtoI_VG2_Intrinsic; + def int_aarch64_sve_fcvtu_x2 : SME2_CVT_FtoI_VG2_Intrinsic; + def int_aarch64_sve_scvtf_x2 : SME2_CVT_ItoF_VG2_Intrinsic; + def int_aarch64_sve_ucvtf_x2 : SME2_CVT_ItoF_VG2_Intrinsic; + def int_aarch64_sve_fcvts_x4 : SME2_CVT_FtoI_VG4_Intrinsic; + def int_aarch64_sve_fcvtu_x4 : SME2_CVT_FtoI_VG4_Intrinsic; + def int_aarch64_sve_scvtf_x4 : SME2_CVT_ItoF_VG4_Intrinsic; + def int_aarch64_sve_ucvtf_x4 : SME2_CVT_ItoF_VG4_Intrinsic; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -357,6 +357,7 @@ unsigned Opc_rr, unsigned Opc_ri, bool IsIntr = false); void SelectWhilePair(SDNode *N, unsigned Opc); + void SelectCVTIntrinsic(SDNode *N, unsigned NumVecs, unsigned Opcode); bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm); /// SVE Reg+Imm addressing mode. @@ -1747,6 +1748,22 @@ CurDAG->RemoveDeadNode(N); } +void AArch64DAGToDAGISel::SelectCVTIntrinsic(SDNode *N, unsigned NumVecs, + unsigned Opcode) { + EVT VT = N->getValueType(0); + SmallVector Regs(N->op_begin() + 1, N->op_begin() + 1 + NumVecs); + SDValue Ops = createZTuple(Regs); + SDLoc DL(N); + SDNode *Intrinsic = CurDAG->getMachineNode(Opcode, DL, MVT::Untyped, Ops); + SDValue SuperReg = SDValue(Intrinsic, 0); + for (unsigned i = 0; i < NumVecs; ++i) + ReplaceUses(SDValue(N, i), CurDAG->getTargetExtractSubreg( + AArch64::zsub0 + i, DL, VT, SuperReg)); + + CurDAG->RemoveDeadNode(N); + return; +} + void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs, unsigned Scale, unsigned Opc_ri, unsigned Opc_rr, bool IsIntr) { @@ -4732,6 +4749,30 @@ AArch64::WHILELT_2PXX_S, AArch64::WHILELT_2PXX_D})) SelectWhilePair(Node, Op); return; + case Intrinsic::aarch64_sve_fcvts_x2: + SelectCVTIntrinsic(Node, 2, AArch64::FCVTZS_2Z2Z_StoS); + return; + case Intrinsic::aarch64_sve_scvtf_x2: + SelectCVTIntrinsic(Node, 2, AArch64::SCVTF_2Z2Z_StoS); + return; + case Intrinsic::aarch64_sve_fcvtu_x2: + SelectCVTIntrinsic(Node, 2, AArch64::FCVTZU_2Z2Z_StoS); + return; + case Intrinsic::aarch64_sve_ucvtf_x2: + SelectCVTIntrinsic(Node, 2, AArch64::UCVTF_2Z2Z_StoS); + return; + case Intrinsic::aarch64_sve_fcvts_x4: + SelectCVTIntrinsic(Node, 4, AArch64::FCVTZS_4Z4Z_StoS); + return; + case Intrinsic::aarch64_sve_scvtf_x4: + SelectCVTIntrinsic(Node, 4, AArch64::SCVTF_4Z4Z_StoS); + return; + case Intrinsic::aarch64_sve_fcvtu_x4: + SelectCVTIntrinsic(Node, 4, AArch64::FCVTZU_4Z4Z_StoS); + return; + case Intrinsic::aarch64_sve_ucvtf_x4: + SelectCVTIntrinsic(Node, 4, AArch64::UCVTF_4Z4Z_StoS); + return; } break; } diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -363,9 +363,9 @@ defm UMLSL_VG2_M2Z2Z : sme2_int_mla_long_array_vg2_multi<"umlsl", 0b11, int_aarch64_sme_umlsl_vg2x2>; defm UMLSL_VG4_M4Z4Z : sme2_int_mla_long_array_vg4_multi<"umlsl", 0b11, int_aarch64_sme_umlsl_vg2x4>; -defm FCVT_Z2Z_StoH : sme2_cvt_vg2_single<"fcvt", 0b0000, nxv8f16, nxv4f32, null_frag>; +defm FCVT_Z2Z_StoH : sme2_cvt_vg2_single<"fcvt", 0b0000, nxv8f16, nxv4f32, int_aarch64_sve_fcvt_x2>; defm FCVTN_Z2Z_StoH : sme2_cvt_vg2_single<"fcvtn", 0b0001, nxv8f16, nxv4f32, int_aarch64_sve_fcvtn_x2>; -defm BFCVT_Z2Z_StoH : sme2_cvt_vg2_single<"bfcvt", 0b1000, nxv8bf16, nxv4f32, null_frag>; +defm BFCVT_Z2Z_StoH : sme2_cvt_vg2_single<"bfcvt", 0b1000, nxv8bf16, nxv4f32, int_aarch64_sve_bfcvt_x2>; defm BFCVTN_Z2Z_StoH : sme2_cvt_vg2_single<"bfcvtn", 0b1001, nxv8bf16, nxv4f32, int_aarch64_sve_bfcvtn_x2>; defm SQCVT_Z2Z_StoH : sme2_cvt_vg2_single<"sqcvt", 0b0110, nxv8i16, nxv4i32, null_frag>; diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -2036,7 +2036,7 @@ // SME2 multi-vec FP to int convert four registers // SME2 multi-vec int to FP four registers multiclass sme2_fp_cvt_vg4_multi op> { - def _S : sme2_frint_zip_cvt_vg4_multi<0b00, op, ZZZZ_s_mul_r, ZZZZ_s_mul_r, mnemonic>; + def NAME : sme2_frint_zip_cvt_vg4_multi<0b00, op, ZZZZ_s_mul_r, ZZZZ_s_mul_r, mnemonic>; } // SME2 multi-vec quadwords ZIP four registers diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-cvt.ll @@ -0,0 +1,151 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s + +; +; FCVT +; + +define @multi_vector_cvt_x2_f16( %unused, %zn1, %zn2) { +; CHECK-LABEL: multi_vector_cvt_x2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: fcvt z0.h, { z2.s, z3.s } +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.fcvt.x2.nxv4f32( %zn1, %zn2) + ret %res +} + +; +; BFCVT +; + +define @multi_vector_cvt_x2_bf16( %unused, %zn1, %zn2) { +; CHECK-LABEL: multi_vector_cvt_x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: bfcvt z0.h, { z2.s, z3.s } +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.bfcvt.x2( %zn1, %zn2) + ret %res +} + +; +; FCVTZS +; +define {, } @multi_vector_cvt_x2_f32_s32( %unused, %zn0, %zn1) { +; CHECK-LABEL: multi_vector_cvt_x2_f32_s32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: fcvtzs { z0.s, z1.s }, { z2.s, z3.s } +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sve.fcvts.x2.nxv4f32(%zn0, %zn1) + ret {, } %res +} + +define {, ,, } @multi_vector_cvt_x4_f32_s32( %unused, %zn0, %zn1, %zn2, %zn3) { +; CHECK-LABEL: multi_vector_cvt_x4_f32_s32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: fcvtzs { z0.s - z3.s }, { z4.s - z7.s } +; CHECK-NEXT: ret + %res = call {, ,, } @llvm.aarch64.sve.fcvts.x4.nxv4f32(%zn0, %zn1, %zn2, %zn3) + ret {, , , } %res +} + +; +; FCVTZU +; +define {, } @multi_vector_cvt_x2_f32_u32( %unused, %zn0, %zn1) { +; CHECK-LABEL: multi_vector_cvt_x2_f32_u32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: fcvtzu { z0.s, z1.s }, { z2.s, z3.s } +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sve.fcvtu.x2.nxv4f32(%zn0, %zn1) + ret {, } %res +} + +define {, , , } @multi_vector_cvt_x4_f32_u32( %unused, %zn0, %zn1, %zn2, %zn3) { +; CHECK-LABEL: multi_vector_cvt_x4_f32_u32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: fcvtzu { z0.s - z3.s }, { z4.s - z7.s } +; CHECK-NEXT: ret + %res = call {, ,, } @llvm.aarch64.sve.fcvtu.x4.nxv4f32(%zn0, %zn1, %zn2, %zn3) + ret {, , , } %res +} + +; +; SCVTF +; +define {, } @multi_vector_cvt_x2_s32_f32(%unused, %zn0, %zn1) { +; CHECK-LABEL: multi_vector_cvt_x2_s32_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: scvtf { z0.s, z1.s }, { z2.s, z3.s } +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sve.scvtf.x2.nxv4f32(%zn0, %zn1) + ret {, } %res +} + +define {, ,, } @multi_vector_cvt_x4_s32_f32(%unused, %zn0, %zn1, %zn2, %zn3) { +; CHECK-LABEL: multi_vector_cvt_x4_s32_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: scvtf { z0.s - z3.s }, { z4.s - z7.s } +; CHECK-NEXT: ret + %res = call {, , , } @llvm.aarch64.sve.scvtf.x4.nxv4f32(%zn0, %zn1, %zn2, %zn3) + ret {, ,, } %res +} + +; +; UCVTF +; +define {, } @multi_vector_cvt_x2_u32_f32(%unused, %zn0, %zn1) { +; CHECK-LABEL: multi_vector_cvt_x2_u32_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: ucvtf { z0.s, z1.s }, { z2.s, z3.s } +; CHECK-NEXT: ret + %res = call {, } @llvm.aarch64.sve.ucvtf.x2.nxv4f32(%zn0, %zn1) + ret {, } %res +} + +define {, ,, } @multi_vector_cvt_x4_u32_f32(%unused, %zn0, %zn1, %zn2, %zn3) { +; CHECK-LABEL: multi_vector_cvt_x4_u32_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: ucvtf { z0.s - z3.s }, { z4.s - z7.s } +; CHECK-NEXT: ret + %res = call {, , , } @llvm.aarch64.sve.ucvtf.x4.nxv4f32(%zn0, %zn1, %zn2, %zn3) + ret {, ,, } %res +} + +declare @llvm.aarch64.sve.fcvt.x2.nxv4f32(, ) +declare @llvm.aarch64.sve.bfcvt.x2(, ) +declare {, } @llvm.aarch64.sve.fcvts.x2.nxv4f32(,) +declare {, } @llvm.aarch64.sve.fcvtu.x2.nxv4f32(,) +declare {, } @llvm.aarch64.sve.scvtf.x2.nxv4f32(,) +declare {, } @llvm.aarch64.sve.ucvtf.x2.nxv4f32(,) +declare {, ,, } @llvm.aarch64.sve.fcvts.x4.nxv4f32(,,,) +declare {, ,, } @llvm.aarch64.sve.fcvtu.x4.nxv4f32(,,,) +declare {, ,, } @llvm.aarch64.sve.scvtf.x4.nxv4f32(,,,) +declare {, ,, } @llvm.aarch64.sve.ucvtf.x4.nxv4f32(,,,)