Index: llvm/include/llvm/IR/IntrinsicsAArch64.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAArch64.td +++ llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2678,4 +2678,24 @@ def int_aarch64_sme_set_tpidr2 : DefaultAttrsIntrinsic<[], [llvm_i64_ty], [IntrNoMem, IntrHasSideEffects]>; + // Clamp + // + + def int_aarch64_sve_sclamp : AdvSIMD_3VectorArg_Intrinsic; + def int_aarch64_sve_uclamp : AdvSIMD_3VectorArg_Intrinsic; + + // + // Reversal + // + + def int_aarch64_sve_revd : AdvSIMD_Merged1VectorArg_Intrinsic; + + // + // Predicate selection + // + + def int_aarch64_sve_psel + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + LLVMMatchType<0>, llvm_i32_ty]>; } Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -408,6 +408,7 @@ // SME RDSVL, + REVD_MERGE_PASSTHRU, // Asserts that a function argument (i32) is zero-extended to i8 by // the caller Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -208,6 +208,7 @@ case AArch64ISD::BSWAP_MERGE_PASSTHRU: case AArch64ISD::REVH_MERGE_PASSTHRU: case AArch64ISD::REVW_MERGE_PASSTHRU: + case AArch64ISD::REVD_MERGE_PASSTHRU: case AArch64ISD::CTLZ_MERGE_PASSTHRU: case AArch64ISD::CTPOP_MERGE_PASSTHRU: case AArch64ISD::DUP_MERGE_PASSTHRU: @@ -2251,6 +2252,7 @@ MAKE_CASE(AArch64ISD::BSWAP_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::REVH_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::REVW_MERGE_PASSTHRU) + MAKE_CASE(AArch64ISD::REVD_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::CTLZ_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::CTPOP_MERGE_PASSTHRU) MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU) @@ -4569,6 +4571,9 @@ case Intrinsic::aarch64_sve_revw: return DAG.getNode(AArch64ISD::REVW_MERGE_PASSTHRU, dl, Op.getValueType(), Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); + case Intrinsic::aarch64_sve_revd: + return DAG.getNode(AArch64ISD::REVD_MERGE_PASSTHRU, dl, Op.getValueType(), + Op.getOperand(2), Op.getOperand(3), Op.getOperand(1)); case Intrinsic::aarch64_sve_sxtb: return DAG.getNode( AArch64ISD::SIGN_EXTEND_INREG_MERGE_PASSTHRU, dl, Op.getValueType(), Index: llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -148,11 +148,11 @@ // SVE2 instructions //===----------------------------------------------------------------------===// -def REVD_ZPmZ : sve2_int_perm_revd<"revd">; +defm REVD_ZPmZ : sve2_int_perm_revd<"revd", AArch64revd_mt>; -defm SCLAMP_ZZZ : sve2_clamp<"sclamp", 0b0>; -defm UCLAMP_ZZZ : sve2_clamp<"uclamp", 0b1>; +defm SCLAMP_ZZZ : sve2_clamp<"sclamp", 0b0, int_aarch64_sve_sclamp>; +defm UCLAMP_ZZZ : sve2_clamp<"uclamp", 0b1, int_aarch64_sve_uclamp>; -defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel">; +defm PSEL_PPPRI : sve2_int_perm_sel_p<"psel", int_aarch64_sve_psel>; } // End let Predicates = [HasSME] Index: llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td =================================================================== --- llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -233,6 +233,7 @@ def AArch64revb_mt : SDNode<"AArch64ISD::BSWAP_MERGE_PASSTHRU", SDT_AArch64Arith>; def AArch64revh_mt : SDNode<"AArch64ISD::REVH_MERGE_PASSTHRU", SDT_AArch64Arith>; def AArch64revw_mt : SDNode<"AArch64ISD::REVW_MERGE_PASSTHRU", SDT_AArch64Arith>; +def AArch64revd_mt : SDNode<"AArch64ISD::REVD_MERGE_PASSTHRU", SDT_AArch64Arith>; // These are like the above but we don't yet have need for ISD nodes. They allow // a single pattern to match intrinsic and ISD operand layouts. Index: llvm/lib/Target/AArch64/SMEInstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/SMEInstrFormats.td +++ llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -1010,6 +1010,15 @@ let ElementSize = ZPR128.ElementSize; } +multiclass sve2_int_perm_revd { + def NAME : sve2_int_perm_revd; + + def : SVE_1_Op_Passthru_Pat(NAME)>; + def : SVE_1_Op_Passthru_Pat(NAME)>; + def : SVE_1_Op_Passthru_Pat(NAME)>; + def : SVE_1_Op_Passthru_Pat(NAME)>; +} + class sve2_clamp sz, bit U, ZPRRegOp zpr_ty> : I<(outs zpr_ty:$Zd), (ins zpr_ty:$Zn, zpr_ty:$Zm, zpr_ty:$_Zd), asm, "\t$Zd, $Zn, $Zm", "", []>, @@ -1031,11 +1040,16 @@ let ElementSize = zpr_ty.ElementSize; } -multiclass sve2_clamp { +multiclass sve2_clamp { def _B : sve2_clamp; def _H : sve2_clamp; def _S : sve2_clamp; def _D : sve2_clamp; + + def : SVE_3_Op_Pat(NAME # _B)>; + def : SVE_3_Op_Pat(NAME # _H)>; + def : SVE_3_Op_Pat(NAME # _S)>; + def : SVE_3_Op_Pat(NAME # _D)>; } class sve2_int_perm_sel_p @@ -1058,7 +1072,7 @@ let Inst{3-0} = Pd; } -multiclass sve2_int_perm_sel_p { +multiclass sve2_int_perm_sel_p { def _B : sve2_int_perm_sel_p { bits<4> imm; let Inst{23-22} = imm{3-2}; @@ -1082,4 +1096,32 @@ let Inst{22} = 0b1; let Inst{20-18} = 0b000; } + + def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv16i1 PPRAny:$Pm), + MatrixIndexGPR32Op12_15:$idx)), + (!cast(NAME # _B) $Pn, $Pm, $idx, 0)>; + def : Pat<(nxv8i1 (op (nxv8i1 PPRAny:$Pn), (nxv8i1 PPRAny:$Pm), + MatrixIndexGPR32Op12_15:$idx)), + (!cast(NAME # _H) $Pn, $Pm, $idx, 0)>; + def : Pat<(nxv4i1 (op (nxv4i1 PPRAny:$Pn), (nxv4i1 PPRAny:$Pm), + MatrixIndexGPR32Op12_15:$idx)), + (!cast(NAME # _S) $Pn, $Pm, $idx, 0)>; + def : Pat<(nxv2i1 (op (nxv2i1 PPRAny:$Pn), (nxv2i1 PPRAny:$Pm), + MatrixIndexGPR32Op12_15:$idx)), + (!cast(NAME # _D) $Pn, $Pm, $idx, 0)>; + + let AddedComplexity = 1 in { + def : Pat<(nxv16i1 (op (nxv16i1 PPRAny:$Pn), (nxv16i1 PPRAny:$Pm), + (i32 (tileslice8 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_15:$imm)))), + (!cast(NAME # _B) $Pn, $Pm, $idx, $imm)>; + def : Pat<(nxv8i1 (op (nxv8i1 PPRAny:$Pn), (nxv8i1 PPRAny:$Pm), + (i32 (tileslice16 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_7:$imm)))), + (!cast(NAME # _H) $Pn, $Pm, $idx, $imm)>; + def : Pat<(nxv4i1 (op (nxv4i1 PPRAny:$Pn), (nxv4i1 PPRAny:$Pm), + (i32 (tileslice32 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_3:$imm)))), + (!cast(NAME # _S) $Pn, $Pm, $idx, $imm)>; + def : Pat<(nxv2i1 (op (nxv2i1 PPRAny:$Pn), (nxv2i1 PPRAny:$Pm), + (i32 (tileslice64 MatrixIndexGPR32Op12_15:$idx, sme_elm_idx0_1:$imm)))), + (!cast(NAME # _D) $Pn, $Pm, $idx, $imm)>; + } } Index: llvm/test/CodeGen/AArch64/sve2-intrinsics-psel.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve2-intrinsics-psel.ll @@ -0,0 +1,91 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s + +define @psel_b( %p1, %p2, i32 %idx) { +; CHECK-LABEL: psel_b: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: psel p0, p0, p1.b[w12, 0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.psel.nxv16i1( %p1, %p2, i32 %idx) + ret %res +} + +define @psel_b_imm( %p1, %p2, i32 %idx) { +; CHECK-LABEL: psel_b_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: psel p0, p0, p1.b[w12, 15] +; CHECK-NEXT: ret + %add = add i32 %idx, 15 + %res = call @llvm.aarch64.sve.psel.nxv16i1( %p1, %p2, i32 %add) + ret %res +} + +define @psel_h( %p1, %p2, i32 %idx) { +; CHECK-LABEL: psel_h: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: psel p0, p0, p1.h[w12, 0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.psel.nxv8i1( %p1, %p2, i32 %idx) + ret %res +} + +define @psel_h_imm( %p1, %p2, i32 %idx) { +; CHECK-LABEL: psel_h_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: psel p0, p0, p1.h[w12, 7] +; CHECK-NEXT: ret + %add = add i32 %idx, 7 + %res = call @llvm.aarch64.sve.psel.nxv8i1( %p1, %p2, i32 %add) + ret %res +} + +define @psel_s( %p1, %p2, i32 %idx) { +; CHECK-LABEL: psel_s: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: psel p0, p0, p1.s[w12, 0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.psel.nxv4i1( %p1, %p2, i32 %idx) + ret %res +} + +define @psel_s_imm( %p1, %p2, i32 %idx) { +; CHECK-LABEL: psel_s_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: psel p0, p0, p1.s[w12, 3] +; CHECK-NEXT: ret + %add = add i32 %idx, 3 + %res = call @llvm.aarch64.sve.psel.nxv4i1( %p1, %p2, i32 %add) + ret %res +} + +define @psel_d( %p1, %p2, i32 %idx) { +; CHECK-LABEL: psel_d: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: psel p0, p0, p1.d[w12, 0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.psel.nxv2i1( %p1, %p2, i32 %idx) + ret %res +} + +define @psel_d_imm( %p1, %p2, i32 %idx) { +; CHECK-LABEL: psel_d_imm: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: psel p0, p0, p1.d[w12, 1] +; CHECK-NEXT: ret + %add = add i32 %idx, 1 + %res = call @llvm.aarch64.sve.psel.nxv2i1( %p1, %p2, i32 %add) + ret %res +} + +declare @llvm.aarch64.sve.psel.nxv16i1(, , i32) +declare @llvm.aarch64.sve.psel.nxv8i1(, , i32) +declare @llvm.aarch64.sve.psel.nxv4i1(, , i32) +declare @llvm.aarch64.sve.psel.nxv2i1(, , i32) Index: llvm/test/CodeGen/AArch64/sve2-intrinsics-revd.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve2-intrinsics-revd.ll @@ -0,0 +1,43 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s + +define @test_revd_i8( %a, %pg, %b) { +; CHECK-LABEL: test_revd_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: revd z0.q, p0/m, z1.q +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.revd.nxv16i8( %a, %pg, %b) + ret %res +} + +define @test_revd_i16( %a, %pg, %b) { +; CHECK-LABEL: test_revd_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: revd z0.q, p0/m, z1.q +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.revd.nxv8i16( %a, %pg, %b) + ret %res +} + +define @test_revd_i32( %a, %pg, %b) { +; CHECK-LABEL: test_revd_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: revd z0.q, p0/m, z1.q +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.revd.nxv4i32( %a, %pg, %b) + ret %res +} + +define @test_revd_i64( %a, %pg, %b) { +; CHECK-LABEL: test_revd_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: revd z0.q, p0/m, z1.q +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.revd.nxv2i64( %a, %pg, %b) + ret %res +} + +declare @llvm.aarch64.sve.revd.nxv16i8(, , ) +declare @llvm.aarch64.sve.revd.nxv8i16(, , ) +declare @llvm.aarch64.sve.revd.nxv4i32(, , ) +declare @llvm.aarch64.sve.revd.nxv2i64(, , ) Index: llvm/test/CodeGen/AArch64/sve2-intrinsics-sclamp.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve2-intrinsics-sclamp.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s + +define @test_sclamp_i8( %a, %b, %c) { +; CHECK-LABEL: test_sclamp_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: sclamp z2.b, z0.b, z1.b +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.sclamp.nxv16i8( %a, %b, %c) + ret %res +} + +define @test_sclamp_i16( %a, %b, %c) { +; CHECK-LABEL: test_sclamp_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: sclamp z2.h, z0.h, z1.h +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.sclamp.nxv8i16( %a, %b, %c) + ret %res +} + +define @test_sclamp_i32( %a, %b, %c) { +; CHECK-LABEL: test_sclamp_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: sclamp z2.s, z0.s, z1.s +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.sclamp.nxv4i32( %a, %b, %c) + ret %res +} + +define @test_sclamp_i64( %a, %b, %c) { +; CHECK-LABEL: test_sclamp_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: sclamp z2.d, z0.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.sclamp.nxv2i64( %a, %b, %c) + ret %res +} + +declare @llvm.aarch64.sve.sclamp.nxv16i8(, , ) +declare @llvm.aarch64.sve.sclamp.nxv8i16(, , ) +declare @llvm.aarch64.sve.sclamp.nxv4i32(, , ) +declare @llvm.aarch64.sve.sclamp.nxv2i64(, , ) Index: llvm/test/CodeGen/AArch64/sve2-intrinsics-uclamp.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve2-intrinsics-uclamp.ll @@ -0,0 +1,47 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s + +define @test_uclamp_i8( %a, %b, %c) { +; CHECK-LABEL: test_uclamp_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: uclamp z2.b, z0.b, z1.b +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.uclamp.nxv16i8( %a, %b, %c) + ret %res +} + +define @test_uclamp_i16( %a, %b, %c) { +; CHECK-LABEL: test_uclamp_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: uclamp z2.h, z0.h, z1.h +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.uclamp.nxv8i16( %a, %b, %c) + ret %res +} + +define @test_uclamp_i32( %a, %b, %c) { +; CHECK-LABEL: test_uclamp_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: uclamp z2.s, z0.s, z1.s +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.uclamp.nxv4i32( %a, %b, %c) + ret %res +} + +define @test_uclamp_i64( %a, %b, %c) { +; CHECK-LABEL: test_uclamp_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: uclamp z2.d, z0.d, z1.d +; CHECK-NEXT: mov z0.d, z2.d +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sve.uclamp.nxv2i64( %a, %b, %c) + ret %res +} + +declare @llvm.aarch64.sve.uclamp.nxv16i8(, , ) +declare @llvm.aarch64.sve.uclamp.nxv8i16(, , ) +declare @llvm.aarch64.sve.uclamp.nxv4i32(, , ) +declare @llvm.aarch64.sve.uclamp.nxv2i64(, , )