Index: llvm/include/llvm/IR/IntrinsicsAArch64.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAArch64.td +++ llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2844,6 +2844,15 @@ [LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + class SME2_VG2_Unpk_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], + [LLVMSubdivide2VectorType<0>], [IntrNoMem]>; + + class SME2_VG4_Unpk_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>], + [LLVMSubdivide2VectorType<0>, LLVMSubdivide2VectorType<0>], + [IntrNoMem]>; // // Multi-vector fused multiply-add/subtract @@ -2972,4 +2981,12 @@ def int_aarch64_sve_uzpq_x2 : SVE2_VG2_ZipUzp_Intrinsic; def int_aarch64_sve_uzp_x4 : SVE2_VG4_ZipUzp_Intrinsic; def int_aarch64_sve_uzpq_x4 : SVE2_VG4_ZipUzp_Intrinsic; + + // + // Signed/unsigned multi-vector unpacks + // + def int_aarch64_sve_sunpk_x2 : SME2_VG2_Unpk_Intrinsic; + def int_aarch64_sve_uunpk_x2 : SME2_VG2_Unpk_Intrinsic; + def int_aarch64_sve_sunpk_x4 : SME2_VG4_Unpk_Intrinsic; + def int_aarch64_sve_uunpk_x4 : SME2_VG4_Unpk_Intrinsic; } Index: llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -4944,6 +4944,34 @@ SelectUnaryMultiIntrinsic(Node, 4, /*IsTupleInput=*/true, AArch64::UZP_VG4_4Z4Z_Q); return; + case Intrinsic::aarch64_sve_sunpk_x2: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), + {0, AArch64::SUNPK_VG2_2ZZ_H, AArch64::SUNPK_VG2_2ZZ_S, + AArch64::SUNPK_VG2_2ZZ_D})) + SelectUnaryMultiIntrinsic(Node, 2, /*IsTupleInput=*/false, Op); + return; + case Intrinsic::aarch64_sve_uunpk_x2: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), + {0, AArch64::UUNPK_VG2_2ZZ_H, AArch64::UUNPK_VG2_2ZZ_S, + AArch64::UUNPK_VG2_2ZZ_D})) + SelectUnaryMultiIntrinsic(Node, 2, /*IsTupleInput=*/false, Op); + return; + case Intrinsic::aarch64_sve_sunpk_x4: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), + {0, AArch64::SUNPK_VG4_4Z2Z_H, AArch64::SUNPK_VG4_4Z2Z_S, + AArch64::SUNPK_VG4_4Z2Z_D})) + SelectUnaryMultiIntrinsic(Node, 4, /*IsTupleInput=*/true, Op); + return; + case Intrinsic::aarch64_sve_uunpk_x4: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), + {0, AArch64::UUNPK_VG4_4Z2Z_H, AArch64::UUNPK_VG4_4Z2Z_S, + AArch64::UUNPK_VG4_4Z2Z_D})) + SelectUnaryMultiIntrinsic(Node, 4, /*IsTupleInput=*/true, Op); + return; } break; } Index: llvm/test/CodeGen/AArch64/sve2p1-intrinsics-unpk.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/AArch64/sve2p1-intrinsics-unpk.ll @@ -0,0 +1,146 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s + + +; == 2 vectors == + +define { , } @test_unpk_s16_x2( %unused, %a) { +; CHECK-LABEL: test_unpk_s16_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: sunpk { z0.h, z1.h }, z1.b +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.sunpk.x2.nxv8i16( %a) + ret { , } %res +} + +define { , } @test_unpk_s32_x2( %unused, %a) { +; CHECK-LABEL: test_unpk_s32_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: sunpk { z0.s, z1.s }, z1.h +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.sunpk.x2.nxv4i32( %a) + ret { , } %res +} + +define { , } @test_unpk_s64_x2( %unusued, %a) { +; CHECK-LABEL: test_unpk_s64_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: sunpk { z0.d, z1.d }, z1.s +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.sunpk.x2.nxv2i64( %a) + ret { , } %res +} + +define { , } @test_unpk_u16_x2( %unused, %a) { +; CHECK-LABEL: test_unpk_u16_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpk { z0.h, z1.h }, z1.b +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.uunpk.x2.nxv8i16( %a) + ret { , } %res +} + +define { , } @test_unpk_u32_x2( %unused, %a) { +; CHECK-LABEL: test_unpk_u32_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpk { z0.s, z1.s }, z1.h +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.uunpk.x2.nxv4i32( %a) + ret { , } %res +} + +define { , } @test_unpk_u64_x2( %unused, %a) { +; CHECK-LABEL: test_unpk_u64_x2: +; CHECK: // %bb.0: +; CHECK-NEXT: uunpk { z0.d, z1.d }, z1.s +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.uunpk.x2.nxv2i64( %a) + ret { , } %res +} + + +; == 4 vectors == + +define { , , , } @test_unpk_s16_x4( %unused, %a, %b) { +; CHECK-LABEL: test_unpk_s16_x4: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: sunpk { z0.h - z3.h }, { z2.b, z3.b } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.sunpk.x4.nxv8i16( %a, %b) + ret { , , , } %res +} + +define { , , , } @test_unpk_s32( %unused, %a, %b) { +; CHECK-LABEL: test_unpk_s32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: sunpk { z0.s - z3.s }, { z2.h, z3.h } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.sunpk.x4.nxv4i32( %a, %b) + ret { , , , } %res +} + +define { , , , } @test_unpk_s64( %unused, %a, %b) { +; CHECK-LABEL: test_unpk_s64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: sunpk { z0.d - z3.d }, { z2.s, z3.s } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.sunpk.x4.nxv2i64( %a, %b) + ret { , , , } %res +} + +define { , , , } @test_unpk_u16_x4( %unused, %a, %b) { +; CHECK-LABEL: test_unpk_u16_x4: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: uunpk { z0.h - z3.h }, { z2.b, z3.b } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.uunpk.x4.nxv8i16( %a, %b) + ret { , , , } %res +} + +define { , , , } @test_unpk_u32( %unused, %a, %b) { +; CHECK-LABEL: test_unpk_u32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: uunpk { z0.s - z3.s }, { z2.h, z3.h } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.uunpk.x4.nxv4i32( %a, %b) + ret { , , , } %res +} + +define { , , , } @test_unpk_u64( %unused, %a, %b) { +; CHECK-LABEL: test_unpk_u64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: uunpk { z0.d - z3.d }, { z2.s, z3.s } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.uunpk.x4.nxv2i64( %a, %b) + ret { , , , } %res +} + + + +; == 2 vectors == +declare { , } @llvm.aarch64.sve.sunpk.x2.nxv8i16() +declare { , } @llvm.aarch64.sve.sunpk.x2.nxv4i32() +declare { , } @llvm.aarch64.sve.sunpk.x2.nxv2i64() +declare { , } @llvm.aarch64.sve.uunpk.x2.nxv8i16() +declare { , } @llvm.aarch64.sve.uunpk.x2.nxv4i32() +declare { , } @llvm.aarch64.sve.uunpk.x2.nxv2i64() + +; == 4 vectors == +declare { , , , } @llvm.aarch64.sve.sunpk.x4.nxv8i16(, ) +declare { , , , } @llvm.aarch64.sve.sunpk.x4.nxv4i32(, ) +declare { , , , } @llvm.aarch64.sve.sunpk.x4.nxv2i64(, ) +declare { , , , } @llvm.aarch64.sve.uunpk.x4.nxv8i16(, ) +declare { , , , } @llvm.aarch64.sve.uunpk.x4.nxv4i32(, ) +declare { , , , } @llvm.aarch64.sve.uunpk.x4.nxv2i64(, )