diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2956,6 +2956,16 @@ LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + class SVE2_VG2_ZipUzp_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], + [LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + + class SVE2_VG4_ZipUzp_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>], + [LLVMMatchType<0>, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + // // Multi-vector fused multiply-add/subtract @@ -3199,4 +3209,14 @@ def int_aarch64_sme_fdot_lane_za32_vg1x2 : SME2_Matrix_ArrayVector_VG2_Multi_Index_Intrinsic; def int_aarch64_sme_fdot_lane_za32_vg1x4 : SME2_Matrix_ArrayVector_VG4_Multi_Index_Intrinsic; + + // Multi-vector zip and unzips + def int_aarch64_sve_zip_x2 : SVE2_VG2_ZipUzp_Intrinsic; + def int_aarch64_sve_zipq_x2 : SVE2_VG2_ZipUzp_Intrinsic; + def int_aarch64_sve_zip_x4 : SVE2_VG4_ZipUzp_Intrinsic; + def int_aarch64_sve_zipq_x4 : SVE2_VG4_ZipUzp_Intrinsic; + def int_aarch64_sve_uzp_x2 : SVE2_VG2_ZipUzp_Intrinsic; + def int_aarch64_sve_uzpq_x2 : SVE2_VG2_ZipUzp_Intrinsic; + def int_aarch64_sve_uzp_x4 : SVE2_VG4_ZipUzp_Intrinsic; + def int_aarch64_sve_uzpq_x4 : SVE2_VG4_ZipUzp_Intrinsic; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -365,6 +365,8 @@ void SelectWhilePair(SDNode *N, unsigned Opc); void SelectCVTIntrinsic(SDNode *N, unsigned NumVecs, unsigned Opcode); void SelectClamp(SDNode *N, unsigned NumVecs, unsigned Opcode); + void SelectUnaryMultiIntrinsic(SDNode *N, unsigned NumOutVecs, + bool IsTupleInput, unsigned Opc); template void SelectMultiVectorMove(SDNode *N, unsigned NumVecs, unsigned BaseReg, @@ -1720,6 +1722,7 @@ Int1 = 0, Int = 1, FP = 2, + AnyType = 3, }; /// This function selects an opcode from a list of opcodes, which is @@ -1733,6 +1736,8 @@ EVT EltVT = VT.getVectorElementType(); switch (Kind) { + case SelectTypeKind::AnyType: + break; case SelectTypeKind::Int: if (EltVT != MVT::i8 && EltVT != MVT::i16 && EltVT != MVT::i32 && EltVT != MVT::i64) @@ -1952,6 +1957,36 @@ CurDAG->RemoveDeadNode(N); } +void AArch64DAGToDAGISel::SelectUnaryMultiIntrinsic(SDNode *N, + unsigned NumOutVecs, + bool IsTupleInput, + unsigned Opc) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + unsigned NumInVecs = N->getNumOperands() - 1; + + SmallVector Ops; + if (IsTupleInput) { + assert((NumInVecs == 2 || NumInVecs == 4) && + "Don't know how to handle multi-register input!"); + SmallVector Regs(N->op_begin() + 1, + N->op_begin() + 1 + NumInVecs); + Ops.push_back(createZMulTuple(Regs)); + } else { + // All intrinsic nodes have the ID as the first operand, hence the "1 + I". + for (unsigned I = 0; I < NumInVecs; I++) + Ops.push_back(N->getOperand(1 + I)); + } + + SDNode *Res = CurDAG->getMachineNode(Opc, DL, MVT::Untyped, Ops); + SDValue SuperReg = SDValue(Res, 0); + + for (unsigned I = 0; I < NumOutVecs; I++) + ReplaceUses(SDValue(N, I), CurDAG->getTargetExtractSubreg( + AArch64::zsub0 + I, DL, VT, SuperReg)); + CurDAG->RemoveDeadNode(N); +} + void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc) { SDLoc dl(N); @@ -5244,6 +5279,50 @@ AArch64::ADD_VG4_4ZZ_S, AArch64::ADD_VG4_4ZZ_D})) SelectDestructiveMultiIntrinsic(Node, 4, false, Op); return; + case Intrinsic::aarch64_sve_zip_x2: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), + {AArch64::ZIP_VG2_2ZZZ_B, AArch64::ZIP_VG2_2ZZZ_H, + AArch64::ZIP_VG2_2ZZZ_S, AArch64::ZIP_VG2_2ZZZ_D})) + SelectUnaryMultiIntrinsic(Node, 2, /*IsTupleInput=*/false, Op); + return; + case Intrinsic::aarch64_sve_zipq_x2: + SelectUnaryMultiIntrinsic(Node, 2, /*IsTupleInput=*/false, + AArch64::ZIP_VG2_2ZZZ_Q); + return; + case Intrinsic::aarch64_sve_zip_x4: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), + {AArch64::ZIP_VG4_4Z4Z_B, AArch64::ZIP_VG4_4Z4Z_H, + AArch64::ZIP_VG4_4Z4Z_S, AArch64::ZIP_VG4_4Z4Z_D})) + SelectUnaryMultiIntrinsic(Node, 4, /*IsTupleInput=*/true, Op); + return; + case Intrinsic::aarch64_sve_zipq_x4: + SelectUnaryMultiIntrinsic(Node, 4, /*IsTupleInput=*/true, + AArch64::ZIP_VG4_4Z4Z_Q); + return; + case Intrinsic::aarch64_sve_uzp_x2: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), + {AArch64::UZP_VG2_2ZZZ_B, AArch64::UZP_VG2_2ZZZ_H, + AArch64::UZP_VG2_2ZZZ_S, AArch64::UZP_VG2_2ZZZ_D})) + SelectUnaryMultiIntrinsic(Node, 2, /*IsTupleInput=*/false, Op); + return; + case Intrinsic::aarch64_sve_uzpq_x2: + SelectUnaryMultiIntrinsic(Node, 2, /*IsTupleInput=*/false, + AArch64::UZP_VG2_2ZZZ_Q); + return; + case Intrinsic::aarch64_sve_uzp_x4: + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), + {AArch64::UZP_VG4_4Z4Z_B, AArch64::UZP_VG4_4Z4Z_H, + AArch64::UZP_VG4_4Z4Z_S, AArch64::UZP_VG4_4Z4Z_D})) + SelectUnaryMultiIntrinsic(Node, 4, /*IsTupleInput=*/true, Op); + return; + case Intrinsic::aarch64_sve_uzpq_x4: + SelectUnaryMultiIntrinsic(Node, 4, /*IsTupleInput=*/true, + AArch64::UZP_VG4_4Z4Z_Q); + return; } break; } diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx2.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx2.ll @@ -0,0 +1,172 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s + +; == 8 to 64-bit elements == + +define { , } @uzp_x2_i8( %zn, %zm) nounwind { +; CHECK-LABEL: uzp_x2_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp { z0.b, z1.b }, z0.b, z1.b +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.uzp.x2.nxv16i8( %zn, %zm) + ret { , } %res +} + +define { , } @uzp_x2_i16( %zn, %zm) nounwind { +; CHECK-LABEL: uzp_x2_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp { z0.h, z1.h }, z0.h, z1.h +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.uzp.x2.nxv8i16( %zn, %zm) + ret { , } %res +} + +define { , } @uzp_x2_f16( %zn, %zm) nounwind { +; CHECK-LABEL: uzp_x2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp { z0.h, z1.h }, z0.h, z1.h +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.uzp.x2.nxv8f16( %zn, %zm) + ret { , } %res +} + +define { , } @uzp_x2_bf16( %zn, %zm) nounwind { +; CHECK-LABEL: uzp_x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp { z0.h, z1.h }, z0.h, z1.h +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.uzp.x2.nxv8bf16( %zn, %zm) + ret { , } %res +} + +define { , } @uzp_x2_i32( %zn, %zm) nounwind { +; CHECK-LABEL: uzp_x2_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp { z0.s, z1.s }, z0.s, z1.s +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.uzp.x2.nxv4i32( %zn, %zm) + ret { , } %res +} + +define { , } @uzp_x2_f32( %zn, %zm) nounwind { +; CHECK-LABEL: uzp_x2_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp { z0.s, z1.s }, z0.s, z1.s +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.uzp.x2.nxv4f32( %zn, %zm) + ret { , } %res +} + +define { , } @uzp_x2_i64( %zn, %zm) nounwind { +; CHECK-LABEL: uzp_x2_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp { z0.d, z1.d }, z0.d, z1.d +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.uzp.x2.nxv2i64( %zn, %zm) + ret { , } %res +} + +define { , } @uzp_x2_f64( %zn, %zm) nounwind { +; CHECK-LABEL: uzp_x2_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp { z0.d, z1.d }, z0.d, z1.d +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.uzp.x2.nxv2f64( %zn, %zm) + ret { , } %res +} + + +; == 128-bit elements == + +define { , } @uzpq_x2_i8( %zn, %zm) nounwind { +; CHECK-LABEL: uzpq_x2_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp { z0.q, z1.q }, z0.q, z1.q +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.uzpq.x2.nxv16i8( %zn, %zm) + ret { , } %res +} + +define { , } @uzpq_x2_i16( %zn, %zm) nounwind { +; CHECK-LABEL: uzpq_x2_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp { z0.q, z1.q }, z0.q, z1.q +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.uzpq.x2.nxv8i16( %zn, %zm) + ret { , } %res +} + +define { , } @uzpq_x2_f16( %zn, %zm) nounwind { +; CHECK-LABEL: uzpq_x2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp { z0.q, z1.q }, z0.q, z1.q +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.uzpq.x2.nxv8f16( %zn, %zm) + ret { , } %res +} + +define { , } @uzpq_x2_bf16( %zn, %zm) nounwind { +; CHECK-LABEL: uzpq_x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp { z0.q, z1.q }, z0.q, z1.q +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.uzpq.x2.nxv8bf16( %zn, %zm) + ret { , } %res +} + +define { , } @uzpq_x2_i32( %zn, %zm) nounwind { +; CHECK-LABEL: uzpq_x2_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp { z0.q, z1.q }, z0.q, z1.q +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.uzpq.x2.nxv4i32( %zn, %zm) + ret { , } %res +} + +define { , } @uzpq_x2_f32( %zn, %zm) nounwind { +; CHECK-LABEL: uzpq_x2_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp { z0.q, z1.q }, z0.q, z1.q +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.uzpq.x2.nxv4f32( %zn, %zm) + ret { , } %res +} + +define { , } @uzpq_x2_i64( %zn, %zm) nounwind { +; CHECK-LABEL: uzpq_x2_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp { z0.q, z1.q }, z0.q, z1.q +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.uzpq.x2.nxv2i64( %zn, %zm) + ret { , } %res +} + +define { , } @uzpq_x2_f64( %zn, %zm) nounwind { +; CHECK-LABEL: uzpq_x2_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: uzp { z0.q, z1.q }, z0.q, z1.q +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.uzpq.x2.nxv2f64( %zn, %zm) + ret { , } %res +} + + +; == 8 to 64-bit elements == +declare { , } @llvm.aarch64.sve.uzp.x2.nxv16i8(, ) +declare { , } @llvm.aarch64.sve.uzp.x2.nxv8i16( %zn, %zm) +declare { , } @llvm.aarch64.sve.uzp.x2.nxv4i32( %zn, %zm) +declare { , } @llvm.aarch64.sve.uzp.x2.nxv2i64( %zn, %zm) +declare { , } @llvm.aarch64.sve.uzp.x2.nxv8f16( %zn, %zm) +declare { , } @llvm.aarch64.sve.uzp.x2.nxv8bf16( %zn, %zm) +declare { , } @llvm.aarch64.sve.uzp.x2.nxv4f32( %zn, %zm) +declare { , } @llvm.aarch64.sve.uzp.x2.nxv2f64( %zn, %zm) + +; == 128-bit elements == +declare { , } @llvm.aarch64.sve.uzpq.x2.nxv16i8( %zn, %zm) +declare { , } @llvm.aarch64.sve.uzpq.x2.nxv8i16( %zn, %zm) +declare { , } @llvm.aarch64.sve.uzpq.x2.nxv4i32( %zn, %zm) +declare { , } @llvm.aarch64.sve.uzpq.x2.nxv2i64( %zn, %zm) +declare { , } @llvm.aarch64.sve.uzpq.x2.nxv8f16( %zn, %zm) +declare { , } @llvm.aarch64.sve.uzpq.x2.nxv8bf16( %zn, %zm) +declare { , } @llvm.aarch64.sve.uzpq.x2.nxv4f32( %zn, %zm) +declare { , } @llvm.aarch64.sve.uzpq.x2.nxv2f64( %zn, %zm) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx4.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx4.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-uzpx4.ll @@ -0,0 +1,236 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s + +; == 8 to 64-bit elements == + +define { , , , } @uzp_x4_i8( %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: uzp_x4_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.b - z3.b }, { z0.b - z3.b } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.uzp.x4.nxv16i8( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @uzp_x4_i16( %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: uzp_x4_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.h - z3.h }, { z0.h - z3.h } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.uzp.x4.nxv8i16( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @uzp_x4_f16( %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: uzp_x4_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.h - z3.h }, { z0.h - z3.h } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.uzp.x4.nxv8f16( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @uzp_x4_bf16( %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: uzp_x4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.h - z3.h }, { z0.h - z3.h } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.uzp.x4.nxv8bf16( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @uzp_x4_i32( %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: uzp_x4_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.s - z3.s }, { z0.s - z3.s } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.uzp.x4.nxv4i32( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @uzp_x4_f32( %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: uzp_x4_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.s - z3.s }, { z0.s - z3.s } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.uzp.x4.nxv4f32( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @uzp_x4_i64( %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: uzp_x4_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.d - z3.d }, { z0.d - z3.d } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.uzp.x4.nxv2i64( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @uzp_x4_f64( %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: uzp_x4_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.d - z3.d }, { z0.d - z3.d } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.uzp.x4.nxv2f64( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + + +; == 128-bit elements == + +define { , , , } @zipq_x4_i8( %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: zipq_x4_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.q - z3.q }, { z0.q - z3.q } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv16i8( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @zipq_x4_i16( %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: zipq_x4_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.q - z3.q }, { z0.q - z3.q } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8i16( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @zipq_x4_f16( %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: zipq_x4_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.q - z3.q }, { z0.q - z3.q } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8f16( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @zipq_x4_bf16( %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: zipq_x4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.q - z3.q }, { z0.q - z3.q } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8bf16( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @zipq_x4_i32( %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: zipq_x4_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.q - z3.q }, { z0.q - z3.q } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv4i32( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @zipq_x4_f32( %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: zipq_x4_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.q - z3.q }, { z0.q - z3.q } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv4f32( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @zipq_x4_i64( %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: zipq_x4_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.q - z3.q }, { z0.q - z3.q } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv2i64( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @zipq_x4_f64( %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: zipq_x4_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: uzp { z0.q - z3.q }, { z0.q - z3.q } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.uzpq.x4.nxv2f64( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + + +; == 8 to 64-bit elements == +declare { , , , } @llvm.aarch64.sve.uzp.x4.nxv16i8( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.uzp.x4.nxv8i16( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.uzp.x4.nxv4i32( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.uzp.x4.nxv2i64( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.uzp.x4.nxv8f16( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.uzp.x4.nxv8bf16( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.uzp.x4.nxv4f32( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.uzp.x4.nxv2f64( %zn1, %zn2, %zn3, %zn4) + +; == 128-bit elements == +declare { , , , } @llvm.aarch64.sve.uzpq.x4.nxv16i8( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8i16( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.uzpq.x4.nxv4i32( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.uzpq.x4.nxv2i64( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8f16( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.uzpq.x4.nxv8bf16( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.uzpq.x4.nxv4f32( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.uzpq.x4.nxv2f64( %zn1, %zn2, %zn3, %zn4) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipx2.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipx2.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipx2.ll @@ -0,0 +1,190 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s + +; == 8 to 64-bit elements == + +define { , } @zip_x2_i8( %zn, %zm) nounwind { +; CHECK-LABEL: zip_x2_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: zip { z0.b, z1.b }, z0.b, z1.b +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv16i8( %zn, %zm) + ret { , } %res +} + +define { , } @zip_x2_i16( %zn, %zm) nounwind { +; CHECK-LABEL: zip_x2_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: zip { z0.h, z1.h }, z0.h, z1.h +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv8i16( %zn, %zm) + ret { , } %res +} + +define { , } @zip_x2_f16( %zn, %zm) nounwind { +; CHECK-LABEL: zip_x2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: zip { z0.h, z1.h }, z0.h, z1.h +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv8f16( %zn, %zm) + ret { , } %res +} + +define { , } @zip_x2_bf16( %zn, %zm) nounwind { +; CHECK-LABEL: zip_x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: zip { z0.h, z1.h }, z0.h, z1.h +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv8bf16( %zn, %zm) + ret { , } %res +} + +define { , } @zip_x2_i32( %zn, %zm) nounwind { +; CHECK-LABEL: zip_x2_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: zip { z0.s, z1.s }, z0.s, z1.s +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv4i32( %zn, %zm) + ret { , } %res +} + +define { , } @zip_x2_f32( %zn, %zm) nounwind { +; CHECK-LABEL: zip_x2_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: zip { z0.s, z1.s }, z0.s, z1.s +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv4f32( %zn, %zm) + ret { , } %res +} + +define { , } @zip_x2_i64( %zn, %zm) nounwind { +; CHECK-LABEL: zip_x2_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: zip { z0.d, z1.d }, z0.d, z1.d +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv2i64( %zn, %zm) + ret { , } %res +} + +define { , } @zip_x2_f64( %zn, %zm) nounwind { +; CHECK-LABEL: zip_x2_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: zip { z0.d, z1.d }, z0.d, z1.d +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv2f64( %zn, %zm) + ret { , } %res +} + +define { , } @zip_x2_i8_not_tied( %unused, %zn, %zm) nounwind { +; CHECK-LABEL: zip_x2_i8_not_tied: +; CHECK: // %bb.0: +; CHECK-NEXT: zip { z0.b, z1.b }, z1.b, z2.b +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.zip.x2.nxv16i8( %zn, %zm) + ret { , } %res +} + + +; == 128-bit elements == + +define { , } @zipq_x2_i8( %zn, %zm) nounwind { +; CHECK-LABEL: zipq_x2_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: zip { z0.q, z1.q }, z0.q, z1.q +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.zipq.x2.nxv16i8( %zn, %zm) + ret { , } %res +} + +define { , } @zipq_x2_i16( %zn, %zm) nounwind { +; CHECK-LABEL: zipq_x2_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: zip { z0.q, z1.q }, z0.q, z1.q +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.zipq.x2.nxv8i16( %zn, %zm) + ret { , } %res +} + +define { , } @zipq_x2_f16( %zn, %zm) nounwind { +; CHECK-LABEL: zipq_x2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: zip { z0.q, z1.q }, z0.q, z1.q +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.zipq.x2.nxv8f16( %zn, %zm) + ret { , } %res +} + +define { , } @zipq_x2_bf16( %zn, %zm) nounwind { +; CHECK-LABEL: zipq_x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: zip { z0.q, z1.q }, z0.q, z1.q +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.zipq.x2.nxv8bf16( %zn, %zm) + ret { , } %res +} + +define { , } @zipq_x2_i32( %zn, %zm) nounwind { +; CHECK-LABEL: zipq_x2_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: zip { z0.q, z1.q }, z0.q, z1.q +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.zipq.x2.nxv4i32( %zn, %zm) + ret { , } %res +} + +define { , } @zipq_x2_f32( %zn, %zm) nounwind { +; CHECK-LABEL: zipq_x2_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: zip { z0.q, z1.q }, z0.q, z1.q +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.zipq.x2.nxv4f32( %zn, %zm) + ret { , } %res +} + +define { , } @zipq_x2_i64( %zn, %zm) nounwind { +; CHECK-LABEL: zipq_x2_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: zip { z0.q, z1.q }, z0.q, z1.q +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.zipq.x2.nxv2i64( %zn, %zm) + ret { , } %res +} + +define { , } @zipq_x2_f64( %zn, %zm) nounwind { +; CHECK-LABEL: zipq_x2_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: zip { z0.q, z1.q }, z0.q, z1.q +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.zipq.x2.nxv2f64( %zn, %zm) + ret { , } %res +} + +define { , } @zipq_x2_i8_not_tied( %unused, %zn, %zm) nounwind { +; CHECK-LABEL: zipq_x2_i8_not_tied: +; CHECK: // %bb.0: +; CHECK-NEXT: zip { z0.q, z1.q }, z1.q, z2.q +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.zipq.x2.nxv16i8( %zn, %zm) + ret { , } %res +} + + +; == 8 to 64-bit elements == +declare { , } @llvm.aarch64.sve.zip.x2.nxv16i8( %zn, %zm) +declare { , } @llvm.aarch64.sve.zip.x2.nxv8i16( %zn, %zm) +declare { , } @llvm.aarch64.sve.zip.x2.nxv4i32( %zn, %zm) +declare { , } @llvm.aarch64.sve.zip.x2.nxv2i64( %zn, %zm) +declare { , } @llvm.aarch64.sve.zip.x2.nxv8f16( %zn, %zm) +declare { , } @llvm.aarch64.sve.zip.x2.nxv8bf16( %zn, %zm) +declare { , } @llvm.aarch64.sve.zip.x2.nxv4f32( %zn, %zm) +declare { , } @llvm.aarch64.sve.zip.x2.nxv2f64( %zn, %zm) + +; == 128-bit elements == +declare { , } @llvm.aarch64.sve.zipq.x2.nxv16i8( %zn, %zm) +declare { , } @llvm.aarch64.sve.zipq.x2.nxv8i16( %zn, %zm) +declare { , } @llvm.aarch64.sve.zipq.x2.nxv4i32( %zn, %zm) +declare { , } @llvm.aarch64.sve.zipq.x2.nxv2i64( %zn, %zm) +declare { , } @llvm.aarch64.sve.zipq.x2.nxv8f16( %zn, %zm) +declare { , } @llvm.aarch64.sve.zipq.x2.nxv8bf16( %zn, %zm) +declare { , } @llvm.aarch64.sve.zipq.x2.nxv4f32( %zn, %zm) +declare { , } @llvm.aarch64.sve.zipq.x2.nxv2f64( %zn, %zm) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipx4.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipx4.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-zipx4.ll @@ -0,0 +1,236 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s + +; == 8 to 64-bit elements == + +define { , , , } @zip_x4_i8( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: zip_x4_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: zip { z0.b - z3.b }, { z4.b - z7.b } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.zip.x4.nxv16i8( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @zip_x4_i16( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: zip_x4_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: zip { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.zip.x4.nxv8i16( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @zip_x4_f16( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: zip_x4_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: zip { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.zip.x4.nxv8f16( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @zip_x4_bf16( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: zip_x4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: zip { z0.h - z3.h }, { z4.h - z7.h } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.zip.x4.nxv8bf16( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @zip_x4_i32( %unusued, %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: zip_x4_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: zip { z0.s - z3.s }, { z4.s - z7.s } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.zip.x4.nxv4i32( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @zip_x4_f32( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: zip_x4_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: zip { z0.s - z3.s }, { z4.s - z7.s } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.zip.x4.nxv4f32( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @zip_x4_i64( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: zip_x4_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: zip { z0.d - z3.d }, { z4.d - z7.d } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.zip.x4.nxv2i64( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @zip_x4_f64( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: zip_x4_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: zip { z0.d - z3.d }, { z4.d - z7.d } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.zip.x4.nxv2f64( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + + +; == 128-bit elements == + +define { , , , } @zipq_x4_i8( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: zipq_x4_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: zip { z0.q - z3.q }, { z4.q - z7.q } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.zipq.x4.nxv16i8( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @zipq_x4_i16( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: zipq_x4_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: zip { z0.q - z3.q }, { z4.q - z7.q } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.zipq.x4.nxv8i16( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @zipq_x4_f16( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: zipq_x4_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: zip { z0.q - z3.q }, { z4.q - z7.q } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.zipq.x4.nxv8f16( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @zipq_x4_bf16( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: zipq_x4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: zip { z0.q - z3.q }, { z4.q - z7.q } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.zipq.x4.nxv8bf16( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @zipq_x4_i32( %unusued, %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: zipq_x4_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: zip { z0.q - z3.q }, { z4.q - z7.q } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.zipq.x4.nxv4i32( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @zipq_x4_f32( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: zipq_x4_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: zip { z0.q - z3.q }, { z4.q - z7.q } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.zipq.x4.nxv4f32( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @zipq_x4_i64( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: zipq_x4_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: zip { z0.q - z3.q }, { z4.q - z7.q } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.zipq.x4.nxv2i64( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + +define { , , , } @zipq_x4_f64( %unused, %zn1, %zn2, %zn3, %zn4) nounwind { +; CHECK-LABEL: zipq_x4_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: zip { z0.q - z3.q }, { z4.q - z7.q } +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.zipq.x4.nxv2f64( %zn1, %zn2, %zn3, %zn4) + ret { , , , } %res +} + + +; == 8 to 64-bit elements == +declare { , , , } @llvm.aarch64.sve.zip.x4.nxv16i8( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.zip.x4.nxv8i16( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.zip.x4.nxv4i32( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.zip.x4.nxv2i64( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.zip.x4.nxv8f16( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.zip.x4.nxv8bf16( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.zip.x4.nxv4f32( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.zip.x4.nxv2f64( %zn1, %zn2, %zn3, %zn4) + +; == 128-bit elements == +declare { , , , } @llvm.aarch64.sve.zipq.x4.nxv16i8( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.zipq.x4.nxv8i16( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.zipq.x4.nxv4i32( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.zipq.x4.nxv2i64( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.zipq.x4.nxv8f16( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.zipq.x4.nxv8bf16( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.zipq.x4.nxv4f32( %zn1, %zn2, %zn3, %zn4) +declare { , , , } @llvm.aarch64.sve.zipq.x4.nxv2f64( %zn1, %zn2, %zn3, %zn4)