diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2820,6 +2820,54 @@ [llvm_anyvector_ty, LLVMMatchType<0>, LLVMMatchType<0>, LLVMMatchType<0>], [IntrNoMem]>; + class SME2_ZA_ArrayVector_Read_VG2_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], + [llvm_i32_ty], + []>; + + class SME2_ZA_ArrayVector_Read_VG4_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>], + [llvm_i32_ty], + []>; + + class SME2_Matrix_TileVector_Read_VG2_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], + [llvm_i32_ty, llvm_i32_ty], + []>; + + class SME2_Matrix_TileVector_Read_VG4_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>], + [llvm_i32_ty, llvm_i32_ty], + []>; + + class SME2_ZA_ArrayVector_Write_VG2_Intrinsic + : DefaultAttrsIntrinsic<[], + [llvm_i32_ty, + llvm_anyvector_ty, LLVMMatchType<0>], + []>; + + class SME2_ZA_ArrayVector_Write_VG4_Intrinsic + : DefaultAttrsIntrinsic<[], + [llvm_i32_ty, + llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>], + []>; + + class SME2_Matrix_TileVector_Write_VG2_Intrinsic + : DefaultAttrsIntrinsic<[], + [llvm_i32_ty, llvm_i32_ty, + llvm_anyvector_ty, LLVMMatchType<0>], + [ImmArg>]>; + + class SME2_Matrix_TileVector_Write_VG4_Intrinsic + : DefaultAttrsIntrinsic<[], + [llvm_i32_ty, llvm_i32_ty, + llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>], + [ImmArg>]>; + class SME2_VG2_Multi_Single_Single_Intrinsic : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], [LLVMMatchType<0>, LLVMMatchType<0>, @@ -2962,4 +3010,26 @@ def int_aarch64_sme_ # intr # _ # za # _vg1x4 : SME2_ZA_Write_VG4_Intrinsic; } } + + // + // Move multi-vectors to/from ZA + // + + def int_aarch64_sme_read_hor_vg2 : SME2_Matrix_TileVector_Read_VG2_Intrinsic; + def int_aarch64_sme_read_hor_vg4 : SME2_Matrix_TileVector_Read_VG4_Intrinsic; + + def int_aarch64_sme_read_ver_vg2 : SME2_Matrix_TileVector_Read_VG2_Intrinsic; + def int_aarch64_sme_read_ver_vg4 : SME2_Matrix_TileVector_Read_VG4_Intrinsic; + + def int_aarch64_sme_read_vg1x2 : SME2_ZA_ArrayVector_Read_VG2_Intrinsic; + def int_aarch64_sme_read_vg1x4 : SME2_ZA_ArrayVector_Read_VG4_Intrinsic; + + def int_aarch64_sme_write_hor_vg2 : SME2_Matrix_TileVector_Write_VG2_Intrinsic; + def int_aarch64_sme_write_hor_vg4 : SME2_Matrix_TileVector_Write_VG4_Intrinsic; + + def int_aarch64_sme_write_ver_vg2 : SME2_Matrix_TileVector_Write_VG2_Intrinsic; + def int_aarch64_sme_write_ver_vg4 : SME2_Matrix_TileVector_Write_VG4_Intrinsic; + + def int_aarch64_sme_write_vg1x2 : SME2_ZA_ArrayVector_Write_VG2_Intrinsic; + def int_aarch64_sme_write_vg1x4 : SME2_ZA_ArrayVector_Write_VG4_Intrinsic; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -364,6 +364,10 @@ void SelectCVTIntrinsic(SDNode *N, unsigned NumVecs, unsigned Opcode); void SelectClamp(SDNode *N, unsigned NumVecs, unsigned Opcode); + template + void SelectMultiVectorMove(SDNode *N, unsigned NumVecs, unsigned BaseReg, + unsigned Op); + bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm); /// SVE Reg+Imm addressing mode. template @@ -1847,6 +1851,68 @@ return; } +bool SelectSMETile(unsigned &BaseReg, unsigned TileNum) { + switch (BaseReg) { + default: + return false; + case AArch64::ZA: + case AArch64::ZAB0: + if (TileNum == 0) + break; + return false; + case AArch64::ZAH0: + if (TileNum <= 1) + break; + return false; + case AArch64::ZAS0: + if (TileNum <= 3) + break; + return false; + case AArch64::ZAD0: + if (TileNum <= 7) + break; + return false; + } + + BaseReg += TileNum; + return true; +} + +template +void AArch64DAGToDAGISel::SelectMultiVectorMove(SDNode *N, unsigned NumVecs, + unsigned BaseReg, unsigned Op) { + unsigned TileNum = 0; + if (BaseReg != AArch64::ZA) + TileNum = cast(N->getOperand(2))->getZExtValue(); + + if (!SelectSMETile(BaseReg, TileNum)) + return; + + SDValue SliceBase, Base, Offset; + if (BaseReg == AArch64::ZA) + SliceBase = N->getOperand(2); + else + SliceBase = N->getOperand(3); + + if (!SelectSMETileSlice(SliceBase, MaxIdx, Base, Offset, Scale)) + return; + + SDLoc DL(N); + SDValue SubReg = CurDAG->getRegister(BaseReg, MVT::Other); + SDValue Ops[] = {SubReg, Base, Offset, /*Chain*/ N->getOperand(0)}; + SDNode *Mov = CurDAG->getMachineNode(Op, DL, {MVT::Untyped, MVT::Other}, Ops); + + EVT VT = N->getValueType(0); + for (unsigned I = 0; I < NumVecs; ++I) + ReplaceUses(SDValue(N, I), + CurDAG->getTargetExtractSubreg(AArch64::zsub0 + I, DL, VT, + SDValue(Mov, 0))); + // Copy chain + unsigned ChainIdx = NumVecs; + ReplaceUses(SDValue(N, ChainIdx), SDValue(Mov, 1)); + CurDAG->RemoveDeadNode(N); +} + void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc) { SDLoc dl(N); @@ -4680,6 +4746,100 @@ } break; } + case Intrinsic::aarch64_sme_read_hor_vg2: { + if (VT == MVT::nxv16i8) { + SelectMultiVectorMove<14, 2>(Node, 2, AArch64::ZAB0, + AArch64::MOVA_2ZMXI_H_B); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + VT == MVT::nxv8bf16) { + SelectMultiVectorMove<6, 2>(Node, 2, AArch64::ZAH0, + AArch64::MOVA_2ZMXI_H_H); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectMultiVectorMove<2, 2>(Node, 2, AArch64::ZAS0, + AArch64::MOVA_2ZMXI_H_S); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectMultiVectorMove<0, 2>(Node, 2, AArch64::ZAD0, + AArch64::MOVA_2ZMXI_H_D); + return; + } + break; + } + case Intrinsic::aarch64_sme_read_ver_vg2: { + if (VT == MVT::nxv16i8) { + SelectMultiVectorMove<14, 2>(Node, 2, AArch64::ZAB0, + AArch64::MOVA_2ZMXI_V_B); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + VT == MVT::nxv8bf16) { + SelectMultiVectorMove<6, 2>(Node, 2, AArch64::ZAH0, + AArch64::MOVA_2ZMXI_V_H); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectMultiVectorMove<2, 2>(Node, 2, AArch64::ZAS0, + AArch64::MOVA_2ZMXI_V_S); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectMultiVectorMove<0, 2>(Node, 2, AArch64::ZAD0, + AArch64::MOVA_2ZMXI_V_D); + return; + } + break; + } + case Intrinsic::aarch64_sme_read_hor_vg4: { + if (VT == MVT::nxv16i8) { + SelectMultiVectorMove<12, 4>(Node, 4, AArch64::ZAB0, + AArch64::MOVA_4ZMXI_H_B); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + VT == MVT::nxv8bf16) { + SelectMultiVectorMove<4, 4>(Node, 4, AArch64::ZAH0, + AArch64::MOVA_4ZMXI_H_H); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectMultiVectorMove<0, 2>(Node, 4, AArch64::ZAS0, + AArch64::MOVA_4ZMXI_H_S); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectMultiVectorMove<0, 2>(Node, 4, AArch64::ZAD0, + AArch64::MOVA_4ZMXI_H_D); + return; + } + break; + } + case Intrinsic::aarch64_sme_read_ver_vg4: { + if (VT == MVT::nxv16i8) { + SelectMultiVectorMove<12, 4>(Node, 4, AArch64::ZAB0, + AArch64::MOVA_4ZMXI_V_B); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + VT == MVT::nxv8bf16) { + SelectMultiVectorMove<4, 4>(Node, 4, AArch64::ZAH0, + AArch64::MOVA_4ZMXI_V_H); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectMultiVectorMove<0, 4>(Node, 4, AArch64::ZAS0, + AArch64::MOVA_4ZMXI_V_S); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectMultiVectorMove<0, 4>(Node, 4, AArch64::ZAD0, + AArch64::MOVA_4ZMXI_V_D); + return; + } + break; + } + case Intrinsic::aarch64_sme_read_vg1x2: { + SelectMultiVectorMove<7, 1>(Node, 2, AArch64::ZA, + AArch64::MOVA_VG2_2ZMXI); + return; + } + case Intrinsic::aarch64_sme_read_vg1x4: { + SelectMultiVectorMove<7, 1>(Node, 4, AArch64::ZA, + AArch64::MOVA_VG4_4ZMXI); + return; + } case Intrinsic::swift_async_context_addr: { SDLoc DL(Node); SDValue Chain = Node->getOperand(0); diff --git a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SMEInstrInfo.td @@ -609,13 +609,13 @@ defm FRINTP_2Z2Z: sme2_frint_vector_vg2_multi<"frintp", 0b10010>; defm FRINTP_4Z4Z: sme2_frint_vector_vg4_multi<"frintp", 0b1001000>; -defm MOVA_MXI2Z : sme2_mova_vec_to_tile_vg2_multi<"mova">; -defm MOVA_MXI4Z : sme2_mova_vec_to_tile_vg4_multi<"mova">; +defm MOVA_MXI2Z : sme2_mova_vec_to_tile_vg2_multi<"mova", int_aarch64_sme_write_hor_vg2, int_aarch64_sme_write_ver_vg2>; +defm MOVA_MXI4Z : sme2_mova_vec_to_tile_vg4_multi<"mova", int_aarch64_sme_write_hor_vg4, int_aarch64_sme_write_ver_vg4>; defm MOVA_2ZMXI : sme2_mova_tile_to_vec_vg2_multi<"mova">; defm MOVA_4ZMXI : sme2_mova_tile_to_vec_vg4_multi<"mova">; -defm MOVA_VG2_MXI2Z : sme2_mova_vec_to_array_vg2_multi<"mova">; -defm MOVA_VG4_MXI4Z : sme2_mova_vec_to_array_vg4_multi<"mova">; +defm MOVA_VG2_MXI2Z : sme2_mova_vec_to_array_vg2_multi<"mova", int_aarch64_sme_write_vg1x2>; +defm MOVA_VG4_MXI4Z : sme2_mova_vec_to_array_vg4_multi<"mova", int_aarch64_sme_write_vg1x4>; defm MOVA_VG2_2ZMXI : sme2_mova_array_to_vec_vg2_multi<0b000, "mova">; defm MOVA_VG4_4ZMXI : sme2_mova_array_to_vec_vg4_multi<0b1000, "mova">; diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -24,6 +24,12 @@ def tileslicerange3s2 : ComplexPattern", []>; def tileslicerange2s2 : ComplexPattern", []>; +def tileslicerange1s2 : ComplexPattern", []>; +def tileslicerange0s2 : ComplexPattern", []>; + +def tileslicerange2s4 : ComplexPattern", []>; +def tileslicerange1s4 : ComplexPattern", []>; +def tileslicerange0s4 : ComplexPattern", []>; def am_sme_indexed_b4 :ComplexPattern", [], [SDNPWantRoot]>; @@ -84,6 +90,13 @@ let usesCustomInserter = 1; } +class sme2_move_to_tile_pseudo + : SMEPseudo2Instr, + Pseudo<(outs), (ins tile_imm:$tile, MatrixIndexGPR32Op12_15:$Rs, imm_ty:$imm, multi_vector_ty:$Zn), []> { + let SMEMatrixType = za_flag; + let usesCustomInserter = 1; +} + //===----------------------------------------------------------------------===// // SME pattern match helpers. //===----------------------------------------------------------------------===// @@ -152,6 +165,14 @@ : Pat<(intrinsic (i32 (tileslice MatrixIndexGPR32Op8_11:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4), (!cast(name # _PSEUDO) $base, $offset, (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3))>; +class SME2_Tile_VG2_Multi_Pat + : Pat<(intrinsic tile_imm:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2), + (!cast(name # _PSEUDO) $tile, $base, $offset, (REG_SEQUENCE ZPR2Mul2, vt:$Zn1, zsub0, vt:$Zn2, zsub1))>; + +class SME2_Tile_VG4_Multi_Pat + : Pat<(intrinsic tile_imm:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$base, index_ty:$offset)), vt:$Zn1, vt:$Zn2, vt:$Zn3, vt:$Zn4), + (!cast(name # _PSEUDO) $tile, $base, $offset, (REG_SEQUENCE ZPR4Mul4, vt:$Zn1, zsub0, vt:$Zn2, zsub1, vt:$Zn3, zsub2, vt:$Zn4, zsub3))>; + //===----------------------------------------------------------------------===// // SME Outer Products //===----------------------------------------------------------------------===// @@ -811,15 +832,10 @@ Operand offset_ty, SDPatternOperator op, ComplexPattern tileslice> { - def : Pat<(op imm_ty:$tile, MatrixIndexGPR32Op12_15:$idx, + def : Pat<(op imm_ty:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$idx, + offset_ty:$imm)), (ppr_vt PPR3bAny:$pg), (zpr_vt ZPRAny:$zn)), - (inst imm_ty:$tile, $idx, 0, $pg, $zn)>; - let AddedComplexity = 1 in { - def : Pat<(op imm_ty:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$idx, - offset_ty:$imm)), - (ppr_vt PPR3bAny:$pg), (zpr_vt ZPRAny:$zn)), - (inst imm_ty:$tile, $idx, $imm, $pg, $zn)>; - } + (inst imm_ty:$tile, $idx, $imm, $pg, $zn)>; } class sme_mova_insert_pseudo @@ -3052,13 +3068,13 @@ } // SME2 move vector to tile, two registers -multiclass sme2_mova_vec_to_tile_vg2_multi_base { +multiclass sme2_mova_vec_to_tile_vg2_multi_base { def _B : sme2_mova_vec_to_tile_vg2_multi_base<0b00, v, !if(v, TileVectorOpV8, TileVectorOpH8), uimm3s2range, ZZ_b_mul_r, - mnemonic> { + mnemonic>, SMEPseudo2Instr { bits<3> imm; let Inst{2-0} = imm; } @@ -3067,7 +3083,7 @@ !if(v, TileVectorOpV16, TileVectorOpH16), uimm2s2range, ZZ_h_mul_r, - mnemonic> { + mnemonic>, SMEPseudo2Instr { bits<1> ZAd; bits<2> imm; let Inst{2} = ZAd; @@ -3078,7 +3094,7 @@ !if(v, TileVectorOpV32, TileVectorOpH32), uimm1s2range, ZZ_s_mul_r, - mnemonic> { + mnemonic>, SMEPseudo2Instr { bits<2> ZAd; bits<1> imm; let Inst{2-1} = ZAd; @@ -3089,11 +3105,25 @@ !if(v, TileVectorOpV64, TileVectorOpH64), uimm0s2range, ZZ_d_mul_r, - mnemonic> { + mnemonic>, SMEPseudo2Instr { bits<3> ZAd; let Inst{2-0} = ZAd; } + def NAME # _B_PSEUDO : sme2_move_to_tile_pseudo; + def NAME # _H_PSEUDO : sme2_move_to_tile_pseudo; + def NAME # _S_PSEUDO : sme2_move_to_tile_pseudo; + def NAME # _D_PSEUDO : sme2_move_to_tile_pseudo; + + def : SME2_Tile_VG2_Multi_Pat; + def : SME2_Tile_VG2_Multi_Pat; + def : SME2_Tile_VG2_Multi_Pat; + def : SME2_Tile_VG2_Multi_Pat; + def : SME2_Tile_VG2_Multi_Pat; + def : SME2_Tile_VG2_Multi_Pat; + def : SME2_Tile_VG2_Multi_Pat; + def : SME2_Tile_VG2_Multi_Pat; + defm : sme2_mova_vec_to_tile_or_array_aliases<1, !cast(NAME # _B), !if(v, TileVectorOpV8, TileVectorOpH8), @@ -3170,9 +3200,10 @@ "mova">; } -multiclass sme2_mova_vec_to_tile_vg2_multi{ - defm _H : sme2_mova_vec_to_tile_vg2_multi_base<0b0, mnemonic>; - defm _V : sme2_mova_vec_to_tile_vg2_multi_base<0b1, mnemonic>; +multiclass sme2_mova_vec_to_tile_vg2_multi{ + defm _H : sme2_mova_vec_to_tile_vg2_multi_base<0b0, mnemonic, int_h>; + defm _V : sme2_mova_vec_to_tile_vg2_multi_base<0b1, mnemonic, int_v>; } class sme2_mova_vec_to_tile_vg4_multi_base sz, bit v, bits<3> op, @@ -3201,13 +3232,13 @@ } // SME2 move vector to tile, four registers -multiclass sme2_mova_vec_to_tile_vg4_multi_base { +multiclass sme2_mova_vec_to_tile_vg4_multi_base { def _B : sme2_mova_vec_to_tile_vg4_multi_base<0b00, v, {0,?,?}, !if(v, TileVectorOpV8, TileVectorOpH8), uimm2s4range, ZZZZ_b_mul_r, - mnemonic> { + mnemonic>, SMEPseudo2Instr { bits<2> imm; let Inst{1-0} = imm; } @@ -3216,7 +3247,7 @@ !if(v, TileVectorOpV16, TileVectorOpH16), uimm1s4range, ZZZZ_h_mul_r, - mnemonic> { + mnemonic>, SMEPseudo2Instr { bits<1> ZAd; bits<1> imm; let Inst{1} = ZAd; @@ -3227,7 +3258,7 @@ !if(v, TileVectorOpV32, TileVectorOpH32), uimm0s4range, ZZZZ_s_mul_r, - mnemonic> { + mnemonic>, SMEPseudo2Instr { bits<2> ZAd; let Inst{1-0} = ZAd; } @@ -3236,11 +3267,25 @@ !if(v, TileVectorOpV64, TileVectorOpH64), uimm0s4range, ZZZZ_d_mul_r, - mnemonic> { + mnemonic>, SMEPseudo2Instr { bits<3> ZAd; let Inst{2-0} = ZAd; } + def NAME # _B_PSEUDO : sme2_move_to_tile_pseudo; + def NAME # _H_PSEUDO : sme2_move_to_tile_pseudo; + def NAME # _S_PSEUDO : sme2_move_to_tile_pseudo; + def NAME # _D_PSEUDO : sme2_move_to_tile_pseudo; + + def : SME2_Tile_VG4_Multi_Pat; + def : SME2_Tile_VG4_Multi_Pat; + def : SME2_Tile_VG4_Multi_Pat; + def : SME2_Tile_VG4_Multi_Pat; + def : SME2_Tile_VG4_Multi_Pat; + def : SME2_Tile_VG4_Multi_Pat; + def : SME2_Tile_VG4_Multi_Pat; + def : SME2_Tile_VG4_Multi_Pat; + defm : sme2_mova_vec_to_tile_or_array_aliases<1, !cast(NAME # _B), !if(v, TileVectorOpV8, TileVectorOpH8), @@ -3293,9 +3338,10 @@ } -multiclass sme2_mova_vec_to_tile_vg4_multi{ - defm _H : sme2_mova_vec_to_tile_vg4_multi_base<0b0, mnemonic>; - defm _V : sme2_mova_vec_to_tile_vg4_multi_base<0b1, mnemonic>; +multiclass sme2_mova_vec_to_tile_vg4_multi{ + defm _H : sme2_mova_vec_to_tile_vg4_multi_base<0b0, mnemonic, int_h>; + defm _V : sme2_mova_vec_to_tile_vg4_multi_base<0b1, mnemonic, int_v>; } // SME Move into Array @@ -3321,13 +3367,18 @@ } // MOVA (vector to array, two registers) -multiclass sme2_mova_vec_to_array_vg2_multi { +multiclass sme2_mova_vec_to_array_vg2_multi { def NAME : sme2_mova_vec_to_array_vg24_multi<{0,?,?,?,?}, MatrixOp64, - ZZ_d_mul_r, mnemonic, "vgx2">{ + ZZ_d_mul_r, mnemonic, "vgx2">, SMEPseudo2Instr { bits<4> Zn; let Inst{9-6} = Zn; } + def NAME # _PSEUDO : sme2_move_to_za_pseudo; + + def : SME2_ZA_VG1x2_Multi_Pat; + def : SME2_ZA_VG1x2_Multi_Pat; + defm : sme2_mova_vec_to_tile_or_array_aliases<0, !cast(NAME), MatrixOp8, MatrixIndexGPR32Op8_11, @@ -3409,13 +3460,18 @@ } // MOVA (vector to array, four registers) -multiclass sme2_mova_vec_to_array_vg4_multi { +multiclass sme2_mova_vec_to_array_vg4_multi { def NAME : sme2_mova_vec_to_array_vg24_multi<{1,?,?,?,0}, MatrixOp64, - ZZZZ_d_mul_r, mnemonic, "vgx4"> { + ZZZZ_d_mul_r, mnemonic, "vgx4">, SMEPseudo2Instr { bits<3> Zn; let Inst{9-7} = Zn; } + def NAME # _PSEUDO : sme2_move_to_za_pseudo; + + def : SME2_ZA_VG1x4_Multi_Pat; + def : SME2_ZA_VG1x4_Multi_Pat; + defm : sme2_mova_vec_to_tile_or_array_aliases<0, !cast(NAME), MatrixOp8, MatrixIndexGPR32Op8_11, diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-extract-mova.ll @@ -0,0 +1,496 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s + +; +; Move Multi-Vector From Tile (Read) x2 +; + +; Horizontal + +define { , } @za_read_horiz_vg2_b(i32 %slice) { +; CHECK-LABEL: za_read_horiz_vg2_b: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.b, z1.b }, za0h.b[w12, 0:1] +; CHECK-NEXT: mov { z0.b, z1.b }, za0h.b[w12, 14:15] +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 %slice) + %slice.14 = add i32 %slice, 14 + %res2 = call { , } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32 0, i32 %slice.14) + ret { , } %res2 +} + +define { , } @za_read_horiz_vg2_h(i32 %slice) { +; CHECK-LABEL: za_read_horiz_vg2_h: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.h, z1.h }, za0h.h[w12, 0:1] +; CHECK-NEXT: mov { z0.h, z1.h }, za1h.h[w12, 6:7] +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 0, i32 %slice) + %slice.6 = add i32 %slice, 6 + %res2 = call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32 1, i32 %slice.6) + ret { , } %res2 +} + +define { , } @za_read_horiz_vg2_f16(i32 %slice) { +; CHECK-LABEL: za_read_horiz_vg2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.h, z1.h }, za0h.h[w12, 0:1] +; CHECK-NEXT: mov { z0.h, z1.h }, za1h.h[w12, 6:7] +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32 0, i32 %slice) + %slice.6 = add i32 %slice, 6 + %res2 = call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32 1, i32 %slice.6) + ret { , } %res2 +} + +define { , } @za_read_horiz_vg2_bf16(i32 %slice) { +; CHECK-LABEL: za_read_horiz_vg2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.h, z1.h }, za0h.h[w12, 0:1] +; CHECK-NEXT: mov { z0.h, z1.h }, za1h.h[w12, 6:7] +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32 0, i32 %slice) + %slice.6 = add i32 %slice, 6 + %res2 = call { , } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32 1, i32 %slice.6) + ret { , } %res2 +} + +define { , } @za_read_horiz_vg2_s(i32 %slice) { +; CHECK-LABEL: za_read_horiz_vg2_s: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.s, z1.s }, za0h.s[w12, 0:1] +; CHECK-NEXT: mov { z0.s, z1.s }, za3h.s[w12, 2:3] +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 0, i32 %slice) + %slice.2 = add i32 %slice, 2 + %res2 = call { , } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32 3, i32 %slice.2) + ret { , } %res2 +} + +define { , } @za_read_horiz_vg2_f32(i32 %slice) { +; CHECK-LABEL: za_read_horiz_vg2_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.s, z1.s }, za0h.s[w12, 0:1] +; CHECK-NEXT: mov { z0.s, z1.s }, za3h.s[w12, 2:3] +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32 0, i32 %slice) + %slice.2 = add i32 %slice, 2 + %res2 = call { , } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32 3, i32 %slice.2) + ret { , } %res2 +} + +define { , } @za_read_horiz_vg2_d(i32 %slice) { +; CHECK-LABEL: za_read_horiz_vg2_d: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.d, z1.d }, za0h.d[w12, 0:1] +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32 0, i32 %slice) + ret { , } %res +} + +define { , } @za_read_horiz_vg2_f64(i32 %slice) { +; CHECK-LABEL: za_read_horiz_vg2_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.d, z1.d }, za0h.d[w12, 0:1] +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.read.hor.vg2.nxv2f64(i32 0, i32 %slice) + ret { , } %res +} + +; Vertical + +define { , } @za_read_vert_vg2_b(i32 %slice) { +; CHECK-LABEL: za_read_vert_vg2_b: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.b, z1.b }, za0v.b[w12, 0:1] +; CHECK-NEXT: mov { z0.b, z1.b }, za0v.b[w12, 14:15] +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 %slice) + %slice.14 = add i32 %slice, 14 + %res2 = call { , } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32 0, i32 %slice.14) + ret { , } %res2 +} + +define { , } @za_read_vert_vg2_h(i32 %slice) { +; CHECK-LABEL: za_read_vert_vg2_h: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.h, z1.h }, za0v.h[w12, 0:1] +; CHECK-NEXT: mov { z0.h, z1.h }, za1v.h[w12, 6:7] +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 0, i32 %slice) + %slice.6 = add i32 %slice, 6 + %res2 = call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32 1, i32 %slice.6) + ret { , } %res2 +} + +define { , } @za_read_vert_vg2_f16(i32 %slice) { +; CHECK-LABEL: za_read_vert_vg2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.h, z1.h }, za0v.h[w12, 0:1] +; CHECK-NEXT: mov { z0.h, z1.h }, za1v.h[w12, 6:7] +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32 0, i32 %slice) + %slice.6 = add i32 %slice, 6 + %res2 = call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32 1, i32 %slice.6) + ret { , } %res2 +} + +define { , } @za_read_vert_vg2_bf16(i32 %slice) { +; CHECK-LABEL: za_read_vert_vg2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.h, z1.h }, za0v.h[w12, 0:1] +; CHECK-NEXT: mov { z0.h, z1.h }, za1v.h[w12, 6:7] +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32 0, i32 %slice) + %slice.6 = add i32 %slice, 6 + %res2 = call { , } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32 1, i32 %slice.6) + ret { , } %res2 +} + +define { , } @za_read_vert_vg2_s(i32 %slice) { +; CHECK-LABEL: za_read_vert_vg2_s: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.s, z1.s }, za0v.s[w12, 0:1] +; CHECK-NEXT: mov { z0.s, z1.s }, za3v.s[w12, 2:3] +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 0, i32 %slice) + %slice.2 = add i32 %slice, 2 + %res2 = call { , } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32 3, i32 %slice.2) + ret { , } %res2 +} + +define { , } @za_read_vert_vg2_f32(i32 %slice) { +; CHECK-LABEL: za_read_vert_vg2_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.s, z1.s }, za0v.s[w12, 0:1] +; CHECK-NEXT: mov { z0.s, z1.s }, za3v.s[w12, 2:3] +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32 0, i32 %slice) + %slice.2 = add i32 %slice, 2 + %res2 = call { , } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32 3, i32 %slice.2) + ret { , } %res2 +} + +define { , } @za_read_vert_vg2_d(i32 %slice) { +; CHECK-LABEL: za_read_vert_vg2_d: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.d, z1.d }, za0v.d[w12, 0:1] +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32 0, i32 %slice) + ret { , } %res +} + +define { , } @za_read_vert_vg2_f64(i32 %slice) { +; CHECK-LABEL: za_read_vert_vg2_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.d, z1.d }, za0v.d[w12, 0:1] +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.read.ver.vg2.nxv2f64(i32 0, i32 %slice) + ret { , } %res +} + +; +; Move Multi-Vector From Tile (Read) x4 +; + +; Horizontal + +define { , , , } @za_read_horiz_vg4_b(i32 %slice) { +; CHECK-LABEL: za_read_horiz_vg4_b: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.b - z3.b }, za0h.b[w12, 0:3] +; CHECK-NEXT: mov { z0.b - z3.b }, za0h.b[w12, 12:15] +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 %slice) + %slice.12 = add i32 %slice, 12 + %res2 = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32 0, i32 %slice.12) + ret { , , , } %res2 +} + +define { , , , } @za_read_horiz_vg4_h(i32 %slice) { +; CHECK-LABEL: za_read_horiz_vg4_h: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.h - z3.h }, za0h.h[w12, 0:3] +; CHECK-NEXT: mov { z0.h - z3.h }, za1h.h[w12, 4:7] +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 0, i32 %slice) + %slice.4 = add i32 %slice, 4 + %res2 = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32 1, i32 %slice.4) + ret { , , , } %res2 +} + +define { , , , } @za_read_horiz_vg4_f16(i32 %slice) { +; CHECK-LABEL: za_read_horiz_vg4_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.h - z3.h }, za0h.h[w12, 0:3] +; CHECK-NEXT: mov { z0.h - z3.h }, za1h.h[w12, 4:7] +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32 0, i32 %slice) + %slice.4 = add i32 %slice, 4 + %res2 = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32 1, i32 %slice.4) + ret { , , , } %res2 +} + +define { , , , } @za_read_horiz_vg4_bf16(i32 %slice) { +; CHECK-LABEL: za_read_horiz_vg4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.h - z3.h }, za0h.h[w12, 0:3] +; CHECK-NEXT: mov { z0.h - z3.h }, za1h.h[w12, 4:7] +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32 0, i32 %slice) + %slice.4 = add i32 %slice, 4 + %res2 = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32 1, i32 %slice.4) + ret { , , , } %res2 +} + +define { , , , } @za_read_horiz_vg4_s(i32 %slice) { +; CHECK-LABEL: za_read_horiz_vg4_s: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.s - z3.s }, za0h.s[w12, 0:3] +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32 0, i32 %slice) + ret { , , , } %res +} + +define { , , , } @za_read_horiz_vg4_f32(i32 %slice) { +; CHECK-LABEL: za_read_horiz_vg4_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.s - z3.s }, za0h.s[w12, 0:3] +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4f32(i32 0, i32 %slice) + ret { , , , } %res +} + +define { , , , } @za_read_horiz_vg4_d(i32 %slice) { +; CHECK-LABEL: za_read_horiz_vg4_d: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.d - z3.d }, za0h.d[w12, 0:3] +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32 0, i32 %slice) + ret { , , , } %res +} + +define { , , , } @za_read_horiz_vg4_f64(i32 %slice) { +; CHECK-LABEL: za_read_horiz_vg4_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.d - z3.d }, za0h.d[w12, 0:3] +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2f64(i32 0, i32 %slice) + ret { , , , } %res +} + +; Vertical + +define { , , , } @za_read_vert_vg4_b(i32 %slice) { +; CHECK-LABEL: za_read_vert_vg4_b: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.b - z3.b }, za0v.b[w12, 0:3] +; CHECK-NEXT: mov { z0.b - z3.b }, za0v.b[w12, 12:15] +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 %slice) + %slice.12 = add i32 %slice, 12 + %res2 = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32 0, i32 %slice.12) + ret { , , , } %res2 +} + +define { , , , } @za_read_vert_vg4_h(i32 %slice) { +; CHECK-LABEL: za_read_vert_vg4_h: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.h - z3.h }, za0v.h[w12, 0:3] +; CHECK-NEXT: mov { z0.h - z3.h }, za1v.h[w12, 4:7] +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 0, i32 %slice) + %slice.4 = add i32 %slice, 4 + %res2 = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32 1, i32 %slice.4) + ret { , , , } %res2 +} + +define { , , , } @za_read_vert_vg4_f16(i32 %slice) { +; CHECK-LABEL: za_read_vert_vg4_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.h - z3.h }, za0v.h[w12, 0:3] +; CHECK-NEXT: mov { z0.h - z3.h }, za1v.h[w12, 4:7] +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32 0, i32 %slice) + %slice.4 = add i32 %slice, 4 + %res2 = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32 1, i32 %slice.4) + ret { , , , } %res2 +} + +define { , , , } @za_read_vert_vg4_bf16(i32 %slice) { +; CHECK-LABEL: za_read_vert_vg4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.h - z3.h }, za0v.h[w12, 0:3] +; CHECK-NEXT: mov { z0.h - z3.h }, za1v.h[w12, 4:7] +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32 0, i32 %slice) + %slice.4 = add i32 %slice, 4 + %res2 = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32 1, i32 %slice.4) + ret { , , , } %res2 +} + +define { , , , } @za_read_vert_vg4_s(i32 %slice) { +; CHECK-LABEL: za_read_vert_vg4_s: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.s - z3.s }, za0v.s[w12, 0:3] +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32 0, i32 %slice) + ret { , , , } %res +} + +define { , , , } @za_read_vert_vg4_f32(i32 %slice) { +; CHECK-LABEL: za_read_vert_vg4_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.s - z3.s }, za0v.s[w12, 0:3] +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4f32(i32 0, i32 %slice) + ret { , , , } %res +} + +define { , , , } @za_read_vert_vg4_d(i32 %slice) { +; CHECK-LABEL: za_read_vert_vg4_d: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.d - z3.d }, za0v.d[w12, 0:3] +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32 0, i32 %slice) + ret { , , , } %res +} + +define { , , , } @za_read_vert_vg4_f64(i32 %slice) { +; CHECK-LABEL: za_read_vert_vg4_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov { z0.d - z3.d }, za0v.d[w12, 0:3] +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2f64(i32 0, i32 %slice) + ret { , , , } %res +} + +; Move Multi-Vector From ZA (Read) x2 + +define { , } @za_read_vg1x2_d(i32 %slice) { +; CHECK-LABEL: za_read_vg1x2_d: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 0, vgx2] +; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 7, vgx2] +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 %slice) + %slice.7 = add i32 %slice, 7 + %res2 = call { , } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32 %slice.7) + ret { , } %res2 +} + +define { , } @za_read_vg1x2_f64(i32 %slice) { +; CHECK-LABEL: za_read_vg1x2_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 0, vgx2] +; CHECK-NEXT: mov { z0.d, z1.d }, za.d[w8, 7, vgx2] +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32 %slice) + %slice.7 = add i32 %slice, 7 + %res2 = call { , } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32 %slice.7) + ret { , } %res2 +} + +; Move Multi-Vector From ZA (Read) x4 + +define { , , , } @za_read_vg1x4_d(i32 %slice) { +; CHECK-LABEL: za_read_vg1x4_d: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 0, vgx4] +; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 7, vgx4] +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 %slice) + %slice.7 = add i32 %slice, 7 + %res2 = call { , , , } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32 %slice.7) + ret { , , , } %res2 +} + +define { , , , } @za_read_vg1x4_f64(i32 %slice) { +; CHECK-LABEL: za_read_vg1x4_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 0, vgx4] +; CHECK-NEXT: mov { z0.d - z3.d }, za.d[w8, 7, vgx4] +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32 %slice) + %slice.7 = add i32 %slice, 7 + %res2 = call { , , , } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32 %slice.7) + ret { , , , } %res2 +} + +declare { , } @llvm.aarch64.sme.read.hor.vg2.nxv16i8(i32, i32) +declare { , } @llvm.aarch64.sme.read.hor.vg2.nxv8i16(i32, i32) +declare { , } @llvm.aarch64.sme.read.hor.vg2.nxv8f16(i32, i32) +declare { , } @llvm.aarch64.sme.read.hor.vg2.nxv8bf16(i32, i32) +declare { , } @llvm.aarch64.sme.read.hor.vg2.nxv4i32(i32, i32) +declare { , } @llvm.aarch64.sme.read.hor.vg2.nxv4f32(i32, i32) +declare { , } @llvm.aarch64.sme.read.hor.vg2.nxv2i64(i32, i32) +declare { , } @llvm.aarch64.sme.read.hor.vg2.nxv2f64(i32, i32) + +declare { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv16i8(i32, i32) +declare { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8i16(i32, i32) +declare { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8f16(i32, i32) +declare { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv8bf16(i32, i32) +declare { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4i32(i32, i32) +declare { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv4f32(i32, i32) +declare { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2i64(i32, i32) +declare { , , , } @llvm.aarch64.sme.read.hor.vg4.nxv2f64(i32, i32) + +declare { , } @llvm.aarch64.sme.read.ver.vg2.nxv16i8(i32, i32) +declare { , } @llvm.aarch64.sme.read.ver.vg2.nxv8i16(i32, i32) +declare { , } @llvm.aarch64.sme.read.ver.vg2.nxv8f16(i32, i32) +declare { , } @llvm.aarch64.sme.read.ver.vg2.nxv8bf16(i32, i32) +declare { , } @llvm.aarch64.sme.read.ver.vg2.nxv4i32(i32, i32) +declare { , } @llvm.aarch64.sme.read.ver.vg2.nxv4f32(i32, i32) +declare { , } @llvm.aarch64.sme.read.ver.vg2.nxv2i64(i32, i32) +declare { , } @llvm.aarch64.sme.read.ver.vg2.nxv2f64(i32, i32) + +declare { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv16i8(i32, i32) +declare { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8i16(i32, i32) +declare { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8f16(i32, i32) +declare { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv8bf16(i32, i32) +declare { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4i32(i32, i32) +declare { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv4f32(i32, i32) +declare { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2i64(i32, i32) +declare { , , , } @llvm.aarch64.sme.read.ver.vg4.nxv2f64(i32, i32) + +declare { , } @llvm.aarch64.sme.read.vg1x2.nxv2i64(i32) +declare { , } @llvm.aarch64.sme.read.vg1x2.nxv2f64(i32) + +declare { , , , } @llvm.aarch64.sme.read.vg1x4.nxv2i64(i32) +declare { , , , } @llvm.aarch64.sme.read.vg1x4.nxv2f64(i32) diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-insert-mova.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-insert-mova.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-insert-mova.ll @@ -0,0 +1,608 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s + +; +; Move Multi-Vector To Tile (Write) x 2 +; + +; Horizontal + +define void @za_write_vg2_horiz_b(i32 %slice, %zn1, %zn2) { +; CHECK-LABEL: za_write_vg2_horiz_b: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov za0h.b[w12, 0:1], { z0.b, z1.b } +; CHECK-NEXT: mov za0h.b[w12, 14:15], { z0.b, z1.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 0, i32 %slice, %zn1, %zn2) + %slice.14 = add i32 %slice, 14 + call void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32 0, i32 %slice.14, %zn1, %zn2) + ret void +} + +define void @za_write_vg2_horiz_h(i32 %slice, %zn1, %zn2) { +; CHECK-LABEL: za_write_vg2_horiz_h: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov za0h.h[w12, 0:1], { z0.h, z1.h } +; CHECK-NEXT: mov za1h.h[w12, 6:7], { z0.h, z1.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32 0, i32 %slice, %zn1, %zn2) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32 1, i32 %slice.6, %zn1, %zn2) + ret void +} + +define void @za_write_vg2_horiz_f16(i32 %slice, %zn1, %zn2) { +; CHECK-LABEL: za_write_vg2_horiz_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov za0h.h[w12, 0:1], { z0.h, z1.h } +; CHECK-NEXT: mov za1h.h[w12, 6:7], { z0.h, z1.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.hor.vg2.nxv8f16(i32 0, i32 %slice, %zn1, %zn2) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.write.hor.vg2.nxv8f16(i32 1, i32 %slice.6, %zn1, %zn2) + ret void +} + +define void @za_write_vg2_horiz_bf16(i32 %slice, %zn1, %zn2) { +; CHECK-LABEL: za_write_vg2_horiz_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov za0h.h[w12, 0:1], { z0.h, z1.h } +; CHECK-NEXT: mov za1h.h[w12, 6:7], { z0.h, z1.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.hor.vg2.nxv8bf16(i32 0, i32 %slice, %zn1, %zn2) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.write.hor.vg2.nxv8bf16(i32 1, i32 %slice.6, %zn1, %zn2) + ret void +} + +define void @za_write_vg2_horiz_s(i32 %slice, %zn1, %zn2) { +; CHECK-LABEL: za_write_vg2_horiz_s: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov za0h.s[w12, 0:1], { z0.s, z1.s } +; CHECK-NEXT: mov za3h.s[w12, 2:3], { z0.s, z1.s } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32 0, i32 %slice, %zn1, %zn2) + %slice.2 = add i32 %slice, 2 + call void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32 3, i32 %slice.2, %zn1, %zn2) + ret void +} + +define void @za_write_vg2_horiz_f32(i32 %slice, %zn1, %zn2) { +; CHECK-LABEL: za_write_vg2_horiz_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov za0h.s[w12, 0:1], { z0.s, z1.s } +; CHECK-NEXT: mov za3h.s[w12, 2:3], { z0.s, z1.s } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.hor.vg2.nxv4f32(i32 0, i32 %slice, %zn1, %zn2) + %slice.2 = add i32 %slice, 2 + call void @llvm.aarch64.sme.write.hor.vg2.nxv4f32(i32 3, i32 %slice.2, %zn1, %zn2) + ret void +} + +define void @za_write_vg2_horiz_d(i32 %slice, %zn1, %zn2) { +; CHECK-LABEL: za_write_vg2_horiz_d: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov za0h.d[w12, 0:1], { z0.d, z1.d } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.hor.vg2.nxv2i64(i32 0, i32 %slice, %zn1, %zn2) + ret void +} + +define void @za_write_vg2_horiz_f64(i32 %slice, %zn1, %zn2) { +; CHECK-LABEL: za_write_vg2_horiz_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov za0h.d[w12, 0:1], { z0.d, z1.d } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.hor.vg2.nxv2f64(i32 0, i32 %slice, %zn1, %zn2) + ret void +} + +; Vertical + +define void @za_write_vg2_vert_b(i32 %slice, %zn1, %zn2) { +; CHECK-LABEL: za_write_vg2_vert_b: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov za0v.b[w12, 0:1], { z0.b, z1.b } +; CHECK-NEXT: mov za0v.b[w12, 14:15], { z0.b, z1.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32 0, i32 %slice, %zn1, %zn2) + %slice.14 = add i32 %slice, 14 + call void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32 0, i32 %slice.14, %zn1, %zn2) + ret void +} + +define void @za_write_vg2_vert_h(i32 %slice, %zn1, %zn2) { +; CHECK-LABEL: za_write_vg2_vert_h: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov za0v.h[w12, 0:1], { z0.h, z1.h } +; CHECK-NEXT: mov za1v.h[w12, 6:7], { z0.h, z1.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32 0, i32 %slice, %zn1, %zn2) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32 1, i32 %slice.6, %zn1, %zn2) + ret void +} + +define void @za_write_vg2_vert_f16(i32 %slice, %zn1, %zn2) { +; CHECK-LABEL: za_write_vg2_vert_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov za0v.h[w12, 0:1], { z0.h, z1.h } +; CHECK-NEXT: mov za1v.h[w12, 6:7], { z0.h, z1.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.ver.vg2.nxv8f16(i32 0, i32 %slice, %zn1, %zn2) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.write.ver.vg2.nxv8f16(i32 1, i32 %slice.6, %zn1, %zn2) + ret void +} + +define void @za_write_vg2_vert_bf16(i32 %slice, %zn1, %zn2) { +; CHECK-LABEL: za_write_vg2_vert_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov za0v.h[w12, 0:1], { z0.h, z1.h } +; CHECK-NEXT: mov za1v.h[w12, 6:7], { z0.h, z1.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.ver.vg2.nxv8bf16(i32 0, i32 %slice, %zn1, %zn2) + %slice.6 = add i32 %slice, 6 + call void @llvm.aarch64.sme.write.ver.vg2.nxv8bf16(i32 1, i32 %slice.6, %zn1, %zn2) + ret void +} + +define void @za_write_vg2_vert_s(i32 %slice, %zn1, %zn2) { +; CHECK-LABEL: za_write_vg2_vert_s: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov za0v.s[w12, 0:1], { z0.s, z1.s } +; CHECK-NEXT: mov za3v.s[w12, 2:3], { z0.s, z1.s } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32 0, i32 %slice, %zn1, %zn2) + %slice.2 = add i32 %slice, 2 + call void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32 3, i32 %slice.2, %zn1, %zn2) + ret void +} + +define void @za_write_vg2_vert_f32(i32 %slice, %zn1, %zn2) { +; CHECK-LABEL: za_write_vg2_vert_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov za0v.s[w12, 0:1], { z0.s, z1.s } +; CHECK-NEXT: mov za3v.s[w12, 2:3], { z0.s, z1.s } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.ver.vg2.nxv4f32(i32 0, i32 %slice, %zn1, %zn2) + %slice.2 = add i32 %slice, 2 + call void @llvm.aarch64.sme.write.ver.vg2.nxv4f32(i32 3, i32 %slice.2, %zn1, %zn2) + ret void +} + +define void @za_write_vg2_vert_d(i32 %slice, %zn1, %zn2) { +; CHECK-LABEL: za_write_vg2_vert_d: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov za0v.d[w12, 0:1], { z0.d, z1.d } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.ver.vg2.nxv2i64(i32 0, i32 %slice, %zn1, %zn2) + ret void +} + +define void @za_write_vg2_vert_f64(i32 %slice, %zn1, %zn2) { +; CHECK-LABEL: za_write_vg2_vert_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov za0v.d[w12, 0:1], { z0.d, z1.d } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.ver.vg2.nxv2f64(i32 0, i32 %slice, %zn1, %zn2) + ret void +} + +; +; Move Multi-Vector To Tile (Write) x 4 +; + +; Horizontal + +define void @za_write_vg4_horiz_b(i32 %slice, %zn1, %zn2, %zn3, %zn4) { +; CHECK-LABEL: za_write_vg4_horiz_b: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov za0h.b[w12, 0:3], { z0.b - z3.b } +; CHECK-NEXT: mov za0h.b[w12, 12:15], { z0.b - z3.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32 0, i32 %slice, %zn1, %zn2, %zn3, %zn4) + %slice.12 = add i32 %slice, 12 + call void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32 0, i32 %slice.12, %zn1, %zn2, %zn3, %zn4) + ret void +} + +define void @za_write_vg4_horiz_h(i32 %slice, %zn1, %zn2, %zn3, %zn4) { +; CHECK-LABEL: za_write_vg4_horiz_h: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov za0h.h[w12, 0:3], { z0.h - z3.h } +; CHECK-NEXT: mov za1h.h[w12, 4:7], { z0.h - z3.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32 0, i32 %slice, %zn1, %zn2, %zn3, %zn4) + %slice.4 = add i32 %slice, 4 + call void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32 1, i32 %slice.4, %zn1, %zn2, %zn3, %zn4) + ret void +} + +define void @za_write_vg4_horiz_f16(i32 %slice, %zn1, %zn2, %zn3, %zn4) { +; CHECK-LABEL: za_write_vg4_horiz_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov za0h.h[w12, 0:3], { z0.h - z3.h } +; CHECK-NEXT: mov za1h.h[w12, 4:7], { z0.h - z3.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.hor.vg4.nxv8f16(i32 0, i32 %slice, %zn1, %zn2, %zn3, %zn4) + %slice.4 = add i32 %slice, 4 + call void @llvm.aarch64.sme.write.hor.vg4.nxv8f16(i32 1, i32 %slice.4, %zn1, %zn2, %zn3, %zn4) + ret void +} + +define void @za_write_vg4_horiz_bf16(i32 %slice, %zn1, %zn2, %zn3, %zn4) { +; CHECK-LABEL: za_write_vg4_horiz_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov za0h.h[w12, 0:3], { z0.h - z3.h } +; CHECK-NEXT: mov za1h.h[w12, 4:7], { z0.h - z3.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.hor.vg4.nxv8bf16(i32 0, i32 %slice, %zn1, %zn2, %zn3, %zn4) + %slice.4 = add i32 %slice, 4 + call void @llvm.aarch64.sme.write.hor.vg4.nxv8bf16(i32 1, i32 %slice.4, %zn1, %zn2, %zn3, %zn4) + ret void +} + +define void @za_write_vg4_horiz_s(i32 %slice, %zn1, %zn2, %zn3, %zn4) { +; CHECK-LABEL: za_write_vg4_horiz_s: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov za0h.s[w12, 0:3], { z0.s - z3.s } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.hor.vg4.nxv4i32(i32 0, i32 %slice, %zn1, %zn2, %zn3, %zn4) + ret void +} + +define void @za_write_vg4_horiz_f32(i32 %slice, %zn1, %zn2, %zn3, %zn4) { +; CHECK-LABEL: za_write_vg4_horiz_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov za0h.s[w12, 0:3], { z0.s - z3.s } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.hor.vg4.nxv4f32(i32 0, i32 %slice, %zn1, %zn2, %zn3, %zn4) + ret void +} + +define void @za_write_vg4_horiz_d(i32 %slice, %zn1, %zn2, %zn3, %zn4) { +; CHECK-LABEL: za_write_vg4_horiz_d: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov za0h.d[w12, 0:3], { z0.d - z3.d } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.hor.vg4.nxv2i64(i32 0, i32 %slice, %zn1, %zn2, %zn3, %zn4) + ret void +} + +define void @za_write_vg4_horiz_f64(i32 %slice, %zn1, %zn2, %zn3, %zn4) { +; CHECK-LABEL: za_write_vg4_horiz_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov za0h.d[w12, 0:3], { z0.d - z3.d } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.hor.vg4.nxv2f64(i32 0, i32 %slice, %zn1, %zn2, %zn3, %zn4) + ret void +} + +; Vertical + +define void @za_write_vg4_vert_b(i32 %slice, %zn1, %zn2, %zn3, %zn4) { +; CHECK-LABEL: za_write_vg4_vert_b: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov za0v.b[w12, 0:3], { z0.b - z3.b } +; CHECK-NEXT: mov za0v.b[w12, 12:15], { z0.b - z3.b } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32 0, i32 %slice, %zn1, %zn2, %zn3, %zn4) + %slice.12 = add i32 %slice, 12 + call void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32 0, i32 %slice.12, %zn1, %zn2, %zn3, %zn4) + ret void +} + +define void @za_write_vg4_vert_h(i32 %slice, %zn1, %zn2, %zn3, %zn4) { +; CHECK-LABEL: za_write_vg4_vert_h: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov za0v.h[w12, 0:3], { z0.h - z3.h } +; CHECK-NEXT: mov za1v.h[w12, 4:7], { z0.h - z3.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32 0, i32 %slice, %zn1, %zn2, %zn3, %zn4) + %slice.4 = add i32 %slice, 4 + call void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32 1, i32 %slice.4, %zn1, %zn2, %zn3, %zn4) + ret void +} + +define void @za_write_vg4_vert_f16(i32 %slice, %zn1, %zn2, %zn3, %zn4) { +; CHECK-LABEL: za_write_vg4_vert_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov za0v.h[w12, 0:3], { z0.h - z3.h } +; CHECK-NEXT: mov za1v.h[w12, 4:7], { z0.h - z3.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.ver.vg4.nxv8f16(i32 0, i32 %slice, %zn1, %zn2, %zn3, %zn4) + %slice.4 = add i32 %slice, 4 + call void @llvm.aarch64.sme.write.ver.vg4.nxv8f16(i32 1, i32 %slice.4, %zn1, %zn2, %zn3, %zn4) + ret void +} + +define void @za_write_vg4_vert_bf16(i32 %slice, %zn1, %zn2, %zn3, %zn4) { +; CHECK-LABEL: za_write_vg4_vert_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov za0v.h[w12, 0:3], { z0.h - z3.h } +; CHECK-NEXT: mov za1v.h[w12, 4:7], { z0.h - z3.h } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.ver.vg4.nxv8bf16(i32 0, i32 %slice, %zn1, %zn2, %zn3, %zn4) + %slice.4 = add i32 %slice, 4 + call void @llvm.aarch64.sme.write.ver.vg4.nxv8bf16(i32 1, i32 %slice.4, %zn1, %zn2, %zn3, %zn4) + ret void +} + +define void @za_write_vg4_vert_s(i32 %slice, %zn1, %zn2, %zn3, %zn4) { +; CHECK-LABEL: za_write_vg4_vert_s: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov za0v.s[w12, 0:3], { z0.s - z3.s } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.ver.vg4.nxv4i32(i32 0, i32 %slice, %zn1, %zn2, %zn3, %zn4) + ret void +} + +define void @za_write_vg4_vert_f32(i32 %slice, %zn1, %zn2, %zn3, %zn4) { +; CHECK-LABEL: za_write_vg4_vert_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov za0v.s[w12, 0:3], { z0.s - z3.s } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.ver.vg4.nxv4f32(i32 0, i32 %slice, %zn1, %zn2, %zn3, %zn4) + ret void +} + +define void @za_write_vg4_vert_d(i32 %slice, %zn1, %zn2, %zn3, %zn4) { +; CHECK-LABEL: za_write_vg4_vert_d: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov za0v.d[w12, 0:3], { z0.d - z3.d } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.ver.vg4.nxv2i64(i32 0, i32 %slice, %zn1, %zn2, %zn3, %zn4) + ret void +} + +define void @za_write_vg4_vert_f64(i32 %slice, %zn1, %zn2, %zn3, %zn4) { +; CHECK-LABEL: za_write_vg4_vert_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov za0v.d[w12, 0:3], { z0.d - z3.d } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32 0, i32 %slice, %zn1, %zn2, %zn3, %zn4) + ret void +} + +; +; Move Multi-Vector To ZA (Write) x2 +; + +define void @za_write_vg1x2_d(i32 %slice, %za1, %za2) { +; CHECK-LABEL: za_write_vg1x2_d: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d } +; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32 %slice, %za1, %za2) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32 %slice.7, %za1, %za2) + ret void +} + +define void @za_write_vg1x2_f64(i32 %slice, %za1, %za2) { +; CHECK-LABEL: za_write_vg1x2_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1 +; CHECK-NEXT: mov za.d[w8, 0, vgx2], { z0.d, z1.d } +; CHECK-NEXT: mov za.d[w8, 7, vgx2], { z0.d, z1.d } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.vg1x2.nxv2f64(i32 %slice, %za1, %za2) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.write.vg1x2.nxv2f64(i32 %slice.7, %za1, %za2) + ret void +} + +; +; Move Multi-Vector To ZA (Write) x4 +; + +define void @za_write_vg1x4_d(i32 %slice, %za1, %za2, %za3, %za4) { +; CHECK-LABEL: za_write_vg1x4_d: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d } +; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32 %slice, %za1, %za2, %za3, %za4) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32 %slice.7, %za1, %za2, %za3, %za4) + ret void +} + +define void @za_write_vg1x4_f64(i32 %slice, %za1, %za2, %za3, %za4) { +; CHECK-LABEL: za_write_vg1x4_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: // kill: def $z3 killed $z3 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov w8, w0 +; CHECK-NEXT: // kill: def $z2 killed $z2 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1_z2_z3 def $z0_z1_z2_z3 +; CHECK-NEXT: mov za.d[w8, 0, vgx4], { z0.d - z3.d } +; CHECK-NEXT: mov za.d[w8, 7, vgx4], { z0.d - z3.d } +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 %slice, %za1, %za2, %za3, %za4) + %slice.7 = add i32 %slice, 7 + call void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32 %slice.7, %za1, %za2, %za3, %za4) + ret void +} + +declare void @llvm.aarch64.sme.write.hor.vg2.nxv16i8(i32, i32, , ) +declare void @llvm.aarch64.sme.write.hor.vg2.nxv8i16(i32, i32, , ) +declare void @llvm.aarch64.sme.write.hor.vg2.nxv8f16(i32, i32, , ) +declare void @llvm.aarch64.sme.write.hor.vg2.nxv8bf16(i32, i32, , ) +declare void @llvm.aarch64.sme.write.hor.vg2.nxv4i32(i32, i32, , ) +declare void @llvm.aarch64.sme.write.hor.vg2.nxv4f32(i32, i32, , ) +declare void @llvm.aarch64.sme.write.hor.vg2.nxv2i64(i32, i32, , ) +declare void @llvm.aarch64.sme.write.hor.vg2.nxv2f64(i32, i32, , ) + +declare void @llvm.aarch64.sme.write.ver.vg2.nxv16i8(i32, i32, , ) +declare void @llvm.aarch64.sme.write.ver.vg2.nxv8i16(i32, i32, , ) +declare void @llvm.aarch64.sme.write.ver.vg2.nxv8f16(i32, i32, , ) +declare void @llvm.aarch64.sme.write.ver.vg2.nxv8bf16(i32, i32, , ) +declare void @llvm.aarch64.sme.write.ver.vg2.nxv4i32(i32, i32, , ) +declare void @llvm.aarch64.sme.write.ver.vg2.nxv4f32(i32, i32, , ) +declare void @llvm.aarch64.sme.write.ver.vg2.nxv2i64(i32, i32, , ) +declare void @llvm.aarch64.sme.write.ver.vg2.nxv2f64(i32, i32, , ) + +declare void @llvm.aarch64.sme.write.hor.vg4.nxv16i8(i32, i32, , , , ) +declare void @llvm.aarch64.sme.write.hor.vg4.nxv8i16(i32, i32, , , , ) +declare void @llvm.aarch64.sme.write.hor.vg4.nxv8f16(i32, i32, , , , ) +declare void @llvm.aarch64.sme.write.hor.vg4.nxv8bf16(i32, i32, , , , ) +declare void @llvm.aarch64.sme.write.hor.vg4.nxv4i32(i32, i32, , , , ) +declare void @llvm.aarch64.sme.write.hor.vg4.nxv4f32(i32, i32, , , , ) +declare void @llvm.aarch64.sme.write.hor.vg4.nxv2i64(i32, i32, , , , ) +declare void @llvm.aarch64.sme.write.hor.vg4.nxv2f64(i32, i32, , , , ) + +declare void @llvm.aarch64.sme.write.ver.vg4.nxv16i8(i32, i32, , , , ) +declare void @llvm.aarch64.sme.write.ver.vg4.nxv8i16(i32, i32, , , , ) +declare void @llvm.aarch64.sme.write.ver.vg4.nxv8f16(i32, i32, , , , ) +declare void @llvm.aarch64.sme.write.ver.vg4.nxv8bf16(i32, i32, , , , ) +declare void @llvm.aarch64.sme.write.ver.vg4.nxv4i32(i32, i32, , , , ) +declare void @llvm.aarch64.sme.write.ver.vg4.nxv4f32(i32, i32, , , , ) +declare void @llvm.aarch64.sme.write.ver.vg4.nxv2i64(i32, i32, , , , ) +declare void @llvm.aarch64.sme.write.ver.vg4.nxv2f64(i32, i32, , , , ) + +declare void @llvm.aarch64.sme.write.vg1x2.nxv2i64(i32, , ) +declare void @llvm.aarch64.sme.write.vg1x2.nxv2f64(i32, , ) + +declare void @llvm.aarch64.sme.write.vg1x4.nxv2i64(i32, , , , ) +declare void @llvm.aarch64.sme.write.vg1x4.nxv2f64(i32, , , , )