diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2631,4 +2631,22 @@ [], [llvm_i32_ty, llvm_ptr_ty]>; def int_aarch64_sme_str : DefaultAttrsIntrinsic< [], [llvm_i32_ty, llvm_ptr_ty]>; + + class SME_TileToVector_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty], + [LLVMMatchType<0>, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, llvm_i64_ty, llvm_i32_ty]>; + class SME_VectorToTile_Intrinsic + : DefaultAttrsIntrinsic<[], + [llvm_i64_ty, llvm_i32_ty, LLVMScalarOrSameVectorWidth<0, llvm_i1_ty>, + llvm_anyvector_ty]>; + + def int_aarch64_sme_read_horiz : SME_TileToVector_Intrinsic; + def int_aarch64_sme_read_vert : SME_TileToVector_Intrinsic; + def int_aarch64_sme_write_horiz : SME_VectorToTile_Intrinsic; + def int_aarch64_sme_write_vert : SME_VectorToTile_Intrinsic; + + def int_aarch64_sme_readq_horiz : SME_TileToVector_Intrinsic; + def int_aarch64_sme_readq_vert : SME_TileToVector_Intrinsic; + def int_aarch64_sme_writeq_horiz : SME_VectorToTile_Intrinsic; + def int_aarch64_sme_writeq_vert : SME_VectorToTile_Intrinsic; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -561,6 +561,10 @@ MachineBasicBlock *BB) const; MachineBasicBlock *EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitInsertVectorToTile(unsigned Opc, unsigned BaseReg, + MachineInstr &MI, + MachineBasicBlock *BB) const; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2357,6 +2357,24 @@ return BB; } +MachineBasicBlock * +AArch64TargetLowering::EmitInsertVectorToTile(unsigned Opc, unsigned BaseReg, + MachineInstr &MI, + MachineBasicBlock *BB) const { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc)); + + MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define); + MIB.addReg(BaseReg + MI.getOperand(0).getImm()); + MIB.add(MI.getOperand(1)); // Slice index register + MIB.add(MI.getOperand(2)); // Slice index offset + MIB.add(MI.getOperand(3)); // pg + MIB.add(MI.getOperand(4)); // zn + + MI.eraseFromParent(); // The pseudo is gone now. + return BB; +} + MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB) const { switch (MI.getOpcode()) { @@ -2409,6 +2427,36 @@ return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB); case AArch64::LDR_ZA_PSEUDO: return EmitFill(MI, BB); + case AArch64::INSERT_MXIPZ_H_PSEUDO_B: + return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_B, AArch64::ZAB0, MI, + BB); + case AArch64::INSERT_MXIPZ_H_PSEUDO_H: + return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_H, AArch64::ZAH0, MI, + BB); + case AArch64::INSERT_MXIPZ_H_PSEUDO_S: + return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_S, AArch64::ZAS0, MI, + BB); + case AArch64::INSERT_MXIPZ_H_PSEUDO_D: + return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_D, AArch64::ZAD0, MI, + BB); + case AArch64::INSERT_MXIPZ_H_PSEUDO_Q: + return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_H_Q, AArch64::ZAQ0, MI, + BB); + case AArch64::INSERT_MXIPZ_V_PSEUDO_B: + return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_B, AArch64::ZAB0, MI, + BB); + case AArch64::INSERT_MXIPZ_V_PSEUDO_H: + return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_H, AArch64::ZAH0, MI, + BB); + case AArch64::INSERT_MXIPZ_V_PSEUDO_S: + return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_S, AArch64::ZAS0, MI, + BB); + case AArch64::INSERT_MXIPZ_V_PSEUDO_D: + return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_D, AArch64::ZAD0, MI, + BB); + case AArch64::INSERT_MXIPZ_V_PSEUDO_Q: + return EmitInsertVectorToTile(AArch64::INSERT_MXIPZ_V_Q, AArch64::ZAQ0, MI, + BB); } } diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -622,6 +622,30 @@ (inst tile_ty:$ZAd, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm, PPR3bAny:$Pg, zpr_ty:$Zn), 1>; } +multiclass sme_vector_to_tile_patterns { + def : Pat<(op imm_ty:$tile, MatrixIndexGPR32Op12_15:$idx, + (ppr_vt PPR3bAny:$pg), (zpr_vt ZPRAny:$zn)), + (inst imm_ty:$tile, $idx, 0, $pg, $zn)>; + let AddedComplexity = 1 in { + def : Pat<(op imm_ty:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$idx, + offset_ty:$imm)), + (ppr_vt PPR3bAny:$pg), (zpr_vt ZPRAny:$zn)), + (inst imm_ty:$tile, $idx, $imm, $pg, $zn)>; + } +} + +class sme_mova_insert_pseudo + : Pseudo<(outs), (ins i64imm:$tile, MatrixIndexGPR32Op12_15:$idx, + i64imm:$imm, PPR3bAny:$pg, ZPRAny:$zn), []>, + Sched<[]> { + // Translated to the actual instructions in AArch64ISelLowering.cpp + let usesCustomInserter = 1; +} + multiclass sme_vector_v_to_tile { def _B : sme_vector_to_tile_inst<0b0, 0b00, !if(is_col, TileVectorOpV8, TileVectorOpH8), @@ -661,6 +685,14 @@ let Inst{3-0} = ZAd; } + // Pseudo instructions for lowering intrinsics, using immediates instead of + // tile registers. + def _PSEUDO_B : sme_mova_insert_pseudo; + def _PSEUDO_H : sme_mova_insert_pseudo; + def _PSEUDO_S : sme_mova_insert_pseudo; + def _PSEUDO_D : sme_mova_insert_pseudo; + def _PSEUDO_Q : sme_mova_insert_pseudo; + defm : sme_vector_to_tile_aliases(NAME # _B), !if(is_col, TileVectorOpV8, TileVectorOpH8), @@ -681,6 +713,62 @@ !if(is_col, TileVectorOpV128, TileVectorOpH128), ZPR128, sme_elm_idx0_0>; + + defvar op = !if(is_col, int_aarch64_sme_write_vert, + int_aarch64_sme_write_horiz); + + defm : sme_vector_to_tile_patterns(NAME # _PSEUDO_B), + nxv16i8, nxv16i1, sme_elm_idx0_0, imm0_15, + op, tileslice8>; + defm : sme_vector_to_tile_patterns(NAME # _PSEUDO_H), + nxv8i16, nxv8i1, sme_elm_idx0_1, imm0_7, + op, tileslice16>; + defm : sme_vector_to_tile_patterns(NAME # _PSEUDO_H), + nxv8f16, nxv8i1, sme_elm_idx0_1, imm0_7, + op, tileslice16>; + defm : sme_vector_to_tile_patterns(NAME # _PSEUDO_H), + nxv8bf16, nxv8i1, sme_elm_idx0_1, imm0_7, + op, tileslice16>; + defm : sme_vector_to_tile_patterns(NAME # _PSEUDO_S), + nxv4i32, nxv4i1, sme_elm_idx0_3, imm0_3, + op, tileslice32>; + defm : sme_vector_to_tile_patterns(NAME # _PSEUDO_S), + nxv4f32, nxv4i1, sme_elm_idx0_3, imm0_3, + op, tileslice32>; + defm : sme_vector_to_tile_patterns(NAME # _PSEUDO_D), + nxv2i64, nxv2i1, sme_elm_idx0_7, imm0_1, + op, tileslice64>; + defm : sme_vector_to_tile_patterns(NAME # _PSEUDO_D), + nxv2f64, nxv2i1, sme_elm_idx0_7, imm0_1, + op, tileslice64>; + + defvar opq = !if(is_col, int_aarch64_sme_writeq_vert, + int_aarch64_sme_writeq_horiz); + + defm : sme_vector_to_tile_patterns(NAME # _PSEUDO_Q), + nxv16i8, nxv16i1, sme_elm_idx0_15, + sme_elm_idx0_0, opq, tileslice128>; + defm : sme_vector_to_tile_patterns(NAME # _PSEUDO_Q), + nxv8i16, nxv8i1, sme_elm_idx0_15, + sme_elm_idx0_0, opq, tileslice128>; + defm : sme_vector_to_tile_patterns(NAME # _PSEUDO_Q), + nxv8f16, nxv8i1, sme_elm_idx0_15, + sme_elm_idx0_0, opq, tileslice128>; + defm : sme_vector_to_tile_patterns(NAME # _PSEUDO_Q), + nxv8bf16, nxv8i1, sme_elm_idx0_15, + sme_elm_idx0_0, opq, tileslice128>; + defm : sme_vector_to_tile_patterns(NAME # _PSEUDO_Q), + nxv4i32, nxv4i1, sme_elm_idx0_15, + sme_elm_idx0_0, opq, tileslice128>; + defm : sme_vector_to_tile_patterns(NAME # _PSEUDO_Q), + nxv4f32, nxv4i1, sme_elm_idx0_15, + sme_elm_idx0_0, opq, tileslice128>; + defm : sme_vector_to_tile_patterns(NAME # _PSEUDO_Q), + nxv2i64, nxv2i1, sme_elm_idx0_15, + sme_elm_idx0_0, opq, tileslice128>; + defm : sme_vector_to_tile_patterns(NAME # _PSEUDO_Q), + nxv2f64, nxv2i1, sme_elm_idx0_15, + sme_elm_idx0_0, opq, tileslice128>; } multiclass sme_vector_to_tile { @@ -722,6 +810,23 @@ (inst zpr_ty:$Zd, PPR3bAny:$Pg, tile_ty:$ZAn, MatrixIndexGPR32Op12_15:$Rv, imm_ty:$imm), 1>; } +multiclass sme_tile_to_vector_patterns { + def : Pat<(zpr_vt (op (zpr_vt ZPRAny:$passthru), (ppr_vt PPR3bAny:$pg), + (imm2tile untyped:$tile), MatrixIndexGPR32Op12_15:$idx)), + (inst $passthru, $pg, $tile, $idx, 0)>; + let AddedComplexity = 1 in { + def : Pat<(zpr_vt (op (zpr_vt ZPRAny:$passthru), (ppr_vt PPR3bAny:$pg), + (imm2tile untyped:$tile), + (i32 (tileslice MatrixIndexGPR32Op12_15:$idx, + offset_ty:$imm)))), + (inst $passthru, $pg, $tile, $idx, $imm)>; + } +} + multiclass sme_tile_to_vector_v { def _B : sme_tile_to_vector_inst<0b0, 0b00, ZPR8, !if(is_col, TileVectorOpV8, TileVectorOpH8), @@ -775,6 +880,62 @@ defm : sme_tile_to_vector_aliases(NAME # _Q), ZPR128, !if(is_col, TileVectorOpV128, TileVectorOpH128), sme_elm_idx0_0>; + + defvar op = !if(is_col, int_aarch64_sme_read_vert, + int_aarch64_sme_read_horiz); + + defm : sme_tile_to_vector_patterns(NAME # _B), + nxv16i8, nxv16i1, imm0_15, + imm_to_tile8, tileslice8, op>; + defm : sme_tile_to_vector_patterns(NAME # _H), + nxv8i16, nxv8i1, imm0_7, + imm_to_tile16, tileslice16, op>; + defm : sme_tile_to_vector_patterns(NAME # _H), + nxv8f16, nxv8i1, imm0_7, + imm_to_tile16, tileslice16, op>; + defm : sme_tile_to_vector_patterns(NAME # _H), + nxv8bf16, nxv8i1, imm0_7, + imm_to_tile16, tileslice16, op>; + defm : sme_tile_to_vector_patterns(NAME # _S), + nxv4i32, nxv4i1, imm0_3, + imm_to_tile32, tileslice32, op>; + defm : sme_tile_to_vector_patterns(NAME # _S), + nxv4f32, nxv4i1, imm0_3, + imm_to_tile32, tileslice32, op>; + defm : sme_tile_to_vector_patterns(NAME # _D), + nxv2i64, nxv2i1, imm0_1, + imm_to_tile64, tileslice64, op>; + defm : sme_tile_to_vector_patterns(NAME # _D), + nxv2f64, nxv2i1, imm0_1, + imm_to_tile64, tileslice64, op>; + + defvar opq = !if(is_col, int_aarch64_sme_readq_vert, + int_aarch64_sme_readq_horiz); + + defm : sme_tile_to_vector_patterns(NAME # _Q), + nxv16i8, nxv16i1, sme_elm_idx0_0, + imm_to_tile128, tileslice128, opq>; + defm : sme_tile_to_vector_patterns(NAME # _Q), + nxv8i16, nxv8i1, sme_elm_idx0_0, + imm_to_tile128, tileslice128, opq>; + defm : sme_tile_to_vector_patterns(NAME # _Q), + nxv8f16, nxv8i1, sme_elm_idx0_0, + imm_to_tile128, tileslice128, opq>; + defm : sme_tile_to_vector_patterns(NAME # _Q), + nxv8bf16, nxv8i1, sme_elm_idx0_0, + imm_to_tile128, tileslice128, opq>; + defm : sme_tile_to_vector_patterns(NAME # _Q), + nxv4i32, nxv4i1, sme_elm_idx0_0, + imm_to_tile128, tileslice128, opq>; + defm : sme_tile_to_vector_patterns(NAME # _Q), + nxv4f32, nxv4i1, sme_elm_idx0_0, + imm_to_tile128, tileslice128, opq>; + defm : sme_tile_to_vector_patterns(NAME # _Q), + nxv2i64, nxv2i1, sme_elm_idx0_0, + imm_to_tile128, tileslice128, opq>; + defm : sme_tile_to_vector_patterns(NAME # _Q), + nxv2f64, nxv2i1, sme_elm_idx0_0, + imm_to_tile128, tileslice128, opq>; } multiclass sme_tile_to_vector { diff --git a/llvm/test/CodeGen/AArch64/SME/sme-intrinsics-mova-extract.ll b/llvm/test/CodeGen/AArch64/SME/sme-intrinsics-mova-extract.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/SME/sme-intrinsics-mova-extract.ll @@ -0,0 +1,470 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s + +define @extract_row_b( %zd, %pg, i32 %tileslice) { +; CHECK-LABEL: extract_row_b: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z1.b, p0/m, za0h.b[w12, 0] +; CHECK-NEXT: mov z2.b, p0/m, za0h.b[w12, 2] +; CHECK-NEXT: mov z3.b, p0/m, za0h.b[w12, 4] +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z4.b, p0/m, za0h.b[w12, 6] +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: mov z2.b, p0/m, za0h.b[w12, 8] +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z3.b, p0/m, za0h.b[w12, 10] +; CHECK-NEXT: mov z4.b, p0/m, za0h.b[w12, 12] +; CHECK-NEXT: mov z0.b, p0/m, za0h.b[w12, 14] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %z0 = call @llvm.aarch64.sme.read.horiz.nxv16i8( %zd, %pg, i64 0, i32 %tileslice) + %tileslice.2 = add i32 %tileslice, 2 + %z1 = call @llvm.aarch64.sme.read.horiz.nxv16i8( %zd, %pg, i64 0, i32 %tileslice.2) + %tileslice.4 = add i32 %tileslice, 4 + %z2 = call @llvm.aarch64.sme.read.horiz.nxv16i8( %zd, %pg, i64 0, i32 %tileslice.4) + %tileslice.6 = add i32 %tileslice, 6 + %z3 = call @llvm.aarch64.sme.read.horiz.nxv16i8( %zd, %pg, i64 0, i32 %tileslice.6) + %tileslice.8 = add i32 %tileslice, 8 + %z4 = call @llvm.aarch64.sme.read.horiz.nxv16i8( %zd, %pg, i64 0, i32 %tileslice.8) + %tileslice.10 = add i32 %tileslice, 10 + %z5 = call @llvm.aarch64.sme.read.horiz.nxv16i8( %zd, %pg, i64 0, i32 %tileslice.10) + %tileslice.12 = add i32 %tileslice, 12 + %z6 = call @llvm.aarch64.sme.read.horiz.nxv16i8( %zd, %pg, i64 0, i32 %tileslice.12) + %tileslice.14 = add i32 %tileslice, 14 + %z7 = call @llvm.aarch64.sme.read.horiz.nxv16i8( %zd, %pg, i64 0, i32 %tileslice.14) + ret %z0 +} + +define @extract_col_b( %zd, %pg, i32 %tileslice) { +; CHECK-LABEL: extract_col_b: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z1.b, p0/m, za0v.b[w12, 1] +; CHECK-NEXT: mov z2.b, p0/m, za0v.b[w12, 3] +; CHECK-NEXT: mov z3.b, p0/m, za0v.b[w12, 5] +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z4.b, p0/m, za0v.b[w12, 7] +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: mov z2.b, p0/m, za0v.b[w12, 9] +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z3.b, p0/m, za0v.b[w12, 11] +; CHECK-NEXT: mov z4.b, p0/m, za0v.b[w12, 13] +; CHECK-NEXT: mov z0.b, p0/m, za0v.b[w12, 15] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %tileslice.1 = add i32 %tileslice, 1 + %z0 = call @llvm.aarch64.sme.read.vert.nxv16i8( %zd, %pg, i64 0, i32 %tileslice.1) + %tileslice.3 = add i32 %tileslice, 3 + %z1 = call @llvm.aarch64.sme.read.vert.nxv16i8( %zd, %pg, i64 0, i32 %tileslice.3) + %tileslice.5 = add i32 %tileslice, 5 + %z2 = call @llvm.aarch64.sme.read.vert.nxv16i8( %zd, %pg, i64 0, i32 %tileslice.5) + %tileslice.7 = add i32 %tileslice, 7 + %z3 = call @llvm.aarch64.sme.read.vert.nxv16i8( %zd, %pg, i64 0, i32 %tileslice.7) + %tileslice.9 = add i32 %tileslice, 9 + %z4 = call @llvm.aarch64.sme.read.vert.nxv16i8( %zd, %pg, i64 0, i32 %tileslice.9) + %tileslice.11 = add i32 %tileslice, 11 + %z5 = call @llvm.aarch64.sme.read.vert.nxv16i8( %zd, %pg, i64 0, i32 %tileslice.11) + %tileslice.13 = add i32 %tileslice, 13 + %z6 = call @llvm.aarch64.sme.read.vert.nxv16i8( %zd, %pg, i64 0, i32 %tileslice.13) + %tileslice.15 = add i32 %tileslice, 15 + %z7 = call @llvm.aarch64.sme.read.vert.nxv16i8( %zd, %pg, i64 0, i32 %tileslice.15) + ret %z0 +} + +define @extract_row_h( %zd, %pg, i32 %tileslice) { +; CHECK-LABEL: extract_row_h: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: mov z1.h, p0/m, za0h.h[w12, 0] +; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 2] +; CHECK-NEXT: mov z3.h, p0/m, za0h.h[w12, 4] +; CHECK-NEXT: mov z0.h, p0/m, za0h.h[w12, 6] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %z0 = call @llvm.aarch64.sme.read.horiz.nxv8i16( %zd, %pg, i64 0, i32 %tileslice) + %tileslice.2 = add i32 %tileslice, 2 + %z1 = call @llvm.aarch64.sme.read.horiz.nxv8i16( %zd, %pg, i64 0, i32 %tileslice.2) + %tileslice.4 = add i32 %tileslice, 4 + %z2 = call @llvm.aarch64.sme.read.horiz.nxv8i16( %zd, %pg, i64 0, i32 %tileslice.4) + %tileslice.6 = add i32 %tileslice, 6 + %z3 = call @llvm.aarch64.sme.read.horiz.nxv8i16( %zd, %pg, i64 0, i32 %tileslice.6) + ret %z0 +} + +define @extract_col_h( %zd, %pg, i32 %tileslice) { +; CHECK-LABEL: extract_col_h: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: mov z1.h, p0/m, za1v.h[w12, 1] +; CHECK-NEXT: mov z2.h, p0/m, za1v.h[w12, 3] +; CHECK-NEXT: mov z3.h, p0/m, za1v.h[w12, 5] +; CHECK-NEXT: mov z0.h, p0/m, za1v.h[w12, 7] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %tileslice.1 = add i32 %tileslice, 1 + %z0 = call @llvm.aarch64.sme.read.vert.nxv8i16( %zd, %pg, i64 1, i32 %tileslice.1) + %tileslice.3 = add i32 %tileslice, 3 + %z1 = call @llvm.aarch64.sme.read.vert.nxv8i16( %zd, %pg, i64 1, i32 %tileslice.3) + %tileslice.5 = add i32 %tileslice, 5 + %z2 = call @llvm.aarch64.sme.read.vert.nxv8i16( %zd, %pg, i64 1, i32 %tileslice.5) + %tileslice.7 = add i32 %tileslice, 7 + %z3 = call @llvm.aarch64.sme.read.vert.nxv8i16( %zd, %pg, i64 1, i32 %tileslice.7) + ret %z0 +} + +define @extract_f16( %zd, %pg, i32 %tileslice) { +; CHECK-LABEL: extract_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z1.h, p0/m, za0h.h[w12, 0] +; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 1] +; CHECK-NEXT: mov z3.h, p0/m, za0v.h[w12, 2] +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z4.h, p0/m, za0v.h[w12, 3] +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 4] +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z3.h, p0/m, za0h.h[w12, 5] +; CHECK-NEXT: mov z4.h, p0/m, za0v.h[w12, 6] +; CHECK-NEXT: mov z0.h, p0/m, za0v.h[w12, 7] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %z0 = call @llvm.aarch64.sme.read.horiz.nxv8f16( %zd, %pg, i64 0, i32 %tileslice) + %tileslice.1 = add i32 %tileslice, 1 + %z1 = call @llvm.aarch64.sme.read.horiz.nxv8f16( %zd, %pg, i64 0, i32 %tileslice.1) + %tileslice.2 = add i32 %tileslice, 2 + %z2 = call @llvm.aarch64.sme.read.vert.nxv8f16( %zd, %pg, i64 0, i32 %tileslice.2) + %tileslice.3 = add i32 %tileslice, 3 + %z3 = call @llvm.aarch64.sme.read.vert.nxv8f16( %zd, %pg, i64 0, i32 %tileslice.3) + %tileslice.4 = add i32 %tileslice, 4 + %z4 = call @llvm.aarch64.sme.read.horiz.nxv8f16( %zd, %pg, i64 0, i32 %tileslice.4) + %tileslice.5 = add i32 %tileslice, 5 + %z5 = call @llvm.aarch64.sme.read.horiz.nxv8f16( %zd, %pg, i64 0, i32 %tileslice.5) + %tileslice.6 = add i32 %tileslice, 6 + %z6 = call @llvm.aarch64.sme.read.vert.nxv8f16( %zd, %pg, i64 0, i32 %tileslice.6) + %tileslice.7 = add i32 %tileslice, 7 + %z7 = call @llvm.aarch64.sme.read.vert.nxv8f16( %zd, %pg, i64 0, i32 %tileslice.7) + ret %z0 +} + +define @extract_bf16( %zd, %pg, i32 %tileslice, *%ptr) { +; CHECK-LABEL: extract_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z1.h, p0/m, za0h.h[w12, 0] +; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 1] +; CHECK-NEXT: mov z3.h, p0/m, za0v.h[w12, 2] +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z4.h, p0/m, za0v.h[w12, 3] +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: mov z2.h, p0/m, za0h.h[w12, 4] +; CHECK-NEXT: mov z4.d, z0.d +; CHECK-NEXT: mov z3.h, p0/m, za0h.h[w12, 5] +; CHECK-NEXT: mov z4.h, p0/m, za0v.h[w12, 6] +; CHECK-NEXT: mov z0.h, p0/m, za0v.h[w12, 7] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %z0 = call @llvm.aarch64.sme.read.horiz.nxv8bf16( %zd, %pg, i64 0, i32 %tileslice) + %tileslice.1 = add i32 %tileslice, 1 + %z1 = call @llvm.aarch64.sme.read.horiz.nxv8bf16( %zd, %pg, i64 0, i32 %tileslice.1) + %tileslice.2 = add i32 %tileslice, 2 + %z2 = call @llvm.aarch64.sme.read.vert.nxv8bf16( %zd, %pg, i64 0, i32 %tileslice.2) + %tileslice.3 = add i32 %tileslice, 3 + %z3 = call @llvm.aarch64.sme.read.vert.nxv8bf16( %zd, %pg, i64 0, i32 %tileslice.3) + %tileslice.4 = add i32 %tileslice, 4 + %z4 = call @llvm.aarch64.sme.read.horiz.nxv8bf16( %zd, %pg, i64 0, i32 %tileslice.4) + %tileslice.5 = add i32 %tileslice, 5 + %z5 = call @llvm.aarch64.sme.read.horiz.nxv8bf16( %zd, %pg, i64 0, i32 %tileslice.5) + %tileslice.6 = add i32 %tileslice, 6 + %z6 = call @llvm.aarch64.sme.read.vert.nxv8bf16( %zd, %pg, i64 0, i32 %tileslice.6) + %tileslice.7 = add i32 %tileslice, 7 + %z7 = call @llvm.aarch64.sme.read.vert.nxv8bf16( %zd, %pg, i64 0, i32 %tileslice.7) + ret %z0 +} + +define @extract_row_s( %zd, %pg, i32 %tileslice) { +; CHECK-LABEL: extract_row_s: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z1.s, p0/m, za0h.s[w12, 0] +; CHECK-NEXT: mov z0.s, p0/m, za0h.s[w12, 2] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %z0 = call @llvm.aarch64.sme.read.horiz.nxv4i32( %zd, %pg, i64 0, i32 %tileslice) + %tileslice.2 = add i32 %tileslice, 2 + %z1 = call @llvm.aarch64.sme.read.horiz.nxv4i32( %zd, %pg, i64 0, i32 %tileslice.2) + ret %z0 +} + +define @extract_col_s( %zd, %pg, i32 %tileslice) { +; CHECK-LABEL: extract_col_s: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z1.s, p0/m, za3v.s[w12, 1] +; CHECK-NEXT: mov z0.s, p0/m, za3v.s[w12, 3] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %tileslice.1 = add i32 %tileslice, 1 + %z0 = call @llvm.aarch64.sme.read.vert.nxv4i32( %zd, %pg, i64 3, i32 %tileslice.1) + %tileslice.3 = add i32 %tileslice, 3 + %z1 = call @llvm.aarch64.sme.read.vert.nxv4i32( %zd, %pg, i64 3, i32 %tileslice.3) + ret %z0 +} + +define @extract_f32( %zd, %pg, i32 %tileslice) { +; CHECK-LABEL: extract_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z2.d, z0.d +; CHECK-NEXT: mov z3.d, z0.d +; CHECK-NEXT: mov z1.s, p0/m, za0h.s[w12, 0] +; CHECK-NEXT: mov z2.s, p0/m, za0h.s[w12, 1] +; CHECK-NEXT: mov z3.s, p0/m, za0v.s[w12, 2] +; CHECK-NEXT: mov z0.s, p0/m, za0v.s[w12, 3] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %z0 = call @llvm.aarch64.sme.read.horiz.nxv4f32( %zd, %pg, i64 0, i32 %tileslice) + %tileslice.1 = add i32 %tileslice, 1 + %z1 = call @llvm.aarch64.sme.read.horiz.nxv4f32( %zd, %pg, i64 0, i32 %tileslice.1) + %tileslice.2 = add i32 %tileslice, 2 + %z2 = call @llvm.aarch64.sme.read.vert.nxv4f32( %zd, %pg, i64 0, i32 %tileslice.2) + %tileslice.3 = add i32 %tileslice, 3 + %z3 = call @llvm.aarch64.sme.read.vert.nxv4f32( %zd, %pg, i64 0, i32 %tileslice.3) + ret %z0 +} + +define @extract_row_d( %zd, %pg, i32 %tileslice) { +; CHECK-LABEL: extract_row_d: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov z0.d, p0/m, za0h.d[w12, 0] +; CHECK-NEXT: ret + %z0 = call @llvm.aarch64.sme.read.horiz.nxv2i64( %zd, %pg, i64 0, i32 %tileslice) + ret %z0 +} + +define @extract_col_d( %zd, %pg, i32 %tileslice) { +; CHECK-LABEL: extract_col_d: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov z0.d, p0/m, za1v.d[w12, 1] +; CHECK-NEXT: ret + %tileslice.1 = add i32 %tileslice, 1 + %z0 = call @llvm.aarch64.sme.read.vert.nxv2i64( %zd, %pg, i64 1, i32 %tileslice.1) + ret %z0 +} + +define @extract_f64( %zd, %pg, i32 %tileslice) { +; CHECK-LABEL: extract_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov z1.d, z0.d +; CHECK-NEXT: mov z1.d, p0/m, za0h.d[w12, 0] +; CHECK-NEXT: mov z0.d, p0/m, za0v.d[w12, 1] +; CHECK-NEXT: mov z0.d, z1.d +; CHECK-NEXT: ret + %z0 = call @llvm.aarch64.sme.read.horiz.nxv2f64( %zd, %pg, i64 0, i32 %tileslice) + %tileslice.1 = add i32 %tileslice, 1 + %z1 = call @llvm.aarch64.sme.read.vert.nxv2f64( %zd, %pg, i64 0, i32 %tileslice.1) + ret %z0 +} + +define @extract_row_q_v16i18( %zd, %pg) { +; CHECK-LABEL: extract_row_q_v16i18: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov z0.q, p0/m, za0h.q[w12, 0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sme.readq.horiz.nxv16i8( %zd, %pg, i64 0, i32 0) + ret %res +} + +define @extract_row_q_v8i16( %zd, %pg) { +; CHECK-LABEL: extract_row_q_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov z0.q, p0/m, za0h.q[w12, 0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sme.readq.horiz.nxv8i16( %zd, %pg, i64 0, i32 0) + ret %res +} + +define @extract_row_q_v8f16( %zd, %pg) { +; CHECK-LABEL: extract_row_q_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov z0.q, p0/m, za0h.q[w12, 0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sme.readq.horiz.nxv8f16( %zd, %pg, i64 0, i32 0) + ret %res +} + +define @extract_row_q_v4i32( %zd, %pg) { +; CHECK-LABEL: extract_row_q_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov z0.q, p0/m, za0h.q[w12, 0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sme.readq.horiz.nxv4i32( %zd, %pg, i64 0, i32 0) + ret %res +} + +define @extract_row_q_v4f32( %zd, %pg) { +; CHECK-LABEL: extract_row_q_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov z0.q, p0/m, za0h.q[w12, 0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sme.readq.horiz.nxv4f32( %zd, %pg, i64 0, i32 0) + ret %res +} + +define @extract_row_q_v2i64( %zd, %pg) { +; CHECK-LABEL: extract_row_q_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov z0.q, p0/m, za0h.q[w12, 0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sme.readq.horiz.nxv2i64( %zd, %pg, i64 0, i32 0) + ret %res +} + +define @extract_row_q_v2f64( %zd, %pg) { +; CHECK-LABEL: extract_row_q_v2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov z0.q, p0/m, za0h.q[w12, 0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sme.readq.horiz.nxv2f64( %zd, %pg, i64 0, i32 0) + ret %res +} + +define @extract_col_q_v16i8( %zd, %pg) { +; CHECK-LABEL: extract_col_q_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov z0.q, p0/m, za15v.q[w12, 0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sme.readq.vert.nxv16i8( %zd, %pg, i64 15, i32 0) + ret %res +} + +define @extract_col_q_v8i16( %zd, %pg) { +; CHECK-LABEL: extract_col_q_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov z0.q, p0/m, za15v.q[w12, 0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sme.readq.vert.nxv8i16( %zd, %pg, i64 15, i32 0) + ret %res +} + +define @extract_col_q_v8f16( %zd, %pg) { +; CHECK-LABEL: extract_col_q_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov z0.q, p0/m, za15v.q[w12, 0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sme.readq.vert.nxv8f16( %zd, %pg, i64 15, i32 0) + ret %res +} + +define @extract_col_q_v4i32( %zd, %pg) { +; CHECK-LABEL: extract_col_q_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov z0.q, p0/m, za15v.q[w12, 0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sme.readq.vert.nxv4i32( %zd, %pg, i64 15, i32 0) + ret %res +} + +define @extract_col_q_v4f32( %zd, %pg) { +; CHECK-LABEL: extract_col_q_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov z0.q, p0/m, za15v.q[w12, 0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sme.readq.vert.nxv4f32( %zd, %pg, i64 15, i32 0) + ret %res +} + +define @extract_col_q_v2i64( %zd, %pg) { +; CHECK-LABEL: extract_col_q_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov z0.q, p0/m, za15v.q[w12, 0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sme.readq.vert.nxv2i64( %zd, %pg, i64 15, i32 0) + ret %res +} + +define @extract_col_q_v2f64( %zd, %pg) { +; CHECK-LABEL: extract_col_q_v2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov z0.q, p0/m, za15v.q[w12, 0] +; CHECK-NEXT: ret + %res = call @llvm.aarch64.sme.readq.vert.nxv2f64( %zd, %pg, i64 15, i32 0) + ret %res +} + +declare @llvm.aarch64.sme.read.horiz.nxv16i8(, , i64, i32) +declare @llvm.aarch64.sme.read.horiz.nxv8i16(, , i64, i32) +declare @llvm.aarch64.sme.read.horiz.nxv8f16(, , i64, i32) +declare @llvm.aarch64.sme.read.horiz.nxv8bf16(, , i64, i32) +declare @llvm.aarch64.sme.read.horiz.nxv4i32(, , i64, i32) +declare @llvm.aarch64.sme.read.horiz.nxv4f32(, , i64, i32) +declare @llvm.aarch64.sme.read.horiz.nxv2i64(, , i64, i32) +declare @llvm.aarch64.sme.read.horiz.nxv2f64(, , i64, i32) +declare @llvm.aarch64.sme.read.vert.nxv16i8(, , i64, i32) +declare @llvm.aarch64.sme.read.vert.nxv8i16(, , i64, i32) +declare @llvm.aarch64.sme.read.vert.nxv8f16(, , i64, i32) +declare @llvm.aarch64.sme.read.vert.nxv8bf16(, , i64, i32) +declare @llvm.aarch64.sme.read.vert.nxv4i32(, , i64, i32) +declare @llvm.aarch64.sme.read.vert.nxv4f32(, , i64, i32) +declare @llvm.aarch64.sme.read.vert.nxv2i64(, , i64, i32) +declare @llvm.aarch64.sme.read.vert.nxv2f64(, , i64, i32) + +declare @llvm.aarch64.sme.readq.horiz.nxv16i8(, , i64, i32) +declare @llvm.aarch64.sme.readq.horiz.nxv8i16(, , i64, i32) +declare @llvm.aarch64.sme.readq.horiz.nxv8f16(, , i64, i32) +declare @llvm.aarch64.sme.readq.horiz.nxv8bf16(, , i64, i32) +declare @llvm.aarch64.sme.readq.horiz.nxv4i32(, , i64, i32) +declare @llvm.aarch64.sme.readq.horiz.nxv4f32(, , i64, i32) +declare @llvm.aarch64.sme.readq.horiz.nxv2i64(, , i64, i32) +declare @llvm.aarch64.sme.readq.horiz.nxv2f64(, , i64, i32) +declare @llvm.aarch64.sme.readq.vert.nxv16i8(, , i64, i32) +declare @llvm.aarch64.sme.readq.vert.nxv8i16(, , i64, i32) +declare @llvm.aarch64.sme.readq.vert.nxv8f16(, , i64, i32) +declare @llvm.aarch64.sme.readq.vert.nxv8bf16(, , i64, i32) +declare @llvm.aarch64.sme.readq.vert.nxv4i32(, , i64, i32) +declare @llvm.aarch64.sme.readq.vert.nxv4f32(, , i64, i32) +declare @llvm.aarch64.sme.readq.vert.nxv2i64(, , i64, i32) +declare @llvm.aarch64.sme.readq.vert.nxv2f64(, , i64, i32) diff --git a/llvm/test/CodeGen/AArch64/SME/sme-intrinsics-mova-insert.ll b/llvm/test/CodeGen/AArch64/SME/sme-intrinsics-mova-insert.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/SME/sme-intrinsics-mova-insert.ll @@ -0,0 +1,474 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s + +define void @insert_row_b(i32 %tileslice, %pg, +; CHECK-LABEL: insert_row_b: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov za0h.b[w12, 0], p0/m, z0.b +; CHECK-NEXT: mov za0h.b[w12, 2], p0/m, z1.b +; CHECK-NEXT: mov za0h.b[w12, 4], p0/m, z2.b +; CHECK-NEXT: mov za0h.b[w12, 6], p0/m, z3.b +; CHECK-NEXT: mov za0h.b[w12, 8], p0/m, z4.b +; CHECK-NEXT: mov za0h.b[w12, 10], p0/m, z5.b +; CHECK-NEXT: mov za0h.b[w12, 12], p0/m, z6.b +; CHECK-NEXT: mov za0h.b[w12, 14], p0/m, z7.b +; CHECK-NEXT: ret + %z0, %z1, + %z2, %z3, + %z4, %z5, + %z6, %z7) { + call void @llvm.aarch64.sme.write.horiz.nxv16i8(i64 0, i32 %tileslice, %pg, %z0) + %tileslice.2 = add i32 %tileslice, 2 + call void @llvm.aarch64.sme.write.horiz.nxv16i8(i64 0, i32 %tileslice.2, %pg, %z1) + %tileslice.4 = add i32 %tileslice, 4 + call void @llvm.aarch64.sme.write.horiz.nxv16i8(i64 0, i32 %tileslice.4, %pg, %z2) + %tileslice.6 = add i32 %tileslice, 6 + call void @llvm.aarch64.sme.write.horiz.nxv16i8(i64 0, i32 %tileslice.6, %pg, %z3) + %tileslice.8 = add i32 %tileslice, 8 + call void @llvm.aarch64.sme.write.horiz.nxv16i8(i64 0, i32 %tileslice.8, %pg, %z4) + %tileslice.10 = add i32 %tileslice, 10 + call void @llvm.aarch64.sme.write.horiz.nxv16i8(i64 0, i32 %tileslice.10, %pg, %z5) + %tileslice.12 = add i32 %tileslice, 12 + call void @llvm.aarch64.sme.write.horiz.nxv16i8(i64 0, i32 %tileslice.12, %pg, %z6) + %tileslice.14 = add i32 %tileslice, 14 + call void @llvm.aarch64.sme.write.horiz.nxv16i8(i64 0, i32 %tileslice.14, %pg, %z7) + ret void +} + +define void @insert_col_b(i32 %tileslice, %pg, +; CHECK-LABEL: insert_col_b: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov za0v.b[w12, 1], p0/m, z0.b +; CHECK-NEXT: mov za0v.b[w12, 3], p0/m, z1.b +; CHECK-NEXT: mov za0v.b[w12, 5], p0/m, z2.b +; CHECK-NEXT: mov za0v.b[w12, 7], p0/m, z3.b +; CHECK-NEXT: mov za0v.b[w12, 9], p0/m, z4.b +; CHECK-NEXT: mov za0v.b[w12, 11], p0/m, z5.b +; CHECK-NEXT: mov za0v.b[w12, 13], p0/m, z6.b +; CHECK-NEXT: mov za0v.b[w12, 15], p0/m, z7.b +; CHECK-NEXT: ret + %z0, %z1, + %z2, %z3, + %z4, %z5, + %z6, %z7) { + %tileslice.1 = add i32 %tileslice, 1 + call void @llvm.aarch64.sme.write.vert.nxv16i8(i64 0, i32 %tileslice.1, %pg, %z0) + %tileslice.3 = add i32 %tileslice, 3 + call void @llvm.aarch64.sme.write.vert.nxv16i8(i64 0, i32 %tileslice.3, %pg, %z1) + %tileslice.5 = add i32 %tileslice, 5 + call void @llvm.aarch64.sme.write.vert.nxv16i8(i64 0, i32 %tileslice.5, %pg, %z2) + %tileslice.7 = add i32 %tileslice, 7 + call void @llvm.aarch64.sme.write.vert.nxv16i8(i64 0, i32 %tileslice.7, %pg, %z3) + %tileslice.9 = add i32 %tileslice, 9 + call void @llvm.aarch64.sme.write.vert.nxv16i8(i64 0, i32 %tileslice.9, %pg, %z4) + %tileslice.11 = add i32 %tileslice, 11 + call void @llvm.aarch64.sme.write.vert.nxv16i8(i64 0, i32 %tileslice.11, %pg, %z5) + %tileslice.13 = add i32 %tileslice, 13 + call void @llvm.aarch64.sme.write.vert.nxv16i8(i64 0, i32 %tileslice.13, %pg, %z6) + %tileslice.15 = add i32 %tileslice, 15 + call void @llvm.aarch64.sme.write.vert.nxv16i8(i64 0, i32 %tileslice.15, %pg, %z7) + ret void +} + +define void @insert_row_h(i32 %tileslice, %pg, +; CHECK-LABEL: insert_row_h: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov za0h.h[w12, 0], p0/m, z0.h +; CHECK-NEXT: mov za0h.h[w12, 2], p0/m, z2.h +; CHECK-NEXT: mov za0h.h[w12, 4], p0/m, z4.h +; CHECK-NEXT: mov za0h.h[w12, 6], p0/m, z6.h +; CHECK-NEXT: ret + %z0, %z1, + %z2, %z3, + %z4, %z5, + %z6, %z7) { + call void @llvm.aarch64.sme.write.horiz.nxv8i16(i64 0, i32 %tileslice, %pg, %z0) + %tileslice.2 = add i32 %tileslice, 2 + call void @llvm.aarch64.sme.write.horiz.nxv8i16(i64 0, i32 %tileslice.2, %pg, %z2) + %tileslice.4 = add i32 %tileslice, 4 + call void @llvm.aarch64.sme.write.horiz.nxv8i16(i64 0, i32 %tileslice.4, %pg, %z4) + %tileslice.6 = add i32 %tileslice, 6 + call void @llvm.aarch64.sme.write.horiz.nxv8i16(i64 0, i32 %tileslice.6, %pg, %z6) + ret void +} + +define void @insert_col_h(i32 %tileslice, %pg, +; CHECK-LABEL: insert_col_h: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov za1v.h[w12, 1], p0/m, z1.h +; CHECK-NEXT: mov za1v.h[w12, 3], p0/m, z3.h +; CHECK-NEXT: mov za1v.h[w12, 5], p0/m, z5.h +; CHECK-NEXT: mov za1v.h[w12, 7], p0/m, z7.h +; CHECK-NEXT: ret + %z0, %z1, + %z2, %z3, + %z4, %z5, + %z6, %z7) { + %tileslice.1 = add i32 %tileslice, 1 + call void @llvm.aarch64.sme.write.vert.nxv8i16(i64 1, i32 %tileslice.1, %pg, %z1) + %tileslice.3 = add i32 %tileslice, 3 + call void @llvm.aarch64.sme.write.vert.nxv8i16(i64 1, i32 %tileslice.3, %pg, %z3) + %tileslice.5 = add i32 %tileslice, 5 + call void @llvm.aarch64.sme.write.vert.nxv8i16(i64 1, i32 %tileslice.5, %pg, %z5) + %tileslice.7 = add i32 %tileslice, 7 + call void @llvm.aarch64.sme.write.vert.nxv8i16(i64 1, i32 %tileslice.7, %pg, %z7) + ret void +} + +define void @insert_f16(i32 %tileslice, %pg, +; CHECK-LABEL: insert_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov za0h.h[w12, 0], p0/m, z0.h +; CHECK-NEXT: mov za0h.h[w12, 1], p0/m, z1.h +; CHECK-NEXT: mov za0v.h[w12, 2], p0/m, z2.h +; CHECK-NEXT: mov za0v.h[w12, 3], p0/m, z3.h +; CHECK-NEXT: mov za0h.h[w12, 4], p0/m, z4.h +; CHECK-NEXT: mov za0h.h[w12, 5], p0/m, z5.h +; CHECK-NEXT: mov za0v.h[w12, 6], p0/m, z6.h +; CHECK-NEXT: mov za0v.h[w12, 7], p0/m, z7.h +; CHECK-NEXT: ret + %z0, %z1, + %z2, %z3, + %z4, %z5, + %z6, %z7) { + call void @llvm.aarch64.sme.write.horiz.nxv8f16(i64 0, i32 %tileslice, %pg, %z0) + %tileslice.1 = add i32 %tileslice, 1 + call void @llvm.aarch64.sme.write.horiz.nxv8f16(i64 0, i32 %tileslice.1, %pg, %z1) + %tileslice.2 = add i32 %tileslice, 2 + call void @llvm.aarch64.sme.write.vert.nxv8f16(i64 0, i32 %tileslice.2, %pg, %z2) + %tileslice.3 = add i32 %tileslice, 3 + call void @llvm.aarch64.sme.write.vert.nxv8f16(i64 0, i32 %tileslice.3, %pg, %z3) + %tileslice.4 = add i32 %tileslice, 4 + call void @llvm.aarch64.sme.write.horiz.nxv8f16(i64 0, i32 %tileslice.4, %pg, %z4) + %tileslice.5 = add i32 %tileslice, 5 + call void @llvm.aarch64.sme.write.horiz.nxv8f16(i64 0, i32 %tileslice.5, %pg, %z5) + %tileslice.6 = add i32 %tileslice, 6 + call void @llvm.aarch64.sme.write.vert.nxv8f16(i64 0, i32 %tileslice.6, %pg, %z6) + %tileslice.7 = add i32 %tileslice, 7 + call void @llvm.aarch64.sme.write.vert.nxv8f16(i64 0, i32 %tileslice.7, %pg, %z7) + ret void +} + +define void @insert_bf16(i32 %tileslice, %pg, +; CHECK-LABEL: insert_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov za0h.h[w12, 0], p0/m, z0.h +; CHECK-NEXT: mov za0h.h[w12, 1], p0/m, z1.h +; CHECK-NEXT: mov za0v.h[w12, 2], p0/m, z2.h +; CHECK-NEXT: mov za0v.h[w12, 3], p0/m, z3.h +; CHECK-NEXT: mov za0h.h[w12, 4], p0/m, z4.h +; CHECK-NEXT: mov za0h.h[w12, 5], p0/m, z5.h +; CHECK-NEXT: mov za0v.h[w12, 6], p0/m, z6.h +; CHECK-NEXT: mov za0v.h[w12, 7], p0/m, z7.h +; CHECK-NEXT: ret + %z0, %z1, + %z2, %z3, + %z4, %z5, + %z6, %z7) { + call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i64 0, i32 %tileslice, %pg, %z0) + %tileslice.1 = add i32 %tileslice, 1 + call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i64 0, i32 %tileslice.1, %pg, %z1) + %tileslice.2 = add i32 %tileslice, 2 + call void @llvm.aarch64.sme.write.vert.nxv8bf16(i64 0, i32 %tileslice.2, %pg, %z2) + %tileslice.3 = add i32 %tileslice, 3 + call void @llvm.aarch64.sme.write.vert.nxv8bf16(i64 0, i32 %tileslice.3, %pg, %z3) + %tileslice.4 = add i32 %tileslice, 4 + call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i64 0, i32 %tileslice.4, %pg, %z4) + %tileslice.5 = add i32 %tileslice, 5 + call void @llvm.aarch64.sme.write.horiz.nxv8bf16(i64 0, i32 %tileslice.5, %pg, %z5) + %tileslice.6 = add i32 %tileslice, 6 + call void @llvm.aarch64.sme.write.vert.nxv8bf16(i64 0, i32 %tileslice.6, %pg, %z6) + %tileslice.7 = add i32 %tileslice, 7 + call void @llvm.aarch64.sme.write.vert.nxv8bf16(i64 0, i32 %tileslice.7, %pg, %z7) + ret void +} + +define void @insert_row_s(i32 %tileslice, %pg, +; CHECK-LABEL: insert_row_s: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov za0h.s[w12, 0], p0/m, z0.s +; CHECK-NEXT: mov za0h.s[w12, 2], p0/m, z2.s +; CHECK-NEXT: ret + %z0, %z1, + %z2, %z3) { + call void @llvm.aarch64.sme.write.horiz.nxv4i32(i64 0, i32 %tileslice, %pg, %z0) + %tileslice.2 = add i32 %tileslice, 2 + call void @llvm.aarch64.sme.write.horiz.nxv4i32(i64 0, i32 %tileslice.2, %pg, %z2) + ret void +} + +define void @insert_col_s(i32 %tileslice, %pg, +; CHECK-LABEL: insert_col_s: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov za3v.s[w12, 1], p0/m, z1.s +; CHECK-NEXT: mov za3v.s[w12, 3], p0/m, z3.s +; CHECK-NEXT: ret + %z0, %z1, + %z2, %z3) { + %tileslice.1 = add i32 %tileslice, 1 + call void @llvm.aarch64.sme.write.vert.nxv4i32(i64 3, i32 %tileslice.1, %pg, %z1) + %tileslice.3 = add i32 %tileslice, 3 + call void @llvm.aarch64.sme.write.vert.nxv4i32(i64 3, i32 %tileslice.3, %pg, %z3) + ret void +} + +define void @insert_f32(i32 %tileslice, %pg, +; CHECK-LABEL: insert_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov za0h.s[w12, 0], p0/m, z0.s +; CHECK-NEXT: mov za0h.s[w12, 1], p0/m, z1.s +; CHECK-NEXT: mov za0v.s[w12, 2], p0/m, z2.s +; CHECK-NEXT: mov za0v.s[w12, 3], p0/m, z3.s +; CHECK-NEXT: ret + %z0, %z1, + %z2, %z3) { + call void @llvm.aarch64.sme.write.horiz.nxv4f32(i64 0, i32 %tileslice, %pg, %z0) + %tileslice.1 = add i32 %tileslice, 1 + call void @llvm.aarch64.sme.write.horiz.nxv4f32(i64 0, i32 %tileslice.1, %pg, %z1) + %tileslice.2 = add i32 %tileslice, 2 + call void @llvm.aarch64.sme.write.vert.nxv4f32(i64 0, i32 %tileslice.2, %pg, %z2) + %tileslice.3 = add i32 %tileslice, 3 + call void @llvm.aarch64.sme.write.vert.nxv4f32(i64 0, i32 %tileslice.3, %pg, %z3) + ret void +} + +define void @insert_row_d(i32 %tileslice, %pg, +; CHECK-LABEL: insert_row_d: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov za0h.d[w12, 0], p0/m, z0.d +; CHECK-NEXT: ret + %z0, %z1) { + call void @llvm.aarch64.sme.write.horiz.nxv2i64(i64 0, i32 %tileslice, %pg, %z0) + ret void +} + +define void @insert_col_d(i32 %tileslice, %pg, +; CHECK-LABEL: insert_col_d: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov za7v.d[w12, 1], p0/m, z1.d +; CHECK-NEXT: ret + %z0, %z1) { + %tileslice.1 = add i32 %tileslice, 1 + call void @llvm.aarch64.sme.write.vert.nxv2i64(i64 7, i32 %tileslice.1, %pg, %z1) + ret void +} + +define void @insert_f64(i32 %tileslice, %pg, +; CHECK-LABEL: insert_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w0 +; CHECK-NEXT: mov za0h.d[w12, 0], p0/m, z0.d +; CHECK-NEXT: mov za0v.d[w12, 1], p0/m, z1.d +; CHECK-NEXT: ret + %z0, %z1) { + call void @llvm.aarch64.sme.write.horiz.nxv2f64(i64 0, i32 %tileslice, %pg, %z0) + %tileslice.1 = add i32 %tileslice, 1 + call void @llvm.aarch64.sme.write.vert.nxv2f64(i64 0, i32 %tileslice.1, %pg, %z1) + ret void +} + +define void @insert_row_q_v16i8( %pg, %zn) { +; CHECK-LABEL: insert_row_q_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov za0h.q[w12, 0], p0/m, z0.q +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i64 0, i32 0, %pg, %zn) + ret void +} + +define void @insert_row_q_v8i16( %pg, %zn) { +; CHECK-LABEL: insert_row_q_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov za0h.q[w12, 0], p0/m, z0.q +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i64 0, i32 0, %pg, %zn) + ret void +} + +define void @insert_row_q_v8f16( %pg, %zn) { +; CHECK-LABEL: insert_row_q_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov za0h.q[w12, 0], p0/m, z0.q +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.writeq.horiz.nxv8f16(i64 0, i32 0, %pg, %zn) + ret void +} + +define void @insert_row_q_v8bf16( %pg, %zn) { +; CHECK-LABEL: insert_row_q_v8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov za0h.q[w12, 0], p0/m, z0.q +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.writeq.horiz.nxv8bf16(i64 0, i32 0, %pg, %zn) + ret void +} + +define void @insert_row_q_v4i32( %pg, %zn) { +; CHECK-LABEL: insert_row_q_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov za0h.q[w12, 0], p0/m, z0.q +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i64 0, i32 0, %pg, %zn) + ret void +} + +define void @insert_row_q_v4f32( %pg, %zn) { +; CHECK-LABEL: insert_row_q_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov za0h.q[w12, 0], p0/m, z0.q +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.writeq.horiz.nxv4f32(i64 0, i32 0, %pg, %zn) + ret void +} + +define void @insert_row_q_v2i64( %pg, %zn) { +; CHECK-LABEL: insert_row_q_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov za0h.q[w12, 0], p0/m, z0.q +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i64 0, i32 0, %pg, %zn) + ret void +} + +define void @insert_row_q_v2f64( %pg, %zn) { +; CHECK-LABEL: insert_row_q_v2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov za0h.q[w12, 0], p0/m, z0.q +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.writeq.horiz.nxv2f64(i64 0, i32 0, %pg, %zn) + ret void +} + +define void @insert_col_q_v16i8( %pg, %zn) { +; CHECK-LABEL: insert_col_q_v16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov za15v.q[w12, 0], p0/m, z0.q +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.writeq.vert.nxv16i8(i64 15, i32 0, %pg, %zn) + ret void +} + +define void @insert_col_q_v8i16( %pg, %zn) { +; CHECK-LABEL: insert_col_q_v8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov za15v.q[w12, 0], p0/m, z0.q +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.writeq.vert.nxv8i16(i64 15, i32 0, %pg, %zn) + ret void +} + +define void @insert_col_q_v8f16( %pg, %zn) { +; CHECK-LABEL: insert_col_q_v8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov za15v.q[w12, 0], p0/m, z0.q +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.writeq.vert.nxv8f16(i64 15, i32 0, %pg, %zn) + ret void +} + +define void @insert_col_q_v8bf16( %pg, %zn) { +; CHECK-LABEL: insert_col_q_v8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov za15v.q[w12, 0], p0/m, z0.q +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.writeq.vert.nxv8bf16(i64 15, i32 0, %pg, %zn) + ret void +} + +define void @insert_col_q_v4i32( %pg, %zn) { +; CHECK-LABEL: insert_col_q_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov za15v.q[w12, 0], p0/m, z0.q +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.writeq.vert.nxv4i32(i64 15, i32 0, %pg, %zn) + ret void +} + +define void @insert_col_q_v4f32( %pg, %zn) { +; CHECK-LABEL: insert_col_q_v4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov za15v.q[w12, 0], p0/m, z0.q +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.writeq.vert.nxv4f32(i64 15, i32 0, %pg, %zn) + ret void +} + +define void @insert_col_q_v2i64( %pg, %zn) { +; CHECK-LABEL: insert_col_q_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov za15v.q[w12, 0], p0/m, z0.q +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.writeq.vert.nxv2i64(i64 15, i32 0, %pg, %zn) + ret void +} + +define void @insert_col_q_v2f64( %pg, %zn) { +; CHECK-LABEL: insert_col_q_v2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov za15v.q[w12, 0], p0/m, z0.q +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.writeq.vert.nxv2f64(i64 15, i32 0, %pg, %zn) + ret void +} + + +declare void @llvm.aarch64.sme.write.horiz.nxv16i8(i64, i32, , ) +declare void @llvm.aarch64.sme.write.horiz.nxv8i16(i64, i32, , ) +declare void @llvm.aarch64.sme.write.horiz.nxv8f16(i64, i32, , ) +declare void @llvm.aarch64.sme.write.horiz.nxv8bf16(i64, i32, , ) +declare void @llvm.aarch64.sme.write.horiz.nxv4i32(i64, i32, , ) +declare void @llvm.aarch64.sme.write.horiz.nxv4f32(i64, i32, , ) +declare void @llvm.aarch64.sme.write.horiz.nxv2i64(i64, i32, , ) +declare void @llvm.aarch64.sme.write.horiz.nxv2f64(i64, i32, , ) +declare void @llvm.aarch64.sme.write.vert.nxv16i8(i64, i32, , ) +declare void @llvm.aarch64.sme.write.vert.nxv8i16(i64, i32, , ) +declare void @llvm.aarch64.sme.write.vert.nxv8f16(i64, i32, , ) +declare void @llvm.aarch64.sme.write.vert.nxv8bf16(i64, i32, , ) +declare void @llvm.aarch64.sme.write.vert.nxv4i32(i64, i32, , ) +declare void @llvm.aarch64.sme.write.vert.nxv4f32(i64, i32, , ) +declare void @llvm.aarch64.sme.write.vert.nxv2i64(i64, i32, , ) +declare void @llvm.aarch64.sme.write.vert.nxv2f64(i64, i32, , ) + +declare void @llvm.aarch64.sme.writeq.horiz.nxv16i8(i64, i32, , ) +declare void @llvm.aarch64.sme.writeq.horiz.nxv8i16(i64, i32, , ) +declare void @llvm.aarch64.sme.writeq.horiz.nxv8f16(i64, i32, , ) +declare void @llvm.aarch64.sme.writeq.horiz.nxv8bf16(i64, i32, , ) +declare void @llvm.aarch64.sme.writeq.horiz.nxv4i32(i64, i32, , ) +declare void @llvm.aarch64.sme.writeq.horiz.nxv4f32(i64, i32, , ) +declare void @llvm.aarch64.sme.writeq.horiz.nxv2i64(i64, i32, , ) +declare void @llvm.aarch64.sme.writeq.horiz.nxv2f64(i64, i32, , ) +declare void @llvm.aarch64.sme.writeq.vert.nxv16i8(i64, i32, , ) +declare void @llvm.aarch64.sme.writeq.vert.nxv8i16(i64, i32, , ) +declare void @llvm.aarch64.sme.writeq.vert.nxv8f16(i64, i32, , ) +declare void @llvm.aarch64.sme.writeq.vert.nxv8bf16(i64, i32, , ) +declare void @llvm.aarch64.sme.writeq.vert.nxv4i32(i64, i32, , ) +declare void @llvm.aarch64.sme.writeq.vert.nxv4f32(i64, i32, , ) +declare void @llvm.aarch64.sme.writeq.vert.nxv2i64(i64, i32, , ) +declare void @llvm.aarch64.sme.writeq.vert.nxv2f64(i64, i32, , )