Index: llvm/include/llvm/IR/IntrinsicsAArch64.td =================================================================== --- llvm/include/llvm/IR/IntrinsicsAArch64.td +++ llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2625,4 +2625,10 @@ def int_aarch64_sme_st1w_vert : SME_Load_Store_S_Intrinsic; def int_aarch64_sme_st1d_vert : SME_Load_Store_D_Intrinsic; def int_aarch64_sme_st1q_vert : SME_Load_Store_Q_Intrinsic; + + // Spill + fill + def int_aarch64_sme_ldr : DefaultAttrsIntrinsic< + [], [llvm_i32_ty, llvm_ptr_ty]>; + def int_aarch64_sme_str : DefaultAttrsIntrinsic< + [], [llvm_i32_ty, llvm_ptr_ty]>; } Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -576,6 +576,7 @@ MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg, MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const; MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2338,6 +2338,22 @@ return BB; } +MachineBasicBlock * +AArch64TargetLowering::EmitFill(MachineInstr &MI, MachineBasicBlock *BB) const { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineInstrBuilder MIB = + BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(AArch64::LDR_ZA)); + + MIB.addReg(AArch64::ZA, RegState::Define); + MIB.add(MI.getOperand(0)); // vector select register + MIB.add(MI.getOperand(1)); // vector select offset + MIB.add(MI.getOperand(2)); // base + MIB.add(MI.getOperand(1)); // offset, same as vector select offset + + MI.eraseFromParent(); // The pseudo is gone now. + return BB; +} + MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB) const { switch (MI.getOpcode()) { @@ -2388,6 +2404,8 @@ return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB); case AArch64::LD1_MXIPXX_V_PSEUDO_Q: return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB); + case AArch64::LDR_ZA_PSEUDO: + return EmitFill(MI, BB); } } Index: llvm/lib/Target/AArch64/AArch64InstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/AArch64InstrFormats.td +++ llvm/lib/Target/AArch64/AArch64InstrFormats.td @@ -3206,6 +3206,10 @@ def am_indexed64 : ComplexPattern; def am_indexed128 : ComplexPattern; +// (unsigned immediate) +// Indexed for 8-bit registers. offset is in range [0,15]. +def am_indexed8_4b : ComplexPattern", []>; + // (unsigned immediate) // Indexed for 8-bit registers. offset is in range [0,63]. def am_indexed8_6b : ComplexPattern", []>; Index: llvm/lib/Target/AArch64/SMEInstrFormats.td =================================================================== --- llvm/lib/Target/AArch64/SMEInstrFormats.td +++ llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -509,7 +509,7 @@ // SME Save and Restore Array //===----------------------------------------------------------------------===// -class sme_spill_fill_inst +class sme_spill_fill_base : I, Sched<[]> { @@ -524,33 +524,63 @@ let Inst{9-5} = Rn; let Inst{4} = 0b0; let Inst{3-0} = imm4; - - let mayLoad = !not(isStore); - let mayStore = isStore; } -multiclass sme_spill_fill { - def NAME : sme_spill_fill_inst; - +let mayStore = 1 in +class sme_spill_inst + : sme_spill_fill_base<0b1, (outs), + (ins MatrixOp:$ZAt, MatrixIndexGPR32Op12_15:$Rv, + sme_elm_idx0_15:$imm4, GPR64sp:$Rn, + imm0_15:$offset), + opcodestr>; +let mayLoad = 1 in +class sme_fill_inst + : sme_spill_fill_base<0b0, (outs MatrixOp:$ZAt), + (ins MatrixIndexGPR32Op12_15:$Rv, + sme_elm_idx0_15:$imm4, GPR64sp:$Rn, + imm0_15:$offset), + opcodestr>; +multiclass sme_spill { + def NAME : sme_spill_inst; def : InstAlias(NAME) MatrixOp:$ZAt, MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, 0), 1>; -} - -multiclass sme_spill { - defm NAME : sme_spill_fill<0b1, (outs), - (ins MatrixOp:$ZAt, MatrixIndexGPR32Op12_15:$Rv, - sme_elm_idx0_15:$imm4, GPR64sp:$Rn, - imm0_15:$offset), - opcodestr>; + // base + def : Pat<(int_aarch64_sme_str MatrixIndexGPR32Op12_15:$idx, GPR64sp:$base), + (!cast(NAME) ZA, $idx, 0, $base, 0)>; + // scalar + immediate (mul vl) + let AddedComplexity = 2 in { + def : Pat<(int_aarch64_sme_str MatrixIndexGPR32Op12_15:$idx, + (am_indexed8_4b GPR64sp:$base, + imm0_15:$offset)), + (!cast(NAME) ZA, $idx, 0, $base, $offset)>; + } } multiclass sme_fill { - defm NAME : sme_spill_fill<0b0, (outs MatrixOp:$ZAt), - (ins MatrixIndexGPR32Op12_15:$Rv, - sme_elm_idx0_15:$imm4, GPR64sp:$Rn, - imm0_15:$offset), - opcodestr>; + def NAME : sme_fill_inst; + def : InstAlias(NAME) MatrixOp:$ZAt, + MatrixIndexGPR32Op12_15:$Rv, sme_elm_idx0_15:$imm4, GPR64sp:$Rn, 0), 1>; + def NAME # _PSEUDO + : Pseudo<(outs), + (ins MatrixIndexGPR32Op12_15:$idx, imm0_15:$imm4, + GPR64sp:$base), []>, + Sched<[]> { + // Translated to actual instruction in AArch64ISelLowering.cpp + let usesCustomInserter = 1; + let mayLoad = 1; + } + // base + def : Pat<(int_aarch64_sme_ldr MatrixIndexGPR32Op12_15:$idx, GPR64sp:$base), + (!cast(NAME # _PSEUDO) $idx, 0, $base)>; + // scalar + immediate (mul vl) + let AddedComplexity = 2 in { + def : Pat<(int_aarch64_sme_ldr MatrixIndexGPR32Op12_15:$idx, + (am_indexed8_4b GPR64sp:$base, + imm0_15:$imm4)), + (!cast(NAME # _PSEUDO) $idx, $imm4, $base)>; + } } //===----------------------------------------------------------------------===// Index: llvm/test/CodeGen/AArch64/SME/sme-intrinsics-loads.ll =================================================================== --- llvm/test/CodeGen/AArch64/SME/sme-intrinsics-loads.ll +++ llvm/test/CodeGen/AArch64/SME/sme-intrinsics-loads.ll @@ -246,6 +246,27 @@ ret void; } +define void @ldr(i8* %ptr) { +; CHECK-LABEL: ldr: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: ldr za[w12, 0], [x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.ldr(i32 0, i8* %ptr) + ret void; +} + +define void @ldr_with_imm_offset(i8* %ptr) { +; CHECK-LABEL: ldr_with_imm_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: ldr za[w12, 15], [x0, #15, mul vl] +; CHECK-NEXT: ret + %base = getelementptr i8, i8* %ptr, i64 15 + call void @llvm.aarch64.sme.ldr(i32 0, i8* %base) + ret void; +} + declare void @llvm.aarch64.sme.ld1b.horiz(, i8*, i64, i32) declare void @llvm.aarch64.sme.ld1h.horiz(, i16*, i64, i32) declare void @llvm.aarch64.sme.ld1w.horiz(, i32*, i64, i32) @@ -256,3 +277,5 @@ declare void @llvm.aarch64.sme.ld1w.vert(, i32*, i64, i32) declare void @llvm.aarch64.sme.ld1d.vert(, i64*, i64, i32) declare void @llvm.aarch64.sme.ld1q.vert(, i128*, i64, i32) + +declare void @llvm.aarch64.sme.ldr(i32, i8*) Index: llvm/test/CodeGen/AArch64/SME/sme-intrinsics-stores.ll =================================================================== --- llvm/test/CodeGen/AArch64/SME/sme-intrinsics-stores.ll +++ llvm/test/CodeGen/AArch64/SME/sme-intrinsics-stores.ll @@ -246,6 +246,27 @@ ret void; } +define void @str(i8* %ptr) { +; CHECK-LABEL: str: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: str za[w12, 0], [x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.str(i32 0, i8* %ptr) + ret void; +} + +define void @str_with_imm_offset(i8* %ptr) { +; CHECK-LABEL: str_with_imm_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: str za[w12, 0], [x0, #15, mul vl] +; CHECK-NEXT: ret + %base = getelementptr i8, i8* %ptr, i64 15 + call void @llvm.aarch64.sme.str(i32 0, i8* %base) + ret void; +} + declare void @llvm.aarch64.sme.st1b.horiz(, i8*, i64, i32) declare void @llvm.aarch64.sme.st1h.horiz(, i16*, i64, i32) declare void @llvm.aarch64.sme.st1w.horiz(, i32*, i64, i32) @@ -256,3 +277,5 @@ declare void @llvm.aarch64.sme.st1w.vert(, i32*, i64, i32) declare void @llvm.aarch64.sme.st1d.vert(, i64*, i64, i32) declare void @llvm.aarch64.sme.st1q.vert(, i128*, i64, i32) + +declare void @llvm.aarch64.sme.str(i32, i8*)