diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -245,6 +245,7 @@ def llvm_i16_ty : LLVMType; def llvm_i32_ty : LLVMType; def llvm_i64_ty : LLVMType; +def llvm_i128_ty : LLVMType; def llvm_half_ty : LLVMType; def llvm_bfloat_ty : LLVMType; def llvm_float_ty : LLVMType; diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2583,3 +2583,46 @@ def int_aarch64_sve_whilewr_h : SVE2_CONFLICT_DETECT_Intrinsic; def int_aarch64_sve_whilewr_s : SVE2_CONFLICT_DETECT_Intrinsic; def int_aarch64_sve_whilewr_d : SVE2_CONFLICT_DETECT_Intrinsic; + +// Scalable Matrix Extension (SME) Intrinsics +let TargetPrefix = "aarch64" in { + class SME_Load_Store_B_Intrinsic + : DefaultAttrsIntrinsic<[], + [llvm_nxv16i1_ty, llvm_ptr_ty, llvm_i64_ty, llvm_i32_ty], []>; + class SME_Load_Store_H_Intrinsic + : DefaultAttrsIntrinsic<[], + [llvm_nxv16i1_ty, LLVMPointerType, llvm_i64_ty, llvm_i32_ty], []>; + class SME_Load_Store_S_Intrinsic + : DefaultAttrsIntrinsic<[], + [llvm_nxv16i1_ty, LLVMPointerType, llvm_i64_ty, llvm_i32_ty], []>; + class SME_Load_Store_D_Intrinsic + : DefaultAttrsIntrinsic<[], + [llvm_nxv16i1_ty, LLVMPointerType, llvm_i64_ty, llvm_i32_ty], []>; + class SME_Load_Store_Q_Intrinsic + : DefaultAttrsIntrinsic<[], + [llvm_nxv16i1_ty, LLVMPointerType, llvm_i64_ty, llvm_i32_ty], []>; + + // Loads + def int_aarch64_sme_ld1b_horiz : SME_Load_Store_B_Intrinsic; + def int_aarch64_sme_ld1h_horiz : SME_Load_Store_H_Intrinsic; + def int_aarch64_sme_ld1w_horiz : SME_Load_Store_S_Intrinsic; + def int_aarch64_sme_ld1d_horiz : SME_Load_Store_D_Intrinsic; + def int_aarch64_sme_ld1q_horiz : SME_Load_Store_Q_Intrinsic; + def int_aarch64_sme_ld1b_vert : SME_Load_Store_B_Intrinsic; + def int_aarch64_sme_ld1h_vert : SME_Load_Store_H_Intrinsic; + def int_aarch64_sme_ld1w_vert : SME_Load_Store_S_Intrinsic; + def int_aarch64_sme_ld1d_vert : SME_Load_Store_D_Intrinsic; + def int_aarch64_sme_ld1q_vert : SME_Load_Store_Q_Intrinsic; + + // Stores + def int_aarch64_sme_st1b_horiz : SME_Load_Store_B_Intrinsic; + def int_aarch64_sme_st1h_horiz : SME_Load_Store_H_Intrinsic; + def int_aarch64_sme_st1w_horiz : SME_Load_Store_S_Intrinsic; + def int_aarch64_sme_st1d_horiz : SME_Load_Store_D_Intrinsic; + def int_aarch64_sme_st1q_horiz : SME_Load_Store_Q_Intrinsic; + def int_aarch64_sme_st1b_vert : SME_Load_Store_B_Intrinsic; + def int_aarch64_sme_st1h_vert : SME_Load_Store_H_Intrinsic; + def int_aarch64_sme_st1w_vert : SME_Load_Store_S_Intrinsic; + def int_aarch64_sme_st1d_vert : SME_Load_Store_D_Intrinsic; + def int_aarch64_sme_st1q_vert : SME_Load_Store_Q_Intrinsic; +} diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -278,6 +278,15 @@ return false; } + template bool ImmToTile(SDValue N, SDValue &Imm) { + if (auto *CI = dyn_cast(N)) { + uint64_t C = CI->getZExtValue(); + Imm = CurDAG->getRegister(BaseReg + C, MVT::Other); + return true; + } + return false; + } + /// Form sequences of consecutive 64/128-bit registers for use in NEON /// instructions making use of a vector-list (e.g. ldN, tbl). Vecs must have /// between 1 and 4 elements. If it contains a single element that is returned @@ -321,6 +330,11 @@ return SelectSVERegRegAddrMode(N, Scale, Base, Offset); } + template + bool SelectSMETileSlice(SDValue N, SDValue &Vector, SDValue &Offset) { + return SelectSMETileSlice(N, Scale, Vector, Offset); + } + void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc); void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc); @@ -389,6 +403,8 @@ bool SelectSVEArithImm(SDValue N, MVT VT, SDValue &Imm); bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base, SDValue &Offset); + bool SelectSMETileSlice(SDValue N, unsigned Scale, SDValue &Vector, + SDValue &Offset); bool SelectAllActivePredicate(SDValue N); }; @@ -5224,3 +5240,27 @@ return TLI->isAllActivePredicate(*CurDAG, N); } + +bool AArch64DAGToDAGISel::SelectSMETileSlice(SDValue N, unsigned Scale, + SDValue &Vector, SDValue &Offset) { + if (N.getOpcode() != ISD::ADD) + return false; + + // Process an ADD node. + const SDValue LHS = N.getOperand(0); + const SDValue RHS = N.getOperand(1); + + if (auto C = dyn_cast(RHS)) { + int64_t ImmOff = C->getSExtValue(); + unsigned MaxSize = (1 << Scale) - 1; + + if (ImmOff < 0 || ImmOff > MaxSize) + return false; + + Vector = LHS; + Offset = CurDAG->getTargetConstant(ImmOff, SDLoc(N), MVT::i64); + return true; + } + + return false; +} diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -556,6 +556,10 @@ MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI, MachineBasicBlock *BB) const; + MachineBasicBlock *EmitTileLoad(unsigned Opc, unsigned BaseReg, + MachineInstr &MI, + MachineBasicBlock *BB) const; + MachineBasicBlock * EmitInstrWithCustomInserter(MachineInstr &MI, MachineBasicBlock *MBB) const override; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -2323,6 +2323,24 @@ return BB; } +MachineBasicBlock * +AArch64TargetLowering::EmitTileLoad(unsigned Opc, unsigned BaseReg, + MachineInstr &MI, + MachineBasicBlock *BB) const { + const TargetInstrInfo *TII = Subtarget->getInstrInfo(); + MachineInstrBuilder MIB = BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(Opc)); + + MIB.addReg(BaseReg + MI.getOperand(0).getImm(), RegState::Define); + MIB.add(MI.getOperand(1)); // slice index register + MIB.add(MI.getOperand(2)); // slice index offset + MIB.add(MI.getOperand(3)); // pg + MIB.add(MI.getOperand(4)); // base + MIB.add(MI.getOperand(5)); // offset + + MI.eraseFromParent(); // The pseudo is gone now. + return BB; +} + MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter( MachineInstr &MI, MachineBasicBlock *BB) const { switch (MI.getOpcode()) { @@ -2353,6 +2371,26 @@ case AArch64::CATCHRET: return EmitLoweredCatchRet(MI, BB); + case AArch64::LD1_MXIPXX_H_PSEUDO_B: + return EmitTileLoad(AArch64::LD1_MXIPXX_H_B, AArch64::ZAB0, MI, BB); + case AArch64::LD1_MXIPXX_H_PSEUDO_H: + return EmitTileLoad(AArch64::LD1_MXIPXX_H_H, AArch64::ZAH0, MI, BB); + case AArch64::LD1_MXIPXX_H_PSEUDO_S: + return EmitTileLoad(AArch64::LD1_MXIPXX_H_S, AArch64::ZAS0, MI, BB); + case AArch64::LD1_MXIPXX_H_PSEUDO_D: + return EmitTileLoad(AArch64::LD1_MXIPXX_H_D, AArch64::ZAD0, MI, BB); + case AArch64::LD1_MXIPXX_H_PSEUDO_Q: + return EmitTileLoad(AArch64::LD1_MXIPXX_H_Q, AArch64::ZAQ0, MI, BB); + case AArch64::LD1_MXIPXX_V_PSEUDO_B: + return EmitTileLoad(AArch64::LD1_MXIPXX_V_B, AArch64::ZAB0, MI, BB); + case AArch64::LD1_MXIPXX_V_PSEUDO_H: + return EmitTileLoad(AArch64::LD1_MXIPXX_V_H, AArch64::ZAH0, MI, BB); + case AArch64::LD1_MXIPXX_V_PSEUDO_S: + return EmitTileLoad(AArch64::LD1_MXIPXX_V_S, AArch64::ZAS0, MI, BB); + case AArch64::LD1_MXIPXX_V_PSEUDO_D: + return EmitTileLoad(AArch64::LD1_MXIPXX_V_D, AArch64::ZAD0, MI, BB); + case AArch64::LD1_MXIPXX_V_PSEUDO_Q: + return EmitTileLoad(AArch64::LD1_MXIPXX_V_Q, AArch64::ZAQ0, MI, BB); } } diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp @@ -338,6 +338,13 @@ if (MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening)) markSuperRegs(Reserved, AArch64::W16); + // SME tiles are not allocatable. + if (MF.getSubtarget().hasSME()) { + for (MCSubRegIterator SubReg(AArch64::ZA, this, /*self=*/true); + SubReg.isValid(); ++SubReg) + Reserved.set(*SubReg); + } + assert(checkAllSuperRegsMarked(Reserved)); return Reserved; } diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -1212,26 +1212,28 @@ // SME Register Classes -// Accumulator array -def MPR : RegisterClass<"AArch64", [untyped], 2048, (add ZA)> { - let Size = 2048; -} +let isAllocatable = 0 in { + // Accumulator array + def MPR : RegisterClass<"AArch64", [untyped], 2048, (add ZA)> { + let Size = 2048; + } -// Accumulator array as single tiles -def MPR8 : RegisterClass<"AArch64", [untyped], 2048, (add (sequence "ZAB%u", 0, 0))> { - let Size = 2048; -} -def MPR16 : RegisterClass<"AArch64", [untyped], 1024, (add (sequence "ZAH%u", 0, 1))> { - let Size = 1024; -} -def MPR32 : RegisterClass<"AArch64", [untyped], 512, (add (sequence "ZAS%u", 0, 3))> { - let Size = 512; -} -def MPR64 : RegisterClass<"AArch64", [untyped], 256, (add (sequence "ZAD%u", 0, 7))> { - let Size = 256; -} -def MPR128 : RegisterClass<"AArch64", [untyped], 128, (add (sequence "ZAQ%u", 0, 15))> { - let Size = 128; + // Accumulator array as single tiles + def MPR8 : RegisterClass<"AArch64", [untyped], 2048, (add (sequence "ZAB%u", 0, 0))> { + let Size = 2048; + } + def MPR16 : RegisterClass<"AArch64", [untyped], 1024, (add (sequence "ZAH%u", 0, 1))> { + let Size = 1024; + } + def MPR32 : RegisterClass<"AArch64", [untyped], 512, (add (sequence "ZAS%u", 0, 3))> { + let Size = 512; + } + def MPR64 : RegisterClass<"AArch64", [untyped], 256, (add (sequence "ZAD%u", 0, 7))> { + let Size = 256; + } + def MPR128 : RegisterClass<"AArch64", [untyped], 128, (add (sequence "ZAQ%u", 0, 15))> { + let Size = 128; + } } // SME Register Operands diff --git a/llvm/lib/Target/AArch64/SMEInstrFormats.td b/llvm/lib/Target/AArch64/SMEInstrFormats.td --- a/llvm/lib/Target/AArch64/SMEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SMEInstrFormats.td @@ -10,6 +10,18 @@ // //===----------------------------------------------------------------------===// +def imm_to_tile8 : ComplexPattern", []>; +def imm_to_tile16 : ComplexPattern", []>; +def imm_to_tile32 : ComplexPattern", []>; +def imm_to_tile64 : ComplexPattern", []>; +def imm_to_tile128 : ComplexPattern", []>; + +def tileslice8 : ComplexPattern", []>; +def tileslice16 : ComplexPattern", []>; +def tileslice32 : ComplexPattern", []>; +def tileslice64 : ComplexPattern", []>; +def tileslice128 : ComplexPattern", []>; // nop + //===----------------------------------------------------------------------===// // SME Outer Products //===----------------------------------------------------------------------===// @@ -233,6 +245,45 @@ defm NAME : sme_mem_ss_aliases<"ld1", inst, is_col, "/z">; } +multiclass sme_mem_ld_ss_patterns { + // base + def : Pat<(Load PPR3bAny:$pg, GPR64sp:$base, tile_ty:$tile, + MatrixIndexGPR32Op12_15:$idx), + (Inst tile_ty:$tile, $idx, 0, $pg, $base, XZR)>; + // reg + reg + let AddedComplexity = 1 in { + def : Pat<(Load PPR3bAny:$pg, (addr GPR64sp:$base, GPR64:$offset), + tile_ty:$tile, MatrixIndexGPR32Op12_15:$idx), + (Inst tile_ty:$tile, $idx, 0, $pg, $base, $offset)>; + } + + // base, tileslice + let AddedComplexity = 1 in { + def : Pat<(Load PPR3bAny:$pg, GPR64sp:$base, tile_ty:$tile, + (i32 (tileslice MatrixIndexGPR32Op12_15:$idx, offset_ty:$imm))), + (Inst tile_ty:$tile, $idx, $imm, $pg, $base, XZR)>; + } + // reg + reg, tileslice + let AddedComplexity = 2 in { + def : Pat<(Load PPR3bAny:$pg, (addr GPR64sp:$base, GPR64:$offset), + tile_ty:$tile, (i32 (tileslice MatrixIndexGPR32Op12_15:$idx, + offset_ty:$imm))), + (Inst tile_ty:$tile, $idx, $imm, $pg, $base, $offset)>; + } +} + +class sme_load_pseudo + : Pseudo<(outs), (ins i64imm:$tile, MatrixIndexGPR32Op12_15:$idx, + i64imm:$imm, PPR3bAny:$pg, GPR64sp:$base, GPR64:$offset), []>, + Sched<[]> { + // Translated to the actual instructions in AArch64ISelLowering.cpp + let usesCustomInserter = 1; + let mayLoad = 1; +} + multiclass sme_mem_ld_v_ss { def _B : sme_mem_ld_ss_inst<0b0, 0b00, mnemonic # "b", !if(is_col, TileVectorOpV8, TileVectorOpH8), @@ -272,6 +323,40 @@ } defm : sme_mem_ld_ss_aliases; + + // Pseudo instructions for lowering intrinsics, using immediates instead of + // tile registers. + def _PSEUDO_B : sme_load_pseudo; + def _PSEUDO_H : sme_load_pseudo; + def _PSEUDO_S : sme_load_pseudo; + def _PSEUDO_D : sme_load_pseudo; + def _PSEUDO_Q : sme_load_pseudo; + + defm : sme_mem_ld_ss_patterns(NAME # _PSEUDO_B), + !if(is_col, int_aarch64_sme_ld1b_vert, + int_aarch64_sme_ld1b_horiz), + sme_elm_idx0_0, imm0_15, am_sve_regreg_lsl0, + tileslice8>; + defm : sme_mem_ld_ss_patterns(NAME # _PSEUDO_H), + !if(is_col, int_aarch64_sme_ld1h_vert, + int_aarch64_sme_ld1h_horiz), + imm0_1, imm0_7, am_sve_regreg_lsl1, + tileslice16>; + defm : sme_mem_ld_ss_patterns(NAME # _PSEUDO_S), + !if(is_col, int_aarch64_sme_ld1w_vert, + int_aarch64_sme_ld1w_horiz), + imm0_3, imm0_3, am_sve_regreg_lsl2, + tileslice32>; + defm : sme_mem_ld_ss_patterns(NAME # _PSEUDO_D), + !if(is_col, int_aarch64_sme_ld1d_vert, + int_aarch64_sme_ld1d_horiz), + imm0_7, imm0_1, am_sve_regreg_lsl3, + tileslice64>; + defm : sme_mem_ld_ss_patterns(NAME # _PSEUDO_Q), + !if(is_col, int_aarch64_sme_ld1q_vert, + int_aarch64_sme_ld1q_horiz), + imm0_15, sme_elm_idx0_0, am_sve_regreg_lsl4, + tileslice128>; } multiclass sme_mem_ld_ss { @@ -318,6 +403,36 @@ defm NAME : sme_mem_ss_aliases<"st1", inst, is_col>; } +multiclass sme_mem_st_ss_patterns { + // base + def : Pat<(Store PPR3bAny:$pg, GPR64sp:$base, (imm2tile untyped:$tile), + MatrixIndexGPR32Op12_15:$idx), + (Inst $tile, $idx, 0, $pg, $base, XZR)>; + // reg + reg + let AddedComplexity = 1 in { + def : Pat<(Store PPR3bAny:$pg, (addr GPR64sp:$base, GPR64:$offset), + (imm2tile untyped:$tile), MatrixIndexGPR32Op12_15:$idx), + (Inst $tile, $idx, 0, $pg, $base, $offset)>; + } + // base, tileslice + let AddedComplexity = 1 in { + def : Pat<(Store PPR3bAny:$pg, GPR64sp:$base, (imm2tile untyped:$tile), + (i32 (tileslice MatrixIndexGPR32Op12_15:$idx, offset_ty:$imm))), + (Inst $tile, $idx, $imm, $pg, $base, XZR)>; + } + // reg + reg, tileslice + let AddedComplexity = 2 in { + def : Pat<(Store PPR3bAny:$pg, (addr GPR64sp:$base, GPR64:$offset), + (imm2tile untyped:$tile), + (i32 (tileslice MatrixIndexGPR32Op12_15:$idx, offset_ty:$imm))), + (Inst $tile, $idx, $imm, $pg, $base, $offset)>; + } +} + multiclass sme_mem_st_v_ss { def _B : sme_mem_st_ss_inst<0b0, 0b00, mnemonic # "b", !if(is_col, TileVectorOpV8, TileVectorOpH8), @@ -357,6 +472,32 @@ } defm : sme_mem_st_ss_aliases; + + defm : sme_mem_st_ss_patterns(NAME # _B), + !if(is_col, int_aarch64_sme_st1b_vert, + int_aarch64_sme_st1b_horiz), + imm0_15, imm_to_tile8, am_sve_regreg_lsl0, + tileslice8>; + defm : sme_mem_st_ss_patterns(NAME # _H), + !if(is_col, int_aarch64_sme_st1h_vert, + int_aarch64_sme_st1h_horiz), + imm0_7, imm_to_tile16, am_sve_regreg_lsl1, + tileslice16>; + defm : sme_mem_st_ss_patterns(NAME # _S), + !if(is_col, int_aarch64_sme_st1w_vert, + int_aarch64_sme_st1w_horiz), + imm0_3, imm_to_tile32, am_sve_regreg_lsl2, + tileslice32>; + defm : sme_mem_st_ss_patterns(NAME # _D), + !if(is_col, int_aarch64_sme_st1d_vert, + int_aarch64_sme_st1d_horiz), + imm0_1, imm_to_tile64, am_sve_regreg_lsl3, + tileslice64>; + defm : sme_mem_st_ss_patterns(NAME # _Q), + !if(is_col, int_aarch64_sme_st1q_vert, + int_aarch64_sme_st1q_horiz), + sme_elm_idx0_0, imm_to_tile128, + am_sve_regreg_lsl4, tileslice128>; } multiclass sme_mem_st_ss { diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -8489,6 +8489,7 @@ def am_sve_regreg_lsl1 : ComplexPattern", []>; def am_sve_regreg_lsl2 : ComplexPattern", []>; def am_sve_regreg_lsl3 : ComplexPattern", []>; +def am_sve_regreg_lsl4 : ComplexPattern", []>; // Predicated pseudo floating point two operand instructions. multiclass sve_fp_bin_pred_hfd { diff --git a/llvm/test/CodeGen/AArch64/SME/sme-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/SME/sme-intrinsics-loads.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/SME/sme-intrinsics-loads.ll @@ -0,0 +1,258 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s + +define void @ld1b( %pg, i8* %ptr, i32 %sliceidx) { +; CHECK-LABEL: ld1b: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w1 +; CHECK-NEXT: mov w13, wzr +; CHECK-NEXT: ld1b {za0h.b[w12, 15]}, p0/z, [x0] +; CHECK-NEXT: ld1b {za0v.b[w13, 0]}, p0/z, [x0] +; CHECK-NEXT: ret + %tileslice = add i32 %sliceidx, 15 + call void @llvm.aarch64.sme.ld1b.horiz( %pg, i8* %ptr, i64 0, i32 %tileslice) + call void @llvm.aarch64.sme.ld1b.vert( %pg, i8* %ptr, i64 0, i32 0) + ret void; +} + +define void @ld1b_with_addr_offset( %pg, i8* %ptr, i64 %index, i32 %sliceidx) { +; CHECK-LABEL: ld1b_with_addr_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov w13, w2 +; CHECK-NEXT: ld1b {za0h.b[w12, 0]}, p0/z, [x0, x1] +; CHECK-NEXT: ld1b {za0v.b[w13, 15]}, p0/z, [x0, x1] +; CHECK-NEXT: ret + %base = getelementptr i8, i8* %ptr, i64 %index + %tileslice = add i32 %sliceidx, 15 + call void @llvm.aarch64.sme.ld1b.horiz( %pg, i8* %base, i64 0, i32 0) + call void @llvm.aarch64.sme.ld1b.vert( %pg, i8* %base, i64 0, i32 %tileslice) + ret void; +} + +define void @ld1h( %pg, i16* %ptr, i32 %sliceidx) { +; CHECK-LABEL: ld1h: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w1 +; CHECK-NEXT: mov w13, wzr +; CHECK-NEXT: ld1h {za0h.h[w12, 7]}, p0/z, [x0] +; CHECK-NEXT: ld1h {za1h.h[w13, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1h {za0v.h[w13, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1h {za1v.h[w12, 7]}, p0/z, [x0] +; CHECK-NEXT: ret + %tileslice = add i32 %sliceidx, 7 + call void @llvm.aarch64.sme.ld1h.horiz( %pg, i16* %ptr, i64 0, i32 %tileslice) + call void @llvm.aarch64.sme.ld1h.horiz( %pg, i16* %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.ld1h.vert( %pg, i16* %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.ld1h.vert( %pg, i16* %ptr, i64 1, i32 %tileslice) + ret void; +} + +define void @ld1h_with_addr_offset( %pg, i16* %ptr, i64 %index, i32 %sliceidx) { +; CHECK-LABEL: ld1h_with_addr_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w2 +; CHECK-NEXT: mov w13, wzr +; CHECK-NEXT: ld1h {za0h.h[w12, 7]}, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ld1h {za1v.h[w13, 0]}, p0/z, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base = getelementptr i16, i16* %ptr, i64 %index + %tileslice = add i32 %sliceidx, 7 + call void @llvm.aarch64.sme.ld1h.horiz( %pg, i16* %base, i64 0, i32 %tileslice) + call void @llvm.aarch64.sme.ld1h.vert( %pg, i16* %base, i64 1, i32 0) + ret void; +} + +define void @ld1w( %pg, i32* %ptr, i32 %sliceidx) { +; CHECK-LABEL: ld1w: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov w13, w1 +; CHECK-NEXT: ld1w {za0h.s[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1w {za1h.s[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1w {za2h.s[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1w {za3h.s[w13, 3]}, p0/z, [x0] +; CHECK-NEXT: ld1w {za0v.s[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1w {za1v.s[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1w {za2v.s[w13, 3]}, p0/z, [x0] +; CHECK-NEXT: ld1w {za3v.s[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ret + %tileslice = add i32 %sliceidx, 3 + call void @llvm.aarch64.sme.ld1w.horiz( %pg, i32* %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.ld1w.horiz( %pg, i32* %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.ld1w.horiz( %pg, i32* %ptr, i64 2, i32 0) + call void @llvm.aarch64.sme.ld1w.horiz( %pg, i32* %ptr, i64 3, i32 %tileslice) + call void @llvm.aarch64.sme.ld1w.vert( %pg, i32* %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.ld1w.vert( %pg, i32* %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.ld1w.vert( %pg, i32* %ptr, i64 2, i32 %tileslice) + call void @llvm.aarch64.sme.ld1w.vert( %pg, i32* %ptr, i64 3, i32 0) + ret void; +} + +define void @ld1w_with_addr_offset( %pg, i32* %ptr, i64 %index, i32 %sliceidx) { +; CHECK-LABEL: ld1w_with_addr_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w2 +; CHECK-NEXT: mov w13, wzr +; CHECK-NEXT: ld1w {za0h.s[w13, 0]}, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ld1w {za3v.s[w12, 3]}, p0/z, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %base = getelementptr i32, i32* %ptr, i64 %index + %tileslice = add i32 %sliceidx, 3 + call void @llvm.aarch64.sme.ld1w.horiz( %pg, i32* %base, i64 0, i32 0) + call void @llvm.aarch64.sme.ld1w.vert( %pg, i32* %base, i64 3, i32 %tileslice) + ret void; +} + +define void @ld1d( %pg, i64* %ptr, i32 %sliceidx) { +; CHECK-LABEL: ld1d: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w13, wzr +; CHECK-NEXT: mov w12, w1 +; CHECK-NEXT: ld1d {za0h.d[w13, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1d {za1h.d[w13, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1d {za2h.d[w13, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1d {za3h.d[w13, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1d {za4h.d[w12, 1]}, p0/z, [x0] +; CHECK-NEXT: ld1d {za5h.d[w13, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1d {za6h.d[w13, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1d {za7h.d[w13, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1d {za0v.d[w13, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1d {za1v.d[w13, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1d {za2v.d[w13, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1d {za3v.d[w13, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1d {za4v.d[w13, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1d {za5v.d[w13, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1d {za6v.d[w13, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1d {za7v.d[w12, 1]}, p0/z, [x0] +; CHECK-NEXT: ret + %tileslice = add i32 %sliceidx, 1 + call void @llvm.aarch64.sme.ld1d.horiz( %pg, i64* %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.ld1d.horiz( %pg, i64* %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.ld1d.horiz( %pg, i64* %ptr, i64 2, i32 0) + call void @llvm.aarch64.sme.ld1d.horiz( %pg, i64* %ptr, i64 3, i32 0) + call void @llvm.aarch64.sme.ld1d.horiz( %pg, i64* %ptr, i64 4, i32 %tileslice) + call void @llvm.aarch64.sme.ld1d.horiz( %pg, i64* %ptr, i64 5, i32 0) + call void @llvm.aarch64.sme.ld1d.horiz( %pg, i64* %ptr, i64 6, i32 0) + call void @llvm.aarch64.sme.ld1d.horiz( %pg, i64* %ptr, i64 7, i32 0) + call void @llvm.aarch64.sme.ld1d.vert( %pg, i64* %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.ld1d.vert( %pg, i64* %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.ld1d.vert( %pg, i64* %ptr, i64 2, i32 0) + call void @llvm.aarch64.sme.ld1d.vert( %pg, i64* %ptr, i64 3, i32 0) + call void @llvm.aarch64.sme.ld1d.vert( %pg, i64* %ptr, i64 4, i32 0) + call void @llvm.aarch64.sme.ld1d.vert( %pg, i64* %ptr, i64 5, i32 0) + call void @llvm.aarch64.sme.ld1d.vert( %pg, i64* %ptr, i64 6, i32 0) + call void @llvm.aarch64.sme.ld1d.vert( %pg, i64* %ptr, i64 7, i32 %tileslice) + ret void; +} + +define void @ld1d_with_addr_offset( %pg, i64* %ptr, i64 %index, i32 %sliceidx) { +; CHECK-LABEL: ld1d_with_addr_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w2 +; CHECK-NEXT: mov w13, wzr +; CHECK-NEXT: ld1d {za0h.d[w12, 1]}, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ld1d {za7v.d[w13, 0]}, p0/z, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %base = getelementptr i64, i64* %ptr, i64 %index + %tileslice = add i32 %sliceidx, 1 + call void @llvm.aarch64.sme.ld1d.horiz( %pg, i64* %base, i64 0, i32 %tileslice) + call void @llvm.aarch64.sme.ld1d.vert( %pg, i64* %base, i64 7, i32 0) + ret void; +} + +define void @ld1q( %pg, i128* %ptr) { +; CHECK-LABEL: ld1q: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: ld1q {za0h.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za1h.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za2h.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za3h.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za4h.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za5h.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za6h.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za7h.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za8h.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za9h.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za10h.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za11h.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za12h.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za13h.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za14h.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za15h.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za0v.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za1v.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za2v.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za3v.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za4v.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za5v.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za6v.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za7v.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za8v.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za9v.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za10v.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za11v.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za12v.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za13v.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za14v.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ld1q {za15v.q[w12, 0]}, p0/z, [x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.ld1q.horiz( %pg, i128* %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, i128* %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, i128* %ptr, i64 2, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, i128* %ptr, i64 3, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, i128* %ptr, i64 4, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, i128* %ptr, i64 5, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, i128* %ptr, i64 6, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, i128* %ptr, i64 7, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, i128* %ptr, i64 8, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, i128* %ptr, i64 9, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, i128* %ptr, i64 10, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, i128* %ptr, i64 11, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, i128* %ptr, i64 12, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, i128* %ptr, i64 13, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, i128* %ptr, i64 14, i32 0) + call void @llvm.aarch64.sme.ld1q.horiz( %pg, i128* %ptr, i64 15, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, i128* %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, i128* %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, i128* %ptr, i64 2, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, i128* %ptr, i64 3, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, i128* %ptr, i64 4, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, i128* %ptr, i64 5, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, i128* %ptr, i64 6, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, i128* %ptr, i64 7, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, i128* %ptr, i64 8, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, i128* %ptr, i64 9, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, i128* %ptr, i64 10, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, i128* %ptr, i64 11, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, i128* %ptr, i64 12, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, i128* %ptr, i64 13, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, i128* %ptr, i64 14, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, i128* %ptr, i64 15, i32 0) + ret void; +} + +define void @ld1q_with_addr_offset( %pg, i128* %ptr, i64 %index) { +; CHECK-LABEL: ld1q_with_addr_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: ld1q {za0h.q[w12, 0]}, p0/z, [x0, x1, lsl #4] +; CHECK-NEXT: ld1q {za15v.q[w12, 0]}, p0/z, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %base = getelementptr i128, i128* %ptr, i64 %index + call void @llvm.aarch64.sme.ld1q.horiz( %pg, i128* %base, i64 0, i32 0) + call void @llvm.aarch64.sme.ld1q.vert( %pg, i128* %base, i64 15, i32 0) + ret void; +} + +declare void @llvm.aarch64.sme.ld1b.horiz(, i8*, i64, i32) +declare void @llvm.aarch64.sme.ld1h.horiz(, i16*, i64, i32) +declare void @llvm.aarch64.sme.ld1w.horiz(, i32*, i64, i32) +declare void @llvm.aarch64.sme.ld1d.horiz(, i64*, i64, i32) +declare void @llvm.aarch64.sme.ld1q.horiz(, i128*, i64, i32) +declare void @llvm.aarch64.sme.ld1b.vert(, i8*, i64, i32) +declare void @llvm.aarch64.sme.ld1h.vert(, i16*, i64, i32) +declare void @llvm.aarch64.sme.ld1w.vert(, i32*, i64, i32) +declare void @llvm.aarch64.sme.ld1d.vert(, i64*, i64, i32) +declare void @llvm.aarch64.sme.ld1q.vert(, i128*, i64, i32) diff --git a/llvm/test/CodeGen/AArch64/SME/sme-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/SME/sme-intrinsics-stores.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/SME/sme-intrinsics-stores.ll @@ -0,0 +1,258 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme -verify-machineinstrs < %s | FileCheck %s + +define void @st1b( %pg, i8* %ptr, i32 %sliceidx) { +; CHECK-LABEL: st1b: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w1 +; CHECK-NEXT: mov w13, wzr +; CHECK-NEXT: st1b {za0h.b[w12, 15]}, p0, [x0] +; CHECK-NEXT: st1b {za0v.b[w13, 0]}, p0, [x0] +; CHECK-NEXT: ret + %tileslice = add i32 %sliceidx, 15 + call void @llvm.aarch64.sme.st1b.horiz( %pg, i8* %ptr, i64 0, i32 %tileslice) + call void @llvm.aarch64.sme.st1b.vert( %pg, i8* %ptr, i64 0, i32 0) + ret void; +} + +define void @st1b_with_addr_offset( %pg, i8* %ptr, i64 %index, i32 %sliceidx) { +; CHECK-LABEL: st1b_with_addr_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov w13, w2 +; CHECK-NEXT: st1b {za0h.b[w12, 0]}, p0, [x0, x1] +; CHECK-NEXT: st1b {za0v.b[w13, 15]}, p0, [x0, x1] +; CHECK-NEXT: ret + %base = getelementptr i8, i8* %ptr, i64 %index + %tileslice = add i32 %sliceidx, 15 + call void @llvm.aarch64.sme.st1b.horiz( %pg, i8* %base, i64 0, i32 0) + call void @llvm.aarch64.sme.st1b.vert( %pg, i8* %base, i64 0, i32 %tileslice) + ret void; +} + +define void @st1h( %pg, i16* %ptr, i32 %sliceidx) { +; CHECK-LABEL: st1h: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w1 +; CHECK-NEXT: mov w13, wzr +; CHECK-NEXT: st1h {za0h.h[w12, 7]}, p0, [x0] +; CHECK-NEXT: st1h {za1h.h[w13, 0]}, p0, [x0] +; CHECK-NEXT: st1h {za0v.h[w13, 0]}, p0, [x0] +; CHECK-NEXT: st1h {za1v.h[w12, 7]}, p0, [x0] +; CHECK-NEXT: ret + %tileslice = add i32 %sliceidx, 7 + call void @llvm.aarch64.sme.st1h.horiz( %pg, i16* %ptr, i64 0, i32 %tileslice) + call void @llvm.aarch64.sme.st1h.horiz( %pg, i16* %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.st1h.vert( %pg, i16* %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.st1h.vert( %pg, i16* %ptr, i64 1, i32 %tileslice) + ret void; +} + +define void @st1h_with_addr_offset( %pg, i16* %ptr, i64 %index, i32 %sliceidx) { +; CHECK-LABEL: st1h_with_addr_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w2 +; CHECK-NEXT: mov w13, wzr +; CHECK-NEXT: st1h {za0h.h[w12, 7]}, p0, [x0, x1, lsl #1] +; CHECK-NEXT: st1h {za1v.h[w13, 0]}, p0, [x0, x1, lsl #1] +; CHECK-NEXT: ret + %base = getelementptr i16, i16* %ptr, i64 %index + %tileslice = add i32 %sliceidx, 7 + call void @llvm.aarch64.sme.st1h.horiz( %pg, i16* %base, i64 0, i32 %tileslice) + call void @llvm.aarch64.sme.st1h.vert( %pg, i16* %base, i64 1, i32 0) + ret void; +} + +define void @st1w( %pg, i32* %ptr, i32 %sliceidx) { +; CHECK-LABEL: st1w: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w13, wzr +; CHECK-NEXT: mov w12, w1 +; CHECK-NEXT: st1w {za0h.s[w13, 0]}, p0, [x0] +; CHECK-NEXT: st1w {za1h.s[w13, 0]}, p0, [x0] +; CHECK-NEXT: st1w {za2h.s[w13, 0]}, p0, [x0] +; CHECK-NEXT: st1w {za3h.s[w12, 3]}, p0, [x0] +; CHECK-NEXT: st1w {za0v.s[w13, 0]}, p0, [x0] +; CHECK-NEXT: st1w {za1v.s[w13, 0]}, p0, [x0] +; CHECK-NEXT: st1w {za2v.s[w12, 3]}, p0, [x0] +; CHECK-NEXT: st1w {za3v.s[w13, 0]}, p0, [x0] +; CHECK-NEXT: ret + %tileslice = add i32 %sliceidx, 3 + call void @llvm.aarch64.sme.st1w.horiz( %pg, i32* %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.st1w.horiz( %pg, i32* %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.st1w.horiz( %pg, i32* %ptr, i64 2, i32 0) + call void @llvm.aarch64.sme.st1w.horiz( %pg, i32* %ptr, i64 3, i32 %tileslice) + call void @llvm.aarch64.sme.st1w.vert( %pg, i32* %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.st1w.vert( %pg, i32* %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.st1w.vert( %pg, i32* %ptr, i64 2, i32 %tileslice) + call void @llvm.aarch64.sme.st1w.vert( %pg, i32* %ptr, i64 3, i32 0) + ret void; +} + +define void @st1w_with_addr_offset( %pg, i32* %ptr, i64 %index, i32 %sliceidx) { +; CHECK-LABEL: st1w_with_addr_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: mov w13, w2 +; CHECK-NEXT: st1w {za0h.s[w12, 0]}, p0, [x0, x1, lsl #2] +; CHECK-NEXT: st1w {za3v.s[w13, 3]}, p0, [x0, x1, lsl #2] +; CHECK-NEXT: ret + %base = getelementptr i32, i32* %ptr, i64 %index + %tileslice = add i32 %sliceidx, 3 + call void @llvm.aarch64.sme.st1w.horiz( %pg, i32* %base, i64 0, i32 0) + call void @llvm.aarch64.sme.st1w.vert( %pg, i32* %base, i64 3, i32 %tileslice) + ret void; +} + +define void @st1d( %pg, i64* %ptr, i32 %sliceidx) { +; CHECK-LABEL: st1d: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w13, wzr +; CHECK-NEXT: mov w12, w1 +; CHECK-NEXT: st1d {za0h.d[w13, 0]}, p0, [x0] +; CHECK-NEXT: st1d {za1h.d[w13, 0]}, p0, [x0] +; CHECK-NEXT: st1d {za2h.d[w13, 0]}, p0, [x0] +; CHECK-NEXT: st1d {za3h.d[w13, 0]}, p0, [x0] +; CHECK-NEXT: st1d {za4h.d[w12, 1]}, p0, [x0] +; CHECK-NEXT: st1d {za5h.d[w13, 0]}, p0, [x0] +; CHECK-NEXT: st1d {za6h.d[w13, 0]}, p0, [x0] +; CHECK-NEXT: st1d {za7h.d[w13, 0]}, p0, [x0] +; CHECK-NEXT: st1d {za0v.d[w13, 0]}, p0, [x0] +; CHECK-NEXT: st1d {za1v.d[w13, 0]}, p0, [x0] +; CHECK-NEXT: st1d {za2v.d[w13, 0]}, p0, [x0] +; CHECK-NEXT: st1d {za3v.d[w13, 0]}, p0, [x0] +; CHECK-NEXT: st1d {za4v.d[w13, 0]}, p0, [x0] +; CHECK-NEXT: st1d {za5v.d[w13, 0]}, p0, [x0] +; CHECK-NEXT: st1d {za6v.d[w13, 0]}, p0, [x0] +; CHECK-NEXT: st1d {za7v.d[w12, 1]}, p0, [x0] +; CHECK-NEXT: ret + %tileslice = add i32 %sliceidx, 1 + call void @llvm.aarch64.sme.st1d.horiz( %pg, i64* %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.st1d.horiz( %pg, i64* %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.st1d.horiz( %pg, i64* %ptr, i64 2, i32 0) + call void @llvm.aarch64.sme.st1d.horiz( %pg, i64* %ptr, i64 3, i32 0) + call void @llvm.aarch64.sme.st1d.horiz( %pg, i64* %ptr, i64 4, i32 %tileslice) + call void @llvm.aarch64.sme.st1d.horiz( %pg, i64* %ptr, i64 5, i32 0) + call void @llvm.aarch64.sme.st1d.horiz( %pg, i64* %ptr, i64 6, i32 0) + call void @llvm.aarch64.sme.st1d.horiz( %pg, i64* %ptr, i64 7, i32 0) + call void @llvm.aarch64.sme.st1d.vert( %pg, i64* %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.st1d.vert( %pg, i64* %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.st1d.vert( %pg, i64* %ptr, i64 2, i32 0) + call void @llvm.aarch64.sme.st1d.vert( %pg, i64* %ptr, i64 3, i32 0) + call void @llvm.aarch64.sme.st1d.vert( %pg, i64* %ptr, i64 4, i32 0) + call void @llvm.aarch64.sme.st1d.vert( %pg, i64* %ptr, i64 5, i32 0) + call void @llvm.aarch64.sme.st1d.vert( %pg, i64* %ptr, i64 6, i32 0) + call void @llvm.aarch64.sme.st1d.vert( %pg, i64* %ptr, i64 7, i32 %tileslice) + ret void; +} + +define void @st1d_with_addr_offset( %pg, i64* %ptr, i64 %index, i32 %sliceidx) { +; CHECK-LABEL: st1d_with_addr_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, w2 +; CHECK-NEXT: mov w13, wzr +; CHECK-NEXT: st1d {za0h.d[w12, 1]}, p0, [x0, x1, lsl #3] +; CHECK-NEXT: st1d {za7v.d[w13, 0]}, p0, [x0, x1, lsl #3] +; CHECK-NEXT: ret + %base = getelementptr i64, i64* %ptr, i64 %index + %tileslice = add i32 %sliceidx, 1 + call void @llvm.aarch64.sme.st1d.horiz( %pg, i64* %base, i64 0, i32 %tileslice) + call void @llvm.aarch64.sme.st1d.vert( %pg, i64* %base, i64 7, i32 0) + ret void; +} + +define void @st1q( %pg, i128* %ptr) { +; CHECK-LABEL: st1q: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: st1q {za0h.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za1h.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za2h.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za3h.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za4h.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za5h.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za6h.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za7h.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za8h.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za9h.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za10h.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za11h.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za12h.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za13h.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za14h.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za15h.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za0v.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za1v.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za2v.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za3v.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za4v.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za5v.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za6v.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za7v.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za8v.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za9v.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za10v.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za11v.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za12v.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za13v.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za14v.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: st1q {za15v.q[w12, 0]}, p0, [x0] +; CHECK-NEXT: ret + call void @llvm.aarch64.sme.st1q.horiz( %pg, i128* %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, i128* %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, i128* %ptr, i64 2, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, i128* %ptr, i64 3, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, i128* %ptr, i64 4, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, i128* %ptr, i64 5, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, i128* %ptr, i64 6, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, i128* %ptr, i64 7, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, i128* %ptr, i64 8, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, i128* %ptr, i64 9, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, i128* %ptr, i64 10, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, i128* %ptr, i64 11, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, i128* %ptr, i64 12, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, i128* %ptr, i64 13, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, i128* %ptr, i64 14, i32 0) + call void @llvm.aarch64.sme.st1q.horiz( %pg, i128* %ptr, i64 15, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, i128* %ptr, i64 0, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, i128* %ptr, i64 1, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, i128* %ptr, i64 2, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, i128* %ptr, i64 3, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, i128* %ptr, i64 4, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, i128* %ptr, i64 5, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, i128* %ptr, i64 6, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, i128* %ptr, i64 7, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, i128* %ptr, i64 8, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, i128* %ptr, i64 9, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, i128* %ptr, i64 10, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, i128* %ptr, i64 11, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, i128* %ptr, i64 12, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, i128* %ptr, i64 13, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, i128* %ptr, i64 14, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, i128* %ptr, i64 15, i32 0) + ret void; +} + +define void @st1q_with_addr_offset( %pg, i128* %ptr, i64 %index) { +; CHECK-LABEL: st1q_with_addr_offset: +; CHECK: // %bb.0: +; CHECK-NEXT: mov w12, wzr +; CHECK-NEXT: st1q {za0h.q[w12, 0]}, p0, [x0, x1, lsl #4] +; CHECK-NEXT: st1q {za15v.q[w12, 0]}, p0, [x0, x1, lsl #4] +; CHECK-NEXT: ret + %base = getelementptr i128, i128* %ptr, i64 %index + call void @llvm.aarch64.sme.st1q.horiz( %pg, i128* %base, i64 0, i32 0) + call void @llvm.aarch64.sme.st1q.vert( %pg, i128* %base, i64 15, i32 0) + ret void; +} + +declare void @llvm.aarch64.sme.st1b.horiz(, i8*, i64, i32) +declare void @llvm.aarch64.sme.st1h.horiz(, i16*, i64, i32) +declare void @llvm.aarch64.sme.st1w.horiz(, i32*, i64, i32) +declare void @llvm.aarch64.sme.st1d.horiz(, i64*, i64, i32) +declare void @llvm.aarch64.sme.st1q.horiz(, i128*, i64, i32) +declare void @llvm.aarch64.sme.st1b.vert(, i8*, i64, i32) +declare void @llvm.aarch64.sme.st1h.vert(, i16*, i64, i32) +declare void @llvm.aarch64.sme.st1w.vert(, i32*, i64, i32) +declare void @llvm.aarch64.sme.st1d.vert(, i64*, i64, i32) +declare void @llvm.aarch64.sme.st1q.vert(, i128*, i64, i32)