diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2600,6 +2600,46 @@ def int_aarch64_sve_bfdot_lane_v2 : SVE_4Vec_BF16_Indexed; def int_aarch64_sve_bfmlalb_lane_v2 : SVE_4Vec_BF16_Indexed; def int_aarch64_sve_bfmlalt_lane_v2 : SVE_4Vec_BF16_Indexed; + +// +// SVE2.1 - Contiguous loads to multiple consecutive vectors +// + + class SVE2p1_Load_PN_X2_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], + [llvm_aarch64_svcount_ty, llvm_ptr_ty], + [IntrReadMem, IntrArgMemOnly]>; + + class SVE2p1_Load_PN_X4_Intrinsic + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>], + [llvm_aarch64_svcount_ty, llvm_ptr_ty], + [IntrReadMem, IntrArgMemOnly]>; + +def int_aarch64_sve_ld1_pn_x2 : SVE2p1_Load_PN_X2_Intrinsic; +def int_aarch64_sve_ld1_pn_x4 : SVE2p1_Load_PN_X4_Intrinsic; +def int_aarch64_sve_ldnt1_pn_x2 : SVE2p1_Load_PN_X2_Intrinsic; +def int_aarch64_sve_ldnt1_pn_x4 : SVE2p1_Load_PN_X4_Intrinsic; + +// +// SVE2.1 - Contiguous stores to multiple consecutive vectors +// + + class SVE2p1_Store_PN_X2_Intrinsic + : DefaultAttrsIntrinsic<[], [ llvm_anyvector_ty, LLVMMatchType<0>, + llvm_aarch64_svcount_ty, llvm_ptr_ty ], + [IntrWriteMem, IntrArgMemOnly]>; + + class SVE2p1_Store_PN_X4_Intrinsic + : DefaultAttrsIntrinsic<[], [ llvm_anyvector_ty, LLVMMatchType<0>, + LLVMMatchType<0>, LLVMMatchType<0>, + llvm_aarch64_svcount_ty, llvm_ptr_ty], + [IntrWriteMem, IntrArgMemOnly]>; + +def int_aarch64_sve_st1_pn_x2 : SVE2p1_Store_PN_X2_Intrinsic; +def int_aarch64_sve_st1_pn_x4 : SVE2p1_Store_PN_X4_Intrinsic; +def int_aarch64_sve_stnt1_pn_x2 : SVE2p1_Store_PN_X2_Intrinsic; +def int_aarch64_sve_stnt1_pn_x4 : SVE2p1_Store_PN_X4_Intrinsic; } // diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -370,6 +370,9 @@ void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, unsigned Scale, unsigned Opc_rr, unsigned Opc_ri, bool IsIntr = false); + void SelectContiguousMultiVectorLoad(SDNode *N, unsigned NumVecs, + unsigned Scale, unsigned Opc_rr, + unsigned Opc_ri); void SelectDestructiveMultiIntrinsic(SDNode *N, unsigned NumVecs, bool IsZmMulti, unsigned Opcode, bool HasPred = false); @@ -1779,6 +1782,39 @@ CurDAG->RemoveDeadNode(N); } +void AArch64DAGToDAGISel::SelectContiguousMultiVectorLoad(SDNode *N, + unsigned NumVecs, + unsigned Scale, + unsigned Opc_ri, + unsigned Opc_rr) { + assert(Scale < 4 && "Invalid scaling value."); + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue Chain = N->getOperand(0); + + // Use simplest addressing mode for now - base + 0 offset + SDValue PNg = N->getOperand(2); + SDValue Base = N->getOperand(3); + SDValue Offset = CurDAG->getTargetConstant(0, DL, MVT::i64); + + SDValue Ops[] = {PNg, // Predicate-as-counter + Base, // Memory operand + Offset, Chain}; + + const EVT ResTys[] = {MVT::Untyped, MVT::Other}; + + SDNode *Load = CurDAG->getMachineNode(Opc_ri, DL, ResTys, Ops); + SDValue SuperReg = SDValue(Load, 0); + for (unsigned i = 0; i < NumVecs; ++i) + ReplaceUses(SDValue(N, i), CurDAG->getTargetExtractSubreg( + AArch64::zsub0 + i, DL, VT, SuperReg)); + + // Copy chain + unsigned ChainIdx = NumVecs; + ReplaceUses(SDValue(N, ChainIdx), SDValue(Load, 1)); + CurDAG->RemoveDeadNode(N); +} + void AArch64DAGToDAGISel::SelectFrintFromVT(SDNode *N, unsigned NumVecs, unsigned Opcode) { if (N->getValueType(0) != MVT::nxv4f32) @@ -4655,6 +4691,74 @@ } break; } + case Intrinsic::aarch64_sve_ld1_pn_x2: { + if (VT == MVT::nxv16i8) { + SelectContiguousMultiVectorLoad(Node, 2, 0, AArch64::LD1B_2Z_IMM, AArch64::LD1B_2Z); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + VT == MVT::nxv8bf16) { + SelectContiguousMultiVectorLoad(Node, 2, 1, AArch64::LD1H_2Z_IMM, AArch64::LD1H_2Z); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectContiguousMultiVectorLoad(Node, 2, 2, AArch64::LD1W_2Z_IMM, AArch64::LD1W_2Z); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectContiguousMultiVectorLoad(Node, 2, 3, AArch64::LD1D_2Z_IMM, AArch64::LD1D_2Z); + return; + } + break; + } + case Intrinsic::aarch64_sve_ld1_pn_x4: { + if (VT == MVT::nxv16i8) { + SelectContiguousMultiVectorLoad(Node, 4, 0, AArch64::LD1B_4Z_IMM, AArch64::LD1B_4Z); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + VT == MVT::nxv8bf16) { + SelectContiguousMultiVectorLoad(Node, 4, 1, AArch64::LD1H_4Z_IMM, AArch64::LD1H_4Z); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectContiguousMultiVectorLoad(Node, 4, 2, AArch64::LD1W_4Z_IMM, AArch64::LD1W_4Z); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectContiguousMultiVectorLoad(Node, 4, 3, AArch64::LD1D_4Z_IMM, AArch64::LD1D_4Z); + return; + } + break; + } + case Intrinsic::aarch64_sve_ldnt1_pn_x2: { + if (VT == MVT::nxv16i8) { + SelectContiguousMultiVectorLoad(Node, 2, 0, AArch64::LDNT1B_2Z_IMM, AArch64::LDNT1B_2Z); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + VT == MVT::nxv8bf16) { + SelectContiguousMultiVectorLoad(Node, 2, 1, AArch64::LDNT1H_2Z_IMM, AArch64::LDNT1H_2Z); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectContiguousMultiVectorLoad(Node, 2, 2, AArch64::LDNT1W_2Z_IMM, AArch64::LDNT1W_2Z); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectContiguousMultiVectorLoad(Node, 2, 3, AArch64::LDNT1D_2Z_IMM, AArch64::LDNT1D_2Z); + return; + } + break; + } + case Intrinsic::aarch64_sve_ldnt1_pn_x4: { + if (VT == MVT::nxv16i8) { + SelectContiguousMultiVectorLoad(Node, 4, 0, AArch64::LDNT1B_4Z_IMM, AArch64::LDNT1B_4Z); + return; + } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || + VT == MVT::nxv8bf16) { + SelectContiguousMultiVectorLoad(Node, 4, 1, AArch64::LDNT1H_4Z_IMM, AArch64::LDNT1H_4Z); + return; + } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { + SelectContiguousMultiVectorLoad(Node, 4, 2, AArch64::LDNT1W_4Z_IMM, AArch64::LDNT1W_4Z); + return; + } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { + SelectContiguousMultiVectorLoad(Node, 4, 3, AArch64::LDNT1D_4Z_IMM, AArch64::LDNT1D_4Z); + return; + } + break; + } case Intrinsic::aarch64_sve_ld3_sret: { if (VT == MVT::nxv16i8) { SelectPredicatedLoad(Node, 3, 0, AArch64::LD3B_IMM, AArch64::LD3B, diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -3837,6 +3837,59 @@ defm STNT1W_4Z_IMM : sve2p1_mem_cst_si_4z<"stnt1w", 0b10, 0b1, ZZZZ_s_mul_r>; defm STNT1D_4Z_IMM : sve2p1_mem_cst_si_4z<"stnt1d", 0b11, 0b1, ZZZZ_d_mul_r>; +multiclass store_pn_x2 { + def : Pat<(Store (Ty ZPR:$vec0), (Ty ZPR:$vec1), + (aarch64svcount PPR:$PNg), GPR64:$base), + (RegImmInst (REG_SEQUENCE ZPR2Mul2, Ty:$vec0, zsub0, Ty:$vec1, zsub1), + PPR:$PNg, GPR64:$base, (i64 0))>; +} + +// Stores of 2 consecutive vectors +defm : store_pn_x2; +defm : store_pn_x2; +defm : store_pn_x2; +defm : store_pn_x2; +defm : store_pn_x2; +defm : store_pn_x2; +defm : store_pn_x2; +defm : store_pn_x2; +defm : store_pn_x2; +defm : store_pn_x2; +defm : store_pn_x2; +defm : store_pn_x2; +defm : store_pn_x2; +defm : store_pn_x2; +defm : store_pn_x2; +defm : store_pn_x2; + +multiclass store_pn_x4 { + def : Pat<(Store (Ty ZPR:$vec0), (Ty ZPR:$vec1), (Ty ZPR:$vec2), (Ty ZPR:$vec3), + (aarch64svcount PPR:$PNg), GPR64:$base), + (RegImmInst (REG_SEQUENCE ZPR4Mul4, Ty:$vec0, zsub0, Ty:$vec1, zsub1, + Ty:$vec2, zsub2, Ty:$vec3, zsub3), + PPR:$PNg, GPR64:$base, (i64 0))>; +} + +// Stores of 4 consecutive vectors +defm : store_pn_x4; +defm : store_pn_x4; +defm : store_pn_x4; +defm : store_pn_x4; +defm : store_pn_x4; +defm : store_pn_x4; +defm : store_pn_x4; +defm : store_pn_x4; +defm : store_pn_x4; +defm : store_pn_x4; +defm : store_pn_x4; +defm : store_pn_x4; +defm : store_pn_x4; +defm : store_pn_x4; +defm : store_pn_x4; +defm : store_pn_x4; + defm WHILEGE_2PXX : sve2p1_int_while_rr_pair<"whilege", 0b000>; defm WHILEGT_2PXX : sve2p1_int_while_rr_pair<"whilegt", 0b001>; defm WHILELT_2PXX : sve2p1_int_while_rr_pair<"whilelt", 0b010>; diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-loads.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-loads.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-loads.ll @@ -0,0 +1,648 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+bf16 < %s | FileCheck %s + +; == Normal Multi-Vector Consecutive Loads == + +define { , } @ld1_x2_i8(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_x2_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ld1_x2_i16(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_x2_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ld1_x2_i32(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_x2_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1w { z0.s, z1.s }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ld1_x2_i64(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_x2_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1d { z0.d, z1.d }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ld1_x2_f16(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_x2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8f16(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ld1_x2_bf16(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8bf16(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ld1_x2_f32(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_x2_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1w { z0.s, z1.s }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ld1_x2_f64(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_x2_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1d { z0.d, z1.d }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +; Test to ensure we load into the correct registers for the instruction +define @ld1_x2_i8_z0_taken(target("aarch64.svcount") %pn, ptr %ptr, %val) { +; CHECK-LABEL: ld1_x2_i8_z0_taken: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1b { z2.b, z3.b }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: add z0.b, z0.b, z2.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %ld1 = call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr); + %ld1_0 = extractvalue { , } %ld1, 0 + %res = add %val, %ld1_0 + ret %res +} + +define { , , , } @ld1_x4_i8(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_x4_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ld1_x4_i16(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_x4_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1h { z0.h - z3.h }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ld1_x4_i32(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_x4_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1w { z0.s - z3.s }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ld1_x4_i64(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_x4_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1d { z0.d - z3.d }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ld1_x4_f16(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_x4_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1h { z0.h - z3.h }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ld1_x4_bf16(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_x4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1h { z0.h - z3.h }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8bf16(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ld1_x4_f32(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_x4_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1w { z0.s - z3.s }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ld1_x4_f64(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_x4_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1d { z0.d - z3.d }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +; Test to ensure we load into the correct registers for the instruction +define @ld1_x4_i16_z0_taken(target("aarch64.svcount") %pn, ptr %ptr, %val) { +; CHECK-LABEL: ld1_x4_i16_z0_taken: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1h { z4.h - z7.h }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: add z0.h, z0.h, z4.h +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %ld1 = call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr); + %ld1_0 = extractvalue { , , , } %ld1, 0 + %res = add %val, %ld1_0 + ret %res +} + + +; == Non-temporal Multi-Vector Consecutive Loads == + +define { , } @ldnt1_x2_i8(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_x2_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1b { z0.b, z1.b }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ldnt1_x2_i16(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_x2_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1h { z0.h, z1.h }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ldnt1_x2_i32(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_x2_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1w { z0.s, z1.s }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ldnt1_x2_i64(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_x2_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1d { z0.d, z1.d }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ldnt1_x2_f16(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_x2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1h { z0.h, z1.h }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8f16(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ldnt1_x2_bf16(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1h { z0.h, z1.h }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8bf16(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ldnt1_x2_f32(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_x2_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1w { z0.s, z1.s }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4f32(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +define { , } @ldnt1_x2_f64(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_x2_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1d { z0.d, z1.d }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2f64(target("aarch64.svcount") %pn, ptr %ptr); + ret { , } %res +} + +; Test to ensure we load into the correct registers for the instruction +define @ldnt1_x2_i32_z0_taken(target("aarch64.svcount") %pn, ptr %ptr, %val) { +; CHECK-LABEL: ldnt1_x2_i32_z0_taken: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1w { z2.s, z3.s }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: add z0.s, z0.s, z2.s +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %ld1 = call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr); + %ld1_0 = extractvalue { , } %ld1, 0 + %res = add %val, %ld1_0 + ret %res +} + +define { , , , } @ldnt1_x4_i8(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_x4_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1b { z0.b - z3.b }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ldnt1_x4_i16(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_x4_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1h { z0.h - z3.h }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ldnt1_x4_i32(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_x4_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1w { z0.s - z3.s }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ldnt1_x4_i64(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_x4_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1d { z0.d - z3.d }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ldnt1_x4_f16(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_x4_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1h { z0.h - z3.h }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8f16(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ldnt1_x4_bf16(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_x4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1h { z0.h - z3.h }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8bf16(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ldnt1_x4_f32(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_x4_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1w { z0.s - z3.s }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4f32(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +define { , , , } @ldnt1_x4_f64(target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ldnt1_x4_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1d { z0.d - z3.d }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2f64(target("aarch64.svcount") %pn, ptr %ptr); + ret { , , , } %res +} + +; Test to ensure we load into the correct registers for the instruction +define @ldnt1_x4_i64_z0_taken(target("aarch64.svcount") %pn, ptr %ptr, %val) { +; CHECK-LABEL: ldnt1_x4_i64_z0_taken: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ldnt1d { z4.d - z7.d }, pn8/z, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: add z0.d, z0.d, z4.d +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %ld1 = call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr); + %ld1_0 = extractvalue { , , , } %ld1, 0 + %res = add %val, %ld1_0 + ret %res +} + +declare { , } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ld1.pn.x2.nxv2f64(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ld1.pn.x2.nxv4f32(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8f16(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8bf16(target("aarch64.svcount"), ptr) + +declare { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv2f64(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv4f32(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8f16(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8bf16(target("aarch64.svcount"), ptr) + +declare { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2f64(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4f32(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8f16(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8bf16(target("aarch64.svcount"), ptr) + +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2f64(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4f32(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8f16(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8bf16(target("aarch64.svcount"), ptr) diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-stores.ll @@ -0,0 +1,650 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1,+bf16 < %s | FileCheck %s +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2,+bf16 < %s | FileCheck %s + +; == Normal Multi-Vector Consecutive Stores == + +define void @st1_x2_i8( %unused, %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_x2_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: st1b { z2.b, z3.b }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.x2.nxv16i8( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_x2_i16( %unused, %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_x2_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: st1h { z2.h, z3.h }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.x2.nxv8i16( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_x2_i32( %unused, %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_x2_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: st1w { z2.s, z3.s }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.x2.nxv4i32( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_x2_i64( %unused, %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_x2_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: st1d { z2.d, z3.d }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.x2.nxv2i64( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_x2_f16( %unused, %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_x2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: st1h { z2.h, z3.h }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.x2.nxv8f16( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_x2_bf16( %unused, %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: st1h { z2.h, z3.h }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.x2.nxv8bf16( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_x2_f32( %unused, %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_x2_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: st1w { z2.s, z3.s }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.x2.nxv4f32( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_x2_f64( %unused, %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_x2_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: st1d { z2.d, z3.d }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.x2.nxv2f64( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_x4_i8( %unused, %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_x4_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: st1b { z4.b - z7.b }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.x4.nxv16i8( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_x4_i16( %unused, %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_x4_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: st1h { z4.h - z7.h }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.x4.nxv8i16( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_x4_i32( %unused, %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_x4_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: st1w { z4.s - z7.s }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.x4.nxv4i32( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_x4_i64( %unused, %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_x4_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: st1d { z4.d - z7.d }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.x4.nxv2i64( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_x4_f16( %unused, %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_x4_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: st1h { z4.h - z7.h }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.x4.nxv8f16( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_x4_bf16( %unused, %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_x4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: st1h { z4.h - z7.h }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.x4.nxv8bf16( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_x4_f32( %unused, %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_x4_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: st1w { z4.s - z7.s }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.x4.nxv4f32( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @st1_x4_f64( %unused, %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: st1_x4_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: st1d { z4.d - z7.d }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.st1.pn.x4.nxv2f64( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +; == Non-temporal Multi-Vector Consecutive Stores == + +define void @stnt1_x2_i8( %unused, %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_x2_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: stnt1b { z2.b, z3.b }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_x2_i16( %unused, %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_x2_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: stnt1h { z2.h, z3.h }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_x2_i32( %unused, %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_x2_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: stnt1w { z2.s, z3.s }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_x2_i64( %unused, %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_x2_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: stnt1d { z2.d, z3.d }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_x2_f16( %unused, %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_x2_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: stnt1h { z2.h, z3.h }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8f16( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_x2_bf16( %unused, %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_x2_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: stnt1h { z2.h, z3.h }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.x2.nxv8bf16( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_x2_f32( %unused, %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_x2_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: stnt1w { z2.s, z3.s }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.x2.nxv4f32( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_x2_f64( %unused, %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_x2_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z3.d, z2.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z2.d, z1.d +; CHECK-NEXT: stnt1d { z2.d, z3.d }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.x2.nxv2f64( %zn0, %zn1, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_x4_i8( %unused, %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_x4_i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: stnt1b { z4.b - z7.b }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_x4_i16( %unused, %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_x4_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: stnt1h { z4.h - z7.h }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_x4_i32( %unused, %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_x4_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: stnt1w { z4.s - z7.s }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_x4_i64( %unused, %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_x4_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: stnt1d { z4.d - z7.d }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_x4_f16( %unused, %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_x4_f16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: stnt1h { z4.h - z7.h }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8f16( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_x4_bf16( %unused, %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_x4_bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: stnt1h { z4.h - z7.h }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.x4.nxv8bf16( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_x4_f32( %unused, %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_x4_f32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: stnt1w { z4.s - z7.s }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.x4.nxv4f32( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +define void @stnt1_x4_f64( %unused, %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: stnt1_x4_f64: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: mov z7.d, z4.d +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov z6.d, z3.d +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: mov z5.d, z2.d +; CHECK-NEXT: mov z4.d, z1.d +; CHECK-NEXT: stnt1d { z4.d - z7.d }, pn8, [x0] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + call void @llvm.aarch64.sve.stnt1.pn.x4.nxv2f64( %zn0, %zn1, %zn2, %zn3, target("aarch64.svcount") %pn, ptr %ptr); + ret void +} + +declare void @llvm.aarch64.sve.st1.pn.x2.nxv16i8(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.x2.nxv8i16(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.x2.nxv4i32(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.x2.nxv2i64(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.x2.nxv8f16(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.x2.nxv8bf16(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.x2.nxv4f32(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.x2.nxv2f64(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv16i8(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv8i16(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv4i32(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv2i64(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv8f16(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv8bf16(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv4f32(, , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.x2.nxv2f64(, , target("aarch64.svcount"), ptr) + + +declare void @llvm.aarch64.sve.st1.pn.x4.nxv16i8(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.x4.nxv8i16(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.x4.nxv4i32(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.x4.nxv2i64(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.x4.nxv8f16(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.x4.nxv8bf16(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.x4.nxv4f32(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.st1.pn.x4.nxv2f64(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv16i8(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv8i16(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv4i32(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv2i64(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv8f16(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv8bf16(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv4f32(, , , , target("aarch64.svcount"), ptr) +declare void @llvm.aarch64.sve.stnt1.pn.x4.nxv2f64(, , , , target("aarch64.svcount"), ptr)