diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp --- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp +++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp @@ -66,6 +66,11 @@ bool expandMBB(MachineBasicBlock &MBB); bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, MachineBasicBlock::iterator &NextMBBI); + bool expandMultiVecPseudo(MachineBasicBlock &MBB, + MachineBasicBlock::iterator MBBI, + TargetRegisterClass ContiguousClass, + TargetRegisterClass StridedClass, + unsigned ContiguousOpc, unsigned StridedOpc); bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned BitSize); @@ -1038,6 +1043,35 @@ return EndBB; } +bool AArch64ExpandPseudo::expandMultiVecPseudo( + MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, + TargetRegisterClass ContiguousClass, TargetRegisterClass StridedClass, + unsigned ContiguousOp, unsigned StridedOpc) { + MachineInstr &MI = *MBBI; + Register Tuple = MI.getOperand(0).getReg(); + + auto ContiguousRange = ContiguousClass.getRegisters(); + auto StridedRange = StridedClass.getRegisters(); + unsigned Opc; + if ((std::find(ContiguousRange.begin(), ContiguousRange.end(), + Tuple.asMCReg()) != std::end(ContiguousRange))) { + Opc = ContiguousOp; + } else if ((std::find(StridedRange.begin(), StridedRange.end(), + Tuple.asMCReg()) != std::end(StridedRange))) { + Opc = StridedOpc; + } else + llvm_unreachable("Cannot expand Multi-Vector pseudo"); + + MachineInstrBuilder MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc)) + .add(MI.getOperand(0)) + .add(MI.getOperand(1)) + .add(MI.getOperand(2)) + .add(MI.getOperand(3)); + transferImpOps(MI, MIB, MIB); + MI.eraseFromParent(); + return true; +} + /// If MBBI references a pseudo instruction that should be expanded here, /// do the expansion and return true. Otherwise return false. bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB, @@ -1492,6 +1526,134 @@ MI.eraseFromParent(); return true; } + case AArch64::LD1B_2Z_IMM_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass, + AArch64::LD1B_2Z_IMM, AArch64::LD1B_2Z_STRIDED_IMM); + case AArch64::LD1H_2Z_IMM_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass, + AArch64::LD1H_2Z_IMM, AArch64::LD1H_2Z_STRIDED_IMM); + case AArch64::LD1W_2Z_IMM_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass, + AArch64::LD1W_2Z_IMM, AArch64::LD1W_2Z_STRIDED_IMM); + case AArch64::LD1D_2Z_IMM_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass, + AArch64::LD1D_2Z_IMM, AArch64::LD1D_2Z_STRIDED_IMM); + case AArch64::LDNT1B_2Z_IMM_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass, + AArch64::LDNT1B_2Z_IMM, AArch64::LDNT1B_2Z_STRIDED_IMM); + case AArch64::LDNT1H_2Z_IMM_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass, + AArch64::LDNT1H_2Z_IMM, AArch64::LDNT1H_2Z_STRIDED_IMM); + case AArch64::LDNT1W_2Z_IMM_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass, + AArch64::LDNT1W_2Z_IMM, AArch64::LDNT1W_2Z_STRIDED_IMM); + case AArch64::LDNT1D_2Z_IMM_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass, + AArch64::LDNT1D_2Z_IMM, AArch64::LDNT1D_2Z_STRIDED_IMM); + case AArch64::LD1B_2Z_PSEUDO: + return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR2RegClass, + AArch64::ZPR2StridedRegClass, AArch64::LD1B_2Z, + AArch64::LD1B_2Z_STRIDED); + case AArch64::LD1H_2Z_PSEUDO: + return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR2RegClass, + AArch64::ZPR2StridedRegClass, AArch64::LD1H_2Z, + AArch64::LD1H_2Z_STRIDED); + case AArch64::LD1W_2Z_PSEUDO: + return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR2RegClass, + AArch64::ZPR2StridedRegClass, AArch64::LD1W_2Z, + AArch64::LD1W_2Z_STRIDED); + case AArch64::LD1D_2Z_PSEUDO: + return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR2RegClass, + AArch64::ZPR2StridedRegClass, AArch64::LD1D_2Z, + AArch64::LD1D_2Z_STRIDED); + case AArch64::LDNT1B_2Z_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass, + AArch64::LDNT1B_2Z, AArch64::LDNT1B_2Z_STRIDED); + case AArch64::LDNT1H_2Z_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass, + AArch64::LDNT1H_2Z, AArch64::LDNT1H_2Z_STRIDED); + case AArch64::LDNT1W_2Z_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass, + AArch64::LDNT1W_2Z, AArch64::LDNT1W_2Z_STRIDED); + case AArch64::LDNT1D_2Z_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR2RegClass, AArch64::ZPR2StridedRegClass, + AArch64::LDNT1D_2Z, AArch64::LDNT1D_2Z_STRIDED); + case AArch64::LD1B_4Z_IMM_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass, + AArch64::LD1B_4Z_IMM, AArch64::LD1B_4Z_STRIDED_IMM); + case AArch64::LD1H_4Z_IMM_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass, + AArch64::LD1H_4Z_IMM, AArch64::LD1H_4Z_STRIDED_IMM); + case AArch64::LD1W_4Z_IMM_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass, + AArch64::LD1W_4Z_IMM, AArch64::LD1W_4Z_STRIDED_IMM); + case AArch64::LD1D_4Z_IMM_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass, + AArch64::LD1D_4Z_IMM, AArch64::LD1D_4Z_STRIDED_IMM); + case AArch64::LDNT1B_4Z_IMM_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass, + AArch64::LDNT1B_4Z_IMM, AArch64::LDNT1B_4Z_STRIDED_IMM); + case AArch64::LDNT1H_4Z_IMM_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass, + AArch64::LDNT1H_4Z_IMM, AArch64::LDNT1H_4Z_STRIDED_IMM); + case AArch64::LDNT1W_4Z_IMM_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass, + AArch64::LDNT1W_4Z_IMM, AArch64::LDNT1W_4Z_STRIDED_IMM); + case AArch64::LDNT1D_4Z_IMM_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass, + AArch64::LDNT1D_4Z_IMM, AArch64::LDNT1D_4Z_STRIDED_IMM); + case AArch64::LD1B_4Z_PSEUDO: + return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR4RegClass, + AArch64::ZPR4StridedRegClass, AArch64::LD1B_4Z, + AArch64::LD1B_4Z_STRIDED); + case AArch64::LD1H_4Z_PSEUDO: + return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR4RegClass, + AArch64::ZPR4StridedRegClass, AArch64::LD1H_4Z, + AArch64::LD1H_4Z_STRIDED); + case AArch64::LD1W_4Z_PSEUDO: + return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR4RegClass, + AArch64::ZPR4StridedRegClass, AArch64::LD1W_4Z, + AArch64::LD1W_4Z_STRIDED); + case AArch64::LD1D_4Z_PSEUDO: + return expandMultiVecPseudo(MBB, MBBI, AArch64::ZPR4RegClass, + AArch64::ZPR4StridedRegClass, AArch64::LD1D_4Z, + AArch64::LD1D_4Z_STRIDED); + case AArch64::LDNT1B_4Z_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass, + AArch64::LDNT1B_4Z, AArch64::LDNT1B_4Z_STRIDED); + case AArch64::LDNT1H_4Z_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass, + AArch64::LDNT1H_4Z, AArch64::LDNT1H_4Z_STRIDED); + case AArch64::LDNT1W_4Z_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass, + AArch64::LDNT1W_4Z, AArch64::LDNT1W_4Z_STRIDED); + case AArch64::LDNT1D_4Z_PSEUDO: + return expandMultiVecPseudo( + MBB, MBBI, AArch64::ZPR4RegClass, AArch64::ZPR4StridedRegClass, + AArch64::LDNT1D_4Z, AArch64::LDNT1D_4Z_STRIDED); } return false; } diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -4660,68 +4660,188 @@ } case Intrinsic::aarch64_sve_ld1_pn_x2: { if (VT == MVT::nxv16i8) { - SelectContiguousMultiVectorLoad(Node, 2, 0, AArch64::LD1B_2Z_IMM, AArch64::LD1B_2Z); + if (Subtarget->hasSME2()) + SelectContiguousMultiVectorLoad( + Node, 2, 0, AArch64::LD1B_2Z_IMM_PSEUDO, AArch64::LD1B_2Z_PSEUDO); + else if (Subtarget->hasSVE2p1()) + SelectContiguousMultiVectorLoad(Node, 2, 0, AArch64::LD1B_2Z_IMM, + AArch64::LD1B_2Z); + else + break; return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || VT == MVT::nxv8bf16) { - SelectContiguousMultiVectorLoad(Node, 2, 1, AArch64::LD1H_2Z_IMM, AArch64::LD1H_2Z); + if (Subtarget->hasSME2()) + SelectContiguousMultiVectorLoad( + Node, 2, 1, AArch64::LD1H_2Z_IMM_PSEUDO, AArch64::LD1H_2Z_PSEUDO); + else if (Subtarget->hasSVE2p1()) + SelectContiguousMultiVectorLoad(Node, 2, 1, AArch64::LD1H_2Z_IMM, + AArch64::LD1H_2Z); + else + break; return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - SelectContiguousMultiVectorLoad(Node, 2, 2, AArch64::LD1W_2Z_IMM, AArch64::LD1W_2Z); + if (Subtarget->hasSME2()) + SelectContiguousMultiVectorLoad( + Node, 2, 2, AArch64::LD1W_2Z_IMM_PSEUDO, AArch64::LD1W_2Z_PSEUDO); + else if (Subtarget->hasSVE2p1()) + SelectContiguousMultiVectorLoad(Node, 2, 2, AArch64::LD1W_2Z_IMM, + AArch64::LD1W_2Z); + else + break; return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - SelectContiguousMultiVectorLoad(Node, 2, 3, AArch64::LD1D_2Z_IMM, AArch64::LD1D_2Z); + if (Subtarget->hasSME2()) + SelectContiguousMultiVectorLoad( + Node, 2, 3, AArch64::LD1D_2Z_IMM_PSEUDO, AArch64::LD1D_2Z_PSEUDO); + else if (Subtarget->hasSVE2p1()) + SelectContiguousMultiVectorLoad(Node, 2, 3, AArch64::LD1D_2Z_IMM, + AArch64::LD1D_2Z); + else + break; return; } break; } case Intrinsic::aarch64_sve_ld1_pn_x4: { if (VT == MVT::nxv16i8) { - SelectContiguousMultiVectorLoad(Node, 4, 0, AArch64::LD1B_4Z_IMM, AArch64::LD1B_4Z); + if (Subtarget->hasSME2()) + SelectContiguousMultiVectorLoad( + Node, 4, 0, AArch64::LD1B_4Z_IMM_PSEUDO, AArch64::LD1B_4Z_PSEUDO); + else if (Subtarget->hasSVE2p1()) + SelectContiguousMultiVectorLoad(Node, 4, 0, AArch64::LD1B_4Z_IMM, + AArch64::LD1B_4Z); + else + break; return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || VT == MVT::nxv8bf16) { - SelectContiguousMultiVectorLoad(Node, 4, 1, AArch64::LD1H_4Z_IMM, AArch64::LD1H_4Z); + if (Subtarget->hasSME2()) + SelectContiguousMultiVectorLoad( + Node, 4, 1, AArch64::LD1H_4Z_IMM_PSEUDO, AArch64::LD1H_4Z_PSEUDO); + else if (Subtarget->hasSVE2p1()) + SelectContiguousMultiVectorLoad(Node, 4, 1, AArch64::LD1H_4Z_IMM, + AArch64::LD1H_4Z); + else + break; return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - SelectContiguousMultiVectorLoad(Node, 4, 2, AArch64::LD1W_4Z_IMM, AArch64::LD1W_4Z); + if (Subtarget->hasSME2()) + SelectContiguousMultiVectorLoad( + Node, 4, 2, AArch64::LD1W_4Z_IMM_PSEUDO, AArch64::LD1W_4Z_PSEUDO); + else if (Subtarget->hasSVE2p1()) + SelectContiguousMultiVectorLoad(Node, 4, 2, AArch64::LD1W_4Z_IMM, + AArch64::LD1W_4Z); + else + break; return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - SelectContiguousMultiVectorLoad(Node, 4, 3, AArch64::LD1D_4Z_IMM, AArch64::LD1D_4Z); + if (Subtarget->hasSME2()) + SelectContiguousMultiVectorLoad( + Node, 4, 3, AArch64::LD1D_4Z_IMM_PSEUDO, AArch64::LD1D_4Z_PSEUDO); + else if (Subtarget->hasSVE2p1()) + SelectContiguousMultiVectorLoad(Node, 4, 3, AArch64::LD1D_4Z_IMM, + AArch64::LD1D_4Z); + else + break; return; } break; } case Intrinsic::aarch64_sve_ldnt1_pn_x2: { if (VT == MVT::nxv16i8) { - SelectContiguousMultiVectorLoad(Node, 2, 0, AArch64::LDNT1B_2Z_IMM, AArch64::LDNT1B_2Z); + if (Subtarget->hasSME2()) + SelectContiguousMultiVectorLoad(Node, 2, 0, + AArch64::LDNT1B_2Z_IMM_PSEUDO, + AArch64::LDNT1B_2Z_PSEUDO); + else if (Subtarget->hasSVE2p1()) + SelectContiguousMultiVectorLoad(Node, 2, 0, AArch64::LDNT1B_2Z_IMM, + AArch64::LDNT1B_2Z); + else + break; return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || VT == MVT::nxv8bf16) { - SelectContiguousMultiVectorLoad(Node, 2, 1, AArch64::LDNT1H_2Z_IMM, AArch64::LDNT1H_2Z); + if (Subtarget->hasSME2()) + SelectContiguousMultiVectorLoad(Node, 2, 1, + AArch64::LDNT1H_2Z_IMM_PSEUDO, + AArch64::LDNT1H_2Z_PSEUDO); + else if (Subtarget->hasSVE2p1()) + SelectContiguousMultiVectorLoad(Node, 2, 1, AArch64::LDNT1H_2Z_IMM, + AArch64::LDNT1H_2Z); + else + break; return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - SelectContiguousMultiVectorLoad(Node, 2, 2, AArch64::LDNT1W_2Z_IMM, AArch64::LDNT1W_2Z); + if (Subtarget->hasSME2()) + SelectContiguousMultiVectorLoad(Node, 2, 2, + AArch64::LDNT1W_2Z_IMM_PSEUDO, + AArch64::LDNT1W_2Z_PSEUDO); + else if (Subtarget->hasSVE2p1()) + SelectContiguousMultiVectorLoad(Node, 2, 2, AArch64::LDNT1W_2Z_IMM, + AArch64::LDNT1W_2Z); + else + break; return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - SelectContiguousMultiVectorLoad(Node, 2, 3, AArch64::LDNT1D_2Z_IMM, AArch64::LDNT1D_2Z); + if (Subtarget->hasSME2()) + SelectContiguousMultiVectorLoad(Node, 2, 3, + AArch64::LDNT1D_2Z_IMM_PSEUDO, + AArch64::LDNT1D_2Z_PSEUDO); + else if (Subtarget->hasSVE2p1()) + SelectContiguousMultiVectorLoad(Node, 2, 3, AArch64::LDNT1D_2Z_IMM, + AArch64::LDNT1D_2Z); + else + break; return; } break; } case Intrinsic::aarch64_sve_ldnt1_pn_x4: { if (VT == MVT::nxv16i8) { - SelectContiguousMultiVectorLoad(Node, 4, 0, AArch64::LDNT1B_4Z_IMM, AArch64::LDNT1B_4Z); + if (Subtarget->hasSME2()) + SelectContiguousMultiVectorLoad(Node, 4, 0, + AArch64::LDNT1B_4Z_IMM_PSEUDO, + AArch64::LDNT1B_4Z_PSEUDO); + else if (Subtarget->hasSVE2p1()) + SelectContiguousMultiVectorLoad(Node, 4, 0, AArch64::LDNT1B_4Z_IMM, + AArch64::LDNT1B_4Z); + else + break; return; } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 || VT == MVT::nxv8bf16) { - SelectContiguousMultiVectorLoad(Node, 4, 1, AArch64::LDNT1H_4Z_IMM, AArch64::LDNT1H_4Z); + if (Subtarget->hasSME2()) + SelectContiguousMultiVectorLoad(Node, 4, 1, + AArch64::LDNT1H_4Z_IMM_PSEUDO, + AArch64::LDNT1H_4Z_PSEUDO); + else if (Subtarget->hasSVE2p1()) + SelectContiguousMultiVectorLoad(Node, 4, 1, AArch64::LDNT1H_4Z_IMM, + AArch64::LDNT1H_4Z); + else + break; return; } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) { - SelectContiguousMultiVectorLoad(Node, 4, 2, AArch64::LDNT1W_4Z_IMM, AArch64::LDNT1W_4Z); + if (Subtarget->hasSME2()) + SelectContiguousMultiVectorLoad(Node, 4, 2, + AArch64::LDNT1W_4Z_IMM_PSEUDO, + AArch64::LDNT1W_4Z_PSEUDO); + else if (Subtarget->hasSVE2p1()) + SelectContiguousMultiVectorLoad(Node, 4, 2, AArch64::LDNT1W_4Z_IMM, + AArch64::LDNT1W_4Z); + else + break; return; } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) { - SelectContiguousMultiVectorLoad(Node, 4, 3, AArch64::LDNT1D_4Z_IMM, AArch64::LDNT1D_4Z); + if (Subtarget->hasSME2()) + SelectContiguousMultiVectorLoad(Node, 4, 3, + AArch64::LDNT1D_4Z_IMM_PSEUDO, + AArch64::LDNT1D_4Z_PSEUDO); + else if (Subtarget->hasSVE2p1()) + SelectContiguousMultiVectorLoad(Node, 4, 3, AArch64::LDNT1D_4Z_IMM, + AArch64::LDNT1D_4Z); + else + break; return; } break; diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td --- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td +++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td @@ -1340,6 +1340,11 @@ let Size = 512; } +def ZPR2StridedOrContiguous : RegisterClass<"AArch64", [untyped], 256, + (add ZStridedPairsLo, ZStridedPairsHi, + (decimate ZSeqPairs, 2))> { + let Size = 256; +} class ZPRVectorListStrided : ZPRVectorList { @@ -1371,6 +1376,21 @@ : RegisterOperand"> { let ParserMatchClass = ZPRVectorListStrided<64, 2, 8>; } + + def ZZ_b_strided_and_contiguous + : RegisterOperand">; + def ZZ_h_strided_and_contiguous + : RegisterOperand">; + def ZZ_s_strided_and_contiguous + : RegisterOperand">; + def ZZ_d_strided_and_contiguous + : RegisterOperand">; +} + +def ZPR4StridedOrContiguous : RegisterClass<"AArch64", [untyped], 512, + (add ZStridedQuadsLo, ZStridedQuadsHi, + (decimate ZSeqQuads, 4))> { + let Size = 512; } let EncoderMethod = "EncodeZPR4StridedRegisterClass", @@ -1394,6 +1414,15 @@ : RegisterOperand"> { let ParserMatchClass = ZPRVectorListStrided<64, 4, 4>; } + + def ZZZZ_b_strided_and_contiguous + : RegisterOperand">; + def ZZZZ_h_strided_and_contiguous + : RegisterOperand">; + def ZZZZ_s_strided_and_contiguous + : RegisterOperand">; + def ZZZZ_d_strided_and_contiguous + : RegisterOperand">; } class ZPRExtendAsmOperand; // Load to two registers -def LD1B_2Z : sve2p1_mem_cld_ss_2z<"ld1b", 0b00, 0b0, ZZ_b_mul_r, GPR64shifted8>; -def LD1H_2Z : sve2p1_mem_cld_ss_2z<"ld1h", 0b01, 0b0, ZZ_h_mul_r, GPR64shifted16>; -def LD1W_2Z : sve2p1_mem_cld_ss_2z<"ld1w", 0b10, 0b0, ZZ_s_mul_r, GPR64shifted32>; -def LD1D_2Z : sve2p1_mem_cld_ss_2z<"ld1d", 0b11, 0b0, ZZ_d_mul_r, GPR64shifted64>; -defm LD1B_2Z_IMM : sve2p1_mem_cld_si_2z<"ld1b", 0b00, 0b0, ZZ_b_mul_r>; -defm LD1H_2Z_IMM : sve2p1_mem_cld_si_2z<"ld1h", 0b01, 0b0, ZZ_h_mul_r>; -defm LD1W_2Z_IMM : sve2p1_mem_cld_si_2z<"ld1w", 0b10, 0b0, ZZ_s_mul_r>; -defm LD1D_2Z_IMM : sve2p1_mem_cld_si_2z<"ld1d", 0b11, 0b0, ZZ_d_mul_r>; -def LDNT1B_2Z : sve2p1_mem_cld_ss_2z<"ldnt1b", 0b00, 0b1, ZZ_b_mul_r, GPR64shifted8>; -def LDNT1H_2Z : sve2p1_mem_cld_ss_2z<"ldnt1h", 0b01, 0b1, ZZ_h_mul_r, GPR64shifted16>; -def LDNT1W_2Z : sve2p1_mem_cld_ss_2z<"ldnt1w", 0b10, 0b1, ZZ_s_mul_r, GPR64shifted32>; -def LDNT1D_2Z : sve2p1_mem_cld_ss_2z<"ldnt1d", 0b11, 0b1, ZZ_d_mul_r, GPR64shifted64>; -defm LDNT1B_2Z_IMM : sve2p1_mem_cld_si_2z<"ldnt1b", 0b00, 0b1, ZZ_b_mul_r>; -defm LDNT1H_2Z_IMM : sve2p1_mem_cld_si_2z<"ldnt1h", 0b01, 0b1, ZZ_h_mul_r>; -defm LDNT1W_2Z_IMM : sve2p1_mem_cld_si_2z<"ldnt1w", 0b10, 0b1, ZZ_s_mul_r>; -defm LDNT1D_2Z_IMM : sve2p1_mem_cld_si_2z<"ldnt1d", 0b11, 0b1, ZZ_d_mul_r>; +defm LD1B_2Z : sve2p1_mem_cld_ss_2z<"ld1b", 0b00, 0b0, ZZ_b_mul_r, GPR64shifted8, ZZ_b_strided_and_contiguous>; +defm LD1H_2Z : sve2p1_mem_cld_ss_2z<"ld1h", 0b01, 0b0, ZZ_h_mul_r, GPR64shifted16, ZZ_h_strided_and_contiguous>; +defm LD1W_2Z : sve2p1_mem_cld_ss_2z<"ld1w", 0b10, 0b0, ZZ_s_mul_r, GPR64shifted32, ZZ_s_strided_and_contiguous>; +defm LD1D_2Z : sve2p1_mem_cld_ss_2z<"ld1d", 0b11, 0b0, ZZ_d_mul_r, GPR64shifted64, ZZ_d_strided_and_contiguous>; +defm LD1B_2Z_IMM : sve2p1_mem_cld_si_2z<"ld1b", 0b00, 0b0, ZZ_b_mul_r, ZZ_b_strided_and_contiguous>; +defm LD1H_2Z_IMM : sve2p1_mem_cld_si_2z<"ld1h", 0b01, 0b0, ZZ_h_mul_r, ZZ_h_strided_and_contiguous>; +defm LD1W_2Z_IMM : sve2p1_mem_cld_si_2z<"ld1w", 0b10, 0b0, ZZ_s_mul_r, ZZ_s_strided_and_contiguous>; +defm LD1D_2Z_IMM : sve2p1_mem_cld_si_2z<"ld1d", 0b11, 0b0, ZZ_d_mul_r, ZZ_d_strided_and_contiguous>; +defm LDNT1B_2Z : sve2p1_mem_cld_ss_2z<"ldnt1b", 0b00, 0b1, ZZ_b_mul_r, GPR64shifted8, ZZ_b_strided_and_contiguous>; +defm LDNT1H_2Z : sve2p1_mem_cld_ss_2z<"ldnt1h", 0b01, 0b1, ZZ_h_mul_r, GPR64shifted16, ZZ_h_strided_and_contiguous>; +defm LDNT1W_2Z : sve2p1_mem_cld_ss_2z<"ldnt1w", 0b10, 0b1, ZZ_s_mul_r, GPR64shifted32, ZZ_s_strided_and_contiguous>; +defm LDNT1D_2Z : sve2p1_mem_cld_ss_2z<"ldnt1d", 0b11, 0b1, ZZ_d_mul_r, GPR64shifted64, ZZ_d_strided_and_contiguous>; +defm LDNT1B_2Z_IMM : sve2p1_mem_cld_si_2z<"ldnt1b", 0b00, 0b1, ZZ_b_mul_r, ZZ_b_strided_and_contiguous>; +defm LDNT1H_2Z_IMM : sve2p1_mem_cld_si_2z<"ldnt1h", 0b01, 0b1, ZZ_h_mul_r, ZZ_h_strided_and_contiguous>; +defm LDNT1W_2Z_IMM : sve2p1_mem_cld_si_2z<"ldnt1w", 0b10, 0b1, ZZ_s_mul_r, ZZ_s_strided_and_contiguous>; +defm LDNT1D_2Z_IMM : sve2p1_mem_cld_si_2z<"ldnt1d", 0b11, 0b1, ZZ_d_mul_r, ZZ_d_strided_and_contiguous>; // Load to four registers -def LD1B_4Z : sve2p1_mem_cld_ss_4z<"ld1b", 0b00, 0b0, ZZZZ_b_mul_r, GPR64shifted8>; -def LD1H_4Z : sve2p1_mem_cld_ss_4z<"ld1h", 0b01, 0b0, ZZZZ_h_mul_r, GPR64shifted16>; -def LD1W_4Z : sve2p1_mem_cld_ss_4z<"ld1w", 0b10, 0b0, ZZZZ_s_mul_r, GPR64shifted32>; -def LD1D_4Z : sve2p1_mem_cld_ss_4z<"ld1d", 0b11, 0b0, ZZZZ_d_mul_r, GPR64shifted64>; -defm LD1B_4Z_IMM : sve2p1_mem_cld_si_4z<"ld1b", 0b00, 0b0, ZZZZ_b_mul_r>; -defm LD1H_4Z_IMM : sve2p1_mem_cld_si_4z<"ld1h", 0b01, 0b0, ZZZZ_h_mul_r>; -defm LD1W_4Z_IMM : sve2p1_mem_cld_si_4z<"ld1w", 0b10, 0b0, ZZZZ_s_mul_r>; -defm LD1D_4Z_IMM : sve2p1_mem_cld_si_4z<"ld1d", 0b11, 0b0, ZZZZ_d_mul_r>; -def LDNT1B_4Z : sve2p1_mem_cld_ss_4z<"ldnt1b", 0b00, 0b1, ZZZZ_b_mul_r, GPR64shifted8>; -def LDNT1H_4Z : sve2p1_mem_cld_ss_4z<"ldnt1h", 0b01, 0b1, ZZZZ_h_mul_r, GPR64shifted16>; -def LDNT1W_4Z : sve2p1_mem_cld_ss_4z<"ldnt1w", 0b10, 0b1, ZZZZ_s_mul_r, GPR64shifted32>; -def LDNT1D_4Z : sve2p1_mem_cld_ss_4z<"ldnt1d", 0b11, 0b1, ZZZZ_d_mul_r, GPR64shifted64>; -defm LDNT1B_4Z_IMM : sve2p1_mem_cld_si_4z<"ldnt1b", 0b00, 0b1, ZZZZ_b_mul_r>; -defm LDNT1H_4Z_IMM : sve2p1_mem_cld_si_4z<"ldnt1h", 0b01, 0b1, ZZZZ_h_mul_r>; -defm LDNT1W_4Z_IMM : sve2p1_mem_cld_si_4z<"ldnt1w", 0b10, 0b1, ZZZZ_s_mul_r>; -defm LDNT1D_4Z_IMM : sve2p1_mem_cld_si_4z<"ldnt1d", 0b11, 0b1, ZZZZ_d_mul_r>; +defm LD1B_4Z : sve2p1_mem_cld_ss_4z<"ld1b", 0b00, 0b0, ZZZZ_b_mul_r, GPR64shifted8, ZZZZ_b_strided_and_contiguous>; +defm LD1H_4Z : sve2p1_mem_cld_ss_4z<"ld1h", 0b01, 0b0, ZZZZ_h_mul_r, GPR64shifted16, ZZZZ_h_strided_and_contiguous>; +defm LD1W_4Z : sve2p1_mem_cld_ss_4z<"ld1w", 0b10, 0b0, ZZZZ_s_mul_r, GPR64shifted32, ZZZZ_s_strided_and_contiguous>; +defm LD1D_4Z : sve2p1_mem_cld_ss_4z<"ld1d", 0b11, 0b0, ZZZZ_d_mul_r, GPR64shifted64, ZZZZ_d_strided_and_contiguous>; +defm LD1B_4Z_IMM : sve2p1_mem_cld_si_4z<"ld1b", 0b00, 0b0, ZZZZ_b_mul_r, ZZZZ_b_strided_and_contiguous>; +defm LD1H_4Z_IMM : sve2p1_mem_cld_si_4z<"ld1h", 0b01, 0b0, ZZZZ_h_mul_r, ZZZZ_h_strided_and_contiguous>; +defm LD1W_4Z_IMM : sve2p1_mem_cld_si_4z<"ld1w", 0b10, 0b0, ZZZZ_s_mul_r, ZZZZ_s_strided_and_contiguous>; +defm LD1D_4Z_IMM : sve2p1_mem_cld_si_4z<"ld1d", 0b11, 0b0, ZZZZ_d_mul_r, ZZZZ_d_strided_and_contiguous>; +defm LDNT1B_4Z : sve2p1_mem_cld_ss_4z<"ldnt1b", 0b00, 0b1, ZZZZ_b_mul_r, GPR64shifted8, ZZZZ_b_strided_and_contiguous>; +defm LDNT1H_4Z : sve2p1_mem_cld_ss_4z<"ldnt1h", 0b01, 0b1, ZZZZ_h_mul_r, GPR64shifted16, ZZZZ_h_strided_and_contiguous>; +defm LDNT1W_4Z : sve2p1_mem_cld_ss_4z<"ldnt1w", 0b10, 0b1, ZZZZ_s_mul_r, GPR64shifted32, ZZZZ_s_strided_and_contiguous>; +defm LDNT1D_4Z : sve2p1_mem_cld_ss_4z<"ldnt1d", 0b11, 0b1, ZZZZ_d_mul_r, GPR64shifted64, ZZZZ_d_strided_and_contiguous>; +defm LDNT1B_4Z_IMM : sve2p1_mem_cld_si_4z<"ldnt1b", 0b00, 0b1, ZZZZ_b_mul_r, ZZZZ_b_strided_and_contiguous>; +defm LDNT1H_4Z_IMM : sve2p1_mem_cld_si_4z<"ldnt1h", 0b01, 0b1, ZZZZ_h_mul_r, ZZZZ_h_strided_and_contiguous>; +defm LDNT1W_4Z_IMM : sve2p1_mem_cld_si_4z<"ldnt1w", 0b10, 0b1, ZZZZ_s_mul_r, ZZZZ_s_strided_and_contiguous>; +defm LDNT1D_4Z_IMM : sve2p1_mem_cld_si_4z<"ldnt1d", 0b11, 0b1, ZZZZ_d_mul_r, ZZZZ_d_strided_and_contiguous>; // Stores of two registers def ST1B_2Z : sve2p1_mem_cst_ss_2z<"st1b", 0b00, 0b0, ZZ_b_mul_r, GPR64shifted8>; diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td --- a/llvm/lib/Target/AArch64/SVEInstrFormats.td +++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td @@ -9379,6 +9379,12 @@ let mayLoad = 1; } +multiclass sve2p1_mem_cld_ss_2z msz, bit n, + RegisterOperand vector_ty, RegisterOperand gpr_ty, RegisterOperand vector_pseudo_ty> { + def NAME # _PSEUDO : Pseudo<(outs vector_pseudo_ty:$Zt), (ins PNRAny_p8to15:$PNg, GPR64sp:$Rn, gpr_ty:$Rm), []>; + def NAME : sve2p1_mem_cld_ss_2z; +} + // SME2 multi-vec contiguous load (scalar plus immediate, two registers) class sve2p1_mem_cld_si_2z msz, bit n, RegisterOperand vector_ty> @@ -9404,11 +9410,11 @@ } multiclass sve2p1_mem_cld_si_2z msz, bit n, - RegisterOperand vector_ty> { + RegisterOperand vector_ty, RegisterOperand vector_pseudo_ty> { def NAME : sve2p1_mem_cld_si_2z; - def : InstAlias(NAME) vector_ty:$Zt, PNRAny_p8to15:$PNg, GPR64sp:$Rn, 0), 1>; + def NAME # _PSEUDO : Pseudo<(outs vector_pseudo_ty:$Zt), (ins PNRAny_p8to15:$PNg, GPR64sp:$Rn, simm4s2:$imm4), []>; } // SME2 multi-vec contiguous load (scalar plus scalar, four registers) @@ -9436,6 +9442,12 @@ let mayLoad = 1; } +multiclass sve2p1_mem_cld_ss_4z msz, bit n, + RegisterOperand vector_ty, RegisterOperand gpr_ty, RegisterOperand vector_pseudo_ty> { + def NAME # _PSEUDO : Pseudo<(outs vector_pseudo_ty:$Zt), (ins PNRAny_p8to15:$PNg, GPR64sp:$Rn, gpr_ty:$Rm), []>; + def NAME : sve2p1_mem_cld_ss_4z; +} + // SME2 multi-vec contiguous load (scalar plus immediate, four registers) class sve2p1_mem_cld_si_4z msz, bit n, RegisterOperand vector_ty> @@ -9462,14 +9474,13 @@ } multiclass sve2p1_mem_cld_si_4z msz, bit n, - RegisterOperand vector_ty> { + RegisterOperand vector_ty, RegisterOperand vector_pseudo_ty> { def NAME : sve2p1_mem_cld_si_4z; - def : InstAlias(NAME) vector_ty:$Zt, PNRAny_p8to15:$PNg, GPR64sp:$Rn, 0), 1>; + def NAME # _PSEUDO : Pseudo<(outs vector_pseudo_ty:$Zt), (ins PNRAny_p8to15:$PNg, GPR64sp:$Rn, simm4s4:$imm4), []>; } - // SME2 multi-vec contiguous store (scalar plus scalar, two registers) class sve2p1_mem_cst_ss_2z msz, bit n, RegisterOperand vector_ty, RegisterOperand gpr_ty> diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll @@ -0,0 +1,2591 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=STRIDED +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CONTIGUOUS + +define @ld1_x2_i8_z0_z8( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_x2_i8_z0_z8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-17 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0] +; CHECK-NEXT: //APP +; CHECK-NEXT: nop +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z1.d, z8.d +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; STRIDED-LABEL: ld1_x2_i8_z0_z8: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ld1_x2_i8_z0_z8: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-16 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-2 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #2 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #16 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , } %res, 0 + %v0 = call @llvm.vector.insert.nxv32i8.nxv16i8( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , } %res, 1 + %v1 = call @llvm.vector.insert.nxv32i8.nxv16i8( %v0, %res.v1, i64 16) + ret %v1 +} + +define @ld1_x2_i8_z0_z8_scalar( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind { +; CHECK-LABEL: ld1_x2_i8_z0_z8_scalar: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-17 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0, x1] +; CHECK-NEXT: //APP +; CHECK-NEXT: nop +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z1.d, z8.d +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; STRIDED-LABEL: ld1_x2_i8_z0_z8_scalar: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ld1b { z0.b, z8.b }, pn8/z, [x0, x1] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ld1_x2_i8_z0_z8_scalar: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-16 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-2 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ld1b { z0.b, z1.b }, pn8/z, [x0, x1] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #2 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #16 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %base = getelementptr i8, ptr %ptr, i64 %index + %res = call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount") %pn, ptr %base) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , } %res, 0 + %v0 = call @llvm.vector.insert.nxv32i8.nxv16i8( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , } %res, 1 + %v1 = call @llvm.vector.insert.nxv32i8.nxv16i8( %v0, %res.v1, i64 16) + ret %v1 +} + +define @ld1_x2_i16_z0_z8( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_x2_i16_z0_z8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-17 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0] +; CHECK-NEXT: //APP +; CHECK-NEXT: nop +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z1.d, z8.d +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; STRIDED-LABEL: ld1_x2_i16_z0_z8: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ld1_x2_i16_z0_z8: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-16 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-2 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #2 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #16 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , } %res, 0 + %v0 = call @llvm.vector.insert.nxv16i16.nxv8i16( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , } %res, 1 + %v1 = call @llvm.vector.insert.nxv16i16.nxv8i16( %v0, %res.v1, i64 8) + ret %v1 +} + +define @ld1_x2_i16_z0_z8_scalar( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind { +; CHECK-LABEL: ld1_x2_i16_z0_z8_scalar: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-17 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1] +; CHECK-NEXT: //APP +; CHECK-NEXT: nop +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z1.d, z8.d +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; STRIDED-LABEL: ld1_x2_i16_z0_z8_scalar: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ld1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ld1_x2_i16_z0_z8_scalar: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-16 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-2 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ld1h { z0.h, z1.h }, pn8/z, [x0, x1, lsl #1] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #2 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #16 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %base = getelementptr i16, ptr %ptr, i64 %index + %res = call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount") %pn, ptr %base) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , } %res, 0 + %v0 = call @llvm.vector.insert.nxv16i16.nxv8i16( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , } %res, 1 + %v1 = call @llvm.vector.insert.nxv16i16.nxv8i16( %v0, %res.v1, i64 8) + ret %v1 +} + +define @ld1_x2_i32_z0_z8( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_x2_i32_z0_z8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-17 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1w { z0.s, z8.s }, pn8/z, [x0] +; CHECK-NEXT: //APP +; CHECK-NEXT: nop +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z1.d, z8.d +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; STRIDED-LABEL: ld1_x2_i32_z0_z8: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ld1w { z0.s, z8.s }, pn8/z, [x0] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ld1_x2_i32_z0_z8: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-16 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-2 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ld1w { z0.s, z1.s }, pn8/z, [x0] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #2 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #16 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , } %res, 0 + %v0 = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , } %res, 1 + %v1 = call @llvm.vector.insert.nxv8i32.nxv4i32( %v0, %res.v1, i64 4) + ret %v1 +} + +define @ld1_x2_i32_z0_z8_scalar( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind { +; CHECK-LABEL: ld1_x2_i32_z0_z8_scalar: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-17 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2] +; CHECK-NEXT: //APP +; CHECK-NEXT: nop +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z1.d, z8.d +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; STRIDED-LABEL: ld1_x2_i32_z0_z8_scalar: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ld1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ld1_x2_i32_z0_z8_scalar: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-16 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-2 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ld1w { z0.s, z1.s }, pn8/z, [x0, x1, lsl #2] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #2 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #16 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %base = getelementptr i32, ptr %ptr, i64 %index + %res = call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount") %pn, ptr %base) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , } %res, 0 + %v0 = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , } %res, 1 + %v1 = call @llvm.vector.insert.nxv8i32.nxv4i32( %v0, %res.v1, i64 4) + ret %v1 +} + +define @ld1_x2_i64_z0_z8( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_x2_i64_z0_z8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-17 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1d { z0.d, z8.d }, pn8/z, [x0] +; CHECK-NEXT: //APP +; CHECK-NEXT: nop +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z1.d, z8.d +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; STRIDED-LABEL: ld1_x2_i64_z0_z8: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ld1d { z0.d, z8.d }, pn8/z, [x0] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ld1_x2_i64_z0_z8: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-16 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-2 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ld1d { z0.d, z1.d }, pn8/z, [x0] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #2 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #16 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , } %res, 0 + %v0 = call @llvm.vector.insert.nxv4i64.nxv2i64( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , } %res, 1 + %v1 = call @llvm.vector.insert.nxv4i64.nxv2i64( %v0, %res.v1, i64 2) + ret %v1 +} + +define @ld1_x2_i64_z0_z8_scalar( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind { +; CHECK-LABEL: ld1_x2_i64_z0_z8_scalar: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-17 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3] +; CHECK-NEXT: //APP +; CHECK-NEXT: nop +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z1.d, z8.d +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; STRIDED-LABEL: ld1_x2_i64_z0_z8_scalar: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ld1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ld1_x2_i64_z0_z8_scalar: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-16 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-2 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ld1d { z0.d, z1.d }, pn8/z, [x0, x1, lsl #3] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #2 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #16 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %base = getelementptr i64, ptr %ptr, i64 %index + %res = call { , } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount") %pn, ptr %base) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , } %res, 0 + %v0 = call @llvm.vector.insert.nxv4i64.nxv2i64( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , } %res, 1 + %v1 = call @llvm.vector.insert.nxv4i64.nxv2i64( %v0, %res.v1, i64 2) + ret %v1 +} + +define @ld1_x4_i8_z0_z4_z8_z12( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_x4_i8_z0_z4_z8_z12: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-17 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] +; CHECK-NEXT: //APP +; CHECK-NEXT: nop +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z2.d, z8.d +; CHECK-NEXT: mov z3.d, z12.d +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z1.d, z4.d +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; STRIDED-LABEL: ld1_x4_i8_z0_z4_z8_z12: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z2.d, z8.d +; STRIDED-NEXT: mov z3.d, z12.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z4.d +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ld1_x4_i8_z0_z4_z8_z12: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-15 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-4 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: str z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #4 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #15 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , , , } %res, 0 + %v0 = call @llvm.vector.insert.nxv64i8.nxv16i8( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , , , } %res, 1 + %v1 = call @llvm.vector.insert.nxv64i8.nxv16i8( %v0, %res.v1, i64 16) + %res.v2 = extractvalue { , , , } %res, 2 + %v2 = call @llvm.vector.insert.nxv64i8.nxv16i8( %v1, %res.v2, i64 32) + %res.v3 = extractvalue { , , , } %res, 3 + %v3 = call @llvm.vector.insert.nxv64i8.nxv16i8( %v2, %res.v3, i64 48) + ret %v3 +} + +define @ld1_x4_i8_z0_z4_z8_z12_scalar( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind { +; CHECK-LABEL: ld1_x4_i8_z0_z4_z8_z12_scalar: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-17 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1] +; CHECK-NEXT: //APP +; CHECK-NEXT: nop +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z2.d, z8.d +; CHECK-NEXT: mov z3.d, z12.d +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z1.d, z4.d +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; STRIDED-LABEL: ld1_x4_i8_z0_z4_z8_z12_scalar: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ld1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z2.d, z8.d +; STRIDED-NEXT: mov z3.d, z12.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z4.d +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ld1_x4_i8_z0_z4_z8_z12_scalar: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-15 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-4 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ld1b { z0.b - z3.b }, pn8/z, [x0, x1] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: str z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #4 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #15 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %base = getelementptr i8, ptr %ptr, i64 %index + %res = call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount") %pn, ptr %base) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , , , } %res, 0 + %v0 = call @llvm.vector.insert.nxv64i8.nxv16i8( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , , , } %res, 1 + %v1 = call @llvm.vector.insert.nxv64i8.nxv16i8( %v0, %res.v1, i64 16) + %res.v2 = extractvalue { , , , } %res, 2 + %v2 = call @llvm.vector.insert.nxv64i8.nxv16i8( %v1, %res.v2, i64 32) + %res.v3 = extractvalue { , , , } %res, 3 + %v3 = call @llvm.vector.insert.nxv64i8.nxv16i8( %v2, %res.v3, i64 48) + ret %v3 +} + +define @ld1_x4_i16_z0_z4_z8_z12( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_x4_i16_z0_z4_z8_z12: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-17 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0] +; CHECK-NEXT: //APP +; CHECK-NEXT: nop +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z2.d, z8.d +; CHECK-NEXT: mov z3.d, z12.d +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z1.d, z4.d +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; STRIDED-LABEL: ld1_x4_i16_z0_z4_z8_z12: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z2.d, z8.d +; STRIDED-NEXT: mov z3.d, z12.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z4.d +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ld1_x4_i16_z0_z4_z8_z12: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-15 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-4 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ld1h { z0.h - z3.h }, pn8/z, [x0] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: str z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #4 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #15 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , , , } %res, 0 + %v0 = call @llvm.vector.insert.nxv32i16.nxv8i16( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , , , } %res, 1 + %v1 = call @llvm.vector.insert.nxv32i16.nxv8i16( %v0, %res.v1, i64 8) + %res.v2 = extractvalue { , , , } %res, 2 + %v2 = call @llvm.vector.insert.nxv32i16.nxv8i16( %v1, %res.v2, i64 16) + %res.v3 = extractvalue { , , , } %res, 3 + %v3 = call @llvm.vector.insert.nxv32i16.nxv8i16( %v2, %res.v3, i64 24) + ret %v3 +} + +define @ld1_x4_i16_z0_z4_z8_z12_scalar( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind { +; CHECK-LABEL: ld1_x4_i16_z0_z4_z8_z12_scalar: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-17 +; CHECK-NEXT: lsl x8, x1, #1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x8, lsl #1] +; CHECK-NEXT: //APP +; CHECK-NEXT: nop +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z2.d, z8.d +; CHECK-NEXT: mov z3.d, z12.d +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z1.d, z4.d +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; STRIDED-LABEL: ld1_x4_i16_z0_z4_z8_z12_scalar: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ld1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z2.d, z8.d +; STRIDED-NEXT: mov z3.d, z12.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z4.d +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ld1_x4_i16_z0_z4_z8_z12_scalar: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-15 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-4 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ld1h { z0.h - z3.h }, pn8/z, [x0, x1, lsl #1] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: str z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #4 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #15 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %base = getelementptr i16, ptr %ptr, i64 %index + %res = call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount") %pn, ptr %base) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , , , } %res, 0 + %v0 = call @llvm.vector.insert.nxv32i16.nxv8i16( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , , , } %res, 1 + %v1 = call @llvm.vector.insert.nxv32i16.nxv8i16( %v0, %res.v1, i64 8) + %res.v2 = extractvalue { , , , } %res, 2 + %v2 = call @llvm.vector.insert.nxv32i16.nxv8i16( %v1, %res.v2, i64 16) + %res.v3 = extractvalue { , , , } %res, 3 + %v3 = call @llvm.vector.insert.nxv32i16.nxv8i16( %v2, %res.v3, i64 24) + ret %v3 +} + +define @ld1_x4_i32_z0_z4_z8_z12( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_x4_i32_z0_z4_z8_z12: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-17 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0] +; CHECK-NEXT: //APP +; CHECK-NEXT: nop +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z2.d, z8.d +; CHECK-NEXT: mov z3.d, z12.d +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z1.d, z4.d +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; STRIDED-LABEL: ld1_x4_i32_z0_z4_z8_z12: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z2.d, z8.d +; STRIDED-NEXT: mov z3.d, z12.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z4.d +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ld1_x4_i32_z0_z4_z8_z12: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-15 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-4 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ld1w { z0.s - z3.s }, pn8/z, [x0] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: str z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #4 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #15 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , , , } %res, 0 + %v0 = call @llvm.vector.insert.nxv16i32.nxv4i32( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , , , } %res, 1 + %v1 = call @llvm.vector.insert.nxv16i32.nxv4i32( %v0, %res.v1, i64 4) + %res.v2 = extractvalue { , , , } %res, 2 + %v2 = call @llvm.vector.insert.nxv16i32.nxv4i32( %v1, %res.v2, i64 8) + %res.v3 = extractvalue { , , , } %res, 3 + %v3 = call @llvm.vector.insert.nxv16i32.nxv4i32( %v2, %res.v3, i64 12) + ret %v3 +} + +define @ld1_x4_i32_z0_z4_z8_z12_scalar( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind { +; CHECK-LABEL: ld1_x4_i32_z0_z4_z8_z12_scalar: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-17 +; CHECK-NEXT: lsl x8, x1, #2 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x8, lsl #2] +; CHECK-NEXT: //APP +; CHECK-NEXT: nop +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z2.d, z8.d +; CHECK-NEXT: mov z3.d, z12.d +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z1.d, z4.d +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; STRIDED-LABEL: ld1_x4_i32_z0_z4_z8_z12_scalar: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ld1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z2.d, z8.d +; STRIDED-NEXT: mov z3.d, z12.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z4.d +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ld1_x4_i32_z0_z4_z8_z12_scalar: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-15 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-4 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ld1w { z0.s - z3.s }, pn8/z, [x0, x1, lsl #2] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: str z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #4 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #15 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %base = getelementptr i32, ptr %ptr, i64 %index + %res = call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount") %pn, ptr %base) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , , , } %res, 0 + %v0 = call @llvm.vector.insert.nxv16i32.nxv4i32( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , , , } %res, 1 + %v1 = call @llvm.vector.insert.nxv16i32.nxv4i32( %v0, %res.v1, i64 4) + %res.v2 = extractvalue { , , , } %res, 2 + %v2 = call @llvm.vector.insert.nxv16i32.nxv4i32( %v1, %res.v2, i64 8) + %res.v3 = extractvalue { , , , } %res, 3 + %v3 = call @llvm.vector.insert.nxv16i32.nxv4i32( %v2, %res.v3, i64 12) + ret %v3 +} + +define @ld1_x4_i64_z0_z4_z8_z12( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; CHECK-LABEL: ld1_x4_i64_z0_z4_z8_z12: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-17 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0] +; CHECK-NEXT: //APP +; CHECK-NEXT: nop +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z2.d, z8.d +; CHECK-NEXT: mov z3.d, z12.d +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z1.d, z4.d +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; STRIDED-LABEL: ld1_x4_i64_z0_z4_z8_z12: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z2.d, z8.d +; STRIDED-NEXT: mov z3.d, z12.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z4.d +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ld1_x4_i64_z0_z4_z8_z12: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-15 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-4 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ld1d { z0.d - z3.d }, pn8/z, [x0] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: str z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #4 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #15 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , , , } %res, 0 + %v0 = call @llvm.vector.insert.nxv8i64.nxv2i64( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , , , } %res, 1 + %v1 = call @llvm.vector.insert.nxv8i64.nxv2i64( %v0, %res.v1, i64 2) + %res.v2 = extractvalue { , , , } %res, 2 + %v2 = call @llvm.vector.insert.nxv8i64.nxv2i64( %v1, %res.v2, i64 4) + %res.v3 = extractvalue { , , , } %res, 3 + %v3 = call @llvm.vector.insert.nxv8i64.nxv2i64( %v2, %res.v3, i64 6) + ret %v3 +} + +define @ld1_x4_i64_z0_z4_z8_z12_scalar( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind { +; CHECK-LABEL: ld1_x4_i64_z0_z4_z8_z12_scalar: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-17 +; CHECK-NEXT: lsl x8, x1, #3 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x8, lsl #3] +; CHECK-NEXT: //APP +; CHECK-NEXT: nop +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z2.d, z8.d +; CHECK-NEXT: mov z3.d, z12.d +; CHECK-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: mov z1.d, z4.d +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret +; STRIDED-LABEL: ld1_x4_i64_z0_z4_z8_z12_scalar: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ld1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z2.d, z8.d +; STRIDED-NEXT: mov z3.d, z12.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z4.d +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ld1_x4_i64_z0_z4_z8_z12_scalar: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-15 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-4 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ld1d { z0.d - z3.d }, pn8/z, [x0, x1, lsl #3] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: str z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #4 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #15 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %base = getelementptr i64, ptr %ptr, i64 %index + %res = call { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount") %pn, ptr %base) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , , , } %res, 0 + %v0 = call @llvm.vector.insert.nxv8i64.nxv2i64( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , , , } %res, 1 + %v1 = call @llvm.vector.insert.nxv8i64.nxv2i64( %v0, %res.v1, i64 2) + %res.v2 = extractvalue { , , , } %res, 2 + %v2 = call @llvm.vector.insert.nxv8i64.nxv2i64( %v1, %res.v2, i64 4) + %res.v3 = extractvalue { , , , } %res, 3 + %v3 = call @llvm.vector.insert.nxv8i64.nxv2i64( %v2, %res.v3, i64 6) + ret %v3 +} + +declare @llvm.vector.insert.nxv32i8.nxv16i8(, , i64) +declare @llvm.vector.insert.nxv16i16.nxv8i16(, , i64) +declare @llvm.vector.insert.nxv8i32.nxv4i32(, , i64) +declare @llvm.vector.insert.nxv4i64.nxv2i64(, , i64) +declare @llvm.vector.insert.nxv64i8.nxv16i8(, , i64) +declare @llvm.vector.insert.nxv32i16.nxv8i16(, , i64) +declare @llvm.vector.insert.nxv16i32.nxv4i32(, , i64) +declare @llvm.vector.insert.nxv8i64.nxv2i64(, , i64) +declare { , } @llvm.aarch64.sve.ld1.pn.x2.nxv16i8(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ld1.pn.x2.nxv8i16(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ld1.pn.x2.nxv4i32(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ld1.pn.x2.nxv2i64(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv16i8(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv8i16(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv4i32(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ld1.pn.x4.nxv2i64(target("aarch64.svcount"), ptr) diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll @@ -0,0 +1,1820 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sme2 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=STRIDED +; RUN: llc -mtriple=aarch64-linux-gnu -mattr=+sve2p1 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CONTIGUOUS + +define @ldnt1_x2_i8_z0_z8( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; STRIDED-LABEL: ldnt1_x2_i8_z0_z8: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ldnt1b { z0.b, z8.b }, pn8/z, [x0] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ldnt1_x2_i8_z0_z8: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-16 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-2 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ldnt1b { z0.b, z1.b }, pn8/z, [x0] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #2 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #16 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , } %res, 0 + %v0 = call @llvm.vector.insert.nxv32i8.nxv16i8( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , } %res, 1 + %v1 = call @llvm.vector.insert.nxv32i8.nxv16i8( %v0, %res.v1, i64 16) + ret %v1 +} + +define @ldnt1_x2_i8_z0_z8_scalar( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind { +; STRIDED-LABEL: ldnt1_x2_i8_z0_z8_scalar: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ldnt1b { z0.b, z8.b }, pn8/z, [x0, x1] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ldnt1_x2_i8_z0_z8_scalar: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-16 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-2 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ldnt1b { z0.b, z1.b }, pn8/z, [x0, x1] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #2 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #16 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %base = getelementptr i8, ptr %ptr, i64 %index + %res = call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount") %pn, ptr %base) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , } %res, 0 + %v0 = call @llvm.vector.insert.nxv32i8.nxv16i8( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , } %res, 1 + %v1 = call @llvm.vector.insert.nxv32i8.nxv16i8( %v0, %res.v1, i64 16) + ret %v1 +} + +define @ldnt1_x2_i16_z0_z8( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; STRIDED-LABEL: ldnt1_x2_i16_z0_z8: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ldnt1h { z0.h, z8.h }, pn8/z, [x0] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ldnt1_x2_i16_z0_z8: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-16 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-2 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ldnt1h { z0.h, z1.h }, pn8/z, [x0] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #2 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #16 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , } %res, 0 + %v0 = call @llvm.vector.insert.nxv16i16.nxv8i16( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , } %res, 1 + %v1 = call @llvm.vector.insert.nxv16i16.nxv8i16( %v0, %res.v1, i64 8) + ret %v1 +} + +define @ldnt1_x2_i16_z0_z8_scalar( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind { +; STRIDED-LABEL: ldnt1_x2_i16_z0_z8_scalar: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ldnt1h { z0.h, z8.h }, pn8/z, [x0, x1, lsl #1] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ldnt1_x2_i16_z0_z8_scalar: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-16 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-2 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ldnt1h { z0.h, z1.h }, pn8/z, [x0, x1, lsl #1] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #2 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #16 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %base = getelementptr i16, ptr %ptr, i64 %index + %res = call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount") %pn, ptr %base) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , } %res, 0 + %v0 = call @llvm.vector.insert.nxv16i16.nxv8i16( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , } %res, 1 + %v1 = call @llvm.vector.insert.nxv16i16.nxv8i16( %v0, %res.v1, i64 8) + ret %v1 +} + +define @ldnt1_x2_i32_z0_z8( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; STRIDED-LABEL: ldnt1_x2_i32_z0_z8: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ldnt1w { z0.s, z8.s }, pn8/z, [x0] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ldnt1_x2_i32_z0_z8: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-16 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-2 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ldnt1w { z0.s, z1.s }, pn8/z, [x0] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #2 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #16 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , } %res, 0 + %v0 = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , } %res, 1 + %v1 = call @llvm.vector.insert.nxv8i32.nxv4i32( %v0, %res.v1, i64 4) + ret %v1 +} + +define @ldnt1_x2_i32_z0_z8_scalar( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind { +; STRIDED-LABEL: ldnt1_x2_i32_z0_z8_scalar: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ldnt1w { z0.s, z8.s }, pn8/z, [x0, x1, lsl #2] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ldnt1_x2_i32_z0_z8_scalar: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-16 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-2 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ldnt1w { z0.s, z1.s }, pn8/z, [x0, x1, lsl #2] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #2 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #16 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %base = getelementptr i32, ptr %ptr, i64 %index + %res = call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount") %pn, ptr %base) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , } %res, 0 + %v0 = call @llvm.vector.insert.nxv8i32.nxv4i32( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , } %res, 1 + %v1 = call @llvm.vector.insert.nxv8i32.nxv4i32( %v0, %res.v1, i64 4) + ret %v1 +} + +define @ldnt1_x2_i64_z0_z8( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; STRIDED-LABEL: ldnt1_x2_i64_z0_z8: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ldnt1d { z0.d, z8.d }, pn8/z, [x0] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ldnt1_x2_i64_z0_z8: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-16 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-2 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ldnt1d { z0.d, z1.d }, pn8/z, [x0] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #2 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #16 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %res = call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , } %res, 0 + %v0 = call @llvm.vector.insert.nxv4i64.nxv2i64( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , } %res, 1 + %v1 = call @llvm.vector.insert.nxv4i64.nxv2i64( %v0, %res.v1, i64 2) + ret %v1 +} + +define @ldnt1_x2_i64_z0_z8_scalar( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind { +; STRIDED-LABEL: ldnt1_x2_i64_z0_z8_scalar: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ldnt1d { z0.d, z8.d }, pn8/z, [x0, x1, lsl #3] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z8.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ldnt1_x2_i64_z0_z8_scalar: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-16 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-2 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ldnt1d { z0.d, z1.d }, pn8/z, [x0, x1, lsl #3] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #2 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #16 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %base = getelementptr i64, ptr %ptr, i64 %index + %res = call { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount") %pn, ptr %base) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z4},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z12},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , } %res, 0 + %v0 = call @llvm.vector.insert.nxv4i64.nxv2i64( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , } %res, 1 + %v1 = call @llvm.vector.insert.nxv4i64.nxv2i64( %v0, %res.v1, i64 2) + ret %v1 +} + +define @ldnt1_x4_i8_z0_z4_z8_z12( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; STRIDED-LABEL: ldnt1_x4_i8_z0_z4_z8_z12: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z2.d, z8.d +; STRIDED-NEXT: mov z3.d, z12.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z4.d +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ldnt1_x4_i8_z0_z4_z8_z12: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-15 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-4 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ldnt1b { z0.b - z3.b }, pn8/z, [x0] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: str z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #4 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #15 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") %pn, ptr %ptr) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , , , } %res, 0 + %v0 = call @llvm.vector.insert.nxv64i8.nxv16i8( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , , , } %res, 1 + %v1 = call @llvm.vector.insert.nxv64i8.nxv16i8( %v0, %res.v1, i64 16) + %res.v2 = extractvalue { , , , } %res, 2 + %v2 = call @llvm.vector.insert.nxv64i8.nxv16i8( %v1, %res.v2, i64 32) + %res.v3 = extractvalue { , , , } %res, 3 + %v3 = call @llvm.vector.insert.nxv64i8.nxv16i8( %v2, %res.v3, i64 48) + ret %v3 +} + +define @ldnt1_x4_i8_z0_z4_z8_z12_scalar( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind { +; STRIDED-LABEL: ldnt1_x4_i8_z0_z4_z8_z12_scalar: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ldnt1b { z0.b, z4.b, z8.b, z12.b }, pn8/z, [x0, x1] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z2.d, z8.d +; STRIDED-NEXT: mov z3.d, z12.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z4.d +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ldnt1_x4_i8_z0_z4_z8_z12_scalar: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-15 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-4 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ldnt1b { z0.b - z3.b }, pn8/z, [x0, x1] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: str z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #4 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #15 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %base = getelementptr i8, ptr %ptr, i64 %index + %res = call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount") %pn, ptr %base) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , , , } %res, 0 + %v0 = call @llvm.vector.insert.nxv64i8.nxv16i8( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , , , } %res, 1 + %v1 = call @llvm.vector.insert.nxv64i8.nxv16i8( %v0, %res.v1, i64 16) + %res.v2 = extractvalue { , , , } %res, 2 + %v2 = call @llvm.vector.insert.nxv64i8.nxv16i8( %v1, %res.v2, i64 32) + %res.v3 = extractvalue { , , , } %res, 3 + %v3 = call @llvm.vector.insert.nxv64i8.nxv16i8( %v2, %res.v3, i64 48) + ret %v3 +} + +define @ldnt1_x4_i16_z0_z4_z8_z12( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; STRIDED-LABEL: ldnt1_x4_i16_z0_z4_z8_z12: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z2.d, z8.d +; STRIDED-NEXT: mov z3.d, z12.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z4.d +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ldnt1_x4_i16_z0_z4_z8_z12: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-15 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-4 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ldnt1h { z0.h - z3.h }, pn8/z, [x0] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: str z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #4 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #15 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") %pn, ptr %ptr) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , , , } %res, 0 + %v0 = call @llvm.vector.insert.nxv32i16.nxv8i16( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , , , } %res, 1 + %v1 = call @llvm.vector.insert.nxv32i16.nxv8i16( %v0, %res.v1, i64 8) + %res.v2 = extractvalue { , , , } %res, 2 + %v2 = call @llvm.vector.insert.nxv32i16.nxv8i16( %v1, %res.v2, i64 16) + %res.v3 = extractvalue { , , , } %res, 3 + %v3 = call @llvm.vector.insert.nxv32i16.nxv8i16( %v2, %res.v3, i64 24) + ret %v3 +} + +define @ldnt1_x4_i16_z0_z4_z8_z12_scalar( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind { +; STRIDED-LABEL: ldnt1_x4_i16_z0_z4_z8_z12_scalar: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ldnt1h { z0.h, z4.h, z8.h, z12.h }, pn8/z, [x0, x1, lsl #1] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z2.d, z8.d +; STRIDED-NEXT: mov z3.d, z12.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z4.d +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ldnt1_x4_i16_z0_z4_z8_z12_scalar: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-15 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-4 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ldnt1h { z0.h - z3.h }, pn8/z, [x0, x1, lsl #1] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: str z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #4 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #15 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %base = getelementptr i16, ptr %ptr, i64 %index + %res = call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount") %pn, ptr %base) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , , , } %res, 0 + %v0 = call @llvm.vector.insert.nxv32i16.nxv8i16( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , , , } %res, 1 + %v1 = call @llvm.vector.insert.nxv32i16.nxv8i16( %v0, %res.v1, i64 8) + %res.v2 = extractvalue { , , , } %res, 2 + %v2 = call @llvm.vector.insert.nxv32i16.nxv8i16( %v1, %res.v2, i64 16) + %res.v3 = extractvalue { , , , } %res, 3 + %v3 = call @llvm.vector.insert.nxv32i16.nxv8i16( %v2, %res.v3, i64 24) + ret %v3 +} + +define @ldnt1_x4_i32_z0_z4_z8_z12( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; STRIDED-LABEL: ldnt1_x4_i32_z0_z4_z8_z12: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z2.d, z8.d +; STRIDED-NEXT: mov z3.d, z12.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z4.d +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ldnt1_x4_i32_z0_z4_z8_z12: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-15 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-4 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ldnt1w { z0.s - z3.s }, pn8/z, [x0] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: str z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #4 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #15 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") %pn, ptr %ptr) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , , , } %res, 0 + %v0 = call @llvm.vector.insert.nxv16i32.nxv4i32( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , , , } %res, 1 + %v1 = call @llvm.vector.insert.nxv16i32.nxv4i32( %v0, %res.v1, i64 4) + %res.v2 = extractvalue { , , , } %res, 2 + %v2 = call @llvm.vector.insert.nxv16i32.nxv4i32( %v1, %res.v2, i64 8) + %res.v3 = extractvalue { , , , } %res, 3 + %v3 = call @llvm.vector.insert.nxv16i32.nxv4i32( %v2, %res.v3, i64 12) + ret %v3 +} + +define @ldnt1_x4_i32_z0_z4_z8_z12_scalar( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind { +; STRIDED-LABEL: ldnt1_x4_i32_z0_z4_z8_z12_scalar: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ldnt1w { z0.s, z4.s, z8.s, z12.s }, pn8/z, [x0, x1, lsl #2] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z2.d, z8.d +; STRIDED-NEXT: mov z3.d, z12.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z4.d +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ldnt1_x4_i32_z0_z4_z8_z12_scalar: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-15 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-4 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ldnt1w { z0.s - z3.s }, pn8/z, [x0, x1, lsl #2] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: str z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #4 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #15 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %base = getelementptr i32, ptr %ptr, i64 %index + %res = call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount") %pn, ptr %base) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , , , } %res, 0 + %v0 = call @llvm.vector.insert.nxv16i32.nxv4i32( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , , , } %res, 1 + %v1 = call @llvm.vector.insert.nxv16i32.nxv4i32( %v0, %res.v1, i64 4) + %res.v2 = extractvalue { , , , } %res, 2 + %v2 = call @llvm.vector.insert.nxv16i32.nxv4i32( %v1, %res.v2, i64 8) + %res.v3 = extractvalue { , , , } %res, 3 + %v3 = call @llvm.vector.insert.nxv16i32.nxv4i32( %v2, %res.v3, i64 12) + ret %v3 +} + +define @ldnt1_x4_i64_z0_z4_z8_z12( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr) nounwind { +; STRIDED-LABEL: ldnt1_x4_i64_z0_z4_z8_z12: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z2.d, z8.d +; STRIDED-NEXT: mov z3.d, z12.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z4.d +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ldnt1_x4_i64_z0_z4_z8_z12: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-15 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-4 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ldnt1d { z0.d - z3.d }, pn8/z, [x0] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: str z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #4 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #15 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %res = call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") %pn, ptr %ptr) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , , , } %res, 0 + %v0 = call @llvm.vector.insert.nxv8i64.nxv2i64( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , , , } %res, 1 + %v1 = call @llvm.vector.insert.nxv8i64.nxv2i64( %v0, %res.v1, i64 2) + %res.v2 = extractvalue { , , , } %res, 2 + %v2 = call @llvm.vector.insert.nxv8i64.nxv2i64( %v1, %res.v2, i64 4) + %res.v3 = extractvalue { , , , } %res, 3 + %v3 = call @llvm.vector.insert.nxv8i64.nxv2i64( %v2, %res.v3, i64 6) + ret %v3 +} + +define @ldnt1_x4_i64_z0_z4_z8_z12_scalar( %unused, %z1, target("aarch64.svcount") %pn, ptr %ptr, i64 %index) nounwind { +; STRIDED-LABEL: ldnt1_x4_i64_z0_z4_z8_z12_scalar: +; STRIDED: // %bb.0: +; STRIDED-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; STRIDED-NEXT: addvl sp, sp, #-17 +; STRIDED-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; STRIDED-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z12, [sp, #12, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z11, [sp, #13, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z10, [sp, #14, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z9, [sp, #15, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: str z8, [sp, #16, mul vl] // 16-byte Folded Spill +; STRIDED-NEXT: mov p8.b, p0.b +; STRIDED-NEXT: ldnt1d { z0.d, z4.d, z8.d, z12.d }, pn8/z, [x0, x1, lsl #3] +; STRIDED-NEXT: //APP +; STRIDED-NEXT: nop +; STRIDED-NEXT: //NO_APP +; STRIDED-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; STRIDED-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z2.d, z8.d +; STRIDED-NEXT: mov z3.d, z12.d +; STRIDED-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z12, [sp, #12, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload +; STRIDED-NEXT: mov z1.d, z4.d +; STRIDED-NEXT: addvl sp, sp, #17 +; STRIDED-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; STRIDED-NEXT: ret +; +; CONTIGUOUS-LABEL: ldnt1_x4_i64_z0_z4_z8_z12_scalar: +; CONTIGUOUS: // %bb.0: +; CONTIGUOUS-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-15 +; CONTIGUOUS-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CONTIGUOUS-NEXT: str z23, [sp, #1, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z22, [sp, #2, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z21, [sp, #3, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z20, [sp, #4, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z19, [sp, #5, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z18, [sp, #6, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z17, [sp, #7, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z16, [sp, #8, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z15, [sp, #9, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z14, [sp, #10, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z13, [sp, #11, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CONTIGUOUS-NEXT: addvl sp, sp, #-4 +; CONTIGUOUS-NEXT: mov p8.b, p0.b +; CONTIGUOUS-NEXT: ldnt1d { z0.d - z3.d }, pn8/z, [x0, x1, lsl #3] +; CONTIGUOUS-NEXT: str z0, [sp] +; CONTIGUOUS-NEXT: str z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: str z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: str z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: //APP +; CONTIGUOUS-NEXT: nop +; CONTIGUOUS-NEXT: //NO_APP +; CONTIGUOUS-NEXT: ldr z0, [sp] +; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl] +; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl] +; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl] +; CONTIGUOUS-NEXT: addvl sp, sp, #4 +; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z20, [sp, #4, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z19, [sp, #5, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z18, [sp, #6, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z17, [sp, #7, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z16, [sp, #8, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z15, [sp, #9, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z14, [sp, #10, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z13, [sp, #11, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CONTIGUOUS-NEXT: addvl sp, sp, #15 +; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CONTIGUOUS-NEXT: ret + %base = getelementptr i64, ptr %ptr, i64 %index + %res = call { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount") %pn, ptr %base) + call void asm sideeffect "nop", "~{z1},~{z2},~{z3},~{z5},~{z6},~{z7},~{z9},~{z10},~{z11},~{z13},~{z14},~{z15},~{z16},~{z17},~{z18},~{z19},~{z20},~{z21},~{z22},~{z23},~{z24},~{z25},~{z26},~{z27},~{z28},~{z29},~{z30},~{z31}"() nounwind + %res.v0 = extractvalue { , , , } %res, 0 + %v0 = call @llvm.vector.insert.nxv8i64.nxv2i64( poison, %res.v0, i64 0) + %res.v1 = extractvalue { , , , } %res, 1 + %v1 = call @llvm.vector.insert.nxv8i64.nxv2i64( %v0, %res.v1, i64 2) + %res.v2 = extractvalue { , , , } %res, 2 + %v2 = call @llvm.vector.insert.nxv8i64.nxv2i64( %v1, %res.v2, i64 4) + %res.v3 = extractvalue { , , , } %res, 3 + %v3 = call @llvm.vector.insert.nxv8i64.nxv2i64( %v2, %res.v3, i64 6) + ret %v3 +} + +declare @llvm.vector.insert.nxv32i8.nxv16i8(, , i64) +declare @llvm.vector.insert.nxv16i16.nxv8i16(, , i64) +declare @llvm.vector.insert.nxv8i32.nxv4i32(, , i64) +declare @llvm.vector.insert.nxv4i64.nxv2i64(, , i64) +declare @llvm.vector.insert.nxv64i8.nxv16i8(, , i64) +declare @llvm.vector.insert.nxv32i16.nxv8i16(, , i64) +declare @llvm.vector.insert.nxv16i32.nxv4i32(, , i64) +declare @llvm.vector.insert.nxv8i64.nxv2i64(, , i64) +declare { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv16i8(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv8i16(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv4i32(target("aarch64.svcount"), ptr) +declare { , } @llvm.aarch64.sve.ldnt1.pn.x2.nxv2i64(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv16i8(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv8i16(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv4i32(target("aarch64.svcount"), ptr) +declare { , , , } @llvm.aarch64.sve.ldnt1.pn.x4.nxv2i64(target("aarch64.svcount"), ptr)