diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -799,13 +799,13 @@ /// Returns true if \p VecTy is a legal interleaved access type. This /// function checks the vector element type and the overall width of the /// vector. - bool isLegalInterleavedAccessType(VectorType *VecTy, - const DataLayout &DL) const; + bool isLegalInterleavedAccessType(VectorType *VecTy, const DataLayout &DL, + bool &UseScalable) const; /// Returns the number of interleaved accesses that will be generated when /// lowering accesses of the given type. - unsigned getNumInterleavedAccesses(VectorType *VecTy, - const DataLayout &DL) const; + unsigned getNumInterleavedAccesses(VectorType *VecTy, const DataLayout &DL, + bool UseScalable) const; MachineMemOperand::Flags getTargetMMOFlags( const Instruction &I) const override; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12011,10 +12011,10 @@ /// A helper function for determining the number of interleaved accesses we /// will generate when lowering accesses of the given type. -unsigned -AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy, - const DataLayout &DL) const { - return (DL.getTypeSizeInBits(VecTy) + 127) / 128; +unsigned AArch64TargetLowering::getNumInterleavedAccesses( + VectorType *VecTy, const DataLayout &DL, bool UseScalable) const { + unsigned VecSize = UseScalable ? Subtarget->getMinSVEVectorSizeInBits() : 128; + return std::max(1UL, (DL.getTypeSizeInBits(VecTy) + 127) / VecSize); } MachineMemOperand::Flags @@ -12026,24 +12026,51 @@ } bool AArch64TargetLowering::isLegalInterleavedAccessType( - VectorType *VecTy, const DataLayout &DL) const { + VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const { unsigned VecSize = DL.getTypeSizeInBits(VecTy); unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); + unsigned NumElements = cast(VecTy)->getNumElements(); + + UseScalable = false; // Ensure the number of vector elements is greater than 1. - if (cast(VecTy)->getNumElements() < 2) + if (NumElements < 2) return false; // Ensure the element type is legal. if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64) return false; + if (Subtarget->useSVEForFixedLengthVectors() && + (VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 || + (VecSize < Subtarget->getMinSVEVectorSizeInBits() && + isPowerOf2_32(NumElements) && VecSize > 128))) { + UseScalable = true; + return true; + } + // Ensure the total vector size is 64 or a multiple of 128. Types larger than // 128 will be split into multiple interleaved accesses. return VecSize == 64 || VecSize % 128 == 0; } +static ScalableVectorType *getSVEContainerIRType(FixedVectorType *VTy) { + if (VTy->getElementType() == Type::getInt64Ty(VTy->getContext())) + return ScalableVectorType::get(VTy->getElementType(), 2); + + if (VTy->getElementType() == Type::getInt32Ty(VTy->getContext())) + return ScalableVectorType::get(VTy->getElementType(), 4); + + if (VTy->getElementType() == Type::getInt16Ty(VTy->getContext())) + return ScalableVectorType::get(VTy->getElementType(), 8); + + if (VTy->getElementType() == Type::getInt8Ty(VTy->getContext())) + return ScalableVectorType::get(VTy->getElementType(), 16); + + llvm_unreachable("Cannot handle input vector type"); +} + /// Lower an interleaved load into a ldN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): @@ -12071,10 +12098,12 @@ // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. - if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VTy, DL)) + bool UseScalable; + if (!Subtarget->hasNEON() || + !isLegalInterleavedAccessType(VTy, DL, UseScalable)) return false; - unsigned NumLoads = getNumInterleavedAccesses(VTy, DL); + unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable); auto *FVTy = cast(VTy); @@ -12085,48 +12114,84 @@ FVTy = FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements()); + // If we're going to generate more than one load, reset the sub-vector type + // to something legal. + FVTy = FixedVectorType::get(FVTy->getElementType(), + FVTy->getNumElements() / NumLoads); + + auto *LDVTy = + UseScalable ? cast(getSVEContainerIRType(FVTy)) : FVTy; + IRBuilder<> Builder(LI); // The base address of the load. Value *BaseAddr = LI->getPointerOperand(); if (NumLoads > 1) { - // If we're going to generate more than one load, reset the sub-vector type - // to something legal. - FVTy = FixedVectorType::get(FVTy->getElementType(), - FVTy->getNumElements() / NumLoads); - // We will compute the pointer operand of each load from the original base // address using GEPs. Cast the base address to a pointer to the scalar // element type. BaseAddr = Builder.CreateBitCast( BaseAddr, - FVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace())); - } - - Type *PtrTy = FVTy->getPointerTo(LI->getPointerAddressSpace()); - Type *Tys[2] = {FVTy, PtrTy}; - static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2, - Intrinsic::aarch64_neon_ld3, - Intrinsic::aarch64_neon_ld4}; - Function *LdNFunc = - Intrinsic::getDeclaration(LI->getModule(), LoadInts[Factor - 2], Tys); + LDVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace())); + } + + Type *PtrTy = + UseScalable + ? LDVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()) + : LDVTy->getPointerTo(LI->getPointerAddressSpace()); + Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()), + LDVTy->getElementCount()); + + static const Intrinsic::ID SVELoadIntrs[3] = { + Intrinsic::aarch64_sve_ld2_sret, Intrinsic::aarch64_sve_ld3_sret, + Intrinsic::aarch64_sve_ld4_sret}; + static const Intrinsic::ID NEONLoadIntrs[3] = {Intrinsic::aarch64_neon_ld2, + Intrinsic::aarch64_neon_ld3, + Intrinsic::aarch64_neon_ld4}; + Function *LdNFunc; + if (UseScalable) + LdNFunc = Intrinsic::getDeclaration(LI->getModule(), + SVELoadIntrs[Factor - 2], {LDVTy}); + else + LdNFunc = Intrinsic::getDeclaration( + LI->getModule(), NEONLoadIntrs[Factor - 2], {LDVTy, PtrTy}); // Holds sub-vectors extracted from the load intrinsic return values. The // sub-vectors are associated with the shufflevector instructions they will // replace. DenseMap> SubVecs; + Value *PTrue = nullptr; + if (UseScalable) { + unsigned PgPattern = + getSVEPredPatternFromNumElements(FVTy->getNumElements()); + if (Subtarget->getMinSVEVectorSizeInBits() == + Subtarget->getMaxSVEVectorSizeInBits() && + Subtarget->getMinSVEVectorSizeInBits() == DL.getTypeSizeInBits(FVTy)) + PgPattern = AArch64SVEPredPattern::all; + + auto *PTruePat = + ConstantInt::get(Type::getInt32Ty(LDVTy->getContext()), PgPattern); + PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy}, + {PTruePat}); + } + for (unsigned LoadCount = 0; LoadCount < NumLoads; ++LoadCount) { // If we're generating more than one load, compute the base address of // subsequent loads as an offset from the previous. if (LoadCount > 0) - BaseAddr = Builder.CreateConstGEP1_32(FVTy->getElementType(), BaseAddr, + BaseAddr = Builder.CreateConstGEP1_32(LDVTy->getElementType(), BaseAddr, FVTy->getNumElements() * Factor); - CallInst *LdN = Builder.CreateCall( - LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN"); + CallInst *LdN; + if (UseScalable) + LdN = Builder.CreateCall( + LdNFunc, {PTrue, Builder.CreateBitCast(BaseAddr, PtrTy)}, "ldN"); + else + LdN = Builder.CreateCall(LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), + "ldN"); // Extract and store the sub-vectors returned by the load intrinsic. for (unsigned i = 0; i < Shuffles.size(); i++) { @@ -12135,11 +12200,17 @@ Value *SubVec = Builder.CreateExtractValue(LdN, Index); + if (UseScalable) + SubVec = Builder.CreateExtractVector( + FVTy, SubVec, + ConstantInt::get(Type::getInt64Ty(VTy->getContext()), 0)); + // Convert the integer vector to pointer vector if the element is pointer. if (EltTy->isPointerTy()) SubVec = Builder.CreateIntToPtr( SubVec, FixedVectorType::get(SVI->getType()->getElementType(), FVTy->getNumElements())); + SubVecs[SVI].push_back(SubVec); } } @@ -12198,14 +12269,16 @@ auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen); const DataLayout &DL = SI->getModule()->getDataLayout(); + bool UseScalable; // Skip if we do not have NEON and skip illegal vector types. We can // "legalize" wide vector types into multiple interleaved accesses as long as // the vector types are divisible by 128. - if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(SubVecTy, DL)) + if (!Subtarget->hasNEON() || + !isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) return false; - unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL); + unsigned NumStores = getNumInterleavedAccesses(SubVecTy, DL, UseScalable); Value *Op0 = SVI->getOperand(0); Value *Op1 = SVI->getOperand(1); @@ -12226,15 +12299,18 @@ SubVecTy = FixedVectorType::get(IntTy, LaneLen); } + // If we're going to generate more than one store, reset the lane length + // and sub-vector type to something legal. + LaneLen /= NumStores; + SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen); + + auto *STVTy = UseScalable ? cast(getSVEContainerIRType(SubVecTy)) + : SubVecTy; + // The base address of the store. Value *BaseAddr = SI->getPointerOperand(); if (NumStores > 1) { - // If we're going to generate more than one store, reset the lane length - // and sub-vector type to something legal. - LaneLen /= NumStores; - SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen); - // We will compute the pointer operand of each store from the original base // address using GEPs. Cast the base address to a pointer to the scalar // element type. @@ -12245,13 +12321,42 @@ auto Mask = SVI->getShuffleMask(); - Type *PtrTy = SubVecTy->getPointerTo(SI->getPointerAddressSpace()); - Type *Tys[2] = {SubVecTy, PtrTy}; - static const Intrinsic::ID StoreInts[3] = {Intrinsic::aarch64_neon_st2, - Intrinsic::aarch64_neon_st3, - Intrinsic::aarch64_neon_st4}; - Function *StNFunc = - Intrinsic::getDeclaration(SI->getModule(), StoreInts[Factor - 2], Tys); + Type *PtrTy = + UseScalable + ? STVTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()) + : STVTy->getPointerTo(SI->getPointerAddressSpace()); + Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()), + STVTy->getElementCount()); + + static const Intrinsic::ID SVEStoreIntrs[3] = {Intrinsic::aarch64_sve_st2, + Intrinsic::aarch64_sve_st3, + Intrinsic::aarch64_sve_st4}; + static const Intrinsic::ID NEONStoreIntrs[3] = {Intrinsic::aarch64_neon_st2, + Intrinsic::aarch64_neon_st3, + Intrinsic::aarch64_neon_st4}; + Function *StNFunc; + if (UseScalable) + StNFunc = Intrinsic::getDeclaration(SI->getModule(), + SVEStoreIntrs[Factor - 2], {STVTy}); + else + StNFunc = Intrinsic::getDeclaration( + SI->getModule(), NEONStoreIntrs[Factor - 2], {STVTy, PtrTy}); + + Value *PTrue = nullptr; + if (UseScalable) { + unsigned PgPattern = + getSVEPredPatternFromNumElements(SubVecTy->getNumElements()); + if (Subtarget->getMinSVEVectorSizeInBits() == + Subtarget->getMaxSVEVectorSizeInBits() && + Subtarget->getMinSVEVectorSizeInBits() == + DL.getTypeSizeInBits(SubVecTy)) + PgPattern = AArch64SVEPredPattern::all; + + auto *PTruePat = + ConstantInt::get(Type::getInt32Ty(STVTy->getContext()), PgPattern); + PTrue = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy}, + {PTruePat}); + } for (unsigned StoreCount = 0; StoreCount < NumStores; ++StoreCount) { @@ -12259,10 +12364,11 @@ // Split the shufflevector operands into sub vectors for the new stN call. for (unsigned i = 0; i < Factor; i++) { + Value *Shuffle; unsigned IdxI = StoreCount * LaneLen * Factor + i; if (Mask[IdxI] >= 0) { - Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0))); + Shuffle = Builder.CreateShuffleVector( + Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)); } else { unsigned StartMask = 0; for (unsigned j = 1; j < LaneLen; j++) { @@ -12277,11 +12383,21 @@ // In the case of all undefs we're defaulting to using elems from 0 // Note: StartMask cannot be negative, it's checked in // isReInterleaveMask - Ops.push_back(Builder.CreateShuffleVector( - Op0, Op1, createSequentialMask(StartMask, LaneLen, 0))); + Shuffle = Builder.CreateShuffleVector( + Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)); } + + if (UseScalable) + Shuffle = Builder.CreateInsertVector( + STVTy, UndefValue::get(STVTy), Shuffle, + ConstantInt::get(Type::getInt64Ty(STVTy->getContext()), 0)); + + Ops.push_back(Shuffle); } + if (UseScalable) + Ops.push_back(PTrue); + // If we generating more than one store, we compute the base address of // subsequent stores as an offset from the previous. if (StoreCount > 0) diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -1758,9 +1758,10 @@ // ldN/stN only support legal vector types of size 64 or 128 in bits. // Accesses having vector types that are a multiple of 128 bits can be // matched to more than one ldN/stN instruction. + bool UseScalable; if (NumElts % Factor == 0 && - TLI->isLegalInterleavedAccessType(SubVecTy, DL)) - return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL); + TLI->isLegalInterleavedAccessType(SubVecTy, DL, UseScalable)) + return Factor * TLI->getNumInterleavedAccesses(SubVecTy, DL, UseScalable); } return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices, diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InterleavedAccess/AArch64/sve-interleaved-accesses.ll @@ -0,0 +1,356 @@ +; RUN: opt < %s -interleaved-access -S | FileCheck %s + +target triple = "aarch64-linux-gnu" + +define void @load_factor2(<32 x i16>* %ptr) #0 { +; CHECK-LABEL: @load_factor2( +; CHECK-NEXT: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <32 x i16>* %ptr to i16* +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv8i16( [[PTRUE]], i16* [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: [[EXT1:%.*]] = call <16 x i16> @llvm.experimental.vector.extract.v16i16.nxv8i16( [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[EXT2:%.*]] = call <16 x i16> @llvm.experimental.vector.extract.v16i16.nxv8i16( [[TMP3]], i64 0) +; CHECK-NEXT: ret void + %interleaved.vec = load <32 x i16>, <32 x i16>* %ptr, align 4 + %v0 = shufflevector <32 x i16> %interleaved.vec, <32 x i16> poison, <16 x i32> + %v1 = shufflevector <32 x i16> %interleaved.vec, <32 x i16> poison, <16 x i32> + ret void +} + +define void @load_factor3(<24 x i32>* %ptr) #0 { +; CHECK-LABEL: @load_factor3( +; CHECK-NEXT: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <24 x i32>* %ptr to i32* +; CHECK-NEXT: [[LDN:%.*]] = call { , , } @llvm.aarch64.sve.ld3.sret.nxv4i32( [[PTRUE]], i32* [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[LDN]], 2 +; CHECK-NEXT: [[EXT1:%.*]] = call <8 x i32> @llvm.experimental.vector.extract.v8i32.nxv4i32( [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[LDN]], 1 +; CHECK-NEXT: [[EXT2:%.*]] = call <8 x i32> @llvm.experimental.vector.extract.v8i32.nxv4i32( [[TMP3]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[LDN]], 0 +; CHECK-NEXT: [[EXT3:%.*]] = call <8 x i32> @llvm.experimental.vector.extract.v8i32.nxv4i32( [[TMP4]], i64 0) +; CHECK-NEXT: ret void + %interleaved.vec = load <24 x i32>, <24 x i32>* %ptr, align 4 + %v0 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <8 x i32> + %v1 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <8 x i32> + %v2 = shufflevector <24 x i32> %interleaved.vec, <24 x i32> poison, <8 x i32> + ret void +} + +define void @load_factor4(<16 x i64>* %ptr) #0 { +; CHECK-LABEL: @load_factor4( +; CHECK-NEXT: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i64>* %ptr to i64* +; CHECK-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv2i64( [[PTRUE]], i64* [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[LDN]], 3 +; CHECK-NEXT: [[EXT1:%.*]] = call <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64( [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[LDN]], 2 +; CHECK-NEXT: [[EXT2:%.*]] = call <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64( [[TMP3]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[LDN]], 1 +; CHECK-NEXT: [[EXT3:%.*]] = call <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64( [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[LDN]], 0 +; CHECK-NEXT: [[EXT4:%.*]] = call <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64( [[TMP5]], i64 0) +; CHECK-NEXT: ret void + %interleaved.vec = load <16 x i64>, <16 x i64>* %ptr, align 4 + %v0 = shufflevector <16 x i64> %interleaved.vec, <16 x i64> poison, <4 x i32> + %v1 = shufflevector <16 x i64> %interleaved.vec, <16 x i64> poison, <4 x i32> + %v2 = shufflevector <16 x i64> %interleaved.vec, <16 x i64> poison, <4 x i32> + %v3 = shufflevector <16 x i64> %interleaved.vec, <16 x i64> poison, <4 x i32> + ret void +} + +define void @store_factor2(<32 x i16>* %ptr, <16 x i16> %v0, <16 x i16> %v1) #0 { +; CHECK-LABEL: @store_factor2( +; CHECK-NEXT: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv8i1(i32 31) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> %v0, <16 x i16> %v1, <16 x i32> +; CHECK-NEXT: [[INS1:%.*]] = call @llvm.experimental.vector.insert.nxv8i16.v16i16( undef, <16 x i16> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> %v0, <16 x i16> %v1, <16 x i32> +; CHECK-NEXT: [[INS2:%.*]] = call @llvm.experimental.vector.insert.nxv8i16.v16i16( undef, <16 x i16> [[TMP2]], i64 0) +; CHECK-NEXT: [[PTR:%.*]] = bitcast <32 x i16>* %ptr to i16* +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv8i16( [[INS1]], [[INS2]], [[PTRUE]], i16* [[PTR]]) +; CHECK-NEXT: ret void + %interleaved.vec = shufflevector <16 x i16> %v0, <16 x i16> %v1, <32 x i32> + store <32 x i16> %interleaved.vec, <32 x i16>* %ptr, align 4 + ret void +} + +define void @store_factor3(<24 x i32>* %ptr, <8 x i32> %v0, <8 x i32> %v1, <8 x i32> %v2) #0 { +; CHECK-LABEL: @store_factor3( +; CHECK: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <8 x i32> +; CHECK-NEXT: [[INS1:%.*]] = call @llvm.experimental.vector.insert.nxv4i32.v8i32( undef, <8 x i32> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <8 x i32> +; CHECK-NEXT: [[INS2:%.*]] = call @llvm.experimental.vector.insert.nxv4i32.v8i32( undef, <8 x i32> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <16 x i32> %s0, <16 x i32> %s1, <8 x i32> +; CHECK-NEXT: [[INS3:%.*]] = call @llvm.experimental.vector.insert.nxv4i32.v8i32( undef, <8 x i32> [[TMP3]], i64 0) +; CHECK-NEXT: [[PTR:%.*]] = bitcast <24 x i32>* %ptr to i32* +; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv4i32( [[INS1]], [[INS2]], [[INS3]], [[PTRUE]], i32* [[PTR]]) +; CHECK-NEXT: ret void + %s0 = shufflevector <8 x i32> %v0, <8 x i32> %v1, <16 x i32> + %s1 = shufflevector <8 x i32> %v2, <8 x i32> poison, <16 x i32> + %interleaved.vec = shufflevector <16 x i32> %s0, <16 x i32> %s1, <24 x i32> + store <24 x i32> %interleaved.vec, <24 x i32>* %ptr, align 4 + ret void +} + +define void @store_factor4(<16 x i64>* %ptr, <4 x i64> %v0, <4 x i64> %v1, <4 x i64> %v2, <4 x i64> %v3) #0 { +; CHECK-LABEL: @store_factor4( +; CHECK: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> %s0, <8 x i64> %s1, <4 x i32> +; CHECK-NEXT: [[INS1:%.*]] = call @llvm.experimental.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> %s0, <8 x i64> %s1, <4 x i32> +; CHECK-NEXT: [[INS2:%.*]] = call @llvm.experimental.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i64> %s0, <8 x i64> %s1, <4 x i32> +; CHECK-NEXT: [[INS3:%.*]] = call @llvm.experimental.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP3]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> %s0, <8 x i64> %s1, <4 x i32> +; CHECK-NEXT: [[INS4:%.*]] = call @llvm.experimental.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP4]], i64 0) +; CHECK-NEXT: [[PTR:%.*]] = bitcast <16 x i64>* %ptr to i64* +; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv2i64( [[INS1]], [[INS2]], [[INS3]], [[INS4]], [[PTRUE]], i64* [[PTR]]) +; CHECK-NEXT: ret void + %s0 = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> + %s1 = shufflevector <4 x i64> %v2, <4 x i64> %v3, <8 x i32> + %interleaved.vec = shufflevector <8 x i64> %s0, <8 x i64> %s1, <16 x i32> + store <16 x i64> %interleaved.vec, <16 x i64>* %ptr, align 4 + ret void +} + +define void @load_ptrvec_factor2(<8 x i32*>* %ptr) #0 { +; CHECK-LABEL: @load_ptrvec_factor2( +; CHECK-NEXT: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i32*>* %ptr to i64* +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2i64( [[PTRUE]], i64* [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: [[EXT1:%.*]] = call <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64( [[TMP2]], i64 0) +; CHECK-NEXT: [[TOP1:%.*]] = inttoptr <4 x i64> [[EXT1]] to <4 x i32*> +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[EXT2:%.*]] = call <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64( [[TMP3]], i64 0) +; CHECK-NEXT: [[TOP2:%.*]] = inttoptr <4 x i64> [[EXT2]] to <4 x i32*> +; CHECK-NEXT: ret void + %interleaved.vec = load <8 x i32*>, <8 x i32*>* %ptr, align 4 + %v0 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> poison, <4 x i32> + %v1 = shufflevector <8 x i32*> %interleaved.vec, <8 x i32*> poison, <4 x i32> + ret void +} + +define void @load_ptrvec_factor3(<12 x i32*>* %ptr) #0 { +; CHECK-LABEL: @load_ptrvec_factor3( +; CHECK-NEXT: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <12 x i32*>* %ptr to i64* +; CHECK-NEXT: [[LDN:%.*]] = call { , , } @llvm.aarch64.sve.ld3.sret.nxv2i64( [[PTRUE]], i64* [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , } [[LDN]], 2 +; CHECK-NEXT: [[EXT1:%.*]] = call <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64( [[TMP2]], i64 0) +; CHECK-NEXT: [[TOP1:%.*]] = inttoptr <4 x i64> [[EXT1]] to <4 x i32*> +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , } [[LDN]], 1 +; CHECK-NEXT: [[EXT2:%.*]] = call <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64( [[TMP3]], i64 0) +; CHECK-NEXT: [[TOP2:%.*]] = inttoptr <4 x i64> [[EXT2]] to <4 x i32*> +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , } [[LDN]], 0 +; CHECK-NEXT: [[EXT3:%.*]] = call <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64( [[TMP4]], i64 0) +; CHECK-NEXT: [[TOP3:%.*]] = inttoptr <4 x i64> [[EXT3]] to <4 x i32*> +; CHECK-NEXT: ret void + %interleaved.vec = load <12 x i32*>, <12 x i32*>* %ptr, align 4 + %v0 = shufflevector <12 x i32*> %interleaved.vec, <12 x i32*> poison, <4 x i32> + %v1 = shufflevector <12 x i32*> %interleaved.vec, <12 x i32*> poison, <4 x i32> + %v2 = shufflevector <12 x i32*> %interleaved.vec, <12 x i32*> poison, <4 x i32> + ret void +} + +define void @load_ptrvec_factor4(<16 x i32*>* %ptr) #0 { +; CHECK-LABEL: @load_ptrvec_factor4( +; CHECK-NEXT: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i32*>* %ptr to i64* +; CHECK-NEXT: [[LDN:%.*]] = call { , , , } @llvm.aarch64.sve.ld4.sret.nxv2i64( [[PTRUE]], i64* [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , , , } [[LDN]], 3 +; CHECK-NEXT: [[EXT1:%.*]] = call <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64( [[TMP2]], i64 0) +; CHECK-NEXT: [[TOP1:%.*]] = inttoptr <4 x i64> [[EXT1]] to <4 x i32*> +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , , , } [[LDN]], 2 +; CHECK-NEXT: [[EXT2:%.*]] = call <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64( [[TMP3]], i64 0) +; CHECK-NEXT: [[TOP2:%.*]] = inttoptr <4 x i64> [[EXT2]] to <4 x i32*> +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , , , } [[LDN]], 1 +; CHECK-NEXT: [[EXT3:%.*]] = call <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64( [[TMP4]], i64 0) +; CHECK-NEXT: [[TOP3:%.*]] = inttoptr <4 x i64> [[EXT3]] to <4 x i32*> +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , , , } [[LDN]], 0 +; CHECK-NEXT: [[EXT4:%.*]] = call <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64( [[TMP5]], i64 0) +; CHECK-NEXT: [[TOP4:%.*]] = inttoptr <4 x i64> [[EXT4]] to <4 x i32*> +; CHECK-NEXT: ret void + %interleaved.vec = load <16 x i32*>, <16 x i32*>* %ptr, align 4 + %v0 = shufflevector <16 x i32*> %interleaved.vec, <16 x i32*> poison, <4 x i32> + %v1 = shufflevector <16 x i32*> %interleaved.vec, <16 x i32*> poison, <4 x i32> + %v2 = shufflevector <16 x i32*> %interleaved.vec, <16 x i32*> poison, <4 x i32> + %v3 = shufflevector <16 x i32*> %interleaved.vec, <16 x i32*> poison, <4 x i32> + ret void +} + +define void @store_ptrvec_factor2(<8 x i32*>* %ptr, <4 x i32*> %v0, <4 x i32*> %v1) #0 { +; CHECK-LABEL: @store_ptrvec_factor2( +; CHECK-NEXT: [[TOI1:%.*]] = ptrtoint <4 x i32*> %v0 to <4 x i64> +; CHECK-NEXT: [[TOI2:%.*]] = ptrtoint <4 x i32*> %v1 to <4 x i64> +; CHECK-NEXT: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[TOI1]], <4 x i64> [[TOI2]], <4 x i32> +; CHECK-NEXT: [[INS1:%.*]] = call @llvm.experimental.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[TOI1]], <4 x i64> [[TOI2]], <4 x i32> +; CHECK-NEXT: [[INS2:%.*]] = call @llvm.experimental.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP2]], i64 0) +; CHECK-NEXT: [[PTR:%.*]] = bitcast <8 x i32*>* %ptr to i64* +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2i64( [[INS1]], [[INS2]], [[PTRUE]], i64* [[PTR]]) +; CHECK-NEXT: ret void + %interleaved.vec = shufflevector <4 x i32*> %v0, <4 x i32*> %v1, <8 x i32> + store <8 x i32*> %interleaved.vec, <8 x i32*>* %ptr, align 4 + ret void +} + +define void @store_ptrvec_factor3(<12 x i32*>* %ptr, <4 x i32*> %v0, <4 x i32*> %v1, <4 x i32*> %v2) #0 { +; CHECK-LABEL: @store_ptrvec_factor3( +; CHECK: [[TOI1:%.*]] = ptrtoint <8 x i32*> %s0 to <8 x i64> +; CHECK-NEXT: [[TOI2:%.*]] = ptrtoint <8 x i32*> %s1 to <8 x i64> +; CHECK-NEXT: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> +; CHECK-NEXT: [[INS1:%.*]] = call @llvm.experimental.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> +; CHECK-NEXT: [[INS2:%.*]] = call @llvm.experimental.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> +; CHECK-NEXT: [[INS3:%.*]] = call @llvm.experimental.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP3]], i64 0) +; CHECK-NEXT: [[PTR:%.*]] = bitcast <12 x i32*>* %ptr to i64* +; CHECK-NEXT: call void @llvm.aarch64.sve.st3.nxv2i64( [[INS1]], [[INS2]], [[INS3]], [[PTRUE]], i64* [[PTR]]) +; CHECK-NEXT: ret void + %s0 = shufflevector <4 x i32*> %v0, <4 x i32*> %v1, <8 x i32> + %s1 = shufflevector <4 x i32*> %v2, <4 x i32*> poison, <8 x i32> + %interleaved.vec = shufflevector <8 x i32*> %s0, <8 x i32*> %s1, <12 x i32> + store <12 x i32*> %interleaved.vec, <12 x i32*>* %ptr, align 4 + ret void +} + +define void @store_ptrvec_factor4(<16 x i32*>* %ptr, <4 x i32*> %v0, <4 x i32*> %v1, <4 x i32*> %v2, <4 x i32*> %v3) #0 { +; CHECK-LABEL: @store_ptrvec_factor4( +; CHECK: [[TOI1:%.*]] = ptrtoint <8 x i32*> %s0 to <8 x i64> +; CHECK-NEXT: [[TOI2:%.*]] = ptrtoint <8 x i32*> %s1 to <8 x i64> +; CHECK-NEXT: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> +; CHECK-NEXT: [[INS1:%.*]] = call @llvm.experimental.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP1]], i64 0) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> +; CHECK-NEXT: [[INS2:%.*]] = call @llvm.experimental.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> +; CHECK-NEXT: [[INS3:%.*]] = call @llvm.experimental.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP3]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[TOI1]], <8 x i64> [[TOI2]], <4 x i32> +; CHECK-NEXT: [[INS4:%.*]] = call @llvm.experimental.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP4]], i64 0) +; CHECK-NEXT: [[PTR:%.*]] = bitcast <16 x i32*>* %ptr to i64* +; CHECK-NEXT: call void @llvm.aarch64.sve.st4.nxv2i64( [[INS1]], [[INS2]], [[INS3]], [[INS4]], [[PTRUE]], i64* [[PTR]]) +; CHECK-NEXT: ret void + %s0 = shufflevector <4 x i32*> %v0, <4 x i32*> %v1, <8 x i32> + %s1 = shufflevector <4 x i32*> %v2, <4 x i32*> %v3, <8 x i32> + %interleaved.vec = shufflevector <8 x i32*> %s0, <8 x i32*> %s1, <16 x i32> + store <16 x i32*> %interleaved.vec, <16 x i32*>* %ptr, align 4 + ret void +} + +define void @load_factor2_wide(<16 x i64>* %ptr) #0 { +; CHECK-LABEL: @load_factor2_wide( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i64>* %ptr to i64* +; CHECK-NEXT: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2i64( [[PTRUE]], i64* [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: [[EXT1:%.*]] = call <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64( [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[EXT2:%.*]] = call <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64( [[TMP3]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr i64, i64* [[TMP1]], i32 8 +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2i64( [[PTRUE]], i64* [[TMP4]]) +; CHECK-NEXT: [[TMP5:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: [[EXT3:%.*]] = call <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64( [[TMP5]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[EXT4:%.*]] = call <4 x i64> @llvm.experimental.vector.extract.v4i64.nxv2i64( [[TMP6]], i64 0) +; CHECK-NEXT: [[TMP7:%.*]] = shufflevector <4 x i64> [[EXT1]], <4 x i64> [[EXT3]], <8 x i32> +; CHECK-NEXT: [[TMP8:%.*]] = shufflevector <4 x i64> [[EXT2]], <4 x i64> [[EXT4]], <8 x i32> +; CHECK-NEXT: ret void + %interleaved.vec = load <16 x i64>, <16 x i64>* %ptr, align 4 + %v0 = shufflevector <16 x i64> %interleaved.vec, <16 x i64> poison, <8 x i32> + %v1 = shufflevector <16 x i64> %interleaved.vec, <16 x i64> poison, <8 x i32> + ret void +} + +define void @store_factor2_wide(<16 x i64>* %ptr, <8 x i64> %v0, <8 x i64> %v1) #0 { +; CHECK-LABEL: @store_factor2_wide( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <16 x i64>* %ptr to i64* +; CHECK-NEXT: [[PTRUE:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 31) +; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> %v0, <8 x i64> %v1, <4 x i32> +; CHECK-NEXT: [[INS1:%.*]] = call @llvm.experimental.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i64> %v0, <8 x i64> %v1, <4 x i32> +; CHECK-NEXT: [[INS2:%.*]] = call @llvm.experimental.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP3]], i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2i64( [[INS1]], [[INS2]], [[PTRUE]], i64* [[TMP1]]) +; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> %v0, <8 x i64> %v1, <4 x i32> +; CHECK-NEXT: [[INS3:%.*]] = call @llvm.experimental.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> %v0, <8 x i64> %v1, <4 x i32> +; CHECK-NEXT: [[INS4:%.*]] = call @llvm.experimental.vector.insert.nxv2i64.v4i64( undef, <4 x i64> [[TMP5]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i64, i64* [[TMP1]], i32 8 +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2i64( [[INS3]], [[INS4]], [[PTRUE]], i64* [[TMP6]]) +; CHECK-NEXT: ret void + %interleaved.vec = shufflevector <8 x i64> %v0, <8 x i64> %v1, <16 x i32> + store <16 x i64> %interleaved.vec, <16 x i64>* %ptr, align 4 + ret void +} + +; Check that neon is used for illegal multiples of 128-bit types +define void @load_384bit(<12 x i64>* %ptr) #0 { +; CHECK-LABEL: @load_384bit( +; CHECK: llvm.aarch64.neon.ld2 +; CHECK-NOT: llvm.aarch64.sve.ld2 + %interleaved.vec = load <12 x i64>, <12 x i64>* %ptr, align 4 + %v0 = shufflevector <12 x i64> %interleaved.vec, <12 x i64> poison, <6 x i32> + %v1 = shufflevector <12 x i64> %interleaved.vec, <12 x i64> poison, <6 x i32> + ret void +} + +; Check that neon is used for 128-bit vectors +define void @load_128bit(<4 x i64>* %ptr) #0 { +; CHECK-LABEL: @load_128bit( +; CHECK: llvm.aarch64.neon.ld2 +; CHECK-NOT: llvm.aarch64.sve.ld2 + %interleaved.vec = load <4 x i64>, <4 x i64>* %ptr, align 4 + %v0 = shufflevector <4 x i64> %interleaved.vec, <4 x i64> poison, <2 x i32> + %v1 = shufflevector <4 x i64> %interleaved.vec, <4 x i64> poison, <2 x i32> + ret void +} + +; Check that correct ptrues are generated for min != max case +define void @load_min_not_max(<8 x i64>* %ptr) #1 { +; CHECK-LABEL: @load_min_not_max( +; CHECK: call @llvm.aarch64.sve.ptrue.nxv2i1(i32 4) + %interleaved.vec = load <8 x i64>, <8 x i64>* %ptr, align 4 + %v0 = shufflevector <8 x i64> %interleaved.vec, <8 x i64> poison, <4 x i32> + %v1 = shufflevector <8 x i64> %interleaved.vec, <8 x i64> poison, <4 x i32> + ret void +} + +define void @store_min_not_max(<8 x i64>* %ptr, <4 x i64> %v0, <4 x i64> %v1) #1 { +; CHECK-LABEL: @store_min_not_max( +; CHECK: call @llvm.aarch64.sve.ptrue.nxv2i1(i32 4) + %interleaved.vec = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> + store <8 x i64> %interleaved.vec, <8 x i64>* %ptr, align 4 + ret void +} + +; Check that correct ptrues are generated for min > type case +define void @load_min_ge_type(<8 x i64>* %ptr) #2 { +; CHECK-LABEL: @load_min_ge_type( +; CHECK: call @llvm.aarch64.sve.ptrue.nxv2i1(i32 4) + %interleaved.vec = load <8 x i64>, <8 x i64>* %ptr, align 4 + %v0 = shufflevector <8 x i64> %interleaved.vec, <8 x i64> poison, <4 x i32> + %v1 = shufflevector <8 x i64> %interleaved.vec, <8 x i64> poison, <4 x i32> + ret void +} + +define void @store_min_ge_type(<8 x i64>* %ptr, <4 x i64> %v0, <4 x i64> %v1) #2 { +; CHECK-LABEL: @store_min_ge_type( +; CHECK: call @llvm.aarch64.sve.ptrue.nxv2i1(i32 4) + %interleaved.vec = shufflevector <4 x i64> %v0, <4 x i64> %v1, <8 x i32> + store <8 x i64> %interleaved.vec, <8 x i64>* %ptr, align 4 + ret void +} + +attributes #0 = { vscale_range(2,2) "target-features"="+sve" } +attributes #1 = { vscale_range(2,4) "target-features"="+sve" } +attributes #2 = { vscale_range(4,4) "target-features"="+sve" }