Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -2939,6 +2939,28 @@ return false; } + /// Lower a deinterleave intrinsic to a target specific load intrinsic. + /// Return true on success. Currently only supports + /// llvm.experimental.vector.deinterleave2 + /// + /// \p DI is the deinterleave intrinsic. + /// \p Address is the pointer operand from the accompanying load. + virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, + Value *Address) const { + return false; + } + + /// Lower an interleave intrinsic to a target specific store intrinsic. + /// Return true on success. Currently only supports + /// llvm.experimental.vector.interleave2 + /// + /// \p II is the interleave intrinsic. + /// \p Address is the pointer operand from the accompanying store. + virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II, + Value *Address) const { + return false; + } + /// Return true if an fpext operation is free (for instance, because /// single-precision floating-point numbers are implicitly extended to /// double-precision). Index: llvm/lib/CodeGen/InterleavedAccessPass.cpp =================================================================== --- llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -113,6 +113,12 @@ bool lowerInterleavedStore(StoreInst *SI, SmallVector &DeadInsts); + bool lowerDeinterleaveIntrinsic(IntrinsicInst *II, + SmallVector &DeadInsts); + + bool lowerInterleaveIntrinsic(IntrinsicInst *II, + SmallVector &DeadInsts); + /// Returns true if the uses of an interleaved load by the /// extractelement instructions in \p Extracts can be replaced by uses of the /// shufflevector instructions in \p Shuffles instead. If so, the necessary @@ -446,6 +452,47 @@ return true; } +bool InterleavedAccess::lowerDeinterleaveIntrinsic( + IntrinsicInst *DI, SmallVector &DeadInsts) { + LoadInst *LI = dyn_cast(DI->getOperand(0)); + + if (!LI || !LI->hasOneUse() || !LI->isSimple()) + return false; + + LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n"); + + // Try and match this with target specific intrinsics. + if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI->getPointerOperand())) + return false; + + // We now have a target-specific load, so delete the old one. + DeadInsts.push_back(DI); + DeadInsts.push_back(LI); + return true; +} + +bool InterleavedAccess::lowerInterleaveIntrinsic( + IntrinsicInst *II, SmallVector &DeadInsts) { + if (!II->hasOneUse()) + return false; + + StoreInst *SI = dyn_cast(*(II->users().begin())); + + if (!SI || !SI->isSimple()) + return false; + + LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n"); + + // Try and match this with target specific intrinsics. + if (!TLI->lowerInterleaveIntrinsicToStore(II, SI->getPointerOperand())) + return false; + + // We now have a target-specific store, so delete the old one. + DeadInsts.push_back(SI); + DeadInsts.push_back(II); + return true; +} + bool InterleavedAccess::runOnFunction(Function &F) { auto *TPC = getAnalysisIfAvailable(); if (!TPC || !LowerInterleavedAccesses) @@ -468,6 +515,13 @@ if (auto *SI = dyn_cast(&I)) Changed |= lowerInterleavedStore(SI, DeadInsts); + + if (auto *II = dyn_cast(&I)) { + if (II->getIntrinsicID() == Intrinsic::experimental_vector_deinterleave2) + Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts); + if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2) + Changed |= lowerInterleaveIntrinsic(II, DeadInsts); + } } for (auto *I : DeadInsts) Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -646,6 +646,12 @@ bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; + bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, + Value *Address) const override; + + bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II, + Value *Address) const override; + bool isLegalAddImmediate(int64_t) const override; bool isLegalICmpImmediate(int64_t) const override; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -14343,9 +14343,11 @@ unsigned AArch64TargetLowering::getNumInterleavedAccesses( VectorType *VecTy, const DataLayout &DL, bool UseScalable) const { unsigned VecSize = 128; + unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); + unsigned MinElts = VecTy->getElementCount().getKnownMinValue(); if (UseScalable) VecSize = std::max(Subtarget->getMinSVEVectorSizeInBits(), 128u); - return std::max(1, (DL.getTypeSizeInBits(VecTy) + 127) / VecSize); + return std::max(1, (ElSize * MinElts + 127) / VecSize); } MachineMemOperand::Flags @@ -14358,30 +14360,34 @@ bool AArch64TargetLowering::isLegalInterleavedAccessType( VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const { - - unsigned VecSize = DL.getTypeSizeInBits(VecTy); unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); - unsigned NumElements = cast(VecTy)->getNumElements(); + unsigned MinElts = VecTy->getElementCount().getKnownMinValue(); UseScalable = false; // Ensure that the predicate for this number of elements is available. - if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(NumElements)) + if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(MinElts)) return false; // Ensure the number of vector elements is greater than 1. - if (NumElements < 2) + if (MinElts < 2) return false; // Ensure the element type is legal. if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64) return false; + if (VecTy->isScalableTy()) { + UseScalable = true; + return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0; + } + + unsigned VecSize = DL.getTypeSizeInBits(VecTy); if (Subtarget->forceStreamingCompatibleSVE() || (Subtarget->useSVEForFixedLengthVectors() && (VecSize % Subtarget->getMinSVEVectorSizeInBits() == 0 || (VecSize < Subtarget->getMinSVEVectorSizeInBits() && - isPowerOf2_32(NumElements) && VecSize > 128)))) { + isPowerOf2_32(MinElts) && VecSize > 128)))) { UseScalable = true; return true; } @@ -14419,6 +14425,36 @@ llvm_unreachable("Cannot handle input vector type"); } +static Function *GetStructuredLoadFunction(Module *M, unsigned Factor, + bool Scalable, Type *LDVTy, + Type *PtrTy) { + static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret, + Intrinsic::aarch64_sve_ld3_sret, + Intrinsic::aarch64_sve_ld4_sret}; + static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2, + Intrinsic::aarch64_neon_ld3, + Intrinsic::aarch64_neon_ld4}; + if (Scalable) + return Intrinsic::getDeclaration(M, SVELoads[Factor - 2], {LDVTy}); + + return Intrinsic::getDeclaration(M, NEONLoads[Factor - 2], {LDVTy, PtrTy}); +} + +static Function *GetStructuredStoreFunction(Module *M, unsigned Factor, + bool Scalable, Type *STVTy, + Type *PtrTy) { + static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2, + Intrinsic::aarch64_sve_st3, + Intrinsic::aarch64_sve_st4}; + static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2, + Intrinsic::aarch64_neon_st3, + Intrinsic::aarch64_neon_st4}; + if (Scalable) + return Intrinsic::getDeclaration(M, SVEStores[Factor - 2], {STVTy}); + + return Intrinsic::getDeclaration(M, NEONStores[Factor - 2], {STVTy, PtrTy}); +} + /// Lower an interleaved load into a ldN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): @@ -14491,19 +14527,8 @@ Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()), LDVTy->getElementCount()); - static const Intrinsic::ID SVELoadIntrs[3] = { - Intrinsic::aarch64_sve_ld2_sret, Intrinsic::aarch64_sve_ld3_sret, - Intrinsic::aarch64_sve_ld4_sret}; - static const Intrinsic::ID NEONLoadIntrs[3] = {Intrinsic::aarch64_neon_ld2, - Intrinsic::aarch64_neon_ld3, - Intrinsic::aarch64_neon_ld4}; - Function *LdNFunc; - if (UseScalable) - LdNFunc = Intrinsic::getDeclaration(LI->getModule(), - SVELoadIntrs[Factor - 2], {LDVTy}); - else - LdNFunc = Intrinsic::getDeclaration( - LI->getModule(), NEONLoadIntrs[Factor - 2], {LDVTy, PtrTy}); + Function *LdNFunc = GetStructuredLoadFunction(LI->getModule(), Factor, + UseScalable, LDVTy, PtrTy); // Holds sub-vectors extracted from the load intrinsic return values. The // sub-vectors are associated with the shufflevector instructions they will @@ -14688,19 +14713,8 @@ Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()), STVTy->getElementCount()); - static const Intrinsic::ID SVEStoreIntrs[3] = {Intrinsic::aarch64_sve_st2, - Intrinsic::aarch64_sve_st3, - Intrinsic::aarch64_sve_st4}; - static const Intrinsic::ID NEONStoreIntrs[3] = {Intrinsic::aarch64_neon_st2, - Intrinsic::aarch64_neon_st3, - Intrinsic::aarch64_neon_st4}; - Function *StNFunc; - if (UseScalable) - StNFunc = Intrinsic::getDeclaration(SI->getModule(), - SVEStoreIntrs[Factor - 2], {STVTy}); - else - StNFunc = Intrinsic::getDeclaration( - SI->getModule(), NEONStoreIntrs[Factor - 2], {STVTy, PtrTy}); + Function *StNFunc = GetStructuredStoreFunction(SI->getModule(), Factor, + UseScalable, STVTy, PtrTy); Value *PTrue = nullptr; if (UseScalable) { @@ -14770,6 +14784,163 @@ return true; } +bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( + IntrinsicInst *DI, Value *Address) const { + // Only deinterleave2 supported at present + if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2) + return false; + + VectorType *VTy = cast(DI->getType()->getContainedType(0)); + if (!Subtarget->hasNEON() || (VTy->isScalableTy() && !Subtarget->hasSVE())) + return false; + + const DataLayout &DL = DI->getModule()->getDataLayout(); + bool UseScalable; + if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) + return false; + + unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable); + + // Grab the original load in order to get the correct insertion point to + // preserve load-store ordering. + LoadInst *LI = dyn_cast(DI->getOperand(0)); + if (!LI) + return false; + + VectorType *LdTy = + VectorType::get(VTy->getElementType(), + VTy->getElementCount().divideCoefficientBy(NumLoads)); + + Type *PtrTy = + UseScalable + ? LdTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()) + : LdTy->getPointerTo(LI->getPointerAddressSpace()); + + Function *LdNFunc = GetStructuredLoadFunction(DI->getModule(), /*Factor=*/2, + UseScalable, LdTy, PtrTy); + + IRBuilder<> Builder(LI); + + Value *Pred = nullptr; + if (UseScalable) + Pred = Builder.CreateVectorSplat( + LdTy->getElementCount(), + ConstantInt::getTrue(IntegerType::getInt1Ty(VTy->getContext()))); + + Value *BaseAddr = Builder.CreateBitCast(Address, PtrTy); + Value *Result; + if (NumLoads > 1) { + Value *Left = PoisonValue::get(VTy); + Value *Right = PoisonValue::get(VTy); + + for (unsigned I = 0; I < NumLoads; ++I) { + Value *Idx = + ConstantInt::get(Type::getInt64Ty(VTy->getContext()), + I * LdTy->getElementCount().getKnownMinValue()); + Value *Offset = Idx; + if (LdTy->isScalableTy()) + Offset = Builder.CreateVScale(cast(Offset)); + BaseAddr = Builder.CreateGEP(LdTy->getElementType(), BaseAddr, {Offset}); + Value *LdN = nullptr; + if (UseScalable) + LdN = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN"); + else + LdN = Builder.CreateCall(LdNFunc, BaseAddr, "ldN"); + + Left = Builder.CreateInsertVector( + VTy, Left, Builder.CreateExtractValue(LdN, 0), Idx); + Right = Builder.CreateInsertVector( + VTy, Right, Builder.CreateExtractValue(LdN, 1), Idx); + } + + Result = PoisonValue::get(DI->getType()); + Result = Builder.CreateInsertValue(Result, Left, 0); + Result = Builder.CreateInsertValue(Result, Right, 1); + } else { + if (UseScalable) + Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN"); + else + Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN"); + } + + DI->replaceAllUsesWith(Result); + return true; +} + +bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore( + IntrinsicInst *II, Value *Address) const { + // Only interleave2 supported at present + if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2) + return false; + + VectorType *VTy = cast(II->getOperand(0)->getType()); + if (!Subtarget->hasNEON() || (VTy->isScalableTy() && !Subtarget->hasSVE())) + return false; + + const DataLayout &DL = II->getModule()->getDataLayout(); + bool UseScalable; + if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) + return false; + + // Need to find the store to obtain the correct insertion point to preserve + // load-store ordering. + if (!II->hasOneUse()) + return false; + + StoreInst *SI = dyn_cast(*(II->users().begin())); + if (!SI) + return false; + + unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable); + + VectorType *StTy = + VectorType::get(VTy->getElementType(), + VTy->getElementCount().divideCoefficientBy(NumStores)); + + Type *PtrTy = + UseScalable + ? StTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()) + : StTy->getPointerTo(SI->getPointerAddressSpace()); + + Function *StNFunc = GetStructuredStoreFunction(SI->getModule(), /*Factor=*/2, + UseScalable, StTy, PtrTy); + + IRBuilder<> Builder(SI); + + Value *BaseAddr = Builder.CreateBitCast(Address, PtrTy); + Value *Pred = nullptr; + + if (UseScalable) + Pred = Builder.CreateVectorSplat( + StTy->getElementCount(), + ConstantInt::getTrue(IntegerType::getInt1Ty(StTy->getContext()))); + + Value *L = II->getOperand(0); + Value *R = II->getOperand(1); + + for (unsigned I = 0; I < NumStores; ++I) { + if (NumStores > 1) { + Value *Idx = + ConstantInt::get(Type::getInt64Ty(VTy->getContext()), + I * StTy->getElementCount().getKnownMinValue()); + Value *Offset = Idx; + if (StTy->isScalableTy()) + Offset = Builder.CreateVScale(cast(Offset)); + BaseAddr = Builder.CreateGEP(StTy->getElementType(), BaseAddr, {Offset}); + + L = Builder.CreateExtractVector(StTy, II->getOperand(0), Idx); + R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx); + } + + if (UseScalable) + Builder.CreateCall(StNFunc, {L, R, Pred, BaseAddr}); + else + Builder.CreateCall(StNFunc, {L, R, BaseAddr}); + } + + return true; +} + EVT AArch64TargetLowering::getOptimalMemOpType( const MemOp &Op, const AttributeList &FuncAttributes) const { bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat); Index: llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave-intrinsics.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave-intrinsics.ll @@ -0,0 +1,567 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt < %s -interleaved-access -S | FileCheck %s + +target triple = "aarch64-linux-gnu" + +;;; Scalable types + +define void @deinterleave_nxi8_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_nxi8_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv16i8( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: [[L:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[R:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: ret void +; + %load = load , ptr %ptr, align 1 + %deinterleave = tail call { , } @llvm.experimental.vector.deinterleave2.nxv32i8( %load) + %l = extractvalue { , } %deinterleave, 0 + %r = extractvalue { , } %deinterleave, 1 + ret void +} + +define void @deinterleave_nxi16_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_nxi16_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv8i16( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: [[L:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[R:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: ret void +; + %load = load , ptr %ptr, align 2 + %deinterleave = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16i16( %load) + %l = extractvalue { , } %deinterleave, 0 + %r = extractvalue { , } %deinterleave, 1 + ret void +} + +define void @deinterleave_nx8xi32_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_nx8xi32_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: [[L:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[R:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: ret void +; + %load = load , ptr %ptr, align 4 + %deinterleave = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( %load) + %l = extractvalue { , } %deinterleave, 0 + %r = extractvalue { , } %deinterleave, 1 + ret void +} + +define void @deinterleave_nxi64_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_nxi64_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2i64( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: [[L:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[R:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: ret void +; + %load = load , ptr %ptr, align 8 + %deinterleave = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4i64( %load) + %l = extractvalue { , } %deinterleave, 0 + %r = extractvalue { , } %deinterleave, 1 + ret void +} + +define void @deinterleave_nxfloat_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_nxfloat_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4f32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: [[L:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[R:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: ret void +; + %load = load , ptr %ptr, align 4 + %deinterleave = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8f32( %load) + %l = extractvalue { , } %deinterleave, 0 + %r = extractvalue { , } %deinterleave, 1 + ret void +} + +define void @deinterleave_nxdouble_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_nxdouble_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2f64( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: [[L:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[R:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: ret void +; + %load = load , ptr %ptr, align 8 + %deinterleave = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %load) + %l = extractvalue { , } %deinterleave, 0 + %r = extractvalue { , } %deinterleave, 1 + ret void +} + +define void @deinterleave_nxptr_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_nxptr_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2p0( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: [[L:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[R:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: ret void +; + %load = load , ptr %ptr, align 8 + %deinterleave = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4p0( %load) + %l = extractvalue { , } %deinterleave, 0 + %r = extractvalue { , } %deinterleave, 1 + ret void +} + +define void @interleave_nxi8_factor2(ptr %ptr, %l, %r) #0 { +; CHECK-LABEL: define void @interleave_nxi8_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], [[L:%.*]], [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv16i8( [[L]], [[R]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call @llvm.experimental.vector.interleave2.nxv32i8( %l, %r) + store %interleave, ptr %ptr, align 1 + ret void +} + +define void @interleave_nxi16_factor2(ptr %ptr, %l, %r) #0 { +; CHECK-LABEL: define void @interleave_nxi16_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], [[L:%.*]], [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv8i16( [[L]], [[R]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call @llvm.experimental.vector.interleave2.nxv16i16( %l, %r) + store %interleave, ptr %ptr, align 2 + ret void +} + +define void @interleave_nxi32_factor2(ptr %ptr, %l, %r) #0 { +; CHECK-LABEL: define void @interleave_nxi32_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], [[L:%.*]], [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv4i32( [[L]], [[R]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call @llvm.experimental.vector.interleave2.nxv8i32( %l, %r) + store %interleave, ptr %ptr, align 4 + ret void +} + +define void @interleave_nxi64_factor2(ptr %ptr, %l, %r) #0 { +; CHECK-LABEL: define void @interleave_nxi64_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], [[L:%.*]], [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2i64( [[L]], [[R]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call @llvm.experimental.vector.interleave2.nxv4i64( %l, %r) + store %interleave, ptr %ptr, align 8 + ret void +} + +define void @interleave_nxfloat_factor2(ptr %ptr, %l, %r) #0 { +; CHECK-LABEL: define void @interleave_nxfloat_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], [[L:%.*]], [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv4f32( [[L]], [[R]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call @llvm.experimental.vector.interleave2.nxv8f32( %l, %r) + store %interleave, ptr %ptr, align 4 + ret void +} + +define void @interleave_nxdouble_factor2(ptr %ptr, %l, %r) #0 { +; CHECK-LABEL: define void @interleave_nxdouble_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], [[L:%.*]], [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2f64( [[L]], [[R]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call @llvm.experimental.vector.interleave2.nxv4f64( %l, %r) + store %interleave, ptr %ptr, align 4 + ret void +} + +define void @interleave_nxptr_factor2(ptr %ptr, %l, %r) #0 { +; CHECK-LABEL: define void @interleave_nxptr_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], [[L:%.*]], [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2p0( [[L]], [[R]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call @llvm.experimental.vector.interleave2.nxv4p0( %l, %r) + store %interleave, ptr %ptr, align 4 + ret void +} + +;;; Fixed types + +define void @deinterleave_i8_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_i8_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[PTR]]) +; CHECK-NEXT: [[L:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[LDN]], 0 +; CHECK-NEXT: [[R:%.*]] = extractvalue { <16 x i8>, <16 x i8> } [[LDN]], 1 +; CHECK-NEXT: ret void +; + %load = load <32 x i8>, ptr %ptr, align 1 + %deinterleave = tail call { <16 x i8>, <16 x i8> } @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> %load) + %l = extractvalue { <16 x i8>, <16 x i8> } %deinterleave, 0 + %r = extractvalue { <16 x i8>, <16 x i8> } %deinterleave, 1 + ret void +} + +define void @deinterleave_i16_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_i16_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[PTR]]) +; CHECK-NEXT: [[L:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 0 +; CHECK-NEXT: [[R:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 1 +; CHECK-NEXT: ret void +; + %load = load <16 x i16>, ptr %ptr, align 2 + %deinterleave = tail call { <8 x i16>, <8 x i16> } @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %load) + %l = extractvalue { <8 x i16>, <8 x i16> } %deinterleave, 0 + %r = extractvalue { <8 x i16>, <8 x i16> } %deinterleave, 1 + ret void +} + +define void @deinterleave_8xi32_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_8xi32_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[PTR]]) +; CHECK-NEXT: [[L:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 0 +; CHECK-NEXT: [[R:%.*]] = extractvalue { <4 x i32>, <4 x i32> } [[LDN]], 1 +; CHECK-NEXT: ret void +; + %load = load <8 x i32>, ptr %ptr, align 4 + %deinterleave = tail call { <4 x i32>, <4 x i32> } @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %load) + %l = extractvalue { <4 x i32>, <4 x i32> } %deinterleave, 0 + %r = extractvalue { <4 x i32>, <4 x i32> } %deinterleave, 1 + ret void +} + +define void @deinterleave_i64_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_i64_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[PTR]]) +; CHECK-NEXT: [[L:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 0 +; CHECK-NEXT: [[R:%.*]] = extractvalue { <2 x i64>, <2 x i64> } [[LDN]], 1 +; CHECK-NEXT: ret void +; + %load = load <4 x i64>, ptr %ptr, align 8 + %deinterleave = tail call { <2 x i64>, <2 x i64> } @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %load) + %l = extractvalue { <2 x i64>, <2 x i64> } %deinterleave, 0 + %r = extractvalue { <2 x i64>, <2 x i64> } %deinterleave, 1 + ret void +} + +define void @deinterleave_float_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_float_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr [[PTR]]) +; CHECK-NEXT: [[L:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 0 +; CHECK-NEXT: [[R:%.*]] = extractvalue { <4 x float>, <4 x float> } [[LDN]], 1 +; CHECK-NEXT: ret void +; + %load = load <8 x float>, ptr %ptr, align 4 + %deinterleave = tail call { <4 x float>, <4 x float> } @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %load) + %l = extractvalue { <4 x float>, <4 x float> } %deinterleave, 0 + %r = extractvalue { <4 x float>, <4 x float> } %deinterleave, 1 + ret void +} + +define void @deinterleave_double_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_double_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0(ptr [[PTR]]) +; CHECK-NEXT: [[L:%.*]] = extractvalue { <2 x double>, <2 x double> } [[LDN]], 0 +; CHECK-NEXT: [[R:%.*]] = extractvalue { <2 x double>, <2 x double> } [[LDN]], 1 +; CHECK-NEXT: ret void +; + %load = load <4 x double>, ptr %ptr, align 8 + %deinterleave = tail call { <2 x double>, <2 x double> } @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %load) + %l = extractvalue { <2 x double>, <2 x double> } %deinterleave, 0 + %r = extractvalue { <2 x double>, <2 x double> } %deinterleave, 1 + ret void +} + +define void @deinterleave_ptr_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_ptr_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { <2 x ptr>, <2 x ptr> } @llvm.aarch64.neon.ld2.v2p0.p0(ptr [[PTR]]) +; CHECK-NEXT: [[L:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[LDN]], 0 +; CHECK-NEXT: [[R:%.*]] = extractvalue { <2 x ptr>, <2 x ptr> } [[LDN]], 1 +; CHECK-NEXT: ret void +; + %load = load <4 x ptr>, ptr %ptr, align 8 + %deinterleave = tail call { <2 x ptr>, <2 x ptr> } @llvm.experimental.vector.deinterleave2.v4p0(<4 x ptr> %load) + %l = extractvalue { <2 x ptr>, <2 x ptr> } %deinterleave, 0 + %r = extractvalue { <2 x ptr>, <2 x ptr> } %deinterleave, 1 + ret void +} + +define void @interleave_i8_factor2(ptr %ptr, <16 x i8> %l, <16 x i8> %r) #0 { +; CHECK-LABEL: define void @interleave_i8_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], <16 x i8> [[L:%.*]], <16 x i8> [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[L]], <16 x i8> [[R]], ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8> %l, <16 x i8> %r) + store <32 x i8> %interleave, ptr %ptr, align 1 + ret void +} + +define void @interleave_i16_factor2(ptr %ptr, <8 x i16> %l, <8 x i16> %r) #0 { +; CHECK-LABEL: define void @interleave_i16_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], <8 x i16> [[L:%.*]], <8 x i16> [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[L]], <8 x i16> [[R]], ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %l, <8 x i16> %r) + store <16 x i16> %interleave, ptr %ptr, align 2 + ret void +} + +define void @interleave_i32_factor2(ptr %ptr, <4 x i32> %l, <4 x i32> %r) #0 { +; CHECK-LABEL: define void @interleave_i32_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], <4 x i32> [[L:%.*]], <4 x i32> [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[L]], <4 x i32> [[R]], ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %l, <4 x i32> %r) + store <8 x i32> %interleave, ptr %ptr, align 4 + ret void +} + +define void @interleave_i64_factor2(ptr %ptr, <2 x i64> %l, <2 x i64> %r) #0 { +; CHECK-LABEL: define void @interleave_i64_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], <2 x i64> [[L:%.*]], <2 x i64> [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[L]], <2 x i64> [[R]], ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> %l, <2 x i64> %r) + store <4 x i64> %interleave, ptr %ptr, align 8 + ret void +} + +define void @interleave_float_factor2(ptr %ptr, <4 x float> %l, <4 x float> %r) #0 { +; CHECK-LABEL: define void @interleave_float_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], <4 x float> [[L:%.*]], <4 x float> [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> [[L]], <4 x float> [[R]], ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %l, <4 x float> %r) + store <8 x float> %interleave, ptr %ptr, align 4 + ret void +} + +define void @interleave_double_factor2(ptr %ptr, <2 x double> %l, <2 x double> %r) #0 { +; CHECK-LABEL: define void @interleave_double_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], <2 x double> [[L:%.*]], <2 x double> [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v2f64.p0(<2 x double> [[L]], <2 x double> [[R]], ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> %l, <2 x double> %r) + store <4 x double> %interleave, ptr %ptr, align 4 + ret void +} + +define void @interleave_ptr_factor2(ptr %ptr, <2 x ptr> %l, <2 x ptr> %r) #0 { +; CHECK-LABEL: define void @interleave_ptr_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], <2 x ptr> [[L:%.*]], <2 x ptr> [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v2p0.p0(<2 x ptr> [[L]], <2 x ptr> [[R]], ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call <4 x ptr> @llvm.experimental.vector.interleave2.v4p0(<2 x ptr> %l, <2 x ptr> %r) + store <4 x ptr> %interleave, ptr %ptr, align 4 + ret void +} + +;;; Check that we 'legalize' operations that are wider than the target supports. + +define void @deinterleave_wide_nxi32_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_wide_nxi32_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i32, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 4 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr i32, ptr [[TMP1]], i64 [[TMP7]] +; CHECK-NEXT: [[LDN1:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP8]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[LDN1]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP3]], [[TMP9]], i64 4) +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[LDN1]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP5]], [[TMP11]], i64 4) +; CHECK-NEXT: [[TMP13:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP14:%.*]] = mul i64 [[TMP13]], 8 +; CHECK-NEXT: [[TMP15:%.*]] = getelementptr i32, ptr [[TMP8]], i64 [[TMP14]] +; CHECK-NEXT: [[LDN2:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP15]]) +; CHECK-NEXT: [[TMP16:%.*]] = extractvalue { , } [[LDN2]], 0 +; CHECK-NEXT: [[TMP17:%.*]] = call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP16]], i64 8) +; CHECK-NEXT: [[TMP18:%.*]] = extractvalue { , } [[LDN2]], 1 +; CHECK-NEXT: [[TMP19:%.*]] = call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP12]], [[TMP18]], i64 8) +; CHECK-NEXT: [[TMP20:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP21:%.*]] = mul i64 [[TMP20]], 12 +; CHECK-NEXT: [[TMP22:%.*]] = getelementptr i32, ptr [[TMP15]], i64 [[TMP21]] +; CHECK-NEXT: [[LDN3:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP22]]) +; CHECK-NEXT: [[TMP23:%.*]] = extractvalue { , } [[LDN3]], 0 +; CHECK-NEXT: [[TMP24:%.*]] = call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP17]], [[TMP23]], i64 12) +; CHECK-NEXT: [[TMP25:%.*]] = extractvalue { , } [[LDN3]], 1 +; CHECK-NEXT: [[TMP26:%.*]] = call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP19]], [[TMP25]], i64 12) +; CHECK-NEXT: [[TMP27:%.*]] = insertvalue { , } poison, [[TMP24]], 0 +; CHECK-NEXT: [[TMP28:%.*]] = insertvalue { , } [[TMP27]], [[TMP26]], 1 +; CHECK-NEXT: [[L:%.*]] = extractvalue { , } [[TMP28]], 0 +; CHECK-NEXT: [[R:%.*]] = extractvalue { , } [[TMP28]], 1 +; CHECK-NEXT: ret void +; + %load = load , ptr %ptr, align 4 + %deinterleave = tail call { , } @llvm.experimental.vector.deinterleave2.nxv32i32( %load) + %l = extractvalue { , } %deinterleave, 0 + %r = extractvalue { , } %deinterleave, 1 + ret void +} + +define void @deinterleave_wide_nxdouble_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_wide_nxdouble_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2f64( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP7:%.*]] = mul i64 [[TMP6]], 2 +; CHECK-NEXT: [[TMP8:%.*]] = getelementptr double, ptr [[TMP1]], i64 [[TMP7]] +; CHECK-NEXT: [[LDN1:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2f64( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP8]]) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[LDN1]], 0 +; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP3]], [[TMP9]], i64 2) +; CHECK-NEXT: [[TMP11:%.*]] = extractvalue { , } [[LDN1]], 1 +; CHECK-NEXT: [[TMP12:%.*]] = call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP5]], [[TMP11]], i64 2) +; CHECK-NEXT: [[TMP13:%.*]] = insertvalue { , } poison, [[TMP10]], 0 +; CHECK-NEXT: [[TMP14:%.*]] = insertvalue { , } [[TMP13]], [[TMP12]], 1 +; CHECK-NEXT: [[L:%.*]] = extractvalue { , } [[TMP14]], 0 +; CHECK-NEXT: [[R:%.*]] = extractvalue { , } [[TMP14]], 1 +; CHECK-NEXT: ret void +; + %load = load , ptr %ptr, align 8 + %deinterleave = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8f64( %load) + %l = extractvalue { , } %deinterleave, 0 + %r = extractvalue { , } %deinterleave, 1 + ret void +} + +define void @deinterleave_wide_i16_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_wide_i16_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr i16, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[LDN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> poison, <8 x i16> [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> poison, <8 x i16> [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr i16, ptr [[TMP1]], i64 8 +; CHECK-NEXT: [[LDN1:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[TMP6]]) +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN1]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP3]], <8 x i16> [[TMP7]], i64 8) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN1]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP5]], <8 x i16> [[TMP9]], i64 8) +; CHECK-NEXT: [[TMP11:%.*]] = insertvalue { <16 x i16>, <16 x i16> } poison, <16 x i16> [[TMP8]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = insertvalue { <16 x i16>, <16 x i16> } [[TMP11]], <16 x i16> [[TMP10]], 1 +; CHECK-NEXT: [[L:%.*]] = extractvalue { <16 x i16>, <16 x i16> } [[TMP12]], 0 +; CHECK-NEXT: [[R:%.*]] = extractvalue { <16 x i16>, <16 x i16> } [[TMP12]], 1 +; CHECK-NEXT: ret void +; + %load = load <32 x i16>, ptr %ptr, align 2 + %deinterleave = tail call { <16 x i16>, <16 x i16> } @llvm.experimental.vector.deinterleave2.v32i16(<32 x i16> %load) + %l = extractvalue { <16 x i16>, <16 x i16> } %deinterleave, 0 + %r = extractvalue { <16 x i16>, <16 x i16> } %deinterleave, 1 + ret void +} + +define void @interleave_wide_nxdouble_factor2(ptr %ptr, %l, %r) #0 { +; CHECK-LABEL: define void @interleave_wide_nxdouble_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], [[L:%.*]], [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr double, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv2f64.nxv4f64( [[L]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv2f64.nxv4f64( [[R]], i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP2]], [[TMP3]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP1]]) +; CHECK-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[TMP5:%.*]] = mul i64 [[TMP4]], 2 +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr double, ptr [[TMP1]], i64 [[TMP5]] +; CHECK-NEXT: [[TMP7:%.*]] = call @llvm.vector.extract.nxv2f64.nxv4f64( [[L]], i64 2) +; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.vector.extract.nxv2f64.nxv4f64( [[R]], i64 2) +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP7]], [[TMP8]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP6]]) +; CHECK-NEXT: ret void +; + %interleave = tail call @llvm.experimental.vector.interleave2.nxv8f64( %l, %r) + store %interleave, ptr %ptr, align 4 + ret void +} + +define void @interleave_wide_ptr_factor2(ptr %ptr, <8 x ptr> %l, <8 x ptr> %r) #0 { +; CHECK-LABEL: define void @interleave_wide_ptr_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], <8 x ptr> [[L:%.*]], <8 x ptr> [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr ptr, ptr [[PTR]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.v8p0(<8 x ptr> [[L]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.v8p0(<8 x ptr> [[R]], i64 0) +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v2p0.p0(<2 x ptr> [[TMP2]], <2 x ptr> [[TMP3]], ptr [[TMP1]]) +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr ptr, ptr [[TMP1]], i64 2 +; CHECK-NEXT: [[TMP5:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.v8p0(<8 x ptr> [[L]], i64 2) +; CHECK-NEXT: [[TMP6:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.v8p0(<8 x ptr> [[R]], i64 2) +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v2p0.p0(<2 x ptr> [[TMP5]], <2 x ptr> [[TMP6]], ptr [[TMP4]]) +; CHECK-NEXT: [[TMP7:%.*]] = getelementptr ptr, ptr [[TMP4]], i64 4 +; CHECK-NEXT: [[TMP8:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.v8p0(<8 x ptr> [[L]], i64 4) +; CHECK-NEXT: [[TMP9:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.v8p0(<8 x ptr> [[R]], i64 4) +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v2p0.p0(<2 x ptr> [[TMP8]], <2 x ptr> [[TMP9]], ptr [[TMP7]]) +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr ptr, ptr [[TMP7]], i64 6 +; CHECK-NEXT: [[TMP11:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.v8p0(<8 x ptr> [[L]], i64 6) +; CHECK-NEXT: [[TMP12:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.v8p0(<8 x ptr> [[R]], i64 6) +; CHECK-NEXT: call void @llvm.aarch64.neon.st2.v2p0.p0(<2 x ptr> [[TMP11]], <2 x ptr> [[TMP12]], ptr [[TMP10]]) +; CHECK-NEXT: ret void +; + %interleave = tail call <16 x ptr> @llvm.experimental.vector.interleave2.v16p0(<8 x ptr> %l, <8 x ptr> %r) + store <16 x ptr> %interleave, ptr %ptr, align 4 + ret void +} + +declare { , } @llvm.experimental.vector.deinterleave2.nxv32i8() +declare { , } @llvm.experimental.vector.deinterleave2.nxv16i16() +declare { , } @llvm.experimental.vector.deinterleave2.nxv8i32() +declare { , } @llvm.experimental.vector.deinterleave2.nxv4i64() +declare { , } @llvm.experimental.vector.deinterleave2.nxv8f32() +declare { , } @llvm.experimental.vector.deinterleave2.nxv4f64() +declare { , } @llvm.experimental.vector.deinterleave2.nxv4p0() + +declare { <16 x i8>, <16 x i8> } @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8>) +declare { <8 x i16>, <8 x i16> } @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16>) +declare { <4 x i32>, <4 x i32> } @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>) +declare { <2 x i64>, <2 x i64> } @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64>) +declare { <4 x float>, <4 x float> } @llvm.experimental.vector.deinterleave2.v8f32(<8 x float>) +declare { <2 x double>, <2 x double> } @llvm.experimental.vector.deinterleave2.v4f64(<4 x double>) +declare { <2 x ptr>, <2 x ptr> } @llvm.experimental.vector.deinterleave2.v4p0(<4 x ptr>) + +; Larger deinterleaves to test 'legalization' +declare { , } @llvm.experimental.vector.deinterleave2.nxv32i32() +declare { , } @llvm.experimental.vector.deinterleave2.nxv8f64() +declare { <16 x i16>, <16 x i16> } @llvm.experimental.vector.deinterleave2.v32i16(<32 x i16>) + +declare @llvm.experimental.vector.interleave2.nxv32i8(, ) +declare @llvm.experimental.vector.interleave2.nxv16i16(, ) +declare @llvm.experimental.vector.interleave2.nxv8i32(, ) +declare @llvm.experimental.vector.interleave2.nxv4i64(, ) +declare @llvm.experimental.vector.interleave2.nxv8f32(, ) +declare @llvm.experimental.vector.interleave2.nxv4f64(, ) +declare @llvm.experimental.vector.interleave2.nxv4p0(, ) + +declare <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8>, <16 x i8>) +declare <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16>, <8 x i16>) +declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>) +declare <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64>, <2 x i64>) +declare <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float>, <4 x float>) +declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double>, <2 x double>) +declare <4 x ptr> @llvm.experimental.vector.interleave2.v4p0(<2 x ptr>, <2 x ptr>) + +; Larger interleaves to test 'legalization' +declare @llvm.experimental.vector.interleave2.nxv8f64(, ) +declare <16 x ptr> @llvm.experimental.vector.interleave2.v16p0(<8 x ptr>, <8 x ptr>) + +attributes #0 = { vscale_range(1,16) "target-features"="+sve" }