diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -2972,6 +2972,28 @@ return false; } + /// Lower a deinterleave intrinsic to a target specific load intrinsic. + /// Return true on success. Currently only supports + /// llvm.experimental.vector.deinterleave2 + /// + /// \p DI is the deinterleave intrinsic. + /// \p LI is the accompanying load instruction + virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, + LoadInst *LI) const { + return false; + } + + /// Lower an interleave intrinsic to a target specific store intrinsic. + /// Return true on success. Currently only supports + /// llvm.experimental.vector.interleave2 + /// + /// \p II is the interleave intrinsic. + /// \p SI is the accompanying store instruction + virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II, + StoreInst *SI) const { + return false; + } + /// Return true if an fpext operation is free (for instance, because /// single-precision floating-point numbers are implicitly extended to /// double-precision). diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp --- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -58,6 +58,7 @@ #include "llvm/IR/InstIterator.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" +#include "llvm/IR/IntrinsicInst.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" @@ -113,6 +114,16 @@ bool lowerInterleavedStore(StoreInst *SI, SmallVector &DeadInsts); + /// Transform a load and a deinterleave intrinsic into target specific + /// instructions. + bool lowerDeinterleaveIntrinsic(IntrinsicInst *II, + SmallVector &DeadInsts); + + /// Transform an interleave intrinsic and a store into target specific + /// instructions. + bool lowerInterleaveIntrinsic(IntrinsicInst *II, + SmallVector &DeadInsts); + /// Returns true if the uses of an interleaved load by the /// extractelement instructions in \p Extracts can be replaced by uses of the /// shufflevector instructions in \p Shuffles instead. If so, the necessary @@ -446,6 +457,47 @@ return true; } +bool InterleavedAccess::lowerDeinterleaveIntrinsic( + IntrinsicInst *DI, SmallVector &DeadInsts) { + LoadInst *LI = dyn_cast(DI->getOperand(0)); + + if (!LI || !LI->hasOneUse() || !LI->isSimple()) + return false; + + LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n"); + + // Try and match this with target specific intrinsics. + if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI)) + return false; + + // We now have a target-specific load, so delete the old one. + DeadInsts.push_back(DI); + DeadInsts.push_back(LI); + return true; +} + +bool InterleavedAccess::lowerInterleaveIntrinsic( + IntrinsicInst *II, SmallVector &DeadInsts) { + if (!II->hasOneUse()) + return false; + + StoreInst *SI = dyn_cast(*(II->users().begin())); + + if (!SI || !SI->isSimple()) + return false; + + LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n"); + + // Try and match this with target specific intrinsics. + if (!TLI->lowerInterleaveIntrinsicToStore(II, SI)) + return false; + + // We now have a target-specific store, so delete the old one. + DeadInsts.push_back(SI); + DeadInsts.push_back(II); + return true; +} + bool InterleavedAccess::runOnFunction(Function &F) { auto *TPC = getAnalysisIfAvailable(); if (!TPC || !LowerInterleavedAccesses) @@ -468,6 +520,15 @@ if (auto *SI = dyn_cast(&I)) Changed |= lowerInterleavedStore(SI, DeadInsts); + + if (auto *II = dyn_cast(&I)) { + // At present, we only have intrinsics to represent (de)interleaving + // with a factor of 2. + if (II->getIntrinsicID() == Intrinsic::experimental_vector_deinterleave2) + Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts); + if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2) + Changed |= lowerInterleaveIntrinsic(II, DeadInsts); + } } for (auto *I : DeadInsts) diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -651,6 +651,12 @@ bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; + bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, + LoadInst *LI) const override; + + bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II, + StoreInst *SI) const override; + bool isLegalAddImmediate(int64_t) const override; bool isLegalICmpImmediate(int64_t) const override; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -14754,12 +14754,18 @@ bool AArch64TargetLowering::isLegalInterleavedAccessType( VectorType *VecTy, const DataLayout &DL, bool &UseScalable) const { - unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType()); auto EC = VecTy->getElementCount(); unsigned MinElts = EC.getKnownMinValue(); UseScalable = false; + + if (!VecTy->isScalableTy() && !Subtarget->hasNEON()) + return false; + + if (VecTy->isScalableTy() && !Subtarget->hasSVEorSME()) + return false; + // Ensure that the predicate for this number of elements is available. if (Subtarget->hasSVE() && !getSVEPredPatternFromNumElements(MinElts)) return false; @@ -14772,8 +14778,10 @@ if (ElSize != 8 && ElSize != 16 && ElSize != 32 && ElSize != 64) return false; - if (EC.isScalable()) - return MinElts * ElSize == 128; + if (EC.isScalable()) { + UseScalable = true; + return isPowerOf2_32(MinElts) && (MinElts * ElSize) % 128 == 0; + } unsigned VecSize = DL.getTypeSizeInBits(VecTy); if (Subtarget->forceStreamingCompatibleSVE() || @@ -14818,6 +14826,38 @@ llvm_unreachable("Cannot handle input vector type"); } +static Function *getStructuredLoadFunction(Module *M, unsigned Factor, + bool Scalable, Type *LDVTy, + Type *PtrTy) { + assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor"); + static const Intrinsic::ID SVELoads[3] = {Intrinsic::aarch64_sve_ld2_sret, + Intrinsic::aarch64_sve_ld3_sret, + Intrinsic::aarch64_sve_ld4_sret}; + static const Intrinsic::ID NEONLoads[3] = {Intrinsic::aarch64_neon_ld2, + Intrinsic::aarch64_neon_ld3, + Intrinsic::aarch64_neon_ld4}; + if (Scalable) + return Intrinsic::getDeclaration(M, SVELoads[Factor - 2], {LDVTy}); + + return Intrinsic::getDeclaration(M, NEONLoads[Factor - 2], {LDVTy, PtrTy}); +} + +static Function *getStructuredStoreFunction(Module *M, unsigned Factor, + bool Scalable, Type *STVTy, + Type *PtrTy) { + assert(Factor >= 2 && Factor <= 4 && "Invalid interleave factor"); + static const Intrinsic::ID SVEStores[3] = {Intrinsic::aarch64_sve_st2, + Intrinsic::aarch64_sve_st3, + Intrinsic::aarch64_sve_st4}; + static const Intrinsic::ID NEONStores[3] = {Intrinsic::aarch64_neon_st2, + Intrinsic::aarch64_neon_st3, + Intrinsic::aarch64_neon_st4}; + if (Scalable) + return Intrinsic::getDeclaration(M, SVEStores[Factor - 2], {STVTy}); + + return Intrinsic::getDeclaration(M, NEONStores[Factor - 2], {STVTy, PtrTy}); +} + /// Lower an interleaved load into a ldN intrinsic. /// /// E.g. Lower an interleaved load (Factor = 2): @@ -14883,26 +14923,12 @@ LDVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace())); } - Type *PtrTy = - UseScalable - ? LDVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()) - : LDVTy->getPointerTo(LI->getPointerAddressSpace()); + Type *PtrTy = LI->getPointerOperandType(); Type *PredTy = VectorType::get(Type::getInt1Ty(LDVTy->getContext()), LDVTy->getElementCount()); - static const Intrinsic::ID SVELoadIntrs[3] = { - Intrinsic::aarch64_sve_ld2_sret, Intrinsic::aarch64_sve_ld3_sret, - Intrinsic::aarch64_sve_ld4_sret}; - static const Intrinsic::ID NEONLoadIntrs[3] = {Intrinsic::aarch64_neon_ld2, - Intrinsic::aarch64_neon_ld3, - Intrinsic::aarch64_neon_ld4}; - Function *LdNFunc; - if (UseScalable) - LdNFunc = Intrinsic::getDeclaration(LI->getModule(), - SVELoadIntrs[Factor - 2], {LDVTy}); - else - LdNFunc = Intrinsic::getDeclaration( - LI->getModule(), NEONLoadIntrs[Factor - 2], {LDVTy, PtrTy}); + Function *LdNFunc = getStructuredLoadFunction(LI->getModule(), Factor, + UseScalable, LDVTy, PtrTy); // Holds sub-vectors extracted from the load intrinsic return values. The // sub-vectors are associated with the shufflevector instructions they will @@ -15080,26 +15106,12 @@ if (Factor == 2 && SubVecTy->getPrimitiveSizeInBits() == 64 && Mask[0] != 0) return false; - Type *PtrTy = - UseScalable - ? STVTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()) - : STVTy->getPointerTo(SI->getPointerAddressSpace()); + Type *PtrTy = SI->getPointerOperandType(); Type *PredTy = VectorType::get(Type::getInt1Ty(STVTy->getContext()), STVTy->getElementCount()); - static const Intrinsic::ID SVEStoreIntrs[3] = {Intrinsic::aarch64_sve_st2, - Intrinsic::aarch64_sve_st3, - Intrinsic::aarch64_sve_st4}; - static const Intrinsic::ID NEONStoreIntrs[3] = {Intrinsic::aarch64_neon_st2, - Intrinsic::aarch64_neon_st3, - Intrinsic::aarch64_neon_st4}; - Function *StNFunc; - if (UseScalable) - StNFunc = Intrinsic::getDeclaration(SI->getModule(), - SVEStoreIntrs[Factor - 2], {STVTy}); - else - StNFunc = Intrinsic::getDeclaration( - SI->getModule(), NEONStoreIntrs[Factor - 2], {STVTy, PtrTy}); + Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor, + UseScalable, STVTy, PtrTy); Value *PTrue = nullptr; if (UseScalable) { @@ -15169,6 +15181,144 @@ return true; } +bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( + IntrinsicInst *DI, LoadInst *LI) const { + // Only deinterleave2 supported at present. + if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2) + return false; + + // Only a factor of 2 supported at present. + const unsigned Factor = 2; + + VectorType *VTy = cast(DI->getType()->getContainedType(0)); + const DataLayout &DL = DI->getModule()->getDataLayout(); + bool UseScalable; + if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) + return false; + + // TODO: Add support for using SVE instructions with fixed types later, using + // the code from lowerInterleavedLoad to obtain the correct container type. + if (UseScalable && !VTy->isScalableTy()) + return false; + + unsigned NumLoads = getNumInterleavedAccesses(VTy, DL, UseScalable); + + VectorType *LdTy = + VectorType::get(VTy->getElementType(), + VTy->getElementCount().divideCoefficientBy(NumLoads)); + + Type *PtrTy = LI->getPointerOperandType(); + Function *LdNFunc = getStructuredLoadFunction(DI->getModule(), Factor, + UseScalable, LdTy, PtrTy); + + IRBuilder<> Builder(LI); + + Value *Pred = nullptr; + if (UseScalable) + Pred = + Builder.CreateVectorSplat(LdTy->getElementCount(), Builder.getTrue()); + + Value *BaseAddr = LI->getPointerOperand(); + Value *Result; + if (NumLoads > 1) { + Value *Left = PoisonValue::get(VTy); + Value *Right = PoisonValue::get(VTy); + + for (unsigned I = 0; I < NumLoads; ++I) { + Value *Offset = Builder.getInt64(I * Factor); + + Value *Address = Builder.CreateGEP(LdTy, BaseAddr, {Offset}); + Value *LdN = nullptr; + if (UseScalable) + LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN"); + else + LdN = Builder.CreateCall(LdNFunc, Address, "ldN"); + + Value *Idx = + Builder.getInt64(I * LdTy->getElementCount().getKnownMinValue()); + Left = Builder.CreateInsertVector( + VTy, Left, Builder.CreateExtractValue(LdN, 0), Idx); + Right = Builder.CreateInsertVector( + VTy, Right, Builder.CreateExtractValue(LdN, 1), Idx); + } + + Result = PoisonValue::get(DI->getType()); + Result = Builder.CreateInsertValue(Result, Left, 0); + Result = Builder.CreateInsertValue(Result, Right, 1); + } else { + if (UseScalable) + Result = Builder.CreateCall(LdNFunc, {Pred, BaseAddr}, "ldN"); + else + Result = Builder.CreateCall(LdNFunc, BaseAddr, "ldN"); + } + + DI->replaceAllUsesWith(Result); + return true; +} + +bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore( + IntrinsicInst *II, StoreInst *SI) const { + // Only interleave2 supported at present. + if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2) + return false; + + // Only a factor of 2 supported at present. + const unsigned Factor = 2; + + VectorType *VTy = cast(II->getOperand(0)->getType()); + const DataLayout &DL = II->getModule()->getDataLayout(); + bool UseScalable; + if (!isLegalInterleavedAccessType(VTy, DL, UseScalable)) + return false; + + // TODO: Add support for using SVE instructions with fixed types later, using + // the code from lowerInterleavedStore to obtain the correct container type. + if (UseScalable && !VTy->isScalableTy()) + return false; + + unsigned NumStores = getNumInterleavedAccesses(VTy, DL, UseScalable); + + VectorType *StTy = + VectorType::get(VTy->getElementType(), + VTy->getElementCount().divideCoefficientBy(NumStores)); + + Type *PtrTy = SI->getPointerOperandType(); + Function *StNFunc = getStructuredStoreFunction(SI->getModule(), Factor, + UseScalable, StTy, PtrTy); + + IRBuilder<> Builder(SI); + + Value *BaseAddr = SI->getPointerOperand(); + Value *Pred = nullptr; + + if (UseScalable) + Pred = + Builder.CreateVectorSplat(StTy->getElementCount(), Builder.getTrue()); + + Value *L = II->getOperand(0); + Value *R = II->getOperand(1); + + for (unsigned I = 0; I < NumStores; ++I) { + Value *Address = BaseAddr; + if (NumStores > 1) { + Value *Offset = Builder.getInt64(I * Factor); + Address = Builder.CreateGEP(StTy, BaseAddr, {Offset}); + + Value *Idx = + Builder.getInt64(I * StTy->getElementCount().getKnownMinValue()); + L = Builder.CreateExtractVector(StTy, II->getOperand(0), Idx); + R = Builder.CreateExtractVector(StTy, II->getOperand(1), Idx); + } + + if (UseScalable) + Builder.CreateCall(StNFunc, {L, R, Pred, Address}); + else + Builder.CreateCall(StNFunc, {L, R, Address}); + } + + return true; +} + EVT AArch64TargetLowering::getOptimalMemOpType( const MemOp &Op, const AttributeList &FuncAttributes) const { bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat); diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InterleavedAccess/AArch64/fixed-deinterleave-intrinsics.ll @@ -0,0 +1,323 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt < %s -interleaved-access -S | FileCheck %s --check-prefix=NEON +; RUN: opt < %s -interleaved-access -mtriple=aarch64-linux-gnu -mattr=+sve -force-streaming-compatible-sve -S | FileCheck %s --check-prefix=SVE-FIXED + +target triple = "aarch64-linux-gnu" + +define { <16 x i8>, <16 x i8> } @deinterleave_i8_factor2(ptr %ptr) { +; NEON-LABEL: define { <16 x i8>, <16 x i8> } @deinterleave_i8_factor2 +; NEON-SAME: (ptr [[PTR:%.*]]) { +; NEON-NEXT: [[LDN:%.*]] = call { <16 x i8>, <16 x i8> } @llvm.aarch64.neon.ld2.v16i8.p0(ptr [[PTR]]) +; NEON-NEXT: ret { <16 x i8>, <16 x i8> } [[LDN]] +; +; SVE-FIXED-LABEL: define { <16 x i8>, <16 x i8> } @deinterleave_i8_factor2 +; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { +; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <32 x i8>, ptr [[PTR]], align 1 +; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <16 x i8>, <16 x i8> } @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> [[LOAD]]) +; SVE-FIXED-NEXT: ret { <16 x i8>, <16 x i8> } [[DEINTERLEAVE]] +; + %load = load <32 x i8>, ptr %ptr, align 1 + %deinterleave = tail call { <16 x i8>, <16 x i8> } @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8> %load) + ret { <16 x i8>, <16 x i8> } %deinterleave +} + +define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2(ptr %ptr) { +; NEON-LABEL: define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2 +; NEON-SAME: (ptr [[PTR:%.*]]) { +; NEON-NEXT: [[LDN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[PTR]]) +; NEON-NEXT: ret { <8 x i16>, <8 x i16> } [[LDN]] +; +; SVE-FIXED-LABEL: define { <8 x i16>, <8 x i16> } @deinterleave_i16_factor2 +; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <16 x i16>, ptr [[PTR]], align 2 +; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <8 x i16>, <8 x i16> } @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> [[LOAD]]) +; SVE-FIXED-NEXT: ret { <8 x i16>, <8 x i16> } [[DEINTERLEAVE]] +; + %load = load <16 x i16>, ptr %ptr, align 2 + %deinterleave = tail call { <8 x i16>, <8 x i16> } @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16> %load) + ret { <8 x i16>, <8 x i16> } %deinterleave +} + +define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2(ptr %ptr) { +; NEON-LABEL: define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2 +; NEON-SAME: (ptr [[PTR:%.*]]) { +; NEON-NEXT: [[LDN:%.*]] = call { <4 x i32>, <4 x i32> } @llvm.aarch64.neon.ld2.v4i32.p0(ptr [[PTR]]) +; NEON-NEXT: ret { <4 x i32>, <4 x i32> } [[LDN]] +; +; SVE-FIXED-LABEL: define { <4 x i32>, <4 x i32> } @deinterleave_8xi32_factor2 +; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <8 x i32>, ptr [[PTR]], align 4 +; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <4 x i32>, <4 x i32> } @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> [[LOAD]]) +; SVE-FIXED-NEXT: ret { <4 x i32>, <4 x i32> } [[DEINTERLEAVE]] +; + %load = load <8 x i32>, ptr %ptr, align 4 + %deinterleave = tail call { <4 x i32>, <4 x i32> } @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32> %load) + ret { <4 x i32>, <4 x i32> } %deinterleave +} + +define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2(ptr %ptr) { +; NEON-LABEL: define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2 +; NEON-SAME: (ptr [[PTR:%.*]]) { +; NEON-NEXT: [[LDN:%.*]] = call { <2 x i64>, <2 x i64> } @llvm.aarch64.neon.ld2.v2i64.p0(ptr [[PTR]]) +; NEON-NEXT: ret { <2 x i64>, <2 x i64> } [[LDN]] +; +; SVE-FIXED-LABEL: define { <2 x i64>, <2 x i64> } @deinterleave_i64_factor2 +; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <4 x i64>, ptr [[PTR]], align 8 +; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <2 x i64>, <2 x i64> } @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> [[LOAD]]) +; SVE-FIXED-NEXT: ret { <2 x i64>, <2 x i64> } [[DEINTERLEAVE]] +; + %load = load <4 x i64>, ptr %ptr, align 8 + %deinterleave = tail call { <2 x i64>, <2 x i64> } @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64> %load) + ret { <2 x i64>, <2 x i64> } %deinterleave +} + +define { <4 x float>, <4 x float> } @deinterleave_float_factor2(ptr %ptr) { +; NEON-LABEL: define { <4 x float>, <4 x float> } @deinterleave_float_factor2 +; NEON-SAME: (ptr [[PTR:%.*]]) { +; NEON-NEXT: [[LDN:%.*]] = call { <4 x float>, <4 x float> } @llvm.aarch64.neon.ld2.v4f32.p0(ptr [[PTR]]) +; NEON-NEXT: ret { <4 x float>, <4 x float> } [[LDN]] +; +; SVE-FIXED-LABEL: define { <4 x float>, <4 x float> } @deinterleave_float_factor2 +; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <8 x float>, ptr [[PTR]], align 4 +; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <4 x float>, <4 x float> } @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> [[LOAD]]) +; SVE-FIXED-NEXT: ret { <4 x float>, <4 x float> } [[DEINTERLEAVE]] +; + %load = load <8 x float>, ptr %ptr, align 4 + %deinterleave = tail call { <4 x float>, <4 x float> } @llvm.experimental.vector.deinterleave2.v8f32(<8 x float> %load) + ret { <4 x float>, <4 x float> } %deinterleave +} + +define { <2 x double>, <2 x double> } @deinterleave_double_factor2(ptr %ptr) { +; NEON-LABEL: define { <2 x double>, <2 x double> } @deinterleave_double_factor2 +; NEON-SAME: (ptr [[PTR:%.*]]) { +; NEON-NEXT: [[LDN:%.*]] = call { <2 x double>, <2 x double> } @llvm.aarch64.neon.ld2.v2f64.p0(ptr [[PTR]]) +; NEON-NEXT: ret { <2 x double>, <2 x double> } [[LDN]] +; +; SVE-FIXED-LABEL: define { <2 x double>, <2 x double> } @deinterleave_double_factor2 +; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <4 x double>, ptr [[PTR]], align 8 +; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <2 x double>, <2 x double> } @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> [[LOAD]]) +; SVE-FIXED-NEXT: ret { <2 x double>, <2 x double> } [[DEINTERLEAVE]] +; + %load = load <4 x double>, ptr %ptr, align 8 + %deinterleave = tail call { <2 x double>, <2 x double> } @llvm.experimental.vector.deinterleave2.v4f64(<4 x double> %load) + ret { <2 x double>, <2 x double> } %deinterleave +} + +define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2(ptr %ptr) { +; NEON-LABEL: define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2 +; NEON-SAME: (ptr [[PTR:%.*]]) { +; NEON-NEXT: [[LDN:%.*]] = call { <2 x ptr>, <2 x ptr> } @llvm.aarch64.neon.ld2.v2p0.p0(ptr [[PTR]]) +; NEON-NEXT: ret { <2 x ptr>, <2 x ptr> } [[LDN]] +; +; SVE-FIXED-LABEL: define { <2 x ptr>, <2 x ptr> } @deinterleave_ptr_factor2 +; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <4 x ptr>, ptr [[PTR]], align 8 +; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <2 x ptr>, <2 x ptr> } @llvm.experimental.vector.deinterleave2.v4p0(<4 x ptr> [[LOAD]]) +; SVE-FIXED-NEXT: ret { <2 x ptr>, <2 x ptr> } [[DEINTERLEAVE]] +; + %load = load <4 x ptr>, ptr %ptr, align 8 + %deinterleave = tail call { <2 x ptr>, <2 x ptr> } @llvm.experimental.vector.deinterleave2.v4p0(<4 x ptr> %load) + ret { <2 x ptr>, <2 x ptr> } %deinterleave +} + +define void @interleave_i8_factor2(ptr %ptr, <16 x i8> %l, <16 x i8> %r) { +; NEON-LABEL: define void @interleave_i8_factor2 +; NEON-SAME: (ptr [[PTR:%.*]], <16 x i8> [[L:%.*]], <16 x i8> [[R:%.*]]) { +; NEON-NEXT: call void @llvm.aarch64.neon.st2.v16i8.p0(<16 x i8> [[L]], <16 x i8> [[R]], ptr [[PTR]]) +; NEON-NEXT: ret void +; +; SVE-FIXED-LABEL: define void @interleave_i8_factor2 +; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <16 x i8> [[L:%.*]], <16 x i8> [[R:%.*]]) #[[ATTR0]] { +; SVE-FIXED-NEXT: [[INTERLEAVE:%.*]] = tail call <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8> [[L]], <16 x i8> [[R]]) +; SVE-FIXED-NEXT: store <32 x i8> [[INTERLEAVE]], ptr [[PTR]], align 1 +; SVE-FIXED-NEXT: ret void +; + %interleave = tail call <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8> %l, <16 x i8> %r) + store <32 x i8> %interleave, ptr %ptr, align 1 + ret void +} + +define void @interleave_i16_factor2(ptr %ptr, <8 x i16> %l, <8 x i16> %r) { +; NEON-LABEL: define void @interleave_i16_factor2 +; NEON-SAME: (ptr [[PTR:%.*]], <8 x i16> [[L:%.*]], <8 x i16> [[R:%.*]]) { +; NEON-NEXT: call void @llvm.aarch64.neon.st2.v8i16.p0(<8 x i16> [[L]], <8 x i16> [[R]], ptr [[PTR]]) +; NEON-NEXT: ret void +; +; SVE-FIXED-LABEL: define void @interleave_i16_factor2 +; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <8 x i16> [[L:%.*]], <8 x i16> [[R:%.*]]) #[[ATTR0]] { +; SVE-FIXED-NEXT: [[INTERLEAVE:%.*]] = tail call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> [[L]], <8 x i16> [[R]]) +; SVE-FIXED-NEXT: store <16 x i16> [[INTERLEAVE]], ptr [[PTR]], align 2 +; SVE-FIXED-NEXT: ret void +; + %interleave = tail call <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16> %l, <8 x i16> %r) + store <16 x i16> %interleave, ptr %ptr, align 2 + ret void +} + +define void @interleave_i32_factor2(ptr %ptr, <4 x i32> %l, <4 x i32> %r) { +; NEON-LABEL: define void @interleave_i32_factor2 +; NEON-SAME: (ptr [[PTR:%.*]], <4 x i32> [[L:%.*]], <4 x i32> [[R:%.*]]) { +; NEON-NEXT: call void @llvm.aarch64.neon.st2.v4i32.p0(<4 x i32> [[L]], <4 x i32> [[R]], ptr [[PTR]]) +; NEON-NEXT: ret void +; +; SVE-FIXED-LABEL: define void @interleave_i32_factor2 +; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <4 x i32> [[L:%.*]], <4 x i32> [[R:%.*]]) #[[ATTR0]] { +; SVE-FIXED-NEXT: [[INTERLEAVE:%.*]] = tail call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> [[L]], <4 x i32> [[R]]) +; SVE-FIXED-NEXT: store <8 x i32> [[INTERLEAVE]], ptr [[PTR]], align 4 +; SVE-FIXED-NEXT: ret void +; + %interleave = tail call <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32> %l, <4 x i32> %r) + store <8 x i32> %interleave, ptr %ptr, align 4 + ret void +} + +define void @interleave_i64_factor2(ptr %ptr, <2 x i64> %l, <2 x i64> %r) { +; NEON-LABEL: define void @interleave_i64_factor2 +; NEON-SAME: (ptr [[PTR:%.*]], <2 x i64> [[L:%.*]], <2 x i64> [[R:%.*]]) { +; NEON-NEXT: call void @llvm.aarch64.neon.st2.v2i64.p0(<2 x i64> [[L]], <2 x i64> [[R]], ptr [[PTR]]) +; NEON-NEXT: ret void +; +; SVE-FIXED-LABEL: define void @interleave_i64_factor2 +; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <2 x i64> [[L:%.*]], <2 x i64> [[R:%.*]]) #[[ATTR0]] { +; SVE-FIXED-NEXT: [[INTERLEAVE:%.*]] = tail call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> [[L]], <2 x i64> [[R]]) +; SVE-FIXED-NEXT: store <4 x i64> [[INTERLEAVE]], ptr [[PTR]], align 8 +; SVE-FIXED-NEXT: ret void +; + %interleave = tail call <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64> %l, <2 x i64> %r) + store <4 x i64> %interleave, ptr %ptr, align 8 + ret void +} + +define void @interleave_float_factor2(ptr %ptr, <4 x float> %l, <4 x float> %r) { +; NEON-LABEL: define void @interleave_float_factor2 +; NEON-SAME: (ptr [[PTR:%.*]], <4 x float> [[L:%.*]], <4 x float> [[R:%.*]]) { +; NEON-NEXT: call void @llvm.aarch64.neon.st2.v4f32.p0(<4 x float> [[L]], <4 x float> [[R]], ptr [[PTR]]) +; NEON-NEXT: ret void +; +; SVE-FIXED-LABEL: define void @interleave_float_factor2 +; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <4 x float> [[L:%.*]], <4 x float> [[R:%.*]]) #[[ATTR0]] { +; SVE-FIXED-NEXT: [[INTERLEAVE:%.*]] = tail call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> [[L]], <4 x float> [[R]]) +; SVE-FIXED-NEXT: store <8 x float> [[INTERLEAVE]], ptr [[PTR]], align 4 +; SVE-FIXED-NEXT: ret void +; + %interleave = tail call <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float> %l, <4 x float> %r) + store <8 x float> %interleave, ptr %ptr, align 4 + ret void +} + +define void @interleave_double_factor2(ptr %ptr, <2 x double> %l, <2 x double> %r) { +; NEON-LABEL: define void @interleave_double_factor2 +; NEON-SAME: (ptr [[PTR:%.*]], <2 x double> [[L:%.*]], <2 x double> [[R:%.*]]) { +; NEON-NEXT: call void @llvm.aarch64.neon.st2.v2f64.p0(<2 x double> [[L]], <2 x double> [[R]], ptr [[PTR]]) +; NEON-NEXT: ret void +; +; SVE-FIXED-LABEL: define void @interleave_double_factor2 +; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <2 x double> [[L:%.*]], <2 x double> [[R:%.*]]) #[[ATTR0]] { +; SVE-FIXED-NEXT: [[INTERLEAVE:%.*]] = tail call <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> [[L]], <2 x double> [[R]]) +; SVE-FIXED-NEXT: store <4 x double> [[INTERLEAVE]], ptr [[PTR]], align 4 +; SVE-FIXED-NEXT: ret void +; + %interleave = tail call <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double> %l, <2 x double> %r) + store <4 x double> %interleave, ptr %ptr, align 4 + ret void +} + +define void @interleave_ptr_factor2(ptr %ptr, <2 x ptr> %l, <2 x ptr> %r) { +; NEON-LABEL: define void @interleave_ptr_factor2 +; NEON-SAME: (ptr [[PTR:%.*]], <2 x ptr> [[L:%.*]], <2 x ptr> [[R:%.*]]) { +; NEON-NEXT: call void @llvm.aarch64.neon.st2.v2p0.p0(<2 x ptr> [[L]], <2 x ptr> [[R]], ptr [[PTR]]) +; NEON-NEXT: ret void +; +; SVE-FIXED-LABEL: define void @interleave_ptr_factor2 +; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <2 x ptr> [[L:%.*]], <2 x ptr> [[R:%.*]]) #[[ATTR0]] { +; SVE-FIXED-NEXT: [[INTERLEAVE:%.*]] = tail call <4 x ptr> @llvm.experimental.vector.interleave2.v4p0(<2 x ptr> [[L]], <2 x ptr> [[R]]) +; SVE-FIXED-NEXT: store <4 x ptr> [[INTERLEAVE]], ptr [[PTR]], align 4 +; SVE-FIXED-NEXT: ret void +; + %interleave = tail call <4 x ptr> @llvm.experimental.vector.interleave2.v4p0(<2 x ptr> %l, <2 x ptr> %r) + store <4 x ptr> %interleave, ptr %ptr, align 4 + ret void +} + +define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2(ptr %ptr) #0 { +; NEON-LABEL: define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2 +; NEON-SAME: (ptr [[PTR:%.*]]) { +; NEON-NEXT: [[TMP1:%.*]] = getelementptr <8 x i16>, ptr [[PTR]], i64 0 +; NEON-NEXT: [[LDN:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[TMP1]]) +; NEON-NEXT: [[TMP2:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 0 +; NEON-NEXT: [[TMP3:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> poison, <8 x i16> [[TMP2]], i64 0) +; NEON-NEXT: [[TMP4:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN]], 1 +; NEON-NEXT: [[TMP5:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> poison, <8 x i16> [[TMP4]], i64 0) +; NEON-NEXT: [[TMP6:%.*]] = getelementptr <8 x i16>, ptr [[PTR]], i64 2 +; NEON-NEXT: [[LDN1:%.*]] = call { <8 x i16>, <8 x i16> } @llvm.aarch64.neon.ld2.v8i16.p0(ptr [[TMP6]]) +; NEON-NEXT: [[TMP7:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN1]], 0 +; NEON-NEXT: [[TMP8:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP3]], <8 x i16> [[TMP7]], i64 8) +; NEON-NEXT: [[TMP9:%.*]] = extractvalue { <8 x i16>, <8 x i16> } [[LDN1]], 1 +; NEON-NEXT: [[TMP10:%.*]] = call <16 x i16> @llvm.vector.insert.v16i16.v8i16(<16 x i16> [[TMP5]], <8 x i16> [[TMP9]], i64 8) +; NEON-NEXT: [[TMP11:%.*]] = insertvalue { <16 x i16>, <16 x i16> } poison, <16 x i16> [[TMP8]], 0 +; NEON-NEXT: [[TMP12:%.*]] = insertvalue { <16 x i16>, <16 x i16> } [[TMP11]], <16 x i16> [[TMP10]], 1 +; NEON-NEXT: ret { <16 x i16>, <16 x i16> } [[TMP12]] +; +; SVE-FIXED-LABEL: define { <16 x i16>, <16 x i16> } @deinterleave_wide_i16_factor2 +; SVE-FIXED-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; SVE-FIXED-NEXT: [[LOAD:%.*]] = load <32 x i16>, ptr [[PTR]], align 2 +; SVE-FIXED-NEXT: [[DEINTERLEAVE:%.*]] = tail call { <16 x i16>, <16 x i16> } @llvm.experimental.vector.deinterleave2.v32i16(<32 x i16> [[LOAD]]) +; SVE-FIXED-NEXT: ret { <16 x i16>, <16 x i16> } [[DEINTERLEAVE]] +; + %load = load <32 x i16>, ptr %ptr, align 2 + %deinterleave = tail call { <16 x i16>, <16 x i16> } @llvm.experimental.vector.deinterleave2.v32i16(<32 x i16> %load) + ret { <16 x i16>, <16 x i16> } %deinterleave +} + +define void @interleave_wide_ptr_factor2(ptr %ptr, <8 x ptr> %l, <8 x ptr> %r) { +; NEON-LABEL: define void @interleave_wide_ptr_factor2 +; NEON-SAME: (ptr [[PTR:%.*]], <8 x ptr> [[L:%.*]], <8 x ptr> [[R:%.*]]) { +; NEON-NEXT: [[TMP1:%.*]] = getelementptr <2 x ptr>, ptr [[PTR]], i64 0 +; NEON-NEXT: [[TMP2:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.v8p0(<8 x ptr> [[L]], i64 0) +; NEON-NEXT: [[TMP3:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.v8p0(<8 x ptr> [[R]], i64 0) +; NEON-NEXT: call void @llvm.aarch64.neon.st2.v2p0.p0(<2 x ptr> [[TMP2]], <2 x ptr> [[TMP3]], ptr [[TMP1]]) +; NEON-NEXT: [[TMP4:%.*]] = getelementptr <2 x ptr>, ptr [[PTR]], i64 2 +; NEON-NEXT: [[TMP5:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.v8p0(<8 x ptr> [[L]], i64 2) +; NEON-NEXT: [[TMP6:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.v8p0(<8 x ptr> [[R]], i64 2) +; NEON-NEXT: call void @llvm.aarch64.neon.st2.v2p0.p0(<2 x ptr> [[TMP5]], <2 x ptr> [[TMP6]], ptr [[TMP4]]) +; NEON-NEXT: [[TMP7:%.*]] = getelementptr <2 x ptr>, ptr [[PTR]], i64 4 +; NEON-NEXT: [[TMP8:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.v8p0(<8 x ptr> [[L]], i64 4) +; NEON-NEXT: [[TMP9:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.v8p0(<8 x ptr> [[R]], i64 4) +; NEON-NEXT: call void @llvm.aarch64.neon.st2.v2p0.p0(<2 x ptr> [[TMP8]], <2 x ptr> [[TMP9]], ptr [[TMP7]]) +; NEON-NEXT: [[TMP10:%.*]] = getelementptr <2 x ptr>, ptr [[PTR]], i64 6 +; NEON-NEXT: [[TMP11:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.v8p0(<8 x ptr> [[L]], i64 6) +; NEON-NEXT: [[TMP12:%.*]] = call <2 x ptr> @llvm.vector.extract.v2p0.v8p0(<8 x ptr> [[R]], i64 6) +; NEON-NEXT: call void @llvm.aarch64.neon.st2.v2p0.p0(<2 x ptr> [[TMP11]], <2 x ptr> [[TMP12]], ptr [[TMP10]]) +; NEON-NEXT: ret void +; +; SVE-FIXED-LABEL: define void @interleave_wide_ptr_factor2 +; SVE-FIXED-SAME: (ptr [[PTR:%.*]], <8 x ptr> [[L:%.*]], <8 x ptr> [[R:%.*]]) #[[ATTR0]] { +; SVE-FIXED-NEXT: [[INTERLEAVE:%.*]] = tail call <16 x ptr> @llvm.experimental.vector.interleave2.v16p0(<8 x ptr> [[L]], <8 x ptr> [[R]]) +; SVE-FIXED-NEXT: store <16 x ptr> [[INTERLEAVE]], ptr [[PTR]], align 4 +; SVE-FIXED-NEXT: ret void +; + %interleave = tail call <16 x ptr> @llvm.experimental.vector.interleave2.v16p0(<8 x ptr> %l, <8 x ptr> %r) + store <16 x ptr> %interleave, ptr %ptr, align 4 + ret void +} + +declare { <16 x i8>, <16 x i8> } @llvm.experimental.vector.deinterleave2.v32i8(<32 x i8>) +declare { <8 x i16>, <8 x i16> } @llvm.experimental.vector.deinterleave2.v16i16(<16 x i16>) +declare { <4 x i32>, <4 x i32> } @llvm.experimental.vector.deinterleave2.v8i32(<8 x i32>) +declare { <2 x i64>, <2 x i64> } @llvm.experimental.vector.deinterleave2.v4i64(<4 x i64>) +declare { <4 x float>, <4 x float> } @llvm.experimental.vector.deinterleave2.v8f32(<8 x float>) +declare { <2 x double>, <2 x double> } @llvm.experimental.vector.deinterleave2.v4f64(<4 x double>) +declare { <2 x ptr>, <2 x ptr> } @llvm.experimental.vector.deinterleave2.v4p0(<4 x ptr>) +declare { <16 x i16>, <16 x i16> } @llvm.experimental.vector.deinterleave2.v32i16(<32 x i16>) + +declare <32 x i8> @llvm.experimental.vector.interleave2.v32i8(<16 x i8>, <16 x i8>) +declare <16 x i16> @llvm.experimental.vector.interleave2.v16i16(<8 x i16>, <8 x i16>) +declare <8 x i32> @llvm.experimental.vector.interleave2.v8i32(<4 x i32>, <4 x i32>) +declare <4 x i64> @llvm.experimental.vector.interleave2.v4i64(<2 x i64>, <2 x i64>) +declare <8 x float> @llvm.experimental.vector.interleave2.v8f32(<4 x float>, <4 x float>) +declare <4 x double> @llvm.experimental.vector.interleave2.v4f64(<2 x double>, <2 x double>) +declare <4 x ptr> @llvm.experimental.vector.interleave2.v4p0(<2 x ptr>, <2 x ptr>) +declare <16 x ptr> @llvm.experimental.vector.interleave2.v16p0(<8 x ptr>, <8 x ptr>) diff --git a/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InterleavedAccess/AArch64/scalable-deinterleave-intrinsics.ll @@ -0,0 +1,263 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt < %s -interleaved-access -S | FileCheck %s + +target triple = "aarch64-linux-gnu" + +define { , } @deinterleave_nxi8_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define { , } @deinterleave_nxi8_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv16i8( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret { , } [[LDN]] +; + %load = load , ptr %ptr, align 1 + %deinterleave = tail call { , } @llvm.experimental.vector.deinterleave2.nxv32i8( %load) + ret { , } %deinterleave +} + +define { , } @deinterleave_nxi16_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define { , } @deinterleave_nxi16_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv8i16( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret { , } [[LDN]] +; + %load = load , ptr %ptr, align 2 + %deinterleave = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16i16( %load) + ret { , } %deinterleave +} + +define { , } @deinterleave_nx8xi32_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define { , } @deinterleave_nx8xi32_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret { , } [[LDN]] +; + %load = load , ptr %ptr, align 4 + %deinterleave = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( %load) + ret { , } %deinterleave +} + +define { , } @deinterleave_nxi64_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define { , } @deinterleave_nxi64_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2i64( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret { , } [[LDN]] +; + %load = load , ptr %ptr, align 8 + %deinterleave = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4i64( %load) + ret { , } %deinterleave +} + +define { , } @deinterleave_nxfloat_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define { , } @deinterleave_nxfloat_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4f32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret { , } [[LDN]] +; + %load = load , ptr %ptr, align 4 + %deinterleave = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8f32( %load) + ret { , } %deinterleave +} + +define { , } @deinterleave_nxdouble_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define { , } @deinterleave_nxdouble_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2f64( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret { , } [[LDN]] +; + %load = load , ptr %ptr, align 8 + %deinterleave = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %load) + ret { , } %deinterleave +} + +define { , } @deinterleave_nxptr_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define { , } @deinterleave_nxptr_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2p0( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret { , } [[LDN]] +; + %load = load , ptr %ptr, align 8 + %deinterleave = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4p0( %load) + ret { , } %deinterleave +} + +define void @interleave_nxi8_factor2(ptr %ptr, %l, %r) #0 { +; CHECK-LABEL: define void @interleave_nxi8_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], [[L:%.*]], [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv16i8( [[L]], [[R]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call @llvm.experimental.vector.interleave2.nxv32i8( %l, %r) + store %interleave, ptr %ptr, align 1 + ret void +} + +define void @interleave_nxi16_factor2(ptr %ptr, %l, %r) #0 { +; CHECK-LABEL: define void @interleave_nxi16_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], [[L:%.*]], [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv8i16( [[L]], [[R]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call @llvm.experimental.vector.interleave2.nxv16i16( %l, %r) + store %interleave, ptr %ptr, align 2 + ret void +} + +define void @interleave_nxi32_factor2(ptr %ptr, %l, %r) #0 { +; CHECK-LABEL: define void @interleave_nxi32_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], [[L:%.*]], [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv4i32( [[L]], [[R]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call @llvm.experimental.vector.interleave2.nxv8i32( %l, %r) + store %interleave, ptr %ptr, align 4 + ret void +} + +define void @interleave_nxi64_factor2(ptr %ptr, %l, %r) #0 { +; CHECK-LABEL: define void @interleave_nxi64_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], [[L:%.*]], [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2i64( [[L]], [[R]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call @llvm.experimental.vector.interleave2.nxv4i64( %l, %r) + store %interleave, ptr %ptr, align 8 + ret void +} + +define void @interleave_nxfloat_factor2(ptr %ptr, %l, %r) #0 { +; CHECK-LABEL: define void @interleave_nxfloat_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], [[L:%.*]], [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv4f32( [[L]], [[R]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call @llvm.experimental.vector.interleave2.nxv8f32( %l, %r) + store %interleave, ptr %ptr, align 4 + ret void +} + +define void @interleave_nxdouble_factor2(ptr %ptr, %l, %r) #0 { +; CHECK-LABEL: define void @interleave_nxdouble_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], [[L:%.*]], [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2f64( [[L]], [[R]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call @llvm.experimental.vector.interleave2.nxv4f64( %l, %r) + store %interleave, ptr %ptr, align 4 + ret void +} + +define void @interleave_nxptr_factor2(ptr %ptr, %l, %r) #0 { +; CHECK-LABEL: define void @interleave_nxptr_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], [[L:%.*]], [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2p0( [[L]], [[R]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call @llvm.experimental.vector.interleave2.nxv4p0( %l, %r) + store %interleave, ptr %ptr, align 4 + ret void +} + +;;; Check that we 'legalize' operations that are wider than the target supports. + +define { , } @deinterleave_wide_nxi32_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define { , } @deinterleave_wide_nxi32_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[PTR]], i64 0 +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.insert.nxv16i32.nxv4i32( poison, [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr , ptr [[PTR]], i64 2 +; CHECK-NEXT: [[LDN1:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP6]]) +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[LDN1]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP3]], [[TMP7]], i64 4) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[LDN1]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP5]], [[TMP9]], i64 4) +; CHECK-NEXT: [[TMP11:%.*]] = getelementptr , ptr [[PTR]], i64 4 +; CHECK-NEXT: [[LDN2:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP11]]) +; CHECK-NEXT: [[TMP12:%.*]] = extractvalue { , } [[LDN2]], 0 +; CHECK-NEXT: [[TMP13:%.*]] = call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP8]], [[TMP12]], i64 8) +; CHECK-NEXT: [[TMP14:%.*]] = extractvalue { , } [[LDN2]], 1 +; CHECK-NEXT: [[TMP15:%.*]] = call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP10]], [[TMP14]], i64 8) +; CHECK-NEXT: [[TMP16:%.*]] = getelementptr , ptr [[PTR]], i64 6 +; CHECK-NEXT: [[LDN3:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP16]]) +; CHECK-NEXT: [[TMP17:%.*]] = extractvalue { , } [[LDN3]], 0 +; CHECK-NEXT: [[TMP18:%.*]] = call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP13]], [[TMP17]], i64 12) +; CHECK-NEXT: [[TMP19:%.*]] = extractvalue { , } [[LDN3]], 1 +; CHECK-NEXT: [[TMP20:%.*]] = call @llvm.vector.insert.nxv16i32.nxv4i32( [[TMP15]], [[TMP19]], i64 12) +; CHECK-NEXT: [[TMP21:%.*]] = insertvalue { , } poison, [[TMP18]], 0 +; CHECK-NEXT: [[TMP22:%.*]] = insertvalue { , } [[TMP21]], [[TMP20]], 1 +; CHECK-NEXT: ret { , } [[TMP22]] +; + %load = load , ptr %ptr, align 4 + %deinterleave = tail call { , } @llvm.experimental.vector.deinterleave2.nxv32i32( %load) + ret { , } %deinterleave +} + +define { , } @deinterleave_wide_nxdouble_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define { , } @deinterleave_wide_nxdouble_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[PTR]], i64 0 +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2f64( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP1]]) +; CHECK-NEXT: [[TMP2:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP2]], i64 0) +; CHECK-NEXT: [[TMP4:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.insert.nxv4f64.nxv2f64( poison, [[TMP4]], i64 0) +; CHECK-NEXT: [[TMP6:%.*]] = getelementptr , ptr [[PTR]], i64 2 +; CHECK-NEXT: [[LDN1:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2f64( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP6]]) +; CHECK-NEXT: [[TMP7:%.*]] = extractvalue { , } [[LDN1]], 0 +; CHECK-NEXT: [[TMP8:%.*]] = call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP3]], [[TMP7]], i64 2) +; CHECK-NEXT: [[TMP9:%.*]] = extractvalue { , } [[LDN1]], 1 +; CHECK-NEXT: [[TMP10:%.*]] = call @llvm.vector.insert.nxv4f64.nxv2f64( [[TMP5]], [[TMP9]], i64 2) +; CHECK-NEXT: [[TMP11:%.*]] = insertvalue { , } poison, [[TMP8]], 0 +; CHECK-NEXT: [[TMP12:%.*]] = insertvalue { , } [[TMP11]], [[TMP10]], 1 +; CHECK-NEXT: ret { , } [[TMP12]] +; + %load = load , ptr %ptr, align 8 + %deinterleave = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8f64( %load) + ret { , } %deinterleave +} + +define void @interleave_wide_nxdouble_factor2(ptr %ptr, %l, %r) #0 { +; CHECK-LABEL: define void @interleave_wide_nxdouble_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], [[L:%.*]], [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[TMP1:%.*]] = getelementptr , ptr [[PTR]], i64 0 +; CHECK-NEXT: [[TMP2:%.*]] = call @llvm.vector.extract.nxv2f64.nxv4f64( [[L]], i64 0) +; CHECK-NEXT: [[TMP3:%.*]] = call @llvm.vector.extract.nxv2f64.nxv4f64( [[R]], i64 0) +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP2]], [[TMP3]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP1]]) +; CHECK-NEXT: [[TMP4:%.*]] = getelementptr , ptr [[PTR]], i64 2 +; CHECK-NEXT: [[TMP5:%.*]] = call @llvm.vector.extract.nxv2f64.nxv4f64( [[L]], i64 2) +; CHECK-NEXT: [[TMP6:%.*]] = call @llvm.vector.extract.nxv2f64.nxv4f64( [[R]], i64 2) +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2f64( [[TMP5]], [[TMP6]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[TMP4]]) +; CHECK-NEXT: ret void +; + %interleave = tail call @llvm.experimental.vector.interleave2.nxv8f64( %l, %r) + store %interleave, ptr %ptr, align 4 + ret void +} + +declare { , } @llvm.experimental.vector.deinterleave2.nxv32i8() +declare { , } @llvm.experimental.vector.deinterleave2.nxv16i16() +declare { , } @llvm.experimental.vector.deinterleave2.nxv8i32() +declare { , } @llvm.experimental.vector.deinterleave2.nxv4i64() +declare { , } @llvm.experimental.vector.deinterleave2.nxv8f32() +declare { , } @llvm.experimental.vector.deinterleave2.nxv4f64() +declare { , } @llvm.experimental.vector.deinterleave2.nxv4p0() + +; Larger deinterleaves to test 'legalization' +declare { , } @llvm.experimental.vector.deinterleave2.nxv32i32() +declare { , } @llvm.experimental.vector.deinterleave2.nxv8f64() + +declare @llvm.experimental.vector.interleave2.nxv32i8(, ) +declare @llvm.experimental.vector.interleave2.nxv16i16(, ) +declare @llvm.experimental.vector.interleave2.nxv8i32(, ) +declare @llvm.experimental.vector.interleave2.nxv4i64(, ) +declare @llvm.experimental.vector.interleave2.nxv8f32(, ) +declare @llvm.experimental.vector.interleave2.nxv4f64(, ) +declare @llvm.experimental.vector.interleave2.nxv4p0(, ) + +; Larger interleaves to test 'legalization' +declare @llvm.experimental.vector.interleave2.nxv8f64(, ) + +attributes #0 = { vscale_range(1,16) "target-features"="+sve" }