Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -2931,6 +2931,28 @@ return false; } + /// Lower a deinterleave intrinsic to a target specific load intrinsic. + /// Return true on success. Currently only supports + /// llvm.experimental.vector.deinterleave2 + /// + /// \p DI is the deinterleave intrinsic. + /// \p Address is the pointer operand from the accompanying load. + virtual bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, + Value *Address) const { + return false; + } + + /// Lower an interleave intrinsic to a target specific store intrinsic. + /// Return true on success. Currently only supports + /// llvm.experimental.vector.interleave2 + /// + /// \p II is the interleave intrinsic. + /// \p Address is the pointer operand from the accompanying store. + virtual bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II, + Value *Address) const { + return false; + } + /// Return true if zero-extending the specific node Val to type VT2 is free /// (either because it's implicitly zero-extended such as ARM ldrb / ldrh or /// because it's folded such as X86 zero-extending loads). Index: llvm/lib/CodeGen/InterleavedAccessPass.cpp =================================================================== --- llvm/lib/CodeGen/InterleavedAccessPass.cpp +++ llvm/lib/CodeGen/InterleavedAccessPass.cpp @@ -113,6 +113,12 @@ bool lowerInterleavedStore(StoreInst *SI, SmallVector &DeadInsts); + bool lowerDeinterleaveIntrinsic(IntrinsicInst *II, + SmallVector &DeadInsts); + + bool lowerInterleaveIntrinsic(IntrinsicInst *II, + SmallVector &DeadInsts); + /// Returns true if the uses of an interleaved load by the /// extractelement instructions in \p Extracts can be replaced by uses of the /// shufflevector instructions in \p Shuffles instead. If so, the necessary @@ -444,6 +450,47 @@ return true; } +bool InterleavedAccess::lowerDeinterleaveIntrinsic( + IntrinsicInst *DI, SmallVector &DeadInsts) { + LoadInst *LI = dyn_cast(DI->getOperand(0)); + + if (!LI || !LI->hasOneUse()) + return false; + + LLVM_DEBUG(dbgs() << "IA: Found a deinterleave intrinsic: " << *DI << "\n"); + + // Try and match this with target specific intrinsics. + if (!TLI->lowerDeinterleaveIntrinsicToLoad(DI, LI->getPointerOperand())) + return false; + + // We now have a target-specific load, so delete the old one. + DeadInsts.push_back(DI); + DeadInsts.push_back(LI); + return true; +} + +bool InterleavedAccess::lowerInterleaveIntrinsic( + IntrinsicInst *II, SmallVector &DeadInsts) { + if (!II->hasOneUse()) + return false; + + StoreInst *SI = dyn_cast(*(II->users().begin())); + + if (!SI) + return false; + + LLVM_DEBUG(dbgs() << "IA: Found an interleave intrinsic: " << *II << "\n"); + + // Try and match this with target specific intrinsics. + if (!TLI->lowerInterleaveIntrinsicToStore(II, SI->getPointerOperand())) + return false; + + // We now have a target-specific store, so delete the old one. + DeadInsts.push_back(SI); + DeadInsts.push_back(II); + return true; +} + bool InterleavedAccess::runOnFunction(Function &F) { auto *TPC = getAnalysisIfAvailable(); if (!TPC || !LowerInterleavedAccesses) @@ -466,6 +513,13 @@ if (auto *SI = dyn_cast(&I)) Changed |= lowerInterleavedStore(SI, DeadInsts); + + if (auto *II = dyn_cast(&I)) { + if (II->getIntrinsicID() == Intrinsic::experimental_vector_deinterleave2) + Changed |= lowerDeinterleaveIntrinsic(II, DeadInsts); + if (II->getIntrinsicID() == Intrinsic::experimental_vector_interleave2) + Changed |= lowerInterleaveIntrinsic(II, DeadInsts); + } } for (auto *I : DeadInsts) Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -646,6 +646,12 @@ bool lowerInterleavedStore(StoreInst *SI, ShuffleVectorInst *SVI, unsigned Factor) const override; + bool lowerDeinterleaveIntrinsicToLoad(IntrinsicInst *DI, + Value *Address) const override; + + bool lowerInterleaveIntrinsicToStore(IntrinsicInst *II, + Value *Address) const override; + bool isLegalAddImmediate(int64_t) const override; bool isLegalICmpImmediate(int64_t) const override; Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -14668,6 +14668,71 @@ return true; } +bool AArch64TargetLowering::lowerDeinterleaveIntrinsicToLoad( + IntrinsicInst *DI, Value *Address) const { + // Only deinterleave2 supported at present + if (DI->getIntrinsicID() != Intrinsic::experimental_vector_deinterleave2) + return false; + + // Fixed length vectors should use lowerInterleavedLoad above, as + // shufflevector is the canonical way to deinterleave fixed vectors. + VectorType *VTy = dyn_cast(DI->getType()->getContainedType(0)); + if (!VTy || !VTy->isScalableTy() || !Subtarget->hasSVE()) + return false; + + // TODO: We currently cannot 'legalize' a wider scalable load in IR, since + // we cannot concatenate the results. So reject any vector types which don't + // have a minimum size of 128 bits. + if (VTy->getElementCount().getKnownMinValue() * + VTy->getElementType()->getScalarSizeInBits() != + 128) + return false; + + IRBuilder<> Builder(DI); + Function *LdNFunc = Intrinsic::getDeclaration( + DI->getModule(), Intrinsic::aarch64_sve_ld2_sret, {VTy}); + Value *Pred = Builder.CreateVectorSplat( + VTy->getElementCount(), + ConstantInt::getTrue(IntegerType::getInt1Ty(VTy->getContext()))); + Value *LdN = Builder.CreateCall(LdNFunc, {Pred, Address}, "ldN"); + + DI->replaceAllUsesWith(LdN); + + return true; +} + +bool AArch64TargetLowering::lowerInterleaveIntrinsicToStore( + IntrinsicInst *II, Value *Address) const { + // Only interleave2 supported at present + if (II->getIntrinsicID() != Intrinsic::experimental_vector_interleave2) + return false; + + // Fixed length vectors should use lowerInterleavedStore above, as + // shufflevector is the canonical way to interleave fixed vectors. + VectorType *VTy = dyn_cast(II->getOperand(0)->getType()); + if (!VTy || !VTy->isScalableTy() || !Subtarget->hasSVE()) + return false; + + // TODO: We currently cannot 'legalize' a wider scalable store in IR, since + // we cannot split the incoming value. So reject any vector types which don't + // have a minimum size of 128 bits. + if (VTy->getElementCount().getKnownMinValue() * + VTy->getElementType()->getScalarSizeInBits() != + 128) + return false; + + IRBuilder<> Builder(II); + Function *StNFunc = Intrinsic::getDeclaration( + II->getModule(), Intrinsic::aarch64_sve_st2, {VTy}); + Value *Pred = Builder.CreateVectorSplat( + VTy->getElementCount(), + ConstantInt::getTrue(IntegerType::getInt1Ty(VTy->getContext()))); + Builder.CreateCall(StNFunc, + {II->getOperand(0), II->getOperand(1), Pred, Address}); + + return true; +} + EVT AArch64TargetLowering::getOptimalMemOpType( const MemOp &Op, const AttributeList &FuncAttributes) const { bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat); Index: llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave-intrinsics.ll =================================================================== --- /dev/null +++ llvm/test/Transforms/InterleavedAccess/AArch64/sve-deinterleave-intrinsics.ll @@ -0,0 +1,212 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt < %s -interleaved-access -S | FileCheck %s + +target triple = "aarch64-linux-gnu" + +define void @deinterleave_nxi8_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_nxi8_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv16i8( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: [[L:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[R:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: ret void +; + %load = load , ptr %ptr, align 1 + %deinterleave = tail call { , } @llvm.experimental.vector.deinterleave2.nxv32i8( %load) + %l = extractvalue { , } %deinterleave, 0 + %r = extractvalue { , } %deinterleave, 1 + ret void +} + +define void @deinterleave_nxi16_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_nxi16_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv8i16( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: [[L:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[R:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: ret void +; + %load = load , ptr %ptr, align 2 + %deinterleave = tail call { , } @llvm.experimental.vector.deinterleave2.nxv16i16( %load) + %l = extractvalue { , } %deinterleave, 0 + %r = extractvalue { , } %deinterleave, 1 + ret void +} + +define void @deinterleave_nxi32_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_nxi32_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4i32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: [[L:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[R:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: ret void +; + %load = load , ptr %ptr, align 4 + %deinterleave = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8i32( %load) + %l = extractvalue { , } %deinterleave, 0 + %r = extractvalue { , } %deinterleave, 1 + ret void +} + +define void @deinterleave_nxi64_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_nxi64_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2i64( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: [[L:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[R:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: ret void +; + %load = load , ptr %ptr, align 8 + %deinterleave = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4i64( %load) + %l = extractvalue { , } %deinterleave, 0 + %r = extractvalue { , } %deinterleave, 1 + ret void +} + +define void @deinterleave_nxfloat_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_nxfloat_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv4f32( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: [[L:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[R:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: ret void +; + %load = load , ptr %ptr, align 4 + %deinterleave = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8f32( %load) + %l = extractvalue { , } %deinterleave, 0 + %r = extractvalue { , } %deinterleave, 1 + ret void +} + +define void @deinterleave_nxdouble_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_nxdouble_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LDN:%.*]] = call { , } @llvm.aarch64.sve.ld2.sret.nxv2f64( shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: [[L:%.*]] = extractvalue { , } [[LDN]], 0 +; CHECK-NEXT: [[R:%.*]] = extractvalue { , } [[LDN]], 1 +; CHECK-NEXT: ret void +; + %load = load , ptr %ptr, align 8 + %deinterleave = tail call { , } @llvm.experimental.vector.deinterleave2.nxv4f64( %load) + %l = extractvalue { , } %deinterleave, 0 + %r = extractvalue { , } %deinterleave, 1 + ret void +} + +define void @interleave_nxi8_factor2(ptr %ptr, %l, %r) #0 { +; CHECK-LABEL: define void @interleave_nxi8_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], [[L:%.*]], [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv16i8( [[L]], [[R]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call @llvm.experimental.vector.interleave2.nxv32i8( %l, %r) + store %interleave, ptr %ptr, align 1 + ret void +} + +define void @interleave_nxi16_factor2(ptr %ptr, %l, %r) #0 { +; CHECK-LABEL: define void @interleave_nxi16_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], [[L:%.*]], [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv8i16( [[L]], [[R]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call @llvm.experimental.vector.interleave2.nxv16i16( %l, %r) + store %interleave, ptr %ptr, align 2 + ret void +} + +define void @interleave_nxi32_factor2(ptr %ptr, %l, %r) #0 { +; CHECK-LABEL: define void @interleave_nxi32_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], [[L:%.*]], [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv4i32( [[L]], [[R]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call @llvm.experimental.vector.interleave2.nxv8i32( %l, %r) + store %interleave, ptr %ptr, align 4 + ret void +} + +define void @interleave_nxi64_factor2(ptr %ptr, %l, %r) #0 { +; CHECK-LABEL: define void @interleave_nxi64_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], [[L:%.*]], [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2i64( [[L]], [[R]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call @llvm.experimental.vector.interleave2.nxv4i64( %l, %r) + store %interleave, ptr %ptr, align 8 + ret void +} + +define void @interleave_nxfloat_factor2(ptr %ptr, %l, %r) #0 { +; CHECK-LABEL: define void @interleave_nxfloat_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], [[L:%.*]], [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv4f32( [[L]], [[R]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call @llvm.experimental.vector.interleave2.nxv8f32( %l, %r) + store %interleave, ptr %ptr, align 4 + ret void +} + +define void @interleave_nxdouble_factor2(ptr %ptr, %l, %r) #0 { +; CHECK-LABEL: define void @interleave_nxdouble_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], [[L:%.*]], [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: call void @llvm.aarch64.sve.st2.nxv2f64( [[L]], [[R]], shufflevector ( insertelement ( poison, i1 true, i64 0), poison, zeroinitializer), ptr [[PTR]]) +; CHECK-NEXT: ret void +; + %interleave = tail call @llvm.experimental.vector.interleave2.nxv4f64( %l, %r) + store %interleave, ptr %ptr, align 4 + ret void +} + +;;; Check that (for now) we don't transform when the type won't fit in registers +;;; Fixing this will require some sort of vector concat or split intrinsic, +;;; depending on whether you're deinterleaving or interleaving. + +define void @deinterleave_wide_nxdouble_factor2(ptr %ptr) #0 { +; CHECK-LABEL: define void @deinterleave_wide_nxdouble_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[LOAD:%.*]] = load , ptr [[PTR]], align 8 +; CHECK-NEXT: [[DEINTERLEAVE:%.*]] = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8f64( [[LOAD]]) +; CHECK-NEXT: [[L:%.*]] = extractvalue { , } [[DEINTERLEAVE]], 0 +; CHECK-NEXT: [[R:%.*]] = extractvalue { , } [[DEINTERLEAVE]], 1 +; CHECK-NEXT: ret void +; + %load = load , ptr %ptr, align 8 + %deinterleave = tail call { , } @llvm.experimental.vector.deinterleave2.nxv8f64( %load) + %l = extractvalue { , } %deinterleave, 0 + %r = extractvalue { , } %deinterleave, 1 + ret void +} + +define void @interleave_wide_nxdouble_factor2(ptr %ptr, %l, %r) #0 { +; CHECK-LABEL: define void @interleave_wide_nxdouble_factor2 +; CHECK-SAME: (ptr [[PTR:%.*]], [[L:%.*]], [[R:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: [[INTERLEAVE:%.*]] = tail call @llvm.experimental.vector.interleave2.nxv8f64( [[L]], [[R]]) +; CHECK-NEXT: store [[INTERLEAVE]], ptr [[PTR]], align 4 +; CHECK-NEXT: ret void +; + %interleave = tail call @llvm.experimental.vector.interleave2.nxv8f64( %l, %r) + store %interleave, ptr %ptr, align 4 + ret void +} + +declare { , } @llvm.experimental.vector.deinterleave2.nxv32i8() +declare { , } @llvm.experimental.vector.deinterleave2.nxv16i16() +declare { , } @llvm.experimental.vector.deinterleave2.nxv8i32() +declare { , } @llvm.experimental.vector.deinterleave2.nxv4i64() +declare { , } @llvm.experimental.vector.deinterleave2.nxv8f32() +declare { , } @llvm.experimental.vector.deinterleave2.nxv4f64() + +declare { , } @llvm.experimental.vector.deinterleave2.nxv8f64() + +declare @llvm.experimental.vector.interleave2.nxv32i8( %l, ) +declare @llvm.experimental.vector.interleave2.nxv16i16( %l, ) +declare @llvm.experimental.vector.interleave2.nxv8i32( %l, ) +declare @llvm.experimental.vector.interleave2.nxv4i64( %l, ) +declare @llvm.experimental.vector.interleave2.nxv8f32(, ) +declare @llvm.experimental.vector.interleave2.nxv4f64(, ) + +declare @llvm.experimental.vector.interleave2.nxv8f64(, ) + +attributes #0 = { vscale_range(1,16) "target-features"="+sve" }