diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp --- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp +++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp @@ -59,6 +59,8 @@ bool coalescePTrueIntrinsicCalls(BasicBlock &BB, SmallSetVector &PTrues); bool optimizePTrueIntrinsicCalls(SmallSetVector &Functions); + bool optimizePredicateVectorExtract(SmallSetVector &Functions); + bool optimizePredicateVectorInsert(SmallSetVector &Functions); /// Operates at the function-scope. I.e., optimizations are applied local to /// the functions themselves. @@ -276,11 +278,173 @@ return Changed; } +// This is done in SVEIntrinsicOpts rather than InstCombine so that we introduce +// scalable loads as late as possible +bool SVEIntrinsicOpts::optimizePredicateVectorExtract( + SmallSetVector &Functions) { + bool Changed = false; + + for (auto *F : Functions) { + auto Attr = F->getFnAttribute(Attribute::VScaleRange); + if (!Attr.isValid()) + continue; + + unsigned MinSVEVectorSize, MaxSVEVectorSize; + std::tie(MinSVEVectorSize, MaxSVEVectorSize) = Attr.getVScaleRangeArgs(); + if (MinSVEVectorSize != MaxSVEVectorSize && MaxSVEVectorSize != 0) + continue; + + auto *PredType = + ScalableVectorType::get(Type::getInt1Ty(F->getContext()), 16); + + for (auto &BB : *F) { + SmallSetVector InterestingStores; + + for (Instruction &I : BB) { + // If we have a store.. + auto *Store = dyn_cast(&I); + if (!Store) + continue; + + // ..that is storing a predicate vector sized worth of bits.. + auto *FixedStoreType = dyn_cast(Store->getOperand(0)->getType()); + if (!FixedStoreType || FixedStoreType->getPrimitiveSizeInBits() != 16 * MinSVEVectorSize) + continue; + + // ..where the value stored comes from a vector extract with one use.. + auto *IntrI = dyn_cast(Store->getOperand(0)); + if (!IntrI || + IntrI->getIntrinsicID() != Intrinsic::experimental_vector_extract || + !IntrI->hasOneUse()) + continue; + + // ..that is extracting from index 0.. + auto Idx = cast(IntrI->getOperand(1))->getZExtValue(); + if (Idx != 0) + continue; + + // ..where the value being extract from comes from a bitcast + auto *BitCast = dyn_cast(IntrI->getOperand(0)); + if (!BitCast) + continue; + + // ..and the bitcast is casting from predicate type + if (BitCast->getOperand(0)->getType() != PredType) + continue; + + // ..convert it into a direct predicate store. + InterestingStores.insert(Store); + } + + for (auto *Store : InterestingStores) { + IRBuilder<> Builder(F->getContext()); + Builder.SetInsertPoint(&BB, Store->getIterator()); + + auto *IntrI = cast(Store->getOperand(0)); + auto *BitCast = cast(IntrI->getOperand(0)); + auto *PtrBitCast = Builder.CreateBitCast( + Store->getPointerOperand(), + PredType->getPointerTo(Store->getPointerAddressSpace())); + auto *StorePred = Builder.CreateStore(BitCast->getOperand(0), PtrBitCast); + + Store->replaceAllUsesWith(StorePred); + Store->eraseFromParent(); + IntrI->eraseFromParent(); + BitCast->eraseFromParent(); + } + } + } + + return Changed; +} + +// This is done in SVEIntrinsicOpts rather than InstCombine so that we introduce +// scalable loads as late as possible +bool SVEIntrinsicOpts::optimizePredicateVectorInsert( + SmallSetVector &Functions) { + bool Changed = false; + + for (auto *F : Functions) { + auto Attr = F->getFnAttribute(Attribute::VScaleRange); + if (!Attr.isValid()) + continue; + + unsigned MinSVEVectorSize, MaxSVEVectorSize; + std::tie(MinSVEVectorSize, MaxSVEVectorSize) = Attr.getVScaleRangeArgs(); + if (MinSVEVectorSize != MaxSVEVectorSize && MaxSVEVectorSize != 0) + continue; + + auto *PredType = + ScalableVectorType::get(Type::getInt1Ty(F->getContext()), 16); + + for (auto &BB : *F) { + SmallSetVector InterestingBitCasts; + + for (Instruction &I : BB) { + if (I.use_empty()) + continue; + + // If we have a bitcast.. + auto *BitCast = dyn_cast(&I); + if (!BitCast || BitCast->getType() != PredType) + continue; + + // ..whose operand is a vector_insert with only one use.. + auto *IntrI = dyn_cast(BitCast->getOperand(0)); + if (!IntrI || + IntrI->getIntrinsicID() != Intrinsic::experimental_vector_insert || + !IntrI->hasOneUse()) + continue; + + // ..that is inserting into index zero of an undef vector.. + auto Idx = cast(IntrI->getOperand(2))->getZExtValue(); + if (!isa(IntrI->getOperand(0)) || Idx != 0) + continue; + + // ..where the value inserted comes from a load with only one use.. + auto *Load = dyn_cast(IntrI->getOperand(1)); + if (!Load || !Load->hasOneUse()) + continue; + + // ..that is loading a predicate vector sized worth of bits.. + auto *FixedLoadType = dyn_cast(Load->getType()); + if (!FixedLoadType || + FixedLoadType->getPrimitiveSizeInBits() != 16 * MinSVEVectorSize) + continue; + + // ..convert it into a direct predicate load. + InterestingBitCasts.insert(BitCast); + } + + for (auto *BitCast : InterestingBitCasts) { + IRBuilder<> Builder(F->getContext()); + Builder.SetInsertPoint(&BB, BitCast->getIterator()); + + auto *IntrI = cast(BitCast->getOperand(0)); + auto *Load = cast(IntrI->getOperand(1)); + auto *PtrBitCast = Builder.CreateBitCast( + Load->getPointerOperand(), + PredType->getPointerTo(Load->getPointerAddressSpace())); + auto *LoadPred = Builder.CreateLoad(PredType, PtrBitCast); + + BitCast->replaceAllUsesWith(LoadPred); + BitCast->eraseFromParent(); + IntrI->eraseFromParent(); + Load->eraseFromParent(); + } + } + } + + return Changed; +} + bool SVEIntrinsicOpts::optimizeFunctions( SmallSetVector &Functions) { bool Changed = false; Changed |= optimizePTrueIntrinsicCalls(Functions); + Changed |= optimizePredicateVectorExtract(Functions); + Changed |= optimizePredicateVectorInsert(Functions); return Changed; } @@ -297,6 +461,8 @@ continue; switch (F.getIntrinsicID()) { + case Intrinsic::experimental_vector_extract: + case Intrinsic::experimental_vector_insert: case Intrinsic::aarch64_sve_ptrue: for (User *U : F.users()) Functions.insert(cast(U)->getFunction()); diff --git a/llvm/test/CodeGen/AArch64/sve-extract-vector-to-predicate-store.ll b/llvm/test/CodeGen/AArch64/sve-extract-vector-to-predicate-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-extract-vector-to-predicate-store.ll @@ -0,0 +1,86 @@ +; RUN: opt -S -aarch64-sve-intrinsic-opts < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define void @pred_store_v2i8( %pred, <2 x i8>* %addr) #0 { +; CHECK-LABEL: @pred_store_v2i8( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i8>* %addr to * +; CHECK-NEXT: store %pred, * [[TMP1]] +; CHECK-NEXT: ret void + %bitcast = bitcast %pred to + %extract = tail call <2 x i8> @llvm.experimental.vector.extract.v2i8.nxv2i8( %bitcast, i64 0) + store <2 x i8> %extract, <2 x i8>* %addr, align 4 + ret void +} + +define void @pred_store_v4i8( %pred, <4 x i8>* %addr) #1 { +; CHECK-LABEL: @pred_store_v4i8( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i8>* %addr to * +; CHECK-NEXT: store %pred, * [[TMP1]] +; CHECK-NEXT: ret void + %bitcast = bitcast %pred to + %extract = tail call <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8( %bitcast, i64 0) + store <4 x i8> %extract, <4 x i8>* %addr, align 4 + ret void +} + +define void @pred_store_v8i8( %pred, <8 x i8>* %addr) #2 { +; CHECK-LABEL: @pred_store_v8i8( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8>* %addr to * +; CHECK-NEXT: store %pred, * [[TMP1]] +; CHECK-NEXT: ret void + %bitcast = bitcast %pred to + %extract = tail call <8 x i8> @llvm.experimental.vector.extract.v8i8.nxv2i8( %bitcast, i64 0) + store <8 x i8> %extract, <8 x i8>* %addr, align 4 + ret void +} + + +; Check that too small of a vscale prevents optimization +define void @pred_store_neg1( %pred, <4 x i8>* %addr) #0 { +; CHECK-LABEL: @pred_store_neg1( +; CHECK: call <4 x i8> @llvm.experimental.vector.extract + %bitcast = bitcast %pred to + %extract = tail call <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8( %bitcast, i64 0) + store <4 x i8> %extract, <4 x i8>* %addr, align 4 + ret void +} + +; Check that too large of a vscale prevents optimization +define void @pred_store_neg2( %pred, <4 x i8>* %addr) #2 { +; CHECK-LABEL: @pred_store_neg2( +; CHECK: call <4 x i8> @llvm.experimental.vector.extract + %bitcast = bitcast %pred to + %extract = tail call <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8( %bitcast, i64 0) + store <4 x i8> %extract, <4 x i8>* %addr, align 4 + ret void +} + +; Check that a non-zero index prevents optimization +define void @pred_store_neg3( %pred, <4 x i8>* %addr) #1 { +; CHECK-LABEL: @pred_store_neg3( +; CHECK: call <4 x i8> @llvm.experimental.vector.extract + %bitcast = bitcast %pred to + %extract = tail call <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8( %bitcast, i64 4) + store <4 x i8> %extract, <4 x i8>* %addr, align 4 + ret void +} + +; Check that differing vscale min/max prevents optimization +define void @pred_store_neg4( %pred, <4 x i8>* %addr) #3 { +; CHECK-LABEL: @pred_store_neg4( +; CHECK: call <4 x i8> @llvm.experimental.vector.extract + %bitcast = bitcast %pred to + %extract = tail call <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8( %bitcast, i64 0) + store <4 x i8> %extract, <4 x i8>* %addr, align 4 + ret void +} + +declare <2 x i8> @llvm.experimental.vector.extract.v2i8.nxv2i8(, i64) +declare <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8(, i64) +declare <8 x i8> @llvm.experimental.vector.extract.v8i8.nxv2i8(, i64) + +attributes #0 = { "target-features"="+sve" vscale_range(1,1) } +attributes #1 = { "target-features"="+sve" vscale_range(2,2) } +attributes #2 = { "target-features"="+sve" vscale_range(4,0) } +attributes #3 = { "target-features"="+sve" vscale_range(2,4) } diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector-to-predicate-load.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector-to-predicate-load.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-insert-vector-to-predicate-load.ll @@ -0,0 +1,96 @@ +; RUN: opt -S -aarch64-sve-intrinsic-opts < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define @pred_load_v2i8(<2 x i8>* %addr) #0 { +; CHECK-LABEL: @pred_load_v2i8( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i8>* %addr to * +; CHECK-NEXT: [[TMP2:%.*]] = load , * [[TMP1]] +; CHECK-NEXT: ret [[TMP2]] + %load = load <2 x i8>, <2 x i8>* %addr, align 4 + %insert = tail call @llvm.experimental.vector.insert.nxv2i8.v2i8( undef, <2 x i8> %load, i64 0) + %ret = bitcast %insert to + ret %ret +} + + +define @pred_load_v4i8(<4 x i8>* %addr) #1 { +; CHECK-LABEL: @pred_load_v4i8( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i8>* %addr to * +; CHECK-NEXT: [[TMP2:%.*]] = load , * [[TMP1]] +; CHECK-NEXT: ret [[TMP2]] + %load = load <4 x i8>, <4 x i8>* %addr, align 4 + %insert = tail call @llvm.experimental.vector.insert.nxv2i8.v4i8( undef, <4 x i8> %load, i64 0) + %ret = bitcast %insert to + ret %ret +} + +define @pred_load_v8i8(<8 x i8>* %addr) #2 { +; CHECK-LABEL: @pred_load_v8i8( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8>* %addr to * +; CHECK-NEXT: [[TMP2:%.*]] = load , * [[TMP1]] +; CHECK-NEXT: ret [[TMP2]] + %load = load <8 x i8>, <8 x i8>* %addr, align 4 + %insert = tail call @llvm.experimental.vector.insert.nxv2i8.v8i8( undef, <8 x i8> %load, i64 0) + %ret = bitcast %insert to + ret %ret +} + +; Check that too small of a vscale prevents optimization +define @pred_load_neg1(<4 x i8>* %addr) #0 { +; CHECK-LABEL: @pred_load_neg1( +; CHECK: call @llvm.experimental.vector.insert + %load = load <4 x i8>, <4 x i8>* %addr, align 4 + %insert = tail call @llvm.experimental.vector.insert.nxv2i8.v4i8( undef, <4 x i8> %load, i64 0) + %ret = bitcast %insert to + ret %ret +} + +; Check that too large of a vscale prevents optimization +define @pred_load_neg2(<4 x i8>* %addr) #2 { +; CHECK-LABEL: @pred_load_neg2( +; CHECK: call @llvm.experimental.vector.insert + %load = load <4 x i8>, <4 x i8>* %addr, align 4 + %insert = tail call @llvm.experimental.vector.insert.nxv2i8.v4i8( undef, <4 x i8> %load, i64 0) + %ret = bitcast %insert to + ret %ret +} + +; Check that a non-zero index prevents optimization +define @pred_load_neg3(<4 x i8>* %addr) #1 { +; CHECK-LABEL: @pred_load_neg3( +; CHECK: call @llvm.experimental.vector.insert + %load = load <4 x i8>, <4 x i8>* %addr, align 4 + %insert = tail call @llvm.experimental.vector.insert.nxv2i8.v4i8( undef, <4 x i8> %load, i64 4) + %ret = bitcast %insert to + ret %ret +} + +; Check that differing vscale min/max prevents optimization +define @pred_load_neg4(<4 x i8>* %addr) #3 { +; CHECK-LABEL: @pred_load_neg4( +; CHECK: call @llvm.experimental.vector.insert + %load = load <4 x i8>, <4 x i8>* %addr, align 4 + %insert = tail call @llvm.experimental.vector.insert.nxv2i8.v4i8( undef, <4 x i8> %load, i64 0) + %ret = bitcast %insert to + ret %ret +} + +; Check that insertion into a non-undef vector prevents optimization +define @pred_load_neg5(<4 x i8>* %addr, %passthru) #1 { +; CHECK-LABEL: @pred_load_neg5( +; CHECK: call @llvm.experimental.vector.insert + %load = load <4 x i8>, <4 x i8>* %addr, align 4 + %insert = tail call @llvm.experimental.vector.insert.nxv2i8.v4i8( %passthru, <4 x i8> %load, i64 0) + %ret = bitcast %insert to + ret %ret +} + +declare @llvm.experimental.vector.insert.nxv2i8.v2i8(, <2 x i8>, i64) +declare @llvm.experimental.vector.insert.nxv2i8.v4i8(, <4 x i8>, i64) +declare @llvm.experimental.vector.insert.nxv2i8.v8i8(, <8 x i8>, i64) + +attributes #0 = { "target-features"="+sve" vscale_range(1,1) } +attributes #1 = { "target-features"="+sve" vscale_range(2,2) } +attributes #2 = { "target-features"="+sve" vscale_range(4,0) } +attributes #3 = { "target-features"="+sve" vscale_range(2,4) }