diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp --- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp +++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp @@ -22,9 +22,11 @@ //===----------------------------------------------------------------------===// #include "AArch64.h" +#include "AArch64Subtarget.h" #include "Utils/AArch64BaseInfo.h" #include "llvm/ADT/PostOrderIterator.h" #include "llvm/ADT/SetVector.h" +#include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/IR/Constants.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/IRBuilder.h" @@ -59,10 +61,13 @@ bool coalescePTrueIntrinsicCalls(BasicBlock &BB, SmallSetVector &PTrues); bool optimizePTrueIntrinsicCalls(SmallSetVector &Functions); + bool optimizePredicateVectorInsert(SmallSetVector &Functions); /// Operates at the function-scope. I.e., optimizations are applied local to /// the functions themselves. bool optimizeFunctions(SmallSetVector &Functions); + + const TargetMachine *TM; }; } // end anonymous namespace @@ -276,11 +281,84 @@ return Changed; } +// This is done in SVEIntrinsicOpts rather than InstCombine so that we introduce +// scalable loads as late as possible +bool SVEIntrinsicOpts::optimizePredicateVectorInsert( + SmallSetVector &Functions) { + bool Changed = false; + + for (auto *F : Functions) { + auto &ST = TM->getSubtarget(*F); + if (!ST.hasSVE()) + continue; + unsigned MinSVEVectorSize = ST.getMinSVEVectorSizeInBits() / 128; + if (MinSVEVectorSize == 0) + continue; + + auto *PredType = + ScalableVectorType::get(Type::getInt1Ty(F->getContext()), 16); + + for (auto &BB : *F) { + SmallSetVector InterestingBitCasts; + + for (Instruction &I : BB) { + if (I.use_empty()) + continue; + + auto *BitCast = dyn_cast(&I); + if (!BitCast || BitCast->getType() != PredType) + continue; + + auto *IntrI = dyn_cast(BitCast->getOperand(0)); + if (!IntrI || + IntrI->getIntrinsicID() != Intrinsic::experimental_vector_insert || + !IntrI->hasOneUse()) + continue; + + auto Idx = cast(IntrI->getOperand(2))->getZExtValue(); + if (!isa(IntrI->getOperand(0)) || Idx != 0) + continue; + + auto *Load = dyn_cast(IntrI->getOperand(1)); + if (!Load || !Load->hasOneUse()) + continue; + + auto *FixedLoadType = dyn_cast(Load->getType()); + if (!FixedLoadType || + FixedLoadType->getPrimitiveSizeInBits() != 16 * MinSVEVectorSize) + continue; + + InterestingBitCasts.insert(BitCast); + } + + for (auto *BitCast : InterestingBitCasts) { + IRBuilder<> Builder(F->getContext()); + Builder.SetInsertPoint(&BB, BitCast->getIterator()); + + auto *IntrI = cast(BitCast->getOperand(0)); + auto *Load = cast(IntrI->getOperand(1)); + auto *PtrBitCast = Builder.CreateBitCast( + Load->getPointerOperand(), + PredType->getPointerTo(Load->getPointerAddressSpace())); + auto *LoadPred = Builder.CreateLoad(PredType, PtrBitCast); + + BitCast->replaceAllUsesWith(LoadPred); + BitCast->eraseFromParent(); + IntrI->eraseFromParent(); + Load->eraseFromParent(); + } + } + } + + return Changed; +} + bool SVEIntrinsicOpts::optimizeFunctions( SmallSetVector &Functions) { bool Changed = false; Changed |= optimizePTrueIntrinsicCalls(Functions); + Changed |= optimizePredicateVectorInsert(Functions); return Changed; } @@ -289,6 +367,10 @@ bool Changed = false; SmallSetVector Functions; + auto *TPC = getAnalysisIfAvailable(); + assert(TPC && "Expected a TargetPassConfig"); + TM = &TPC->getTM(); + // Check for SVE intrinsic declarations first so that we only iterate over // relevant functions. Where an appropriate declaration is found, store the // function(s) where it is used so we can target these only. @@ -297,6 +379,7 @@ continue; switch (F.getIntrinsicID()) { + case Intrinsic::experimental_vector_insert: case Intrinsic::aarch64_sve_ptrue: for (User *U : F.users()) Functions.insert(cast(U)->getFunction()); diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector-to-predicate-load.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector-to-predicate-load.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-insert-vector-to-predicate-load.ll @@ -0,0 +1,85 @@ +; RUN: opt -S -aarch64-sve-intrinsic-opts < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define @pred_load_v2i8(<2 x i8>* %addr) #0 { +; CHECK-LABEL: @pred_load_v2i8( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i8>* %addr to * +; CHECK-NEXT: [[TMP2:%.*]] = load , * [[TMP1]] +; CHECK-NEXT: ret [[TMP2]] + %load = load <2 x i8>, <2 x i8>* %addr, align 4 + %insert = tail call @llvm.experimental.vector.insert.nxv2i8.v2i8( undef, <2 x i8> %load, i64 0) + %ret = bitcast %insert to + ret %ret +} + + +define @pred_load_v4i8(<4 x i8>* %addr) #1 { +; CHECK-LABEL: @pred_load_v4i8( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i8>* %addr to * +; CHECK-NEXT: [[TMP2:%.*]] = load , * [[TMP1]] +; CHECK-NEXT: ret [[TMP2]] + %load = load <4 x i8>, <4 x i8>* %addr, align 4 + %insert = tail call @llvm.experimental.vector.insert.nxv2i8.v4i8( undef, <4 x i8> %load, i64 0) + %ret = bitcast %insert to + ret %ret +} + +define @pred_load_v8i8(<8 x i8>* %addr) #2 { +; CHECK-LABEL: @pred_load_v8i8( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8>* %addr to * +; CHECK-NEXT: [[TMP2:%.*]] = load , * [[TMP1]] +; CHECK-NEXT: ret [[TMP2]] + %load = load <8 x i8>, <8 x i8>* %addr, align 4 + %insert = tail call @llvm.experimental.vector.insert.nxv2i8.v8i8( undef, <8 x i8> %load, i64 0) + %ret = bitcast %insert to + ret %ret +} + +; Check that too small of a vscale prevents optimization +define @pred_load_neg1(<4 x i8>* %addr) #0 { +; CHECK-LABEL: @pred_load_neg1( +; CHECK: llvm.experimental.vector.insert + %load = load <4 x i8>, <4 x i8>* %addr, align 4 + %insert = tail call @llvm.experimental.vector.insert.nxv2i8.v4i8( undef, <4 x i8> %load, i64 0) + %ret = bitcast %insert to + ret %ret +} + +; Check that too large of a vscale prevents optimization +define @pred_load_neg2(<4 x i8>* %addr) #2 { +; CHECK-LABEL: @pred_load_neg2( +; CHECK: llvm.experimental.vector.insert + %load = load <4 x i8>, <4 x i8>* %addr, align 4 + %insert = tail call @llvm.experimental.vector.insert.nxv2i8.v4i8( undef, <4 x i8> %load, i64 0) + %ret = bitcast %insert to + ret %ret +} + +; Check that a non-zero index prevents optimization +define @pred_load_neg3(<4 x i8>* %addr) #1 { +; CHECK-LABEL: @pred_load_neg3( +; CHECK: llvm.experimental.vector.insert + %load = load <4 x i8>, <4 x i8>* %addr, align 4 + %insert = tail call @llvm.experimental.vector.insert.nxv2i8.v4i8( undef, <4 x i8> %load, i64 4) + %ret = bitcast %insert to + ret %ret +} + +; Check that insertion into a non-undef vector prevents optimization +define @pred_load_neg4(<4 x i8>* %addr, %passthru) #1 { +; CHECK-LABEL: @pred_load_neg4( +; CHECK: llvm.experimental.vector.insert + %load = load <4 x i8>, <4 x i8>* %addr, align 4 + %insert = tail call @llvm.experimental.vector.insert.nxv2i8.v4i8( %passthru, <4 x i8> %load, i64 0) + %ret = bitcast %insert to + ret %ret +} + +declare @llvm.experimental.vector.insert.nxv2i8.v2i8(, <2 x i8>, i64) +declare @llvm.experimental.vector.insert.nxv2i8.v4i8(, <4 x i8>, i64) +declare @llvm.experimental.vector.insert.nxv2i8.v8i8(, <8 x i8>, i64) + +attributes #0 = { "target-features"="+sve" vscale_range(1,1) } +attributes #1 = { "target-features"="+sve" vscale_range(2,2) } +attributes #2 = { "target-features"="+sve" vscale_range(4,8) }