diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp --- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp +++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp @@ -59,6 +59,10 @@ bool coalescePTrueIntrinsicCalls(BasicBlock &BB, SmallSetVector &PTrues); bool optimizePTrueIntrinsicCalls(SmallSetVector &Functions); + bool optimizePredicateStore(Instruction *I); + bool optimizePredicateLoad(Instruction *I); + + bool optimizeInstructions(SmallSetVector &Functions); /// Operates at the function-scope. I.e., optimizations are applied local to /// the functions themselves. @@ -276,11 +280,167 @@ return Changed; } +// This is done in SVEIntrinsicOpts rather than InstCombine so that we introduce +// scalable stores as late as possible +bool SVEIntrinsicOpts::optimizePredicateStore(Instruction *I) { + auto *F = I->getFunction(); + auto Attr = F->getFnAttribute(Attribute::VScaleRange); + if (!Attr.isValid()) + return false; + + unsigned MinVScale, MaxVScale; + std::tie(MinVScale, MaxVScale) = Attr.getVScaleRangeArgs(); + // The transform needs to know the exact runtime length of scalable vectors + if (MinVScale != MaxVScale || MinVScale == 0) + return false; + + auto *PredType = + ScalableVectorType::get(Type::getInt1Ty(I->getContext()), 16); + auto *FixedPredType = + FixedVectorType::get(Type::getInt8Ty(I->getContext()), MinVScale * 2); + + // If we have a store.. + auto *Store = dyn_cast(I); + if (!Store || !Store->isSimple()) + return false; + + // ..that is storing a predicate vector sized worth of bits.. + if (Store->getOperand(0)->getType() != FixedPredType) + return false; + + // ..where the value stored comes from a vector extract.. + auto *IntrI = dyn_cast(Store->getOperand(0)); + if (!IntrI || + IntrI->getIntrinsicID() != Intrinsic::experimental_vector_extract) + return false; + + // ..that is extracting from index 0.. + if (!cast(IntrI->getOperand(1))->isZero()) + return false; + + // ..where the value being extract from comes from a bitcast + auto *BitCast = dyn_cast(IntrI->getOperand(0)); + if (!BitCast) + return false; + + // ..and the bitcast is casting from predicate type + if (BitCast->getOperand(0)->getType() != PredType) + return false; + + IRBuilder<> Builder(I->getContext()); + Builder.SetInsertPoint(I); + + auto *PtrBitCast = Builder.CreateBitCast( + Store->getPointerOperand(), + PredType->getPointerTo(Store->getPointerAddressSpace())); + auto *StorePred = Builder.CreateStore(BitCast->getOperand(0), PtrBitCast); + + Store->replaceAllUsesWith(StorePred); + Store->eraseFromParent(); + if (IntrI->getNumUses() == 0) + IntrI->eraseFromParent(); + if (BitCast->getNumUses() == 0) + BitCast->eraseFromParent(); + + return true; +} + +// This is done in SVEIntrinsicOpts rather than InstCombine so that we introduce +// scalable loads as late as possible +bool SVEIntrinsicOpts::optimizePredicateLoad(Instruction *I) { + auto *F = I->getFunction(); + auto Attr = F->getFnAttribute(Attribute::VScaleRange); + if (!Attr.isValid()) + return false; + + unsigned MinVScale, MaxVScale; + std::tie(MinVScale, MaxVScale) = Attr.getVScaleRangeArgs(); + // The transform needs to know the exact runtime length of scalable vectors + if (MinVScale != MaxVScale || MinVScale == 0) + return false; + + auto *PredType = + ScalableVectorType::get(Type::getInt1Ty(I->getContext()), 16); + auto *FixedPredType = + FixedVectorType::get(Type::getInt8Ty(I->getContext()), MinVScale * 2); + + // If we have a bitcast.. + auto *BitCast = dyn_cast(I); + if (!BitCast || BitCast->getType() != PredType) + return false; + + // ..whose operand is a vector_insert.. + auto *IntrI = dyn_cast(BitCast->getOperand(0)); + if (!IntrI || + IntrI->getIntrinsicID() != Intrinsic::experimental_vector_insert) + return false; + + // ..that is inserting into index zero of an undef vector.. + if (!isa(IntrI->getOperand(0)) || + !cast(IntrI->getOperand(2))->isZero()) + return false; + + // ..where the value inserted comes from a load.. + auto *Load = dyn_cast(IntrI->getOperand(1)); + if (!Load || !Load->isSimple()) + return false; + + // ..that is loading a predicate vector sized worth of bits.. + if (Load->getType() != FixedPredType) + return false; + + IRBuilder<> Builder(I->getContext()); + Builder.SetInsertPoint(Load); + + auto *PtrBitCast = Builder.CreateBitCast( + Load->getPointerOperand(), + PredType->getPointerTo(Load->getPointerAddressSpace())); + auto *LoadPred = Builder.CreateLoad(PredType, PtrBitCast); + + BitCast->replaceAllUsesWith(LoadPred); + BitCast->eraseFromParent(); + if (IntrI->getNumUses() == 0) + IntrI->eraseFromParent(); + if (Load->getNumUses() == 0) + Load->eraseFromParent(); + + return true; +} + +bool SVEIntrinsicOpts::optimizeInstructions( + SmallSetVector &Functions) { + bool Changed = false; + + for (auto *F : Functions) { + DominatorTree *DT = &getAnalysis(*F).getDomTree(); + + // Traverse the DT with an rpo walk so we see defs before uses, allowing + // simplification to be done incrementally. + BasicBlock *Root = DT->getRoot(); + ReversePostOrderTraversal RPOT(Root); + for (auto *BB : RPOT) { + for (Instruction &I : make_early_inc_range(*BB)) { + switch (I.getOpcode()) { + case Instruction::Store: + Changed |= optimizePredicateStore(&I); + break; + case Instruction::BitCast: + Changed |= optimizePredicateLoad(&I); + break; + } + } + } + } + + return Changed; +} + bool SVEIntrinsicOpts::optimizeFunctions( SmallSetVector &Functions) { bool Changed = false; Changed |= optimizePTrueIntrinsicCalls(Functions); + Changed |= optimizeInstructions(Functions); return Changed; } @@ -297,6 +457,8 @@ continue; switch (F.getIntrinsicID()) { + case Intrinsic::experimental_vector_extract: + case Intrinsic::experimental_vector_insert: case Intrinsic::aarch64_sve_ptrue: for (User *U : F.users()) Functions.insert(cast(U)->getFunction()); diff --git a/llvm/test/CodeGen/AArch64/sve-extract-vector-to-predicate-store.ll b/llvm/test/CodeGen/AArch64/sve-extract-vector-to-predicate-store.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-extract-vector-to-predicate-store.ll @@ -0,0 +1,86 @@ +; RUN: opt -S -aarch64-sve-intrinsic-opts < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define void @pred_store_v2i8( %pred, <2 x i8>* %addr) #0 { +; CHECK-LABEL: @pred_store_v2i8( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i8>* %addr to * +; CHECK-NEXT: store %pred, * [[TMP1]] +; CHECK-NEXT: ret void + %bitcast = bitcast %pred to + %extract = tail call <2 x i8> @llvm.experimental.vector.extract.v2i8.nxv2i8( %bitcast, i64 0) + store <2 x i8> %extract, <2 x i8>* %addr, align 4 + ret void +} + +define void @pred_store_v4i8( %pred, <4 x i8>* %addr) #1 { +; CHECK-LABEL: @pred_store_v4i8( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i8>* %addr to * +; CHECK-NEXT: store %pred, * [[TMP1]] +; CHECK-NEXT: ret void + %bitcast = bitcast %pred to + %extract = tail call <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8( %bitcast, i64 0) + store <4 x i8> %extract, <4 x i8>* %addr, align 4 + ret void +} + +define void @pred_store_v8i8( %pred, <8 x i8>* %addr) #2 { +; CHECK-LABEL: @pred_store_v8i8( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8>* %addr to * +; CHECK-NEXT: store %pred, * [[TMP1]] +; CHECK-NEXT: ret void + %bitcast = bitcast %pred to + %extract = tail call <8 x i8> @llvm.experimental.vector.extract.v8i8.nxv2i8( %bitcast, i64 0) + store <8 x i8> %extract, <8 x i8>* %addr, align 4 + ret void +} + + +; Check that too small of a vscale prevents optimization +define void @pred_store_neg1( %pred, <4 x i8>* %addr) #0 { +; CHECK-LABEL: @pred_store_neg1( +; CHECK: call <4 x i8> @llvm.experimental.vector.extract + %bitcast = bitcast %pred to + %extract = tail call <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8( %bitcast, i64 0) + store <4 x i8> %extract, <4 x i8>* %addr, align 4 + ret void +} + +; Check that too large of a vscale prevents optimization +define void @pred_store_neg2( %pred, <4 x i8>* %addr) #2 { +; CHECK-LABEL: @pred_store_neg2( +; CHECK: call <4 x i8> @llvm.experimental.vector.extract + %bitcast = bitcast %pred to + %extract = tail call <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8( %bitcast, i64 0) + store <4 x i8> %extract, <4 x i8>* %addr, align 4 + ret void +} + +; Check that a non-zero index prevents optimization +define void @pred_store_neg3( %pred, <4 x i8>* %addr) #1 { +; CHECK-LABEL: @pred_store_neg3( +; CHECK: call <4 x i8> @llvm.experimental.vector.extract + %bitcast = bitcast %pred to + %extract = tail call <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8( %bitcast, i64 4) + store <4 x i8> %extract, <4 x i8>* %addr, align 4 + ret void +} + +; Check that differing vscale min/max prevents optimization +define void @pred_store_neg4( %pred, <4 x i8>* %addr) #3 { +; CHECK-LABEL: @pred_store_neg4( +; CHECK: call <4 x i8> @llvm.experimental.vector.extract + %bitcast = bitcast %pred to + %extract = tail call <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8( %bitcast, i64 0) + store <4 x i8> %extract, <4 x i8>* %addr, align 4 + ret void +} + +declare <2 x i8> @llvm.experimental.vector.extract.v2i8.nxv2i8(, i64) +declare <4 x i8> @llvm.experimental.vector.extract.v4i8.nxv2i8(, i64) +declare <8 x i8> @llvm.experimental.vector.extract.v8i8.nxv2i8(, i64) + +attributes #0 = { "target-features"="+sve" vscale_range(1,1) } +attributes #1 = { "target-features"="+sve" vscale_range(2,2) } +attributes #2 = { "target-features"="+sve" vscale_range(4,4) } +attributes #3 = { "target-features"="+sve" vscale_range(2,4) } diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector-to-predicate-load.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector-to-predicate-load.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-insert-vector-to-predicate-load.ll @@ -0,0 +1,114 @@ +; RUN: opt -S -aarch64-sve-intrinsic-opts < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define @pred_load_v2i8(<2 x i8>* %addr) #0 { +; CHECK-LABEL: @pred_load_v2i8( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i8>* %addr to * +; CHECK-NEXT: [[TMP2:%.*]] = load , * [[TMP1]] +; CHECK-NEXT: ret [[TMP2]] + %load = load <2 x i8>, <2 x i8>* %addr, align 4 + %insert = tail call @llvm.experimental.vector.insert.nxv2i8.v2i8( undef, <2 x i8> %load, i64 0) + %ret = bitcast %insert to + ret %ret +} + +define @pred_load_v4i8(<4 x i8>* %addr) #1 { +; CHECK-LABEL: @pred_load_v4i8( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <4 x i8>* %addr to * +; CHECK-NEXT: [[TMP2:%.*]] = load , * [[TMP1]] +; CHECK-NEXT: ret [[TMP2]] + %load = load <4 x i8>, <4 x i8>* %addr, align 4 + %insert = tail call @llvm.experimental.vector.insert.nxv2i8.v4i8( undef, <4 x i8> %load, i64 0) + %ret = bitcast %insert to + ret %ret +} + +define @pred_load_v8i8(<8 x i8>* %addr) #2 { +; CHECK-LABEL: @pred_load_v8i8( +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <8 x i8>* %addr to * +; CHECK-NEXT: [[TMP2:%.*]] = load , * [[TMP1]] +; CHECK-NEXT: ret [[TMP2]] + %load = load <8 x i8>, <8 x i8>* %addr, align 4 + %insert = tail call @llvm.experimental.vector.insert.nxv2i8.v8i8( undef, <8 x i8> %load, i64 0) + %ret = bitcast %insert to + ret %ret +} + +; Ensure the insertion point is at the load +define @pred_load_insertion_point(<2 x i8>* %addr) #0 { +; CHECK-LABEL: @pred_load_insertion_point( +; CHECK-NEXT: entry: +; CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i8>* %addr to * +; CHECK-NEXT: [[TMP2:%.*]] = load , * [[TMP1]] +; CHECK-NEXT: br label %bb1 +; CHECK: bb1: +; CHECK-NEXT: ret [[TMP2]] +entry: + %load = load <2 x i8>, <2 x i8>* %addr, align 4 + br label %bb1 + +bb1: + %insert = tail call @llvm.experimental.vector.insert.nxv2i8.v2i8( undef, <2 x i8> %load, i64 0) + %ret = bitcast %insert to + ret %ret +} + +; Check that too small of a vscale prevents optimization +define @pred_load_neg1(<4 x i8>* %addr) #0 { +; CHECK-LABEL: @pred_load_neg1( +; CHECK: call @llvm.experimental.vector.insert + %load = load <4 x i8>, <4 x i8>* %addr, align 4 + %insert = tail call @llvm.experimental.vector.insert.nxv2i8.v4i8( undef, <4 x i8> %load, i64 0) + %ret = bitcast %insert to + ret %ret +} + +; Check that too large of a vscale prevents optimization +define @pred_load_neg2(<4 x i8>* %addr) #2 { +; CHECK-LABEL: @pred_load_neg2( +; CHECK: call @llvm.experimental.vector.insert + %load = load <4 x i8>, <4 x i8>* %addr, align 4 + %insert = tail call @llvm.experimental.vector.insert.nxv2i8.v4i8( undef, <4 x i8> %load, i64 0) + %ret = bitcast %insert to + ret %ret +} + +; Check that a non-zero index prevents optimization +define @pred_load_neg3(<4 x i8>* %addr) #1 { +; CHECK-LABEL: @pred_load_neg3( +; CHECK: call @llvm.experimental.vector.insert + %load = load <4 x i8>, <4 x i8>* %addr, align 4 + %insert = tail call @llvm.experimental.vector.insert.nxv2i8.v4i8( undef, <4 x i8> %load, i64 4) + %ret = bitcast %insert to + ret %ret +} + +; Check that differing vscale min/max prevents optimization +define @pred_load_neg4(<4 x i8>* %addr) #3 { +; CHECK-LABEL: @pred_load_neg4( +; CHECK: call @llvm.experimental.vector.insert + %load = load <4 x i8>, <4 x i8>* %addr, align 4 + %insert = tail call @llvm.experimental.vector.insert.nxv2i8.v4i8( undef, <4 x i8> %load, i64 0) + %ret = bitcast %insert to + ret %ret +} + +; Check that insertion into a non-undef vector prevents optimization +define @pred_load_neg5(<4 x i8>* %addr, %passthru) #1 { +; CHECK-LABEL: @pred_load_neg5( +; CHECK: call @llvm.experimental.vector.insert + %load = load <4 x i8>, <4 x i8>* %addr, align 4 + %insert = tail call @llvm.experimental.vector.insert.nxv2i8.v4i8( %passthru, <4 x i8> %load, i64 0) + %ret = bitcast %insert to + ret %ret +} + +declare @llvm.experimental.vector.insert.nxv2i8.v2i8(, <2 x i8>, i64) +declare @llvm.experimental.vector.insert.nxv2i8.v4i8(, <4 x i8>, i64) +declare @llvm.experimental.vector.insert.nxv2i8.v8i8(, <8 x i8>, i64) + +attributes #0 = { "target-features"="+sve" vscale_range(1,1) } +attributes #1 = { "target-features"="+sve" vscale_range(2,2) } +attributes #2 = { "target-features"="+sve" vscale_range(4,4) } +attributes #3 = { "target-features"="+sve" vscale_range(2,4) }