diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h @@ -100,6 +100,9 @@ InstructionCost getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind); + Optional instCombineIntrinsic(InstCombiner &IC, + IntrinsicInst &II) const; + TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { switch (K) { case TargetTransformInfo::RGK_Scalar: diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -18,6 +18,7 @@ #include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/PatternMatch.h" #include "llvm/Support/Debug.h" +#include "llvm/Transforms/InstCombine/InstCombiner.h" #include using namespace llvm; using namespace llvm::PatternMatch; @@ -279,6 +280,102 @@ return BaseT::getIntrinsicInstrCost(ICA, CostKind); } +static Optional instCombineSVELast(InstCombiner &IC, + IntrinsicInst &II) { + Value *Pg = II.getArgOperand(0); + Value *Vec = II.getArgOperand(1); + bool IsAfter = II.getIntrinsicID() == Intrinsic::aarch64_sve_lasta; + + auto *C = dyn_cast(Pg); + if (IsAfter && C && C->isNullValue()) { + // The intrinsic is extracting lane 0 so use an extract instead. + auto *IdxTy = Type::getInt64Ty(II.getContext()); + auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0)); + Extract->insertBefore(&II); + Extract->takeName(&II); + return IC.replaceInstUsesWith(II, Extract); + } + + auto *IntrPG = dyn_cast(II.getArgOperand(0)); + if (!IntrPG) + return None; + + if (IntrPG->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) + return None; + + const auto PTruePattern = + cast(IntrPG->getOperand(0))->getZExtValue(); + + // Can the intrinsic's predicate be converted to a known constant index? + unsigned Idx; + switch (PTruePattern) { + default: + return None; + case AArch64SVEPredPattern::vl1: + Idx = 0; + break; + case AArch64SVEPredPattern::vl2: + Idx = 1; + break; + case AArch64SVEPredPattern::vl3: + Idx = 2; + break; + case AArch64SVEPredPattern::vl4: + Idx = 3; + break; + case AArch64SVEPredPattern::vl5: + Idx = 4; + break; + case AArch64SVEPredPattern::vl6: + Idx = 5; + break; + case AArch64SVEPredPattern::vl7: + Idx = 6; + break; + case AArch64SVEPredPattern::vl8: + Idx = 7; + break; + case AArch64SVEPredPattern::vl16: + Idx = 15; + break; + } + + // Increment the index if extracting the element after the last active + // predicate element. + if (IsAfter) + ++Idx; + + // Ignore extracts whose index is larger than the known minimum vector + // length. NOTE: This is an artificial constraint where we prefer to + // maintain what the user asked for until an alternative is proven faster. + auto *PgVTy = cast(Pg->getType()); + if (Idx >= PgVTy->getMinNumElements()) + return None; + + // The intrinsic is extracting a fixed lane so use an extract instead. + auto *IdxTy = Type::getInt64Ty(II.getContext()); + auto *Extract = ExtractElementInst::Create(II.getArgOperand(1), + ConstantInt::get(IdxTy, Idx)); + Extract->insertBefore(&II); + Extract->takeName(&II); + return IC.replaceInstUsesWith(II, Extract); +} + +Optional +AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC, + IntrinsicInst &II) const { + Intrinsic::ID IID = II.getIntrinsicID(); + switch (IID) { + default: + break; + case Intrinsic::aarch64_sve_lasta: + case Intrinsic::aarch64_sve_lastb: + return instCombineSVELast(IC, II); + } + + return None; +} + bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode, ArrayRef Args) { diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-lasta-lastb.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-lasta-lastb.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-lasta-lastb.ll @@ -0,0 +1,151 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -instcombine -S < %s | FileCheck --check-prefix OPT %s + +target triple = "aarch64" + +; Most of the testing is covered by the lastb cases, but here we ensure that +; lasta with a predicate having no active lanes is treated as an alias to +; extracting the first vector element. +define i8 @lasta_extractelement_0( %v) #0 { +; OPT-LABEL: @lasta_extractelement_0( +; OPT-NEXT: [[E0:%.*]] = extractelement [[V:%.*]], i64 0 +; OPT-NEXT: ret i8 [[E0]] +; + %e0 = tail call i8 @llvm.aarch64.sve.lasta.nxv16i8( zeroinitializer, %v) + ret i8 %e0 +} + +; Most of the testing is covered by the lastb cases, but here we check the +; resulting extraction index is one more than the lastb case because lasta +; extracts the element after the last active. +define i8 @lasta_extractelement_8( %v) #0 { +; OPT-LABEL: @lasta_extractelement_8( +; OPT-NEXT: [[E1:%.*]] = extractelement [[V:%.*]], i64 8 +; OPT-NEXT: ret i8 [[E1]] +; + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 8) + %e1 = tail call i8 @llvm.aarch64.sve.lasta.nxv16i8( %pg, %v) + ret i8 %e1 +} + +define i8 @lastb_extractelement_0( %v) #0 { +; OPT-LABEL: @lastb_extractelement_0( +; OPT-NEXT: [[E0:%.*]] = extractelement [[V:%.*]], i64 0 +; OPT-NEXT: ret i8 [[E0]] +; + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 1) + %e0 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8( %pg, %v) + ret i8 %e0 +} + +define i8 @lastb_extractelement_1( %v) #0 { +; OPT-LABEL: @lastb_extractelement_1( +; OPT-NEXT: [[E1:%.*]] = extractelement [[V:%.*]], i64 1 +; OPT-NEXT: ret i8 [[E1]] +; + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 2) + %e1 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8( %pg, %v) + ret i8 %e1 +} + +define i8 @lastb_extractelement_2( %v) #0 { +; OPT-LABEL: @lastb_extractelement_2( +; OPT-NEXT: [[E2:%.*]] = extractelement [[V:%.*]], i64 2 +; OPT-NEXT: ret i8 [[E2]] +; + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 3) + %e2 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8( %pg, %v) + ret i8 %e2 +} + +define i8 @lastb_extractelement_3( %v) #0 { +; OPT-LABEL: @lastb_extractelement_3( +; OPT-NEXT: [[E3:%.*]] = extractelement [[V:%.*]], i64 3 +; OPT-NEXT: ret i8 [[E3]] +; + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 4) + %e3 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8( %pg, %v) + ret i8 %e3 +} + +define i8 @lastb_extractelement_4( %v) #0 { +; OPT-LABEL: @lastb_extractelement_4( +; OPT-NEXT: [[E4:%.*]] = extractelement [[V:%.*]], i64 4 +; OPT-NEXT: ret i8 [[E4]] +; + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 5) + %e4 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8( %pg, %v) + ret i8 %e4 +} + +define i8 @lastb_extractelement_5( %v) #0 { +; OPT-LABEL: @lastb_extractelement_5( +; OPT-NEXT: [[E5:%.*]] = extractelement [[V:%.*]], i64 5 +; OPT-NEXT: ret i8 [[E5]] +; + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 6) + %e5 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8( %pg, %v) + ret i8 %e5 +} + +define i8 @lastb_extractelement_6( %v) #0 { +; OPT-LABEL: @lastb_extractelement_6( +; OPT-NEXT: [[E6:%.*]] = extractelement [[V:%.*]], i64 6 +; OPT-NEXT: ret i8 [[E6]] +; + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 7) + %e6 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8( %pg, %v) + ret i8 %e6 +} + +define i8 @lastb_extractelement_7( %v) #0 { +; OPT-LABEL: @lastb_extractelement_7( +; OPT-NEXT: [[E7:%.*]] = extractelement [[V:%.*]], i64 7 +; OPT-NEXT: ret i8 [[E7]] +; + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 8) + %e7 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8( %pg, %v) + ret i8 %e7 +} + +define i8 @lastb_extractelement_15( %v) #0 { +; OPT-LABEL: @lastb_extractelement_15( +; OPT-NEXT: [[E15:%.*]] = extractelement [[V:%.*]], i64 15 +; OPT-NEXT: ret i8 [[E15]] +; + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 9) + %e15 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8( %pg, %v) + ret i8 %e15 +} + +; No transformation because the requested element is beyond the range of the +; known minimum element count so we maintain the user's intentions. +define i8 @lastb_extractelement_31( %v) #0 { +; OPT-LABEL: @lastb_extractelement_31( +; OPT-NEXT: [[PG:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 10) +; OPT-NEXT: [[E31:%.*]] = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8( [[PG]], [[V:%.*]]) +; OPT-NEXT: ret i8 [[E31]] +; + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 10) + %e31 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8( %pg, %v) + ret i8 %e31 +} + +; No transformation because the ptrue's predicate pattern is bogus and thus +; nothing can be inferred about the result. +define i8 @lastb_extractelement_invalid_predicate_pattern( %v) #0 { +; OPT-LABEL: @lastb_extractelement_invalid_predicate_pattern( +; OPT-NEXT: [[PG:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 15) +; OPT-NEXT: [[E:%.*]] = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8( [[PG]], [[V:%.*]]) +; OPT-NEXT: ret i8 [[E]] +; + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 15) + %e = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8( %pg, %v) + ret i8 %e +} + +declare @llvm.aarch64.sve.ptrue.nxv16i1(i32) +declare i8 @llvm.aarch64.sve.lasta.nxv16i8(, ) +declare i8 @llvm.aarch64.sve.lastb.nxv16i8(, ) + +attributes #0 = { "target-features"="+sve" }