diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp --- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp +++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp @@ -76,6 +76,8 @@ bool optimizeFunctions(SmallSetVector &Functions); static bool optimizeConvertFromSVBool(IntrinsicInst *I); + static bool optimizeLasta(IntrinsicInst *I); + static bool optimizeLastaOrLastb(IntrinsicInst *I, bool IsAfter); static bool optimizePTest(IntrinsicInst *I); static bool optimizeVectorMul(IntrinsicInst *I); static bool optimizeTBL(IntrinsicInst *I); @@ -528,6 +530,103 @@ return true; } +bool SVEIntrinsicOpts::optimizeLasta(IntrinsicInst *I) { + assert(I->getIntrinsicID() == Intrinsic::aarch64_sve_lasta); + + Value *Pg = I->getArgOperand(0); + Value *Vec = I->getArgOperand(1); + + auto *C = dyn_cast(Pg); + if (!C || !C->isNullValue()) + return false; + + // The intrinsic is extracting lane 0 so use an extract instead. + auto *IdxTy = Type::getInt64Ty(I->getContext()); + auto *Extract = ExtractElementInst::Create(Vec, ConstantInt::get(IdxTy, 0)); + Extract->insertBefore(I); + Extract->takeName(I); + I->replaceAllUsesWith(Extract); + I->eraseFromParent(); + return true; +} + +bool SVEIntrinsicOpts::optimizeLastaOrLastb(IntrinsicInst *I, + bool IsAfter = true) { + assert(I->getIntrinsicID() == Intrinsic::aarch64_sve_lasta || + I->getIntrinsicID() == Intrinsic::aarch64_sve_lastb); + + auto *Pg = dyn_cast(I->getArgOperand(0)); + if (!Pg) + return false; + + if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) + return false; + + const auto PTruePattern = + cast(Pg->getOperand(0))->getZExtValue(); + + // Can the intrinsic's predicate be converted to a known constant index? + unsigned Idx; + switch (PTruePattern) { + default: + return false; + case AArch64SVEPredPattern::vl1: + Idx = 0; + break; + case AArch64SVEPredPattern::vl2: + Idx = 1; + break; + case AArch64SVEPredPattern::vl3: + Idx = 2; + break; + case AArch64SVEPredPattern::vl4: + Idx = 3; + break; + case AArch64SVEPredPattern::vl5: + Idx = 4; + break; + case AArch64SVEPredPattern::vl6: + Idx = 5; + break; + case AArch64SVEPredPattern::vl7: + Idx = 6; + break; + case AArch64SVEPredPattern::vl8: + Idx = 7; + break; + case AArch64SVEPredPattern::vl16: + Idx = 15; + break; + } + + // Increment the index if extracting the element after the last active + // predicate element. + if (IsAfter) + ++Idx; + + // Ignore extracts whose index is larger than the known minimum vector length. + // NOTE: This is an artificial constraint where we prefer to maintain what + // the user asked for until an alternative is proven faster. + auto *PgVTy = cast(Pg->getType()); + if (Idx >= PgVTy->getMinNumElements()) + return false; + + // The intrinsic is extracting a fixed lane so use an extract instead. + auto *IdxTy = Type::getInt64Ty(I->getContext()); + auto *Extract = ExtractElementInst::Create(I->getArgOperand(1), + ConstantInt::get(IdxTy, Idx)); + Extract->insertBefore(I); + Extract->takeName(I); + I->replaceAllUsesWith(Extract); + I->eraseFromParent(); + + // Remove unused predicate. + if (Pg->use_empty()) + Pg->eraseFromParent(); + + return true; +} + bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction *I) { IntrinsicInst *IntrI = dyn_cast(I); if (!IntrI) @@ -536,6 +635,12 @@ switch (IntrI->getIntrinsicID()) { case Intrinsic::aarch64_sve_convert_from_svbool: return optimizeConvertFromSVBool(IntrI); + case Intrinsic::aarch64_sve_lasta: + if (optimizeLasta(IntrI)) + return true; + return optimizeLastaOrLastb(IntrI, /* IsAfter = */ true); + case Intrinsic::aarch64_sve_lastb: + return optimizeLastaOrLastb(IntrI, /* IsAfter = */ false); case Intrinsic::aarch64_sve_fmul: case Intrinsic::aarch64_sve_mul: return optimizeVectorMul(IntrI); @@ -592,6 +697,8 @@ switch (F.getIntrinsicID()) { case Intrinsic::aarch64_sve_convert_from_svbool: + case Intrinsic::aarch64_sve_lasta: + case Intrinsic::aarch64_sve_lastb: case Intrinsic::aarch64_sve_ptest_any: case Intrinsic::aarch64_sve_ptest_first: case Intrinsic::aarch64_sve_ptest_last: diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-lasta-lastb.ll b/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-lasta-lastb.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-lasta-lastb.ll @@ -0,0 +1,151 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt -S -aarch64-sve-intrinsic-opts < %s | FileCheck --check-prefix OPT %s + +target triple = "aarch64" + +; Most of the testing is covered by the lastb cases, but here we ensure that +; lasta with a predicate having no active lanes is treated as an alias to +; extracting the first vector element. +define i8 @lasta_extractelement_0( %v) #0 { +; OPT-LABEL: @lasta_extractelement_0( +; OPT-NEXT: [[E0:%.*]] = extractelement [[V:%.*]], i64 0 +; OPT-NEXT: ret i8 [[E0]] +; + %e0 = tail call i8 @llvm.aarch64.sve.lasta.nxv16i8( zeroinitializer, %v) + ret i8 %e0 +} + +; Most of the testing is covered by the lastb cases, but here we check the +; resulting extraction index is one more than the lastb case because lasta +; extracts the element after the last active. +define i8 @lasta_extractelement_8( %v) #0 { +; OPT-LABEL: @lasta_extractelement_8( +; OPT-NEXT: [[E1:%.*]] = extractelement [[V:%.*]], i64 8 +; OPT-NEXT: ret i8 [[E1]] +; + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 8) + %e1 = tail call i8 @llvm.aarch64.sve.lasta.nxv16i8( %pg, %v) + ret i8 %e1 +} + +define i8 @lastb_extractelement_0( %v) #0 { +; OPT-LABEL: @lastb_extractelement_0( +; OPT-NEXT: [[E0:%.*]] = extractelement [[V:%.*]], i64 0 +; OPT-NEXT: ret i8 [[E0]] +; + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 1) + %e0 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8( %pg, %v) + ret i8 %e0 +} + +define i8 @lastb_extractelement_1( %v) #0 { +; OPT-LABEL: @lastb_extractelement_1( +; OPT-NEXT: [[E1:%.*]] = extractelement [[V:%.*]], i64 1 +; OPT-NEXT: ret i8 [[E1]] +; + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 2) + %e1 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8( %pg, %v) + ret i8 %e1 +} + +define i8 @lastb_extractelement_2( %v) #0 { +; OPT-LABEL: @lastb_extractelement_2( +; OPT-NEXT: [[E2:%.*]] = extractelement [[V:%.*]], i64 2 +; OPT-NEXT: ret i8 [[E2]] +; + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 3) + %e2 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8( %pg, %v) + ret i8 %e2 +} + +define i8 @lastb_extractelement_3( %v) #0 { +; OPT-LABEL: @lastb_extractelement_3( +; OPT-NEXT: [[E3:%.*]] = extractelement [[V:%.*]], i64 3 +; OPT-NEXT: ret i8 [[E3]] +; + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 4) + %e3 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8( %pg, %v) + ret i8 %e3 +} + +define i8 @lastb_extractelement_4( %v) #0 { +; OPT-LABEL: @lastb_extractelement_4( +; OPT-NEXT: [[E4:%.*]] = extractelement [[V:%.*]], i64 4 +; OPT-NEXT: ret i8 [[E4]] +; + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 5) + %e4 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8( %pg, %v) + ret i8 %e4 +} + +define i8 @lastb_extractelement_5( %v) #0 { +; OPT-LABEL: @lastb_extractelement_5( +; OPT-NEXT: [[E5:%.*]] = extractelement [[V:%.*]], i64 5 +; OPT-NEXT: ret i8 [[E5]] +; + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 6) + %e5 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8( %pg, %v) + ret i8 %e5 +} + +define i8 @lastb_extractelement_6( %v) #0 { +; OPT-LABEL: @lastb_extractelement_6( +; OPT-NEXT: [[E6:%.*]] = extractelement [[V:%.*]], i64 6 +; OPT-NEXT: ret i8 [[E6]] +; + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 7) + %e6 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8( %pg, %v) + ret i8 %e6 +} + +define i8 @lastb_extractelement_7( %v) #0 { +; OPT-LABEL: @lastb_extractelement_7( +; OPT-NEXT: [[E7:%.*]] = extractelement [[V:%.*]], i64 7 +; OPT-NEXT: ret i8 [[E7]] +; + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 8) + %e7 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8( %pg, %v) + ret i8 %e7 +} + +define i8 @lastb_extractelement_15( %v) #0 { +; OPT-LABEL: @lastb_extractelement_15( +; OPT-NEXT: [[E15:%.*]] = extractelement [[V:%.*]], i64 15 +; OPT-NEXT: ret i8 [[E15]] +; + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 9) + %e15 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8( %pg, %v) + ret i8 %e15 +} + +; No transformation because the requested element is beyond the range of the +; known minimum element count so we maintain the user's intentions. +define i8 @lastb_extractelement_31( %v) #0 { +; OPT-LABEL: @lastb_extractelement_31( +; OPT-NEXT: [[PG:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 10) +; OPT-NEXT: [[E31:%.*]] = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8( [[PG]], [[V:%.*]]) +; OPT-NEXT: ret i8 [[E31]] +; + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 10) + %e31 = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8( %pg, %v) + ret i8 %e31 +} + +; No transformation because the ptrue's predicate pattern is bogus and thus +; nothing can be inferred about the result. +define i8 @lastb_extractelement_invalid_predicate_pattern( %v) #0 { +; OPT-LABEL: @lastb_extractelement_invalid_predicate_pattern( +; OPT-NEXT: [[PG:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 15) +; OPT-NEXT: [[E:%.*]] = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8( [[PG]], [[V:%.*]]) +; OPT-NEXT: ret i8 [[E]] +; + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 15) + %e = tail call i8 @llvm.aarch64.sve.lastb.nxv16i8( %pg, %v) + ret i8 %e +} + +declare @llvm.aarch64.sve.ptrue.nxv16i1(i32) +declare i8 @llvm.aarch64.sve.lasta.nxv16i8(, ) +declare i8 @llvm.aarch64.sve.lastb.nxv16i8(, ) + +attributes #0 = { "target-features"="+sve" }