This is an archive of the discontinued LLVM Phabricator instance.

Not necessarily for this commit, but I ended up implementing a similar optimisation myself as part of some other work. Something else we could do is recognise when a series of DUP calls is the same as a series of insertelement calls. For example:

define <vscale x 16 x i8> @dup_insertelement_multi(<vscale x 16 x i8> %v, i8 %s) #0 {
  %pg1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 3)
  %insert1 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> %v, <vscale x 16 x i1> %pg1, i8 %s)
  %pg2 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 2)
  %insert2 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> %insert1, <vscale x 16 x i1> %pg2, i8 %s)
  %pg3 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 1)
  %insert3 = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> %insert2, <vscale x 16 x i1> %pg3, i8 %s)
  ret <vscale x 16 x i8> %insert1
}

is the same as:

define <vscale x 16 x i8> @dup_insertelement_multi(<vscale x 16 x i8> %v, i8 %s) #0 {
  %1 = insertelement <vscale x 16 x i8> %v, i8 %s, i64 2
  %2 = insertelement <vscale x 16 x i8> %1, i8 %s, i64 1
  %3 = insertelement <vscale x 16 x i8> %2, i8 %s, i64 0
  ret <vscale x 16 x i8> %1
}

Doing this might look like:

// NOTE: not tested very extensively at all
auto *Cursor = I;
unsigned ExpectedPTruePattern = AArch64SVEPredPattern::vl1;
while (Cursor && Cursor->getIntrinsicID() == Intrinsic::aarch64_sve_dup &&
       ExpectedPTruePattern <= AArch64SVEPredPattern::vl8) {
  Value *Dst = Cursor->getArgOperand(0);
  Value *Pg = Cursor->getArgOperand(1);
  Value *Splat = Cursor->getArgOperand(2);

  auto *PTrue = dyn_cast<IntrinsicInst>(Pg);
  if (!PTrue || PTrue->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
    break;

  const auto PTruePattern =
      cast<ConstantInt>(PTrue->getOperand(0))->getZExtValue();

  if (PTruePattern != ExpectedPTruePattern)
    break;

  LLVMContext &Ctx = Cursor->getContext();
  IRBuilder<> Builder(Ctx);
  Builder.SetInsertPoint(Cursor);

  auto *Insert = Builder.CreateInsertElement(Dst, Splat, ExpectedPTruePattern - 1);
  Cursor->replaceAllUsesWith(Insert);
  Cursor->eraseFromParent();

  if (PTrue->use_empty())
    PTrue->eraseFromParent();

  Cursor = dyn_cast<IntrinsicInst>(Dst);
  ExpectedPTruePattern++;
  Changed = true;
}

(alternatively, we could keep the existing optimisation as-is, but add another function that looks for insertions into DUPs)

We see these chained-dups when we're moving data between NEON and SVE ACLE types[0]. I think doing this further optimisation might make sense in the context of this patch, but not a blocker from me.

[0]: https://developer.arm.com/documentation/ka004612/latest

llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
547 ↗	(On Diff #340029)	nit: I think this should say `insert` instead.
llvm/test/CodeGen/AArch64/sve-intrinsic-opts-dup.ll
1 ↗	(On Diff #340029)	nit: can we declare the triple in the IR instead of in the run line?

In D101167#2715177, @dmgreen wrote:

Hello Similarly to D100476, can this just be done in instcombine?

@dmgreen, as per the comment in the summary, we need this to be done after the convert.{from,to}.svbool intrinsics have been optimised, which is done in SVEIntrinsicOpts, hence it needs to live in there rather than instcombine.

In D101167#2716076, @joechrisellis wrote:

Not necessarily for this commit, but I ended up implementing a similar optimisation myself as part of some other work. Something else we could do is recognise when a series of DUP calls is the same as a series of insertelement calls. For example:

@joechrisellis I think that should probably be a separate patch, but yes, good idea :)

Move triple in test to IR statement rather than command line
Fix comment typo and lint issues

@dmgreen On second thought, I shall move the svbool convert optimisation into instcombine as well and then move this there too.

Harbormaster completed remote builds in B100905: Diff 340487.Apr 26 2021, 5:54 AM

Matt added a subscriber: Matt.Apr 26 2021, 6:51 AM

Move transform into instcombine

Harbormaster completed remote builds in B100957: Diff 340554.Apr 26 2021, 11:20 AM

Move transform into instcombine

Brilliant! Thanks. LGTM, if @joechrisellis agrees.

This revision is now accepted and ready to land.Apr 27 2021, 9:20 AM

tmatheson removed a subscriber: tmatheson.Apr 27 2021, 10:31 PM

LGTM!

Closed by commit rG89085bcc86d4: [AArch64][SVE] Convert svdup(vec, SV_VL1, elm) to insertelement(vec, elm, 0) (authored by bsmith). · Explain WhyApr 29 2021, 4:18 AM

This revision was automatically updated to reflect the committed changes.

bsmith added a commit: rG89085bcc86d4: [AArch64][SVE] Convert svdup(vec, SV_VL1, elm) to insertelement(vec, elm, 0).

Revision Contents

Path

Size

llvm/

lib/

Target/

AArch64/

AArch64TargetTransformInfo.cpp

26 lines

test/

Transforms/

InstCombine/

AArch64/

sve-intrinsic-opts-dup.ll

52 lines

Diff 341467

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

Show First 20 Lines • Show All 360 Lines • ▼ Show 20 Lines	static Optional<Instruction *> instCombineConvertFromSVBool(InstCombiner &IC,
// If no viable replacement in the conversion chain was found, there is		// If no viable replacement in the conversion chain was found, there is
// nothing to do.		// nothing to do.
if (!EarliestReplacement)		if (!EarliestReplacement)
return None;		return None;

return IC.replaceInstUsesWith(II, EarliestReplacement);		return IC.replaceInstUsesWith(II, EarliestReplacement);
}		}

		static Optional<Instruction *> instCombineSVEDup(InstCombiner &IC,
		IntrinsicInst &II) {
		IntrinsicInst *Pg = dyn_cast<IntrinsicInst>(II.getArgOperand(1));
		if (!Pg)
		return None;

		if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue)
		return None;

		const auto PTruePattern =
		cast<ConstantInt>(Pg->getOperand(0))->getZExtValue();
		if (PTruePattern != AArch64SVEPredPattern::vl1)
		return None;

		// The intrinsic is inserting into lane zero so use an insert instead.
		auto *IdxTy = Type::getInt64Ty(II.getContext());
		auto *Insert = InsertElementInst::Create(
		II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0));
		Insert->insertBefore(&II);
		Insert->takeName(&II);

		return IC.replaceInstUsesWith(II, Insert);
		}

static Optional<Instruction *> instCombineSVELast(InstCombiner &IC,		static Optional<Instruction *> instCombineSVELast(InstCombiner &IC,
IntrinsicInst &II) {		IntrinsicInst &II) {
Value *Pg = II.getArgOperand(0);		Value *Pg = II.getArgOperand(0);
Value *Vec = II.getArgOperand(1);		Value *Vec = II.getArgOperand(1);
bool IsAfter = II.getIntrinsicID() == Intrinsic::aarch64_sve_lasta;		bool IsAfter = II.getIntrinsicID() == Intrinsic::aarch64_sve_lasta;

auto *C = dyn_cast<Constant>(Pg);		auto *C = dyn_cast<Constant>(Pg);
if (IsAfter && C && C->isNullValue()) {		if (IsAfter && C && C->isNullValue()) {
▲ Show 20 Lines • Show All 73 Lines • ▼ Show 20 Lines
AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,		AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
IntrinsicInst &II) const {		IntrinsicInst &II) const {
Intrinsic::ID IID = II.getIntrinsicID();		Intrinsic::ID IID = II.getIntrinsicID();
switch (IID) {		switch (IID) {
default:		default:
break;		break;
case Intrinsic::aarch64_sve_convert_from_svbool:		case Intrinsic::aarch64_sve_convert_from_svbool:
return instCombineConvertFromSVBool(IC, II);		return instCombineConvertFromSVBool(IC, II);
		case Intrinsic::aarch64_sve_dup:
		return instCombineSVEDup(IC, II);
case Intrinsic::aarch64_sve_lasta:		case Intrinsic::aarch64_sve_lasta:
case Intrinsic::aarch64_sve_lastb:		case Intrinsic::aarch64_sve_lastb:
return instCombineSVELast(IC, II);		return instCombineSVELast(IC, II);
}		}

return None;		return None;
}		}

▲ Show 20 Lines • Show All 1,211 Lines • Show Last 20 Lines

llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dup.ll

This file was added.

				; RUN: opt -S -instcombine < %s \| FileCheck %s

				target triple = "aarch64-unknown-linux-gnu"

				define <vscale x 16 x i8> @dup_insertelement_0(<vscale x 16 x i8> %v, i8 %s) #0 {
				; CHECK-LABEL: @dup_insertelement_0(
				; CHECK: %insert = insertelement <vscale x 16 x i8> %v, i8 %s, i64 0
				; CHECK-NEXT: ret <vscale x 16 x i8> %insert
				%pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 1)
				%insert = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> %v, <vscale x 16 x i1> %pg, i8 %s)
				ret <vscale x 16 x i8> %insert
				}

				define <vscale x 16 x i8> @dup_insertelement_1(<vscale x 16 x i8> %v, i8 %s) #0 {
				; CHECK-LABEL: @dup_insertelement_1(
				; CHECK: %pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 2)
				; CHECK-NEXT: %insert = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> %v, <vscale x 16 x i1> %pg, i8 %s)
				; CHECK-NEXT: ret <vscale x 16 x i8> %insert
				%pg = tail call <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32 2)
				%insert = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> %v, <vscale x 16 x i1> %pg, i8 %s)
				ret <vscale x 16 x i8> %insert
				}

				define <vscale x 16 x i8> @dup_insertelement_x(<vscale x 16 x i8> %v, i8 %s, <vscale x 16 x i1> %pg) #0 {
				; CHECK-LABEL: @dup_insertelement_x(
				; CHECK: %insert = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> %v, <vscale x 16 x i1> %pg, i8 %s)
				; CHECK-NEXT: ret <vscale x 16 x i8> %insert
				%insert = tail call <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8> %v, <vscale x 16 x i1> %pg, i8 %s)
				ret <vscale x 16 x i8> %insert
				}

				define <vscale x 8 x i16> @dup_insertelement_0_convert(<vscale x 8 x i16> %v, i16 %s) #0 {
				; CHECK-LABEL: @dup_insertelement_0_convert(
				; CHECK: %insert = insertelement <vscale x 8 x i16> %v, i16 %s, i64 0
				; CHECK-NEXT: ret <vscale x 8 x i16> %insert
				%pg = tail call <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32 1)
				%1 = tail call <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1> %pg)
				%2 = tail call <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1> %1)
				%insert = tail call <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16> %v, <vscale x 8 x i1> %2, i16 %s)
				ret <vscale x 8 x i16> %insert
				}

				declare <vscale x 16 x i8> @llvm.aarch64.sve.dup.nxv16i8(<vscale x 16 x i8>, <vscale x 16 x i1>, i8)
				declare <vscale x 8 x i16> @llvm.aarch64.sve.dup.nxv8i16(<vscale x 8 x i16>, <vscale x 8 x i1>, i16)

				declare <vscale x 16 x i1> @llvm.aarch64.sve.ptrue.nxv16i1(i32)
				declare <vscale x 8 x i1> @llvm.aarch64.sve.ptrue.nxv8i1(i32)

				declare <vscale x 16 x i1> @llvm.aarch64.sve.convert.to.svbool.nxv8i1(<vscale x 8 x i1>)
				declare <vscale x 8 x i1> @llvm.aarch64.sve.convert.from.svbool.nxv8i1(<vscale x 16 x i1>)

				attributes #0 = { "target-features"="+sve" }

This is an archive of the discontinued LLVM Phabricator instance.

[AArch64][SVE] Convert svdup(vec, SV_VL1, elm) to insertelement(vec, elm, 0)ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 341467

llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp

llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dup.ll

[AArch64][SVE] Convert svdup(vec, SV_VL1, elm) to insertelement(vec, elm, 0)
ClosedPublic