diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp --- a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp +++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp @@ -76,6 +76,7 @@ bool optimizeFunctions(SmallSetVector &Functions); static bool optimizeConvertFromSVBool(IntrinsicInst *I); + static bool optimizeDup(IntrinsicInst *I); static bool optimizePTest(IntrinsicInst *I); static bool optimizeVectorMul(IntrinsicInst *I); static bool optimizeTBL(IntrinsicInst *I); @@ -528,6 +529,37 @@ return true; } +bool SVEIntrinsicOpts::optimizeDup(IntrinsicInst *I) { + assert(I->getIntrinsicID() == Intrinsic::aarch64_sve_dup); + + IntrinsicInst *Pg = dyn_cast(I->getArgOperand(1)); + if (!Pg) + return false; + + if (Pg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) + return false; + + const auto PTruePattern = + cast(Pg->getOperand(0))->getZExtValue(); + if (PTruePattern != AArch64SVEPredPattern::vl1) + return false; + + // The intrinsic is inserting into lane zero so use an extract instead. + Type *IdxTy = Type::getInt64Ty(I->getContext()); + auto Insert = InsertElementInst::Create( + I->getArgOperand(0), I->getArgOperand(2), ConstantInt::get(IdxTy, 0)); + Insert->insertBefore(I); + Insert->takeName(I); + I->replaceAllUsesWith(Insert); + I->eraseFromParent(); + + // Remove unused predicate. + if (Pg->use_empty()) + Pg->eraseFromParent(); + + return true; +} + bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction *I) { IntrinsicInst *IntrI = dyn_cast(I); if (!IntrI) @@ -536,6 +568,8 @@ switch (IntrI->getIntrinsicID()) { case Intrinsic::aarch64_sve_convert_from_svbool: return optimizeConvertFromSVBool(IntrI); + case Intrinsic::aarch64_sve_dup: + return optimizeDup(IntrI); case Intrinsic::aarch64_sve_fmul: case Intrinsic::aarch64_sve_mul: return optimizeVectorMul(IntrI); diff --git a/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-dup.ll b/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-dup.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-intrinsic-opts-dup.ll @@ -0,0 +1,50 @@ +; RUN: opt -S -aarch64-sve-intrinsic-opts -mtriple=aarch64-linux-gnu < %s | FileCheck %s + +define @dup_insertelement_0( %v, i8 %s) #0 { +; CHECK-LABEL: @dup_insertelement_0( +; CHECK: %insert = insertelement %v, i8 %s, i64 0 +; CHECK-NEXT: ret %insert + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 1) + %insert = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg, i8 %s) + ret %insert +} + +define @dup_insertelement_1( %v, i8 %s) #0 { +; CHECK-LABEL: @dup_insertelement_1( +; CHECK: %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 2) +; CHECK-NEXT: %insert = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg, i8 %s) +; CHECK-NEXT: ret %insert + %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 2) + %insert = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg, i8 %s) + ret %insert +} + +define @dup_insertelement_x( %v, i8 %s, %pg) #0 { +; CHECK-LABEL: @dup_insertelement_x( +; CHECK: %insert = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg, i8 %s) +; CHECK-NEXT: ret %insert + %insert = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg, i8 %s) + ret %insert +} + +define @dup_insertelement_0_convert( %v, i16 %s) #0 { +; CHECK-LABEL: @dup_insertelement_0_convert( +; CHECK: %insert = insertelement %v, i16 %s, i64 0 +; CHECK-NEXT: ret %insert + %pg = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 1) + %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg) + %2 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %1) + %insert = tail call @llvm.aarch64.sve.dup.nxv8i16( %v, %2, i16 %s) + ret %insert +} + +declare @llvm.aarch64.sve.dup.nxv16i8(, , i8) +declare @llvm.aarch64.sve.dup.nxv8i16(, , i16) + +declare @llvm.aarch64.sve.ptrue.nxv16i1(i32) +declare @llvm.aarch64.sve.ptrue.nxv8i1(i32) + +declare @llvm.aarch64.sve.convert.to.svbool.nxv8i1() +declare @llvm.aarch64.sve.convert.from.svbool.nxv8i1() + +attributes #0 = { "target-features"="+sve" }