diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp --- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -368,6 +368,9 @@ static Optional instCombineSVEDup(InstCombiner &IC, IntrinsicInst &II) { + assert(II.getIntrinsicID() == Intrinsic::aarch64_sve_dup && + "Expected SVE DUP!"); + IntrinsicInst *Pg = dyn_cast(II.getArgOperand(1)); if (!Pg) return None; @@ -377,13 +380,40 @@ const auto PTruePattern = cast(Pg->getOperand(0))->getZExtValue(); - if (PTruePattern != AArch64SVEPredPattern::vl1) + + // Do not consider ptrues whose pattern is not in the range VL1 to VL8 + // inclusive. + if (!(PTruePattern >= AArch64SVEPredPattern::vl1 && + PTruePattern <= AArch64SVEPredPattern::vl8)) return None; - // The intrinsic is inserting into lane zero so use an insert instead. + // If the pattern is not VL1, it is necessary to check that the only uses of + // this DUP are other DUPs that overwrite every vector element except for + // one. If this condition is not true, bail out. + if (PTruePattern != AArch64SVEPredPattern::vl1) { + for (User *U : II.users()) { + IntrinsicInst *UserIntrInst = dyn_cast(U); + if (!UserIntrInst || + UserIntrInst->getIntrinsicID() != Intrinsic::aarch64_sve_dup) + return None; + + IntrinsicInst *UserPg = + dyn_cast(UserIntrInst->getArgOperand(1)); + if (UserPg->getIntrinsicID() != Intrinsic::aarch64_sve_ptrue) + return None; + + const auto UserPTruePattern = + cast(UserPg->getOperand(0))->getZExtValue(); + if (UserPTruePattern != PTruePattern - 1) + return None; + } + } + + // Replace with insertelement. auto *IdxTy = Type::getInt64Ty(II.getContext()); - auto *Insert = InsertElementInst::Create( - II.getArgOperand(0), II.getArgOperand(2), ConstantInt::get(IdxTy, 0)); + auto *Insert = + InsertElementInst::Create(II.getArgOperand(0), II.getArgOperand(2), + ConstantInt::get(IdxTy, PTruePattern - 1)); Insert->insertBefore(&II); Insert->takeName(&II); diff --git a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dup.ll b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dup.ll --- a/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dup.ll +++ b/llvm/test/Transforms/InstCombine/AArch64/sve-intrinsic-opts-dup.ll @@ -1,11 +1,13 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py ; RUN: opt -S -instcombine < %s | FileCheck %s target triple = "aarch64-unknown-linux-gnu" define @dup_insertelement_0( %v, i8 %s) #0 { ; CHECK-LABEL: @dup_insertelement_0( -; CHECK: %insert = insertelement %v, i8 %s, i64 0 -; CHECK-NEXT: ret %insert +; CHECK-NEXT: [[INSERT:%.*]] = insertelement [[V:%.*]], i8 [[S:%.*]], i64 0 +; CHECK-NEXT: ret [[INSERT]] +; %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 1) %insert = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg, i8 %s) ret %insert @@ -13,9 +15,10 @@ define @dup_insertelement_1( %v, i8 %s) #0 { ; CHECK-LABEL: @dup_insertelement_1( -; CHECK: %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 2) -; CHECK-NEXT: %insert = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg, i8 %s) -; CHECK-NEXT: ret %insert +; CHECK-NEXT: [[PG:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 2) +; CHECK-NEXT: [[INSERT:%.*]] = tail call @llvm.aarch64.sve.dup.nxv16i8( [[V:%.*]], [[PG]], i8 [[S:%.*]]) +; CHECK-NEXT: ret [[INSERT]] +; %pg = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 2) %insert = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg, i8 %s) ret %insert @@ -23,16 +26,18 @@ define @dup_insertelement_x( %v, i8 %s, %pg) #0 { ; CHECK-LABEL: @dup_insertelement_x( -; CHECK: %insert = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg, i8 %s) -; CHECK-NEXT: ret %insert +; CHECK-NEXT: [[INSERT:%.*]] = tail call @llvm.aarch64.sve.dup.nxv16i8( [[V:%.*]], [[PG:%.*]], i8 [[S:%.*]]) +; CHECK-NEXT: ret [[INSERT]] +; %insert = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg, i8 %s) ret %insert } define @dup_insertelement_0_convert( %v, i16 %s) #0 { ; CHECK-LABEL: @dup_insertelement_0_convert( -; CHECK: %insert = insertelement %v, i16 %s, i64 0 -; CHECK-NEXT: ret %insert +; CHECK-NEXT: [[INSERT:%.*]] = insertelement [[V:%.*]], i16 [[S:%.*]], i64 0 +; CHECK-NEXT: ret [[INSERT]] +; %pg = tail call @llvm.aarch64.sve.ptrue.nxv8i1(i32 1) %1 = tail call @llvm.aarch64.sve.convert.to.svbool.nxv8i1( %pg) %2 = tail call @llvm.aarch64.sve.convert.from.svbool.nxv8i1( %1) @@ -40,6 +45,167 @@ ret %insert } +define @dup_insertelement_two( %v, i8 %a, i8 %b) #0 { +; CHECK-LABEL: @dup_insertelement_two( +; CHECK-NEXT: [[DUP1:%.*]] = insertelement [[V:%.*]], i8 [[A:%.*]], i64 1 +; CHECK-NEXT: [[DUP2:%.*]] = insertelement [[DUP1]], i8 [[B:%.*]], i64 0 +; CHECK-NEXT: ret [[DUP2]] +; + %pg1 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 2) + %dup1 = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg1, i8 %a) + %pg2 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 1) + %dup2 = tail call @llvm.aarch64.sve.dup.nxv16i8( %dup1, %pg2, i8 %b) + ret %dup2 +} + +define @dup_insertelement_four( %v, i8 %a, i8 %b) #0 { +; CHECK-LABEL: @dup_insertelement_four( +; CHECK-NEXT: [[DUP1:%.*]] = insertelement [[V:%.*]], i8 [[A:%.*]], i64 3 +; CHECK-NEXT: [[DUP2:%.*]] = insertelement [[DUP1]], i8 [[B:%.*]], i64 2 +; CHECK-NEXT: [[DUP3:%.*]] = insertelement [[DUP2]], i8 [[A]], i64 1 +; CHECK-NEXT: [[DUP4:%.*]] = insertelement [[DUP3]], i8 [[B]], i64 0 +; CHECK-NEXT: ret [[DUP4]] +; + %pg1 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 4) + %dup1 = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg1, i8 %a) + %pg2 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 3) + %dup2 = tail call @llvm.aarch64.sve.dup.nxv16i8( %dup1, %pg2, i8 %b) + %pg3 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 2) + %dup3 = tail call @llvm.aarch64.sve.dup.nxv16i8( %dup2, %pg3, i8 %a) + %pg4 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 1) + %dup4 = tail call @llvm.aarch64.sve.dup.nxv16i8( %dup3, %pg4, i8 %b) + ret %dup4 +} + +define @dup_insertelement_eight( %v, i8 %a, i8 %b) #0 { +; CHECK-LABEL: @dup_insertelement_eight( +; CHECK-NEXT: [[DUP1:%.*]] = insertelement [[V:%.*]], i8 [[A:%.*]], i64 7 +; CHECK-NEXT: [[DUP2:%.*]] = insertelement [[DUP1]], i8 [[B:%.*]], i64 6 +; CHECK-NEXT: [[DUP3:%.*]] = insertelement [[DUP2]], i8 [[A]], i64 5 +; CHECK-NEXT: [[DUP4:%.*]] = insertelement [[DUP3]], i8 [[B]], i64 4 +; CHECK-NEXT: [[DUP5:%.*]] = insertelement [[DUP4]], i8 [[A]], i64 3 +; CHECK-NEXT: [[DUP6:%.*]] = insertelement [[DUP5]], i8 [[B]], i64 2 +; CHECK-NEXT: [[DUP7:%.*]] = insertelement [[DUP6]], i8 [[A]], i64 1 +; CHECK-NEXT: [[DUP8:%.*]] = insertelement [[DUP7]], i8 [[B]], i64 0 +; CHECK-NEXT: ret [[DUP8]] +; + %pg1 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 8) + %dup1 = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg1, i8 %a) + %pg2 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 7) + %dup2 = tail call @llvm.aarch64.sve.dup.nxv16i8( %dup1, %pg2, i8 %b) + %pg3 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 6) + %dup3 = tail call @llvm.aarch64.sve.dup.nxv16i8( %dup2, %pg3, i8 %a) + %pg4 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 5) + %dup4 = tail call @llvm.aarch64.sve.dup.nxv16i8( %dup3, %pg4, i8 %b) + %pg5 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 4) + %dup5 = tail call @llvm.aarch64.sve.dup.nxv16i8( %dup4, %pg5, i8 %a) + %pg6 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 3) + %dup6 = tail call @llvm.aarch64.sve.dup.nxv16i8( %dup5, %pg6, i8 %b) + %pg7 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 2) + %dup7 = tail call @llvm.aarch64.sve.dup.nxv16i8( %dup6, %pg7, i8 %a) + %pg8 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 1) + %dup8 = tail call @llvm.aarch64.sve.dup.nxv16i8( %dup7, %pg8, i8 %b) + ret %dup8 +} + +define @dup_insertelement_partial_chain( %v, i8 %a, i8 %b) #0 { +; CHECK-LABEL: @dup_insertelement_partial_chain( +; CHECK-NEXT: [[DUP1:%.*]] = insertelement [[V:%.*]], i8 [[A:%.*]], i64 3 +; CHECK-NEXT: [[PG2:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 3) +; CHECK-NEXT: [[DUP2:%.*]] = tail call @llvm.aarch64.sve.dup.nxv16i8( [[DUP1]], [[PG2]], i8 [[B:%.*]]) +; CHECK-NEXT: [[DUP4:%.*]] = insertelement [[DUP2]], i8 [[B]], i64 0 +; CHECK-NEXT: ret [[DUP4]] +; + %pg1 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 4) + %dup1 = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg1, i8 %a) + %pg2 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 3) + %dup2 = tail call @llvm.aarch64.sve.dup.nxv16i8( %dup1, %pg2, i8 %b) + %pg4 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 1) + %dup4 = tail call @llvm.aarch64.sve.dup.nxv16i8( %dup2, %pg4, i8 %b) + ret %dup4 +} + +define @dup_insertelement_non_vl_ptrue_pat_0( %v, i8 %a, i8 %b) #0 { +; CHECK-LABEL: @dup_insertelement_non_vl_ptrue_pat_0( +; CHECK-NEXT: [[DUP1:%.*]] = insertelement [[V:%.*]], i8 [[A:%.*]], i64 0 +; CHECK-NEXT: [[PG2:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 0) +; CHECK-NEXT: [[DUP2:%.*]] = tail call @llvm.aarch64.sve.dup.nxv16i8( [[DUP1]], [[PG2]], i8 [[B:%.*]]) +; CHECK-NEXT: ret [[DUP2]] +; + %pg1 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 1) + %dup1 = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg1, i8 %a) + %pg2 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 0) ; POW2 -- not in range. + %dup2 = tail call @llvm.aarch64.sve.dup.nxv16i8( %dup1, %pg2, i8 %b) + ret %dup2 +} + +define @dup_insertelement_non_vl_ptrue_pat_9( %v, i8 %a, i8 %b) #0 { +; CHECK-LABEL: @dup_insertelement_non_vl_ptrue_pat_9( +; CHECK-NEXT: [[PG1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 9) +; CHECK-NEXT: [[DUP1:%.*]] = tail call @llvm.aarch64.sve.dup.nxv16i8( [[V:%.*]], [[PG1]], i8 [[A:%.*]]) +; CHECK-NEXT: [[PG2:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 8) +; CHECK-NEXT: [[DUP2:%.*]] = tail call @llvm.aarch64.sve.dup.nxv16i8( [[DUP1]], [[PG2]], i8 [[B:%.*]]) +; CHECK-NEXT: ret [[DUP2]] +; + %pg1 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 9) ; VL16 -- not in range. + %dup1 = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg1, i8 %a) + %pg2 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 8) + %dup2 = tail call @llvm.aarch64.sve.dup.nxv16i8( %dup1, %pg2, i8 %b) + ret %dup2 +} + +define @dup_insertelement_non_contiguous_ptrues( %v, i8 %a, i8 %b) #0 { +; CHECK-LABEL: @dup_insertelement_non_contiguous_ptrues( +; CHECK-NEXT: [[PG1:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 4) +; CHECK-NEXT: [[DUP1:%.*]] = tail call @llvm.aarch64.sve.dup.nxv16i8( [[V:%.*]], [[PG1]], i8 [[A:%.*]]) +; CHECK-NEXT: [[PG2:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 2) +; CHECK-NEXT: [[DUP2:%.*]] = tail call @llvm.aarch64.sve.dup.nxv16i8( [[DUP1]], [[PG2]], i8 [[B:%.*]]) +; CHECK-NEXT: ret [[DUP2]] +; + %pg1 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 4) + %dup1 = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg1, i8 %a) + %pg2 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 2) + %dup2 = tail call @llvm.aarch64.sve.dup.nxv16i8( %dup1, %pg2, i8 %b) + ret %dup2 +} + +define @dup_insertelement_ret_dup( %v, i8 %a, i8 %b) #0 { +; CHECK-LABEL: @dup_insertelement_ret_dup( +; CHECK-NEXT: [[DUP1:%.*]] = insertelement [[V:%.*]], i8 [[A:%.*]], i64 3 +; CHECK-NEXT: [[DUP2:%.*]] = insertelement [[DUP1]], i8 [[B:%.*]], i64 2 +; CHECK-NEXT: [[PG3:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 2) +; CHECK-NEXT: [[DUP3:%.*]] = tail call @llvm.aarch64.sve.dup.nxv16i8( [[DUP2]], [[PG3]], i8 [[A]]) +; CHECK-NEXT: ret [[DUP3]] +; + %pg1 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 4) + %dup1 = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg1, i8 %a) + %pg2 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 3) + %dup2 = tail call @llvm.aarch64.sve.dup.nxv16i8( %dup1, %pg2, i8 %b) + %pg3 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 2) + %dup3 = tail call @llvm.aarch64.sve.dup.nxv16i8( %dup2, %pg3, i8 %a) + ret %dup3 +} + +define i8 @dup_insertelement_multiple_users( %v, i8 %a, i8 %b) #0 { +; CHECK-LABEL: @dup_insertelement_multiple_users( +; CHECK-NEXT: [[DUP1:%.*]] = insertelement [[V:%.*]], i8 [[A:%.*]], i64 3 +; CHECK-NEXT: [[PG2:%.*]] = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 3) +; CHECK-NEXT: [[DUP2:%.*]] = tail call @llvm.aarch64.sve.dup.nxv16i8( [[DUP1]], [[PG2]], i8 [[B:%.*]]) +; CHECK-NEXT: [[X:%.*]] = extractelement [[DUP2]], i32 0 +; CHECK-NEXT: [[Y:%.*]] = extractelement [[DUP2]], i32 1 +; CHECK-NEXT: [[RES:%.*]] = add i8 [[X]], [[Y]] +; CHECK-NEXT: ret i8 [[RES]] +; + %pg1 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 4) + %dup1 = tail call @llvm.aarch64.sve.dup.nxv16i8( %v, %pg1, i8 %a) + %pg2 = tail call @llvm.aarch64.sve.ptrue.nxv16i1(i32 3) + %dup2 = tail call @llvm.aarch64.sve.dup.nxv16i8( %dup1, %pg2, i8 %b) + %x = extractelement %dup2, i32 0 + %y = extractelement %dup2, i32 1 + %res = add i8 %x, %y + ret i8 %res +} + declare @llvm.aarch64.sve.dup.nxv16i8(, , i8) declare @llvm.aarch64.sve.dup.nxv8i16(, , i16)