diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp @@ -3213,6 +3213,51 @@ return I; break; } + case Intrinsic::get_active_lane_mask: { + // Try to eliminate get_active_lane_mask instrinsics + // which always return all false predicates in certain scalable loops + // when the vectorization factor is known. + if (!II->getType()->isScalableTy()) + break; + + auto Op0 = II->getOperand(0); + auto Op1 = dyn_cast(II->getOperand(1)); + if (!Op1) + break; + + Value *Idx, *Vf; + if (!match(Op0, m_Add(m_Value(Idx), m_Value(Vf)))) + break; + + auto Phi = dyn_cast(Idx); + if (!Phi) + break; + + BinaryOperator *BO; + Value *L, *R; + if (!matchSimpleRecurrence(Phi, BO, L, R) || + BO->getOpcode() != Instruction::Add) + break; + + ConstantInt *PhiOp0 = dyn_cast(L); + if (!PhiOp0 || !PhiOp0->isZero()) + break; + ConstantInt *ShlValue; + if (!match(Vf, m_Shl(m_VScale(), m_ConstantInt(ShlValue)))) + break; + + Attribute VScaleAttr = + II->getFunction()->getFnAttribute(Attribute::VScaleRange); + if (!VScaleAttr.isValid()) + break; + unsigned VScaleMin = VScaleAttr.getVScaleRangeMin(); + uint64_t MinVScaleElts = VScaleMin * ShlValue->getZExtValue(); + if (MinVScaleElts < Op1->getZExtValue()) + break; + + auto PFalse = Constant::getNullValue(II->getType()); + return replaceInstUsesWith(*II, PFalse); + } default: { // Handle target specific intrinsics std::optional V = targetInstCombineIntrinsic(*II); diff --git a/llvm/test/Transforms/InstCombine/get-active-lane-mask.ll b/llvm/test/Transforms/InstCombine/get-active-lane-mask.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/InstCombine/get-active-lane-mask.ll @@ -0,0 +1,102 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -passes=instcombine -S < %s | FileCheck %s + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +define void @eliminate_always_false_scalable_get_active_lane_mask_in_loop(ptr %dst, ptr %src) #0 { +; CHECK-LABEL: define void @eliminate_always_false_scalable_get_active_lane_mask_in_loop +; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[VSCALE:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[VF:%.*]] = shl nuw nsw i64 [[VSCALE]], 4 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 4) +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ zeroinitializer, [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr [[TMP0]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP1:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[TMP1]], ptr [[TMP2]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add i64 [[INDEX]], [[VF]] +; CHECK-NEXT: br i1 false, label [[VECTOR_BODY]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %vscale = tail call i64 @llvm.vscale.i64() + %vf = shl nuw nsw i64 %vscale, 4 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 4) + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %active.lane.mask = phi [ %active.lane.mask.entry, %entry ], [ %active.lane.mask.next, %vector.body ] + %0 = getelementptr inbounds i8, ptr %src, i64 %index + %wide.masked.load = tail call @llvm.masked.load.nxv16i8.p0(ptr %0, i32 1, %active.lane.mask, poison) + %1 = shl %wide.masked.load, shufflevector ( insertelement ( poison, i8 1, i32 0), poison, zeroinitializer) + %2 = getelementptr inbounds i8, ptr %dst, i64 %index + tail call void @llvm.masked.store.nxv16i8.p0( %1, ptr %2, i32 1, %active.lane.mask) + %index.next = add i64 %index, %vf + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 4) + %3 = extractelement %active.lane.mask.next, i64 0 + br i1 %3, label %vector.body, label %exit + +exit: ; preds = %vector.body + ret void +} + +define void @neg_get_active_lane_mask_in_loop(ptr %dst, ptr %src) #0 { +; CHECK-LABEL: define void @neg_get_active_lane_mask_in_loop +; CHECK-SAME: (ptr [[DST:%.*]], ptr [[SRC:%.*]]) #[[ATTR0]] { +; CHECK-NEXT: entry: +; CHECK-NEXT: [[VSCALE:%.*]] = tail call i64 @llvm.vscale.i64() +; CHECK-NEXT: [[VF:%.*]] = shl nuw nsw i64 [[VSCALE]], 1 +; CHECK-NEXT: [[ACTIVE_LANE_MASK_ENTRY:%.*]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 4) +; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] +; CHECK: vector.body: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[ACTIVE_LANE_MASK:%.*]] = phi [ [[ACTIVE_LANE_MASK_ENTRY]], [[ENTRY]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds i8, ptr [[SRC]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_MASKED_LOAD:%.*]] = tail call @llvm.masked.load.nxv16i8.p0(ptr [[TMP0]], i32 1, [[ACTIVE_LANE_MASK]], poison) +; CHECK-NEXT: [[TMP1:%.*]] = shl [[WIDE_MASKED_LOAD]], shufflevector ( insertelement ( poison, i8 1, i32 0), poison, zeroinitializer) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i8, ptr [[DST]], i64 [[INDEX]] +; CHECK-NEXT: tail call void @llvm.masked.store.nxv16i8.p0( [[TMP1]], ptr [[TMP2]], i32 1, [[ACTIVE_LANE_MASK]]) +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw nsw i64 [[INDEX]], [[VF]] +; CHECK-NEXT: [[ACTIVE_LANE_MASK_NEXT]] = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 [[INDEX_NEXT]], i64 4) +; CHECK-NEXT: [[TMP3:%.*]] = extractelement [[ACTIVE_LANE_MASK_NEXT]], i64 0 +; CHECK-NEXT: br i1 [[TMP3]], label [[VECTOR_BODY]], label [[EXIT:%.*]] +; CHECK: exit: +; CHECK-NEXT: ret void +; +entry: + %vscale = tail call i64 @llvm.vscale.i64() + %vf = shl nuw nsw i64 %vscale, 1 + %active.lane.mask.entry = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 0, i64 4) + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i64 [ 0, %entry ], [ %index.next, %vector.body ] + %active.lane.mask = phi [ %active.lane.mask.entry, %entry ], [ %active.lane.mask.next, %vector.body ] + %0 = getelementptr inbounds i8, ptr %src, i64 %index + %wide.masked.load = tail call @llvm.masked.load.nxv16i8.p0(ptr %0, i32 1, %active.lane.mask, poison) + %1 = shl %wide.masked.load, shufflevector ( insertelement ( poison, i8 1, i32 0), poison, zeroinitializer) + %2 = getelementptr inbounds i8, ptr %dst, i64 %index + tail call void @llvm.masked.store.nxv16i8.p0( %1, ptr %2, i32 1, %active.lane.mask) + %index.next = add nuw nsw i64 %index, %vf + %active.lane.mask.next = tail call @llvm.get.active.lane.mask.nxv16i1.i64(i64 %index.next, i64 4) + %3 = extractelement %active.lane.mask.next, i64 0 + br i1 %3, label %vector.body, label %exit + +exit: ; preds = %vector.body + ret void +} + +declare i64 @llvm.vscale.i64() +declare @llvm.get.active.lane.mask.nxv16i1.i64(i64, i64) +declare @llvm.masked.load.nxv16i8.p0(ptr, i32 immarg, , ) +declare void @llvm.masked.store.nxv16i8.p0(, ptr, i32 immarg, ) + +attributes #0 = { vscale_range(1,16) "target-cpu"="neoverse-v1" "target-features"="+sve" }