diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp --- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -1125,6 +1125,24 @@ KnownBits Known2; switch (Op.getOpcode()) { + case ISD::VSCALE: { + Function const &F = TLO.DAG.getMachineFunction().getFunction(); + Attribute const &Attr = F.getFnAttribute(Attribute::VScaleRange); + if (!Attr.isValid()) + return false; + std::optional MaxVScale = Attr.getVScaleRangeMax(); + if (!MaxVScale.has_value()) + return false; + APInt VScaleResultUpperbound(64, *MaxVScale); + VScaleResultUpperbound *= Op.getConstantOperandAPInt(0).sextOrTrunc(64); + bool Negative = VScaleResultUpperbound.isNegative(); + if (Negative) + VScaleResultUpperbound = ~VScaleResultUpperbound; + unsigned RequiredBits = VScaleResultUpperbound.getActiveBits(); + if (RequiredBits < BitWidth) + (Negative ? Known.One : Known.Zero).setHighBits(BitWidth - RequiredBits); + return false; + } case ISD::SCALAR_TO_VECTOR: { if (VT.isScalableVector()) return false; diff --git a/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll b/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll --- a/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll +++ b/llvm/test/CodeGen/AArch64/vscale-and-sve-cnt-demandedbits.ll @@ -14,9 +14,8 @@ ; CHECK: // %bb.0: ; CHECK-NEXT: rdvl x8, #1 ; CHECK-NEXT: lsr x8, x8, #4 -; CHECK-NEXT: and w9, w8, #0x1f -; CHECK-NEXT: and w8, w8, #0xfffffffc -; CHECK-NEXT: add w0, w9, w8 +; CHECK-NEXT: and w9, w8, #0x1c +; CHECK-NEXT: add w0, w8, w9 ; CHECK-NEXT: ret %vscale = call i32 @llvm.vscale.i32() %and_redundant = and i32 %vscale, 31 @@ -85,8 +84,7 @@ ; CHECK-LABEL: vscale_trunc_zext: ; CHECK: // %bb.0: ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: lsr x8, x8, #4 -; CHECK-NEXT: and x0, x8, #0xffffffff +; CHECK-NEXT: lsr x0, x8, #4 ; CHECK-NEXT: ret %vscale = call i32 @llvm.vscale.i32() %zext = zext i32 %vscale to i64 @@ -97,8 +95,7 @@ ; CHECK-LABEL: vscale_trunc_sext: ; CHECK: // %bb.0: ; CHECK-NEXT: rdvl x8, #1 -; CHECK-NEXT: lsr x8, x8, #4 -; CHECK-NEXT: sxtw x0, w8 +; CHECK-NEXT: lsr x0, x8, #4 ; CHECK-NEXT: ret %vscale = call i32 @llvm.vscale.i32() %sext = sext i32 %vscale to i64 @@ -200,9 +197,8 @@ ; CHECK-NEXT: mov w9, #5 ; CHECK-NEXT: lsr x8, x8, #4 ; CHECK-NEXT: mul x8, x8, x9 -; CHECK-NEXT: and w9, w8, #0x7f -; CHECK-NEXT: and w8, w8, #0x3f -; CHECK-NEXT: add w0, w9, w8 +; CHECK-NEXT: and w9, w8, #0x3f +; CHECK-NEXT: add w0, w8, w9 ; CHECK-NEXT: ret %vscale = call i32 @llvm.vscale.i32() %mul = mul i32 %vscale, 5 @@ -219,9 +215,8 @@ ; CHECK-NEXT: mov x9, #-5 ; CHECK-NEXT: lsr x8, x8, #4 ; CHECK-NEXT: mul x8, x8, x9 -; CHECK-NEXT: orr w9, w8, #0xffffff80 -; CHECK-NEXT: and w8, w8, #0xffffffc0 -; CHECK-NEXT: add w0, w9, w8 +; CHECK-NEXT: and w9, w8, #0xffffffc0 +; CHECK-NEXT: add w0, w8, w9 ; CHECK-NEXT: ret %vscale = call i32 @llvm.vscale.i32() %mul = mul i32 %vscale, -5 @@ -231,6 +226,22 @@ ret i32 %result } +define i32 @pow2_vscale_with_negative_multiplier() vscale_range(1,16) { +; CHECK-LABEL: pow2_vscale_with_negative_multiplier: +; CHECK: // %bb.0: +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: neg x8, x8 +; CHECK-NEXT: orr w9, w8, #0xfffffff0 +; CHECK-NEXT: add w0, w8, w9 +; CHECK-NEXT: ret + %vscale = call i32 @llvm.vscale.i32() + %mul = mul i32 %vscale, -2 + %or_redundant = or i32 %mul, 4294967264 + %or_required = or i32 %mul, 4294967280 + %result = add i32 %or_redundant, %or_required + ret i32 %result +} + declare i32 @llvm.vscale.i32() declare i64 @llvm.aarch64.sve.cntb(i32 %pattern) declare i64 @llvm.aarch64.sve.cnth(i32 %pattern) diff --git a/llvm/test/CodeGen/RISCV/vscale-demanded-bits.ll b/llvm/test/CodeGen/RISCV/vscale-demanded-bits.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/vscale-demanded-bits.ll @@ -0,0 +1,82 @@ +; RUN: llc -mtriple riscv64-unknown-linux-musl -mattr +v -filetype asm -o - %s | FileCheck %s + +; CHECK: vse8.v v8, (a5), v0.t +; CHECK: vadd.vx v8, v8, a3 + +target datalayout = "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128" +target triple = "riscv64-unknown-linux-musl" + +; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind optsize memory(argmem: write) vscale_range(2,1024) +define dso_local void @_Z4FillPhi(ptr nocapture noundef writeonly %buffer, i32 noundef signext %n) local_unnamed_addr #0 { +entry: + %cmp4 = icmp sgt i32 %n, 0 + br i1 %cmp4, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + %0 = tail call i64 @llvm.vscale.i64() + %1 = shl nuw nsw i64 %0, 3 + %2 = tail call i64 @llvm.vscale.i64() + %3 = shl nuw nsw i64 %2, 3 + %4 = add nsw i64 %3, -1 + %n.rnd.up = add nsw i64 %4, %wide.trip.count + %n.mod.vf = urem i64 %n.rnd.up, %1 + %n.vec = sub nsw i64 %n.rnd.up, %n.mod.vf + %5 = tail call @llvm.experimental.stepvector.nxv8i8() + %6 = tail call i8 @llvm.vscale.i8() + %7 = shl i8 %6, 3 + %.splatinsert = insertelement poison, i8 %7, i64 0 + %.splat = shufflevector %.splatinsert, poison, zeroinitializer + %8 = tail call i64 @llvm.vscale.i64() + %9 = shl nuw nsw i64 %8, 3 + br label %vector.body + +vector.body: ; preds = %vector.body, %for.body.preheader + %index = phi i64 [ 0, %for.body.preheader ], [ %index.next, %vector.body ] + %vec.ind = phi [ %5, %for.body.preheader ], [ %vec.ind.next, %vector.body ] + %active.lane.mask = tail call @llvm.get.active.lane.mask.nxv8i1.i64(i64 %index, i64 %wide.trip.count) + %10 = getelementptr inbounds i8, ptr %buffer, i64 %index + tail call void @llvm.masked.store.nxv8i8.p0( %vec.ind, ptr %10, i32 1, %active.lane.mask), !tbaa !6 + %index.next = add i64 %index, %9 + %vec.ind.next = add %vec.ind, %.splat + %11 = icmp eq i64 %index.next, %n.vec + br i1 %11, label %for.cond.cleanup, label %vector.body, !llvm.loop !9 + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare i64 @llvm.vscale.i64() #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare @llvm.experimental.stepvector.nxv8i8() #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare i8 @llvm.vscale.i8() #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(none) +declare @llvm.get.active.lane.mask.nxv8i1.i64(i64, i64) #1 + +; Function Attrs: nocallback nofree nosync nounwind willreturn memory(argmem: write) +declare void @llvm.masked.store.nxv8i8.p0(, ptr nocapture, i32 immarg, ) #2 + +attributes #0 = { mustprogress nofree noinline norecurse nosync nounwind optsize memory(argmem: write) vscale_range(2,1024) "frame-pointer"="none" "min-legal-vector-width"="0" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic-rv64" "target-features"="+64bit,+a,+c,+d,+f,+m,+relax,+v,+zve32f,+zve32x,+zve64d,+zve64f,+zve64x,+zvl128b,+zvl32b,+zvl64b,-save-restore" } +attributes #1 = { nocallback nofree nosync nounwind willreturn memory(none) } +attributes #2 = { nocallback nofree nosync nounwind willreturn memory(argmem: write) } + +!llvm.module.flags = !{!0, !1, !2, !3, !4} +!llvm.ident = !{!5} + +!0 = !{i32 1, !"wchar_size", i32 4} +!1 = !{i32 1, !"target-abi", !"lp64d"} +!2 = !{i32 8, !"PIC Level", i32 2} +!3 = !{i32 7, !"PIE Level", i32 2} +!4 = !{i32 1, !"SmallDataLimit", i32 8} +!5 = !{!"clang version 16.0.0 (https://github.com/llvm/llvm-project 07184d24df19ecdadb09c53eaf2d18e55a99aa82)"} +!6 = !{!7, !7, i64 0} +!7 = !{!"omnipotent char", !8, i64 0} +!8 = !{!"Simple C++ TBAA"} +!9 = distinct !{!9, !10, !11} +!10 = !{!"llvm.loop.mustprogress"} +!11 = !{!"llvm.loop.isvectorized", i32 1}