diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -18060,6 +18060,47 @@ None. +'``llvm.experimental.get.vector.length``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare i32 @llvm.experimental.get.vector.length.i32(i32 %cnt, i32 immarg %element_width, i32 immarg %vf) + declare i32 @llvm.experimental.get.vector.length.i64(i64 %cnt, i32 immarg %element_width, i32 immarg %vf) + +Overview: +""""""""" + +The '``llvm.experimental.get.vector.length.*``' intrinsics take a number of +elements to process and returns how many of the elements can be processed +with the requested vectorization factor. + +Arguments: +"""""""""" + +The first operand is of any integer type and specifies total number of elements +to be processed. The second argument is an i32 immediate for the element width +in bits of the vector type. This serves as a hint to the target about the +element types involved in the loop. The third parameter is an i32 immediate for +the vectorization factor. This factor is treated as a multiple of vscale. + +Semantics: +"""""""""" + +Returns a positive value (explicit vector length) that is unknown at compile +time and depends on the hardware specification. +If the result value does not fit in the result type, then the result is +a :ref:`poison value `. + +If the total count is larger than VF*vscale, this intrinsic may not return +VF*vscale. The result will be at least as large as the result for any value +less than count. This ensures that calling it for the total count will return +the largest value any later loop iteration will see. + Matrix Intrinsics ----------------- diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -459,6 +459,10 @@ return true; } + virtual bool shouldExpandGetVectorLength() const { + return true; + } + // Return true if op(vecreduce(x), vecreduce(y)) should be reassociated to // vecreduce(op(x, y)) for the reduction opcode RedOpc. virtual bool shouldReassociateReduction(unsigned RedOpc, EVT VT) const { diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -2146,6 +2146,12 @@ [llvm_anyint_ty, LLVMMatchType<1>], [IntrNoMem, IntrNoSync, IntrWillReturn]>; +def int_experimental_get_vector_length: + DefaultAttrsIntrinsic<[llvm_i32_ty], + [llvm_anyint_ty, llvm_i32_ty, llvm_i32_ty], + [IntrNoMem, IntrNoSync, IntrWillReturn, + ImmArg>, ImmArg>]>; + def int_experimental_vp_splice: DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7295,6 +7295,33 @@ setValue(&I, SetCC); return; } + case Intrinsic::experimental_get_vector_length: { + if (!TLI.shouldExpandGetVectorLength()) { + visitTargetIntrinsic(I, Intrinsic); + return; + } + + // Expand to a umin between the trip count and the maximum elements the type + // can hold. + EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType()); + SDValue TripCount = getValue(I.getOperand(0)); + + // Extend the trip count to at least the result VT. + if (TripCount.getValueType().bitsLT(VT)) + TripCount = DAG.getNode(ISD::ZERO_EXTEND, sdl, VT, TripCount); + + EVT TripCountVT = TripCount.getValueType(); + + uint64_t VF = cast(I.getOperand(2))->getZExtValue(); + SDValue MaxEVL = DAG.getVScale(sdl, TripCountVT, + APInt(TripCountVT.getSizeInBits(), VF)); + SDValue UMin = DAG.getNode(ISD::UMIN, sdl, TripCountVT, TripCount, MaxEVL); + // Clip to the result type if needed. + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, sdl, VT, UMin); + + setValue(&I, Trunc); + return; + } case Intrinsic::vector_insert: { SDValue Vec = getValue(I.getOperand(0)); SDValue SubVec = getValue(I.getOperand(1)); diff --git a/llvm/test/CodeGen/AArch64/get_vector_length.ll b/llvm/test/CodeGen/AArch64/get_vector_length.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/get_vector_length.ll @@ -0,0 +1,40 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck %s + +declare i32 @llvm.experimental.get.vector.length.i16(i16, i32, i32) +declare i32 @llvm.experimental.get.vector.length.i32(i32, i32, i32) +declare i32 @llvm.experimental.get.vector.length.i64(i64, i32, i32) + +define i32 @vector_length_i16(i16 zeroext %tc) { +; CHECK-LABEL: vector_length_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: csel w0, w0, w8, lo +; CHECK-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.i16(i16 %tc, i32 8, i32 2) + ret i32 %a +} + +define i32 @vector_length_i32(i32 zeroext %tc) { +; CHECK-LABEL: vector_length_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: csel w0, w0, w8, lo +; CHECK-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 8, i32 2) + ret i32 %a +} + +define i32 @vector_length_i64(i64 %tc) { +; CHECK-LABEL: vector_length_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: cmp x0, x8 +; CHECK-NEXT: csel x0, x0, x8, lo +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.i64(i64 %tc, i32 8, i32 2) + ret i32 %a +} diff --git a/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll b/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll @@ -0,0 +1,71 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,RV64 + +declare i32 @llvm.experimental.get.vector.length.i16(i16, i32, i32) +declare i32 @llvm.experimental.get.vector.length.i32(i32, i32, i32) +declare i32 @llvm.experimental.get.vector.length.i64(i64, i32, i32) + +define i32 @vector_length_i16(i16 zeroext %tc) { +; CHECK-LABEL: vector_length_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: bltu a0, a1, .LBB0_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.i16(i16 %tc, i32 8, i32 2) + ret i32 %a +} + +define i32 @vector_length_i32(i32 zeroext %tc) { +; RV32-LABEL: vector_length_i32: +; RV32: # %bb.0: +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: bltu a0, a1, .LBB1_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: .LBB1_2: +; RV32-NEXT: ret +; +; RV64-LABEL: vector_length_i32: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: srli a1, a1, 2 +; RV64-NEXT: bltu a0, a1, .LBB1_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: .LBB1_2: +; RV64-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 8, i32 2) + ret i32 %a +} + +define i32 @vector_length_XLen(iXLen zeroext %tc) { +; RV32-LABEL: vector_length_XLen: +; RV32: # %bb.0: +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: bltu a0, a1, .LBB2_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: .LBB2_2: +; RV32-NEXT: ret +; +; RV64-LABEL: vector_length_XLen: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: srli a1, a1, 2 +; RV64-NEXT: bltu a0, a1, .LBB2_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: .LBB2_2: +; RV64-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.iXLen(iXLen %tc, i32 8, i32 2) + ret i32 %a +}