diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -18058,6 +18058,52 @@ None. +'``llvm.experimental.get.vector.length``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare i32 @llvm.experimental.get.vector.length.i32(i32 %cnt, i32 immarg %element_width, i32 immarg %vf, i1 immarg %scalble) + declare i32 @llvm.experimental.get.vector.length.i64(i64 %cnt, i32 immarg %element_width, i32 immarg %vf, i1 immarg %scalble) + +Overview: +""""""""" + +The '``llvm.experimental.get.vector.length.*``' intrinsics take a number of +elements to process and returns how many of the elements can be processed +with the requested vectorization factor. + +Arguments: +"""""""""" + +The first argument is of any integer type and specifies total number of elements +to be processed. The second argument is an i32 immediate for the element width +in bits of the vector type. This serves as a hint to the target about the +element types involved in the loop. The third argument is an i32 immediate for +the vectorization factor. The fourth argument indicates if the vectorization +factor should be multiplied by vscale. + +Semantics: +"""""""""" + +Returns a positive i32 value (explicit vector length) that is unknown at compile +time and depends on the hardware specification. +If the result value does not fit in the result type, then the result is +a :ref:`poison value `. + +This intrinsic is intended to be used by loop vectorization with VP intrinsics +in order to get the number of elements to process on each loop iteration. The +result should be used to decrease the count for the next iteration until the +count reaches zero. + +If the count is larger than the vectorization factor, including vscale, this +intrinsic may not return the full vectorization factor. The result will be +at least as large as the result will be on any later loop iteration. + Matrix Intrinsics ----------------- diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -459,6 +459,11 @@ return true; } + virtual bool shouldExpandGetVectorLength(EVT CountVT, unsigned VF, + bool IsScalable) const { + return true; + } + // Return true if op(vecreduce(x), vecreduce(y)) should be reassociated to // vecreduce(op(x, y)) for the reduction opcode RedOpc. virtual bool shouldReassociateReduction(unsigned RedOpc, EVT VT) const { diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -2149,6 +2149,13 @@ [llvm_anyint_ty, LLVMMatchType<1>], [IntrNoMem, IntrNoSync, IntrWillReturn]>; +def int_experimental_get_vector_length: + DefaultAttrsIntrinsic<[llvm_i32_ty], + [llvm_anyint_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty], + [IntrNoMem, IntrNoSync, IntrWillReturn, + ImmArg>, ImmArg>, + ImmArg>]>; + def int_experimental_vp_splice: DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7299,6 +7299,43 @@ setValue(&I, SetCC); return; } + case Intrinsic::experimental_get_vector_length: { + assert(cast(I.getOperand(2))->getSExtValue() > 0 && + "Expected positive VF"); + unsigned VF = cast(I.getOperand(2))->getZExtValue(); + bool IsScalable = cast(I.getOperand(3))->isOne(); + + SDValue Count = getValue(I.getOperand(0)); + EVT CountVT = Count.getValueType(); + + if (!TLI.shouldExpandGetVectorLength(CountVT, VF, IsScalable)) { + visitTargetIntrinsic(I, Intrinsic); + return; + } + + // Expand to a umin between the trip count and the maximum elements the type + // can hold. + EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType()); + + // Extend the trip count to at least the result VT. + if (CountVT.bitsLT(VT)) { + Count = DAG.getNode(ISD::ZERO_EXTEND, sdl, VT, Count); + CountVT = VT; + } + + SDValue MaxEVL; + if (IsScalable) + MaxEVL = DAG.getVScale(sdl, CountVT, APInt(CountVT.getSizeInBits(), VF)); + else + MaxEVL = DAG.getConstant(VF, sdl, CountVT); + + SDValue UMin = DAG.getNode(ISD::UMIN, sdl, CountVT, Count, MaxEVL); + // Clip to the result type if needed. + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, sdl, VT, UMin); + + setValue(&I, Trunc); + return; + } case Intrinsic::vector_insert: { SDValue Vec = getValue(I.getOperand(0)); SDValue SubVec = getValue(I.getOperand(1)); diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -5459,6 +5459,12 @@ Call); break; } + case Intrinsic::experimental_get_vector_length: { + ConstantInt *VF = cast(Call.getArgOperand(2)); + Check(!VF->isNegative() && !VF->isZero(), + "get_vector_length: VF must be positive", Call); + break; + } case Intrinsic::masked_load: { Check(Call.getType()->isVectorTy(), "masked_load: must return a vector", Call); diff --git a/llvm/test/CodeGen/AArch64/get_vector_length.ll b/llvm/test/CodeGen/AArch64/get_vector_length.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/get_vector_length.ll @@ -0,0 +1,40 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck %s + +declare i32 @llvm.experimental.get.vector.length.i16(i16, i32, i32, i1) +declare i32 @llvm.experimental.get.vector.length.i32(i32, i32, i32, i1) +declare i32 @llvm.experimental.get.vector.length.i64(i64, i32, i32, i1) + +define i32 @vector_length_i16(i16 zeroext %tc) { +; CHECK-LABEL: vector_length_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: csel w0, w0, w8, lo +; CHECK-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.i16(i16 %tc, i32 8, i32 2, i1 true) + ret i32 %a +} + +define i32 @vector_length_i32(i32 zeroext %tc) { +; CHECK-LABEL: vector_length_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: csel w0, w0, w8, lo +; CHECK-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 8, i32 2, i1 true) + ret i32 %a +} + +define i32 @vector_length_i64(i64 %tc) { +; CHECK-LABEL: vector_length_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: cmp x0, x8 +; CHECK-NEXT: csel x0, x0, x8, lo +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.i64(i64 %tc, i32 8, i32 2, i1 true) + ret i32 %a +} diff --git a/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll b/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll @@ -0,0 +1,130 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,RV64 + +declare i32 @llvm.experimental.get.vector.length.i16(i16, i32, i32, i1) +declare i32 @llvm.experimental.get.vector.length.i32(i32, i32, i32, i1) +declare i32 @llvm.experimental.get.vector.length.i64(i64, i32, i32, i1) + +define i32 @vector_length_i16(i16 zeroext %tc) { +; CHECK-LABEL: vector_length_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: bltu a0, a1, .LBB0_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.i16(i16 %tc, i32 8, i32 2, i1 true) + ret i32 %a +} + +define i32 @vector_length_i32(i32 zeroext %tc) { +; RV32-LABEL: vector_length_i32: +; RV32: # %bb.0: +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: bltu a0, a1, .LBB1_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: .LBB1_2: +; RV32-NEXT: ret +; +; RV64-LABEL: vector_length_i32: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: srli a1, a1, 2 +; RV64-NEXT: bltu a0, a1, .LBB1_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: .LBB1_2: +; RV64-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 8, i32 2, i1 true) + ret i32 %a +} + +define i32 @vector_length_XLen(iXLen zeroext %tc) { +; RV32-LABEL: vector_length_XLen: +; RV32: # %bb.0: +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: bltu a0, a1, .LBB2_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: .LBB2_2: +; RV32-NEXT: ret +; +; RV64-LABEL: vector_length_XLen: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: srli a1, a1, 2 +; RV64-NEXT: bltu a0, a1, .LBB2_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: .LBB2_2: +; RV64-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.iXLen(iXLen %tc, i32 8, i32 2, i1 true) + ret i32 %a +} + +define i32 @vector_length_i16_fixed(i16 zeroext %tc) { +; CHECK-LABEL: vector_length_i16_fixed: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 2 +; CHECK-NEXT: bltu a0, a1, .LBB3_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a0, 2 +; CHECK-NEXT: .LBB3_2: +; CHECK-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.i16(i16 %tc, i32 8, i32 2, i1 false) + ret i32 %a +} + +define i32 @vector_length_i32_fixed(i32 zeroext %tc) { +; RV32-LABEL: vector_length_i32_fixed: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 2 +; RV32-NEXT: bltu a0, a1, .LBB4_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a0, 2 +; RV32-NEXT: .LBB4_2: +; RV32-NEXT: ret +; +; RV64-LABEL: vector_length_i32_fixed: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: li a1, 2 +; RV64-NEXT: bltu a0, a1, .LBB4_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a0, 2 +; RV64-NEXT: .LBB4_2: +; RV64-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 8, i32 2, i1 false) + ret i32 %a +} + +define i32 @vector_length_XLen_fixed(iXLen zeroext %tc) { +; RV32-LABEL: vector_length_XLen_fixed: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 2 +; RV32-NEXT: bltu a0, a1, .LBB5_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a0, 2 +; RV32-NEXT: .LBB5_2: +; RV32-NEXT: ret +; +; RV64-LABEL: vector_length_XLen_fixed: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: li a1, 2 +; RV64-NEXT: bltu a0, a1, .LBB5_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a0, 2 +; RV64-NEXT: .LBB5_2: +; RV64-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.iXLen(iXLen %tc, i32 8, i32 2, i1 false) + ret i32 %a +} diff --git a/llvm/test/Verifier/get_vector_length.ll b/llvm/test/Verifier/get_vector_length.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Verifier/get_vector_length.ll @@ -0,0 +1,17 @@ +; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s + +declare i32 @llvm.experimental.get.vector.length.i32(i32, i32, i32, i1) + +define i32 @vector_length_negative_vf(i32 zeroext %tc) { + ; CHECK: get_vector_length: VF must be positive + ; CHECK-NEXT: %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 8, i32 -1, i1 true) + %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 8, i32 -1, i1 true) + ret i32 %a +} + +define i32 @vector_length_zero_vf(i32 zeroext %tc) { + ; CHECK: get_vector_length: VF must be positive + ; CHECK-NEXT: %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 8, i32 0, i1 true) + %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 8, i32 0, i1 true) + ret i32 %a +}