diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -18069,6 +18069,56 @@ None. +'``llvm.experimental.get.vector.length``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" +This is an overloaded intrinsic. + +:: + + declare i32 @llvm.experimental.get.vector.length.i32(i32 %cnt, i32 immarg %element_width, i32 immarg %vf, i1 immarg %scalable) + declare i32 @llvm.experimental.get.vector.length.i64(i64 %cnt, i32 immarg %element_width, i32 immarg %vf, i1 immarg %scalable) + +Overview: +""""""""" + +The '``llvm.experimental.get.vector.length.*``' intrinsics take a number of +elements to process and returns how many of the elements can be processed +with the requested vectorization factor. + +Arguments: +"""""""""" + +The first argument is an unsigned value of any scalar integer type and specifieso +total number of elements to be processed. The second argument is an i32 +immediate for the element width in bits of the vector type. This serves as a +hint to the target about the element types involved in the loop. The third +argument is an i32 immediate for the vectorization factor. The fourth argument +indicates if the vectorization factor should be multiplied by vscale. + +Semantics: +"""""""""" + +Returns a positive i32 value (explicit vector length) that is unknown at compile +time and depends on the hardware specification. +If the result value does not fit in the result type, then the result is +a :ref:`poison value `. + +This intrinsic is intended to be used by loop vectorization with VP intrinsics +in order to get the number of elements to process on each loop iteration. The +result should be used to decrease the count for the next iteration until the +count reaches zero. + +If the count is larger than the number of lanes in the type described by the +last 2 arguments, this intrinsic may return a value less than the number of +lanes implied by the type. The result will be at least as large as the result +will be on any later loop iteration. + +This intrinsic will only return 0 if the input count is also 0. A non-zero input +count will produce a non-zero result. + Matrix Intrinsics ----------------- diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -1066,6 +1066,9 @@ SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm, bool ConstantFold = true); + SDValue getElementCount(const SDLoc &DL, EVT VT, ElementCount EC, + bool ConstantFold = true); + /// Return a GLOBAL_OFFSET_TABLE node. This does not have a useful SDLoc. SDValue getGLOBAL_OFFSET_TABLE(EVT VT) { return getNode(ISD::GLOBAL_OFFSET_TABLE, SDLoc(), VT); diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -459,6 +459,11 @@ return true; } + virtual bool shouldExpandGetVectorLength(EVT CountVT, unsigned VF, + bool IsScalable) const { + return true; + } + // Return true if op(vecreduce(x), vecreduce(y)) should be reassociated to // vecreduce(op(x, y)) for the reduction opcode RedOpc. virtual bool shouldReassociateReduction(unsigned RedOpc, EVT VT) const { diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -2149,6 +2149,13 @@ [llvm_anyint_ty, LLVMMatchType<1>], [IntrNoMem, IntrNoSync, IntrWillReturn]>; +def int_experimental_get_vector_length: + DefaultAttrsIntrinsic<[llvm_i32_ty], + [llvm_anyint_ty, llvm_i32_ty, llvm_i32_ty, llvm_i1_ty], + [IntrNoMem, IntrNoSync, IntrWillReturn, + ImmArg>, ImmArg>, + ImmArg>]>; + def int_experimental_vp_splice: DefaultAttrsIntrinsic<[llvm_anyvector_ty], [LLVMMatchType<0>, diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -1956,6 +1956,15 @@ return getNode(ISD::VSCALE, DL, VT, getConstant(MulImm, DL, VT)); } +SDValue SelectionDAG::getElementCount(const SDLoc &DL, EVT VT, ElementCount EC, + bool ConstantFold) { + if (EC.isScalable()) + return getVScale(DL, VT, + APInt(VT.getSizeInBits(), EC.getKnownMinValue())); + + return getConstant(EC.getKnownMinValue(), DL, VT); +} + SDValue SelectionDAG::getStepVector(const SDLoc &DL, EVT ResVT) { APInt One(ResVT.getScalarSizeInBits(), 1); return getStepVector(DL, ResVT, One); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -7299,6 +7299,40 @@ setValue(&I, SetCC); return; } + case Intrinsic::experimental_get_vector_length: { + assert(cast(I.getOperand(2))->getSExtValue() > 0 && + "Expected positive VF"); + unsigned VF = cast(I.getOperand(2))->getZExtValue(); + bool IsScalable = cast(I.getOperand(3))->isOne(); + + SDValue Count = getValue(I.getOperand(0)); + EVT CountVT = Count.getValueType(); + + if (!TLI.shouldExpandGetVectorLength(CountVT, VF, IsScalable)) { + visitTargetIntrinsic(I, Intrinsic); + return; + } + + // Expand to a umin between the trip count and the maximum elements the type + // can hold. + EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType()); + + // Extend the trip count to at least the result VT. + if (CountVT.bitsLT(VT)) { + Count = DAG.getNode(ISD::ZERO_EXTEND, sdl, VT, Count); + CountVT = VT; + } + + SDValue MaxEVL = DAG.getElementCount(sdl, CountVT, + ElementCount::get(VF, IsScalable)); + + SDValue UMin = DAG.getNode(ISD::UMIN, sdl, CountVT, Count, MaxEVL); + // Clip to the result type if needed. + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, sdl, VT, UMin); + + setValue(&I, Trunc); + return; + } case Intrinsic::vector_insert: { SDValue Vec = getValue(I.getOperand(0)); SDValue SubVec = getValue(I.getOperand(1)); diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp --- a/llvm/lib/IR/Verifier.cpp +++ b/llvm/lib/IR/Verifier.cpp @@ -5469,6 +5469,12 @@ Call); break; } + case Intrinsic::experimental_get_vector_length: { + ConstantInt *VF = cast(Call.getArgOperand(2)); + Check(!VF->isNegative() && !VF->isZero(), + "get_vector_length: VF must be positive", Call); + break; + } case Intrinsic::masked_load: { Check(Call.getType()->isVectorTy(), "masked_load: must return a vector", Call); diff --git a/llvm/test/CodeGen/AArch64/get_vector_length.ll b/llvm/test/CodeGen/AArch64/get_vector_length.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/get_vector_length.ll @@ -0,0 +1,40 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: llc < %s -mtriple=aarch64-linux-gnu -mattr=+sve | FileCheck %s + +declare i32 @llvm.experimental.get.vector.length.i16(i16, i32, i32, i1) +declare i32 @llvm.experimental.get.vector.length.i32(i32, i32, i32, i1) +declare i32 @llvm.experimental.get.vector.length.i64(i64, i32, i32, i1) + +define i32 @vector_length_i16(i16 zeroext %tc) { +; CHECK-LABEL: vector_length_i16: +; CHECK: // %bb.0: +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: csel w0, w0, w8, lo +; CHECK-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.i16(i16 %tc, i32 8, i32 2, i1 true) + ret i32 %a +} + +define i32 @vector_length_i32(i32 zeroext %tc) { +; CHECK-LABEL: vector_length_i32: +; CHECK: // %bb.0: +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: cmp w0, w8 +; CHECK-NEXT: csel w0, w0, w8, lo +; CHECK-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 8, i32 2, i1 true) + ret i32 %a +} + +define i32 @vector_length_i64(i64 %tc) { +; CHECK-LABEL: vector_length_i64: +; CHECK: // %bb.0: +; CHECK-NEXT: cntd x8 +; CHECK-NEXT: cmp x0, x8 +; CHECK-NEXT: csel x0, x0, x8, lo +; CHECK-NEXT: // kill: def $w0 killed $w0 killed $x0 +; CHECK-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.i64(i64 %tc, i32 8, i32 2, i1 true) + ret i32 %a +} diff --git a/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll b/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/rvv/get_vector_length.ll @@ -0,0 +1,130 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv32 -mattr=+v -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,RV32 +; RUN: sed 's/iXLen/i32/g' %s | llc -mtriple=riscv64 -mattr=+v -verify-machineinstrs | FileCheck %s --check-prefixes=CHECK,RV64 + +declare i32 @llvm.experimental.get.vector.length.i16(i16, i32, i32, i1) +declare i32 @llvm.experimental.get.vector.length.i32(i32, i32, i32, i1) +declare i32 @llvm.experimental.get.vector.length.i64(i64, i32, i32, i1) + +define i32 @vector_length_i16(i16 zeroext %tc) { +; CHECK-LABEL: vector_length_i16: +; CHECK: # %bb.0: +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: srli a1, a1, 2 +; CHECK-NEXT: bltu a0, a1, .LBB0_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: mv a0, a1 +; CHECK-NEXT: .LBB0_2: +; CHECK-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.i16(i16 %tc, i32 8, i32 2, i1 true) + ret i32 %a +} + +define i32 @vector_length_i32(i32 zeroext %tc) { +; RV32-LABEL: vector_length_i32: +; RV32: # %bb.0: +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: bltu a0, a1, .LBB1_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: .LBB1_2: +; RV32-NEXT: ret +; +; RV64-LABEL: vector_length_i32: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: srli a1, a1, 2 +; RV64-NEXT: bltu a0, a1, .LBB1_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: .LBB1_2: +; RV64-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 8, i32 2, i1 true) + ret i32 %a +} + +define i32 @vector_length_XLen(iXLen zeroext %tc) { +; RV32-LABEL: vector_length_XLen: +; RV32: # %bb.0: +; RV32-NEXT: csrr a1, vlenb +; RV32-NEXT: srli a1, a1, 2 +; RV32-NEXT: bltu a0, a1, .LBB2_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: mv a0, a1 +; RV32-NEXT: .LBB2_2: +; RV32-NEXT: ret +; +; RV64-LABEL: vector_length_XLen: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: csrr a1, vlenb +; RV64-NEXT: srli a1, a1, 2 +; RV64-NEXT: bltu a0, a1, .LBB2_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: mv a0, a1 +; RV64-NEXT: .LBB2_2: +; RV64-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.iXLen(iXLen %tc, i32 8, i32 2, i1 true) + ret i32 %a +} + +define i32 @vector_length_i16_fixed(i16 zeroext %tc) { +; CHECK-LABEL: vector_length_i16_fixed: +; CHECK: # %bb.0: +; CHECK-NEXT: li a1, 2 +; CHECK-NEXT: bltu a0, a1, .LBB3_2 +; CHECK-NEXT: # %bb.1: +; CHECK-NEXT: li a0, 2 +; CHECK-NEXT: .LBB3_2: +; CHECK-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.i16(i16 %tc, i32 8, i32 2, i1 false) + ret i32 %a +} + +define i32 @vector_length_i32_fixed(i32 zeroext %tc) { +; RV32-LABEL: vector_length_i32_fixed: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 2 +; RV32-NEXT: bltu a0, a1, .LBB4_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a0, 2 +; RV32-NEXT: .LBB4_2: +; RV32-NEXT: ret +; +; RV64-LABEL: vector_length_i32_fixed: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: li a1, 2 +; RV64-NEXT: bltu a0, a1, .LBB4_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a0, 2 +; RV64-NEXT: .LBB4_2: +; RV64-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 8, i32 2, i1 false) + ret i32 %a +} + +define i32 @vector_length_XLen_fixed(iXLen zeroext %tc) { +; RV32-LABEL: vector_length_XLen_fixed: +; RV32: # %bb.0: +; RV32-NEXT: li a1, 2 +; RV32-NEXT: bltu a0, a1, .LBB5_2 +; RV32-NEXT: # %bb.1: +; RV32-NEXT: li a0, 2 +; RV32-NEXT: .LBB5_2: +; RV32-NEXT: ret +; +; RV64-LABEL: vector_length_XLen_fixed: +; RV64: # %bb.0: +; RV64-NEXT: sext.w a0, a0 +; RV64-NEXT: li a1, 2 +; RV64-NEXT: bltu a0, a1, .LBB5_2 +; RV64-NEXT: # %bb.1: +; RV64-NEXT: li a0, 2 +; RV64-NEXT: .LBB5_2: +; RV64-NEXT: ret + %a = call i32 @llvm.experimental.get.vector.length.iXLen(iXLen %tc, i32 8, i32 2, i1 false) + ret i32 %a +} diff --git a/llvm/test/Verifier/get_vector_length.ll b/llvm/test/Verifier/get_vector_length.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Verifier/get_vector_length.ll @@ -0,0 +1,17 @@ +; RUN: not llvm-as < %s -o /dev/null 2>&1 | FileCheck %s + +declare i32 @llvm.experimental.get.vector.length.i32(i32, i32, i32, i1) + +define i32 @vector_length_negative_vf(i32 zeroext %tc) { + ; CHECK: get_vector_length: VF must be positive + ; CHECK-NEXT: %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 8, i32 -1, i1 true) + %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 8, i32 -1, i1 true) + ret i32 %a +} + +define i32 @vector_length_zero_vf(i32 zeroext %tc) { + ; CHECK: get_vector_length: VF must be positive + ; CHECK-NEXT: %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 8, i32 0, i1 true) + %a = call i32 @llvm.experimental.get.vector.length.i32(i32 %tc, i32 8, i32 0, i1 true) + ret i32 %a +}