diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst --- a/llvm/docs/LangRef.rst +++ b/llvm/docs/LangRef.rst @@ -17889,6 +17889,34 @@ mask argument does not match the pointer size of the target, the mask is zero-extended or truncated accordingly. +.. _int_vscale: + +'``llvm.vscale``' Intrinsic +^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +Syntax: +""""""" + +:: + + declare i32 llvm.vscale.i32() + declare i64 llvm.vscale.i64() + +Overview: +""""""""" + +The ``llvm.vscale`` intrinsic returns the value for ``vscale`` in scalable +vectors such as ````. + +Semantics: +"""""""""" + +``vscale`` is a positive value that is constant throughout program +execution, but is unknown at compile time. +If the result value does not fit in the result type, then the result is +a :ref:`poison value `. + + Stack Map Intrinsics -------------------- diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h --- a/llvm/include/llvm/CodeGen/ISDOpcodes.h +++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h @@ -921,6 +921,11 @@ /// known nonzero constant. The only operand here is the chain. GET_DYNAMIC_AREA_OFFSET, + /// VSCALE(IMM) - Returns the runtime scaling factor used to calculate the + /// number of elements within a scalable vector. IMM is a constant integer + /// multiplier that is applied to the runtime value. + VSCALE, + /// Generic reduction nodes. These nodes represent horizontal vector /// reduction operations, producing a scalar result. /// The STRICT variants perform reductions in sequential order. The first diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h --- a/llvm/include/llvm/CodeGen/SelectionDAG.h +++ b/llvm/include/llvm/CodeGen/SelectionDAG.h @@ -914,6 +914,13 @@ return getNode(ISD::UNDEF, SDLoc(), VT); } + /// Return a node that represents the runtime scaling 'MulImm * RuntimeVL'. + SDValue getVScale(const SDLoc &DL, EVT VT, APInt MulImm) { + assert(MulImm.getMinSignedBits() <= VT.getSizeInBits() && + "Immediate does not fit VT"); + return getNode(ISD::VSCALE, DL, VT, getConstant(MulImm, DL, VT)); + } + /// Return a GLOBAL_OFFSET_TABLE node. This does not have a useful SDLoc. SDValue getGLOBAL_OFFSET_TABLE(EVT VT) { return getNode(ISD::GLOBAL_OFFSET_TABLE, SDLoc(), VT); diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td --- a/llvm/include/llvm/IR/Intrinsics.td +++ b/llvm/include/llvm/IR/Intrinsics.td @@ -1342,6 +1342,11 @@ [IntrNoMem, ImmArg<1>, ImmArg<2>]>; +//===---------- Intrinsics to query properties of scalable vectors --------===// +def int_vscale : Intrinsic<[llvm_anyint_ty], [], [IntrNoMem]>; + +//===----------------------------------------------------------------------===// + //===----------------------------------------------------------------------===// // Target-specific intrinsics //===----------------------------------------------------------------------===// diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h --- a/llvm/include/llvm/IR/PatternMatch.h +++ b/llvm/include/llvm/IR/PatternMatch.h @@ -32,6 +32,7 @@ #include "llvm/ADT/APInt.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" +#include "llvm/IR/DataLayout.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" @@ -2002,6 +2003,42 @@ return ExtractValue_match(V); } +/// Matches patterns for `vscale`. This can either be a call to `llvm.vscale` or +/// the constant expression +/// `ptrtoint(gep , * null, i32 1>` +/// under the right conditions determined by DataLayout. +struct VScaleVal_match { +private: + template + inline BinaryOp_match + m_OffsetGep(const Base &B, const Offset &O) { + return BinaryOp_match(B, O); + } + +public: + const DataLayout &DL; + VScaleVal_match(const DataLayout &DL) : DL(DL) {} + + template bool match(ITy *V) { + if (m_Intrinsic().match(V)) + return true; + + if (m_PtrToInt(m_OffsetGep(m_Zero(), m_SpecificInt(1))).match(V)) { + Type *PtrTy = cast(V)->getOperand(0)->getType(); + Type *DerefTy = PtrTy->getPointerElementType(); + if (DerefTy->isVectorTy() && DerefTy->getVectorIsScalable() && + DL.getTypeAllocSizeInBits(DerefTy).getKnownMinSize() == 8) + return true; + } + + return false; + } +}; + +inline VScaleVal_match m_VScale(const DataLayout &DL) { + return VScaleVal_match(DL); +} + } // end namespace PatternMatch } // end namespace llvm diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td --- a/llvm/include/llvm/Target/TargetSelectionDAG.td +++ b/llvm/include/llvm/Target/TargetSelectionDAG.td @@ -316,6 +316,7 @@ def bb : SDNode<"ISD::BasicBlock", SDTOther , [], "BasicBlockSDNode">; def cond : SDNode<"ISD::CONDCODE" , SDTOther , [], "CondCodeSDNode">; def undef : SDNode<"ISD::UNDEF" , SDTUNDEF , []>; +def vscale : SDNode<"ISD::VSCALE" , SDTIntUnaryOp, []>; def globaladdr : SDNode<"ISD::GlobalAddress", SDTPtrLeaf, [], "GlobalAddressSDNode">; def tglobaladdr : SDNode<"ISD::TargetGlobalAddress", SDTPtrLeaf, [], diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp --- a/llvm/lib/Analysis/ConstantFolding.cpp +++ b/llvm/lib/Analysis/ConstantFolding.cpp @@ -828,7 +828,8 @@ Type *SrcElemTy = GEP->getSourceElementType(); Type *ResElemTy = GEP->getResultElementType(); Type *ResTy = GEP->getType(); - if (!SrcElemTy->isSized()) + if (!SrcElemTy->isSized() || + (SrcElemTy->isVectorTy() && SrcElemTy->getVectorIsScalable())) return nullptr; if (Constant *C = CastGEPIndices(SrcElemTy, Ops, ResTy, diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp --- a/llvm/lib/CodeGen/CodeGenPrepare.cpp +++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp @@ -2010,6 +2010,22 @@ return despeculateCountZeros(II, TLI, DL, ModifiedDT); case Intrinsic::dbg_value: return fixupDbgValue(II); + case Intrinsic::vscale: { + // If datalayout has no special restrictions on vector data layout, + // replace `llvm.vscale` by an equivalent constant expression + // to benefit from cheap constant propagation. + Type *ScalableVectorTy = + VectorType::get(Type::getInt8Ty(II->getContext()), 1, true); + if (DL->getTypeAllocSize(ScalableVectorTy).getKnownMinSize() == 8) { + auto Null = Constant::getNullValue(ScalableVectorTy->getPointerTo()); + auto One = ConstantInt::getSigned(II->getType(), 1); + auto *CGep = + ConstantExpr::getGetElementPtr(ScalableVectorTy, Null, One); + II->replaceAllUsesWith(ConstantExpr::getPtrToInt(CGep, II->getType())); + II->eraseFromParent(); + return true; + } + } } if (TLI) { diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp @@ -91,6 +91,7 @@ case ISD::TRUNCATE: Res = PromoteIntRes_TRUNCATE(N); break; case ISD::UNDEF: Res = PromoteIntRes_UNDEF(N); break; case ISD::VAARG: Res = PromoteIntRes_VAARG(N); break; + case ISD::VSCALE: Res = PromoteIntRes_VSCALE(N); break; case ISD::EXTRACT_SUBVECTOR: Res = PromoteIntRes_EXTRACT_SUBVECTOR(N); break; @@ -1179,6 +1180,13 @@ N->getValueType(0))); } +SDValue DAGTypeLegalizer::PromoteIntRes_VSCALE(SDNode *N) { + EVT VT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0)); + + APInt MulImm = cast(N->getOperand(0))->getAPIntValue(); + return DAG.getVScale(SDLoc(N), VT, MulImm.sextOrSelf(VT.getSizeInBits())); +} + SDValue DAGTypeLegalizer::PromoteIntRes_VAARG(SDNode *N) { SDValue Chain = N->getOperand(0); // Get the chain. SDValue Ptr = N->getOperand(1); // Get the pointer. diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h --- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h +++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h @@ -326,6 +326,7 @@ SDValue PromoteIntRes_ADDSUBCARRY(SDNode *N, unsigned ResNo); SDValue PromoteIntRes_UNDEF(SDNode *N); SDValue PromoteIntRes_VAARG(SDNode *N); + SDValue PromoteIntRes_VSCALE(SDNode *N); SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo); SDValue PromoteIntRes_ADDSUBSAT(SDNode *N); SDValue PromoteIntRes_MULFIX(SDNode *N); diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -5185,11 +5185,20 @@ if (N2C && N2C->isNullValue()) return N1; break; + case ISD::MUL: + assert(VT.isInteger() && "This operator does not apply to FP types!"); + assert(N1.getValueType() == N2.getValueType() && + N1.getValueType() == VT && "Binary operator types must match!"); + if (N2C && (N1.getOpcode() == ISD::VSCALE) && Flags.hasNoSignedWrap()) { + APInt MulImm = cast(N1->getOperand(0))->getAPIntValue(); + APInt N2CImm = N2C->getAPIntValue(); + return getVScale(DL, VT, MulImm * N2CImm); + } + break; case ISD::UDIV: case ISD::UREM: case ISD::MULHU: case ISD::MULHS: - case ISD::MUL: case ISD::SDIV: case ISD::SREM: case ISD::SMIN: @@ -5222,6 +5231,12 @@ "Invalid FCOPYSIGN!"); break; case ISD::SHL: + if (N2C && (N1.getOpcode() == ISD::VSCALE) && Flags.hasNoSignedWrap()) { + APInt MulImm = cast(N1->getOperand(0))->getAPIntValue(); + APInt ShiftImm = N2C->getAPIntValue(); + return getVScale(DL, VT, MulImm << ShiftImm); + } + LLVM_FALLTHROUGH; case ISD::SRA: case ISD::SRL: if (SDValue V = simplifyShift(N1, N2)) diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp @@ -1482,6 +1482,9 @@ TLI.getPointerTy(DAG.getDataLayout(), AS)); } + if (match(C, m_VScale(DAG.getDataLayout()))) + return DAG.getVScale(getCurSDLoc(), VT, APInt(VT.getSizeInBits(), 1)); + if (const ConstantFP *CFP = dyn_cast(C)) return DAG.getConstantFP(*CFP, getCurSDLoc(), VT); @@ -5772,6 +5775,13 @@ // By default, turn this into a target intrinsic node. visitTargetIntrinsic(I, Intrinsic); return; + case Intrinsic::vscale: { + match(&I, m_VScale(DAG.getDataLayout())); + EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType()); + setValue(&I, + DAG.getVScale(getCurSDLoc(), VT, APInt(VT.getSizeInBits(), 1))); + return; + } case Intrinsic::vastart: visitVAStart(I); return; case Intrinsic::vaend: visitVAEnd(I); return; case Intrinsic::vacopy: visitVACopy(I); return; diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp @@ -170,6 +170,7 @@ case ISD::CopyToReg: return "CopyToReg"; case ISD::CopyFromReg: return "CopyFromReg"; case ISD::UNDEF: return "undef"; + case ISD::VSCALE: return "vscale"; case ISD::MERGE_VALUES: return "merge_values"; case ISD::INLINEASM: return "inlineasm"; case ISD::INLINEASM_BR: return "inlineasm_br"; diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -62,6 +62,9 @@ unsigned ConstraintID, std::vector &OutOps) override; + template + bool SelectRDVLImm(SDValue N, SDValue &Imm); + bool tryMLAV64LaneV128(SDNode *N); bool tryMULLV64LaneV128(unsigned IntNo, SDNode *N); bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift); @@ -679,6 +682,23 @@ return SDValue(Node, 0); } +// Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N. +template +bool AArch64DAGToDAGISel::SelectRDVLImm(SDValue N, SDValue &Imm) { + if (!isa(N)) + return false; + + int64_t MulImm = cast(N)->getSExtValue(); + if ((MulImm % std::abs(Scale)) == 0) { + int64_t RDVLImm = MulImm / Scale; + if ((RDVLImm >= Low) && (RDVLImm <= High)) { + Imm = CurDAG->getTargetConstant(RDVLImm, SDLoc(N), MVT::i32); + return true; + } + } + + return false; +} /// SelectArithExtendedRegister - Select a "extended register" operand. This /// operand folds in an extend followed by an optional left shift. diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -748,6 +748,7 @@ SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const; SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const; SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const; SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -836,6 +836,9 @@ } } + if (Subtarget->hasSVE()) + setOperationAction(ISD::VSCALE, MVT::i32, Custom); + setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom); } @@ -3254,6 +3257,8 @@ return LowerATOMIC_LOAD_AND(Op, DAG); case ISD::DYNAMIC_STACKALLOC: return LowerDYNAMIC_STACKALLOC(Op, DAG); + case ISD::VSCALE: + return LowerVSCALE(Op, DAG); } } @@ -8641,6 +8646,17 @@ return DAG.getMergeValues(Ops, dl); } +SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op, + SelectionDAG &DAG) const { + EVT VT = Op.getValueType(); + assert(VT != MVT::i64 && "Expected illegal VSCALE node"); + + SDLoc DL(Op); + APInt MulImm = cast(Op.getOperand(0))->getAPIntValue(); + return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sextOrSelf(64)), + DL, VT); +} + /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as /// MemIntrinsicNodes. The associated MachineMemOperands record the alignment /// specified in the intrinsic calls. diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -46,6 +46,17 @@ def AArch64ld1_gather_sxtw_scaled : SDNode<"AArch64ISD::GLD1_SXTW_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; def AArch64ld1_gather_imm : SDNode<"AArch64ISD::GLD1_IMM", SDT_AArch64_GLD1_IMM, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; +// SVE CNT/INC/RDVL +def sve_rdvl_imm : ComplexPattern">; +def sve_cnth_imm : ComplexPattern">; +def sve_cntw_imm : ComplexPattern">; +def sve_cntd_imm : ComplexPattern">; + +// SVE DEC +def sve_cnth_imm_neg : ComplexPattern">; +def sve_cntw_imm_neg : ComplexPattern">; +def sve_cntd_imm_neg : ComplexPattern">; + def AArch64ld1s_gather : SDNode<"AArch64ISD::GLD1S", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; def AArch64ld1s_gather_scaled : SDNode<"AArch64ISD::GLD1S_SCALED", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; def AArch64ld1s_gather_uxtw : SDNode<"AArch64ISD::GLD1S_UXTW", SDT_AArch64_GLD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>; @@ -1105,6 +1116,23 @@ def : Pat<(sext_inreg (nxv4i32 ZPR:$Zs), nxv4i8), (SXTB_ZPmZ_S (IMPLICIT_DEF), (PTRUE_S 31), ZPR:$Zs)>; def : Pat<(sext_inreg (nxv8i16 ZPR:$Zs), nxv8i8), (SXTB_ZPmZ_H (IMPLICIT_DEF), (PTRUE_H 31), ZPR:$Zs)>; + // General case that we ideally never want to match. + def : Pat<(vscale GPR64:$scale), (MADDXrrr (UBFMXri (RDVLI_XI 1), 4, 63), $scale, XZR)>; + + let AddedComplexity = 5 in { + def : Pat<(vscale (i64 1)), (UBFMXri (RDVLI_XI 1), 4, 63)>; + def : Pat<(vscale (i64 -1)), (SBFMXri (RDVLI_XI -1), 4, 63)>; + + def : Pat<(vscale (sve_rdvl_imm i32:$imm)), (RDVLI_XI $imm)>; + def : Pat<(vscale (sve_cnth_imm i32:$imm)), (CNTH_XPiI 31, $imm)>; + def : Pat<(vscale (sve_cntw_imm i32:$imm)), (CNTW_XPiI 31, $imm)>; + def : Pat<(vscale (sve_cntd_imm i32:$imm)), (CNTD_XPiI 31, $imm)>; + + def : Pat<(vscale (sve_cnth_imm_neg i32:$imm)), (SUBXrs XZR, (CNTH_XPiI 31, $imm), 0)>; + def : Pat<(vscale (sve_cntw_imm_neg i32:$imm)), (SUBXrs XZR, (CNTW_XPiI 31, $imm), 0)>; + def : Pat<(vscale (sve_cntd_imm_neg i32:$imm)), (SUBXrs XZR, (CNTD_XPiI 31, $imm), 0)>; + } + def : Pat<(nxv16i8 (bitconvert (nxv8i16 ZPR:$src))), (nxv16i8 ZPR:$src)>; def : Pat<(nxv16i8 (bitconvert (nxv4i32 ZPR:$src))), (nxv16i8 ZPR:$src)>; def : Pat<(nxv16i8 (bitconvert (nxv2i64 ZPR:$src))), (nxv16i8 ZPR:$src)>; diff --git a/llvm/test/CodeGen/AArch64/sve-vscale.ll b/llvm/test/CodeGen/AArch64/sve-vscale.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-vscale.ll @@ -0,0 +1,200 @@ +; RUN: llc -mtriple aarch64 -mattr=+sve -asm-verbose=0 < %s | FileCheck %s +; RUN: opt -codegenprepare -S < %s | llc -mtriple aarch64 -mattr=+sve -asm-verbose=0 | FileCheck %s + +; +; RDVL +; + +; CHECK-LABEL: rdvl_i8: +; CHECK: rdvl x0, #1 +; CHECK-NEXT: ret +define i8 @rdvl_i8() nounwind { + %vscale = call i8 @llvm.vscale.i8() + %1 = mul nsw i8 %vscale, 16 + ret i8 %1 +} + +; CHECK-LABEL: rdvl_i16: +; CHECK: rdvl x0, #1 +; CHECK-NEXT: ret +define i16 @rdvl_i16() nounwind { + %vscale = call i16 @llvm.vscale.i16() + %1 = mul nsw i16 %vscale, 16 + ret i16 %1 +} + +; CHECK-LABEL: rdvl_i32: +; CHECK: rdvl x0, #1 +; CHECK-NEXT: ret +define i32 @rdvl_i32() nounwind { + %vscale = call i32 @llvm.vscale.i32() + %1 = mul nsw i32 %vscale, 16 + ret i32 %1 +} + +; CHECK-LABEL: rdvl_i64: +; CHECK: rdvl x0, #1 +; CHECK-NEXT: ret +define i64 @rdvl_i64() nounwind { + %vscale = call i64 @llvm.vscale.i64() + %1 = mul nsw i64 %vscale, 16 + ret i64 %1 +} + +; CHECK-LABEL: rdvl_const: +; CHECK: rdvl x0, #1 +; CHECK-NEXT: ret +define i32 @rdvl_const() nounwind { + ret i32 mul nsw (i32 ptrtoint (* getelementptr (, * null, i64 1) to i32), i32 16) +} + +define i32 @vscale_1() nounwind { +; CHECK-LABEL: vscale_1: +; CHECK: rdvl [[TMP:x[0-9]+]], #1 +; CHECK-NEXT: lsr x0, [[TMP]], #4 +; CHECK-NEXT: ret + %vscale = call i32 @llvm.vscale.i32() + ret i32 %vscale +} + +define i32 @vscale_neg1() nounwind { +; CHECK-LABEL: vscale_neg1: +; CHECK: rdvl [[TMP:x[0-9]+]], #-1 +; CHECK-NEXT: asr x0, [[TMP]], #4 +; CHECK-NEXT: ret + %vscale = call i32 @llvm.vscale.i32() + %neg = mul nsw i32 -1, %vscale + ret i32 %neg +} + +; CHECK-LABEL: rdvl_3: +; CHECK: rdvl [[VL_B:x[0-9]+]], #1 +; CHECK-NEXT: lsr [[VL_Q:x[0-9]+]], [[VL_B]], #4 +; CHECK-NEXT: mov w[[MUL:[0-9]+]], #3 +; CHECK-NEXT: mul x0, [[VL_Q]], x[[MUL]] +; CHECK-NEXT: ret +define i32 @rdvl_3() nounwind { + %vscale = call i32 @llvm.vscale.i32() + %1 = mul nsw i32 %vscale, 3 + ret i32 %1 +} + + +; CHECK-LABEL: rdvl_min: +; CHECK: rdvl x0, #-32 +; CHECK-NEXT: ret +define i32 @rdvl_min() nounwind { + %vscale = call i32 @llvm.vscale.i32() + %1 = mul nsw i32 %vscale, -512 + ret i32 %1 +} + +; CHECK-LABEL: rdvl_max: +; CHECK: rdvl x0, #31 +; CHECK-NEXT: ret +define i32 @rdvl_max() nounwind { + %vscale = call i32 @llvm.vscale.i32() + %1 = mul nsw i32 %vscale, 496 + ret i32 %1 +} + +; +; CNTH +; + +; CHECK-LABEL: cnth: +; CHECK: cnth x0{{$}} +; CHECK-NEXT: ret +define i32 @cnth() nounwind { + %vscale = call i32 @llvm.vscale.i32() + %1 = shl nsw i32 %vscale, 3 + ret i32 %1 +} + +; CHECK-LABEL: cnth_max: +; CHECK: cnth x0, all, mul #15 +; CHECK-NEXT: ret +define i32 @cnth_max() nounwind { + %vscale = call i32 @llvm.vscale.i32() + %1 = mul nsw i32 %vscale, 120 + ret i32 %1 +} + +; CHECK-LABEL: cnth_neg: +; CHECK: cnth [[CNT:x[0-9]+]] +; CHECK: neg x0, [[CNT]] +; CHECK-NEXT: ret +define i32 @cnth_neg() nounwind { + %vscale = call i32 @llvm.vscale.i32() + %1 = mul nsw i32 %vscale, -8 + ret i32 %1 +} + +; +; CNTW +; + +; CHECK-LABEL: cntw: +; CHECK: cntw x0{{$}} +; CHECK-NEXT: ret +define i32 @cntw() nounwind { + %vscale = call i32 @llvm.vscale.i32() + %1 = shl nsw i32 %vscale, 2 + ret i32 %1 +} + +; CHECK-LABEL: cntw_max: +; CHECK: cntw x0, all, mul #15 +; CHECK-NEXT: ret +define i32 @cntw_max() nounwind { + %vscale = call i32 @llvm.vscale.i32() + %1 = mul nsw i32 %vscale, 60 + ret i32 %1 +} + +; CHECK-LABEL: cntw_neg: +; CHECK: cntw [[CNT:x[0-9]+]] +; CHECK: neg x0, [[CNT]] +; CHECK-NEXT: ret +define i32 @cntw_neg() nounwind { + %vscale = call i32 @llvm.vscale.i32() + %1 = mul nsw i32 %vscale, -4 + ret i32 %1 +} + +; +; CNTD +; + +; CHECK-LABEL: cntd: +; CHECK: cntd x0{{$}} +; CHECK-NEXT: ret +define i32 @cntd() nounwind { + %vscale = call i32 @llvm.vscale.i32() + %1 = shl nsw i32 %vscale, 1 + ret i32 %1 +} + +; CHECK-LABEL: cntd_max: +; CHECK: cntd x0, all, mul #15 +; CHECK-NEXT: ret +define i32 @cntd_max() nounwind { + %vscale = call i32 @llvm.vscale.i32() + %1 = mul nsw i32 %vscale, 30 + ret i32 %1 +} + +; CHECK-LABEL: cntd_neg: +; CHECK: cntd [[CNT:x[0-9]+]] +; CHECK: neg x0, [[CNT]] +; CHECK-NEXT: ret +define i32 @cntd_neg() nounwind { + %vscale = call i32 @llvm.vscale.i32() + %1 = mul nsw i32 %vscale, -2 + ret i32 %1 +} + +declare i8 @llvm.vscale.i8() +declare i16 @llvm.vscale.i16() +declare i32 @llvm.vscale.i32() +declare i64 @llvm.vscale.i64()