diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -58,20 +58,7 @@ bool supportsScalableVectors() const { return ST->hasVInstructions(); } Optional getMaxVScale() const; - TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { - switch (K) { - case TargetTransformInfo::RGK_Scalar: - return TypeSize::getFixed(ST->getXLen()); - case TargetTransformInfo::RGK_FixedWidthVector: - return TypeSize::getFixed( - ST->hasVInstructions() ? ST->getMinRVVVectorSizeInBits() : 0); - case TargetTransformInfo::RGK_ScalableVector: - return TypeSize::getScalable( - ST->hasVInstructions() ? RISCV::RVVBitsPerBlock : 0); - } - - llvm_unreachable("Unsupported register kind"); - } + TypeSize getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const; void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -15,6 +15,13 @@ #define DEBUG_TYPE "riscvtti" +static cl::opt RVVRegisterWidthLMUL( + "riscv-v-register-bit-width-lmul", + cl::desc( + "The LMUL to use for getRegisterBitWidth queries. Affects LMUL used " + "by autovectorized code. Fractional LMULs are not supported."), + cl::init(1), cl::Hidden); + InstructionCost RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) { assert(Ty->isIntegerTy() && @@ -137,6 +144,24 @@ return BaseT::getMaxVScale(); } +TypeSize +RISCVTTIImpl::getRegisterBitWidth(TargetTransformInfo::RegisterKind K) const { + unsigned LMUL = PowerOf2Floor( + std::max(std::min(RVVRegisterWidthLMUL, 8), 1)); + switch (K) { + case TargetTransformInfo::RGK_Scalar: + return TypeSize::getFixed(ST->getXLen()); + case TargetTransformInfo::RGK_FixedWidthVector: + return TypeSize::getFixed( + ST->hasVInstructions() ? LMUL * ST->getMinRVVVectorSizeInBits() : 0); + case TargetTransformInfo::RGK_ScalableVector: + return TypeSize::getScalable( + ST->hasVInstructions() ? LMUL * RISCV::RVVBitsPerBlock : 0); + } + + llvm_unreachable("Unsupported register kind"); +} + InstructionCost RISCVTTIImpl::getGatherScatterOpCost( unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask, Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) { diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll --- a/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll +++ b/llvm/test/Transforms/LoopVectorize/RISCV/riscv-unroll.ll @@ -1,14 +1,125 @@ -; RUN: opt < %s -loop-vectorize -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -S | FileCheck %s -; RUN: opt < %s -loop-vectorize -mtriple=riscv32 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -S | FileCheck %s +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py +; RUN: opt < %s -loop-vectorize -force-target-max-vector-interleave=1 -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -S | FileCheck %s --check-prefix=LMUL1 +; RUN: opt < %s -loop-vectorize -force-target-max-vector-interleave=1 -mtriple=riscv32 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -S | FileCheck %s --check-prefix=LMUL1 +; RUN: opt < %s -loop-vectorize -force-target-max-vector-interleave=1 -mtriple=riscv64 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-register-bit-width-lmul=2 -S | FileCheck %s --check-prefix=LMUL2 +; RUN: opt < %s -loop-vectorize -force-target-max-vector-interleave=1 -mtriple=riscv32 -mattr=+experimental-v -riscv-v-vector-bits-min=128 -riscv-v-register-bit-width-lmul=2 -S | FileCheck %s --check-prefix=LMUL2 ; Function Attrs: nounwind define i32* @array_add(i32* noalias nocapture readonly %a, i32* noalias nocapture readonly %b, i32* %c, i32 %size) { -;CHECK-LABEL: array_add -;CHECK: load <4 x i32> -;CHECK: load <4 x i32> -;CHECK: add nsw <4 x i32> -;CHECK: store <4 x i32> -;CHECK: ret +; LMUL1-LABEL: @array_add( +; LMUL1-NEXT: entry: +; LMUL1-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[SIZE:%.*]], 0 +; LMUL1-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; LMUL1: for.body.preheader: +; LMUL1-NEXT: [[TMP0:%.*]] = add i32 [[SIZE]], -1 +; LMUL1-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; LMUL1-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; LMUL1-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 4 +; LMUL1-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; LMUL1: vector.ph: +; LMUL1-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4 +; LMUL1-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] +; LMUL1-NEXT: br label [[VECTOR_BODY:%.*]] +; LMUL1: vector.body: +; LMUL1-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; LMUL1-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; LMUL1-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP3]] +; LMUL1-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 +; LMUL1-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* +; LMUL1-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i32>, <4 x i32>* [[TMP6]], align 4 +; LMUL1-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP3]] +; LMUL1-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 +; LMUL1-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <4 x i32>* +; LMUL1-NEXT: [[WIDE_LOAD1:%.*]] = load <4 x i32>, <4 x i32>* [[TMP9]], align 4 +; LMUL1-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] +; LMUL1-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP3]] +; LMUL1-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0 +; LMUL1-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* +; LMUL1-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP13]], align 4 +; LMUL1-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; LMUL1-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; LMUL1-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; LMUL1: middle.block: +; LMUL1-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; LMUL1-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; LMUL1: scalar.ph: +; LMUL1-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; LMUL1-NEXT: br label [[FOR_BODY:%.*]] +; LMUL1: for.body: +; LMUL1-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; LMUL1-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; LMUL1-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; LMUL1-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; LMUL1-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; LMUL1-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP15]] +; LMUL1-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]] +; LMUL1-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4 +; LMUL1-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; LMUL1-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; LMUL1-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[SIZE]] +; LMUL1-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; LMUL1: for.end.loopexit: +; LMUL1-NEXT: br label [[FOR_END]] +; LMUL1: for.end: +; LMUL1-NEXT: ret i32* [[C]] +; +; LMUL2-LABEL: @array_add( +; LMUL2-NEXT: entry: +; LMUL2-NEXT: [[CMP10:%.*]] = icmp sgt i32 [[SIZE:%.*]], 0 +; LMUL2-NEXT: br i1 [[CMP10]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_END:%.*]] +; LMUL2: for.body.preheader: +; LMUL2-NEXT: [[TMP0:%.*]] = add i32 [[SIZE]], -1 +; LMUL2-NEXT: [[TMP1:%.*]] = zext i32 [[TMP0]] to i64 +; LMUL2-NEXT: [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1 +; LMUL2-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], 8 +; LMUL2-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]] +; LMUL2: vector.ph: +; LMUL2-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 8 +; LMUL2-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]] +; LMUL2-NEXT: br label [[VECTOR_BODY:%.*]] +; LMUL2: vector.body: +; LMUL2-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ] +; LMUL2-NEXT: [[TMP3:%.*]] = add i64 [[INDEX]], 0 +; LMUL2-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[A:%.*]], i64 [[TMP3]] +; LMUL2-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[TMP4]], i32 0 +; LMUL2-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <8 x i32>* +; LMUL2-NEXT: [[WIDE_LOAD:%.*]] = load <8 x i32>, <8 x i32>* [[TMP6]], align 4 +; LMUL2-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, i32* [[B:%.*]], i64 [[TMP3]] +; LMUL2-NEXT: [[TMP8:%.*]] = getelementptr inbounds i32, i32* [[TMP7]], i32 0 +; LMUL2-NEXT: [[TMP9:%.*]] = bitcast i32* [[TMP8]] to <8 x i32>* +; LMUL2-NEXT: [[WIDE_LOAD1:%.*]] = load <8 x i32>, <8 x i32>* [[TMP9]], align 4 +; LMUL2-NEXT: [[TMP10:%.*]] = add nsw <8 x i32> [[WIDE_LOAD1]], [[WIDE_LOAD]] +; LMUL2-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, i32* [[C:%.*]], i64 [[TMP3]] +; LMUL2-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP11]], i32 0 +; LMUL2-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <8 x i32>* +; LMUL2-NEXT: store <8 x i32> [[TMP10]], <8 x i32>* [[TMP13]], align 4 +; LMUL2-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; LMUL2-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; LMUL2-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; LMUL2: middle.block: +; LMUL2-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP2]], [[N_VEC]] +; LMUL2-NEXT: br i1 [[CMP_N]], label [[FOR_END_LOOPEXIT:%.*]], label [[SCALAR_PH]] +; LMUL2: scalar.ph: +; LMUL2-NEXT: [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[FOR_BODY_PREHEADER]] ] +; LMUL2-NEXT: br label [[FOR_BODY:%.*]] +; LMUL2: for.body: +; LMUL2-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ] +; LMUL2-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, i32* [[A]], i64 [[INDVARS_IV]] +; LMUL2-NEXT: [[TMP15:%.*]] = load i32, i32* [[ARRAYIDX]], align 4 +; LMUL2-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, i32* [[B]], i64 [[INDVARS_IV]] +; LMUL2-NEXT: [[TMP16:%.*]] = load i32, i32* [[ARRAYIDX2]], align 4 +; LMUL2-NEXT: [[ADD:%.*]] = add nsw i32 [[TMP16]], [[TMP15]] +; LMUL2-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds i32, i32* [[C]], i64 [[INDVARS_IV]] +; LMUL2-NEXT: store i32 [[ADD]], i32* [[ARRAYIDX4]], align 4 +; LMUL2-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 +; LMUL2-NEXT: [[LFTR_WIDEIV:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 +; LMUL2-NEXT: [[EXITCOND:%.*]] = icmp eq i32 [[LFTR_WIDEIV]], [[SIZE]] +; LMUL2-NEXT: br i1 [[EXITCOND]], label [[FOR_END_LOOPEXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP2:![0-9]+]] +; LMUL2: for.end.loopexit: +; LMUL2-NEXT: br label [[FOR_END]] +; LMUL2: for.end: +; LMUL2-NEXT: ret i32* [[C]] +; entry: %cmp10 = icmp sgt i32 %size, 0 br i1 %cmp10, label %for.body.preheader, label %for.end