diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -61,6 +61,7 @@ unsigned XLen = 32; MVT XLenVT = MVT::i32; uint8_t MaxInterleaveFactor = 2; + unsigned VScaleForTuning = 2; RISCVABI::ABI TargetABI = RISCVABI::ABI_Unknown; BitVector UserReservedRegister; RISCVFrameLowering FrameLowering; @@ -162,6 +163,7 @@ // implied by the architecture. unsigned getMaxRVVVectorSizeInBits() const; unsigned getMinRVVVectorSizeInBits() const; + unsigned getVScaleForTuning() const { return VScaleForTuning; } unsigned getMaxLMULForFixedLengthVectors() const; unsigned getMaxELENForFixedLengthVectors() const; bool useRVVForFixedLengthVectors() const; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -73,6 +73,10 @@ llvm_unreachable("Unsupported register kind"); } + Optional getVScaleForTuning() const { + return ST->getVScaleForTuning(); + } + void getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE); @@ -90,6 +94,11 @@ TTI::TargetCostKind CostKind, const Instruction *I); + InstructionCost getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src, + TTI::CastContextHint CCH, + TTI::TargetCostKind CostKind, + const Instruction *I = nullptr); + bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) { if (!ST->hasVInstructions()) return false; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -163,6 +163,42 @@ return NumLoads * MemOpCost; } +InstructionCost RISCVTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, + Type *Src, + TTI::CastContextHint CCH, + TTI::TargetCostKind CostKind, + const Instruction *I) { + int ISD = TLI->InstructionOpcodeToISD(Opcode); + assert(ISD && "Invalid opcode"); + + // TODO: Allow non-throughput costs that aren't binary. + auto AdjustCost = [&CostKind](InstructionCost Cost) -> InstructionCost { + if (CostKind != TTI::TCK_RecipThroughput) + return Cost == 0 ? 0 : 1; + return Cost; + }; + + EVT SrcTy = TLI->getValueType(DL, Src); + EVT DstTy = TLI->getValueType(DL, Dst); + + if (!SrcTy.isSimple() || !DstTy.isSimple()) + return AdjustCost( + BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); + + if (ISD == ISD::ZERO_EXTEND && SrcTy.isScalableVector() && + DstTy.isScalableVector()) { + if (SrcTy.isInteger() && DstTy.isInteger() && + SrcTy.getVectorElementCount() == DstTy.getVectorElementCount()) { + if (SrcTy.getScalarSizeInBits() <= DstTy.getScalarSizeInBits()) { + return 0; + } + } + } + + return AdjustCost( + BaseT::getCastInstrCost(Opcode, Dst, Src, CCH, CostKind, I)); +} + void RISCVTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE, TTI::UnrollingPreferences &UP, OptimizationRemarkEmitter *ORE) { diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/scalable-vectorization.ll b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-vectorization.ll new file mode 100644 --- /dev/null +++ b/llvm/test/Transforms/LoopVectorize/RISCV/scalable-vectorization.ll @@ -0,0 +1,166 @@ +; REQUIRES: asserts +; RUN: opt -mtriple=riscv64-unknown-elf -mattr=+experimental-v -riscv-v-vector-bits-min=128 -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize -scalable-vectorization=off < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_SCALABLE_DISABLED +; RUN: opt -mtriple=riscv64-unknown-elf -mattr=+experimental-v -riscv-v-vector-bits-min=128 -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize -scalable-vectorization=on < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_SCALABLE_ON +; RUN: opt -mtriple=riscv64-unknown-elf -mattr=+experimental-v -riscv-v-vector-bits-min=128 -force-target-instruction-cost=1 -loop-vectorize -S -debug-only=loop-vectorize -vectorizer-maximize-bandwidth -scalable-vectorization=on < %s 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK_SCALABLE_ON_MAXBW + +; Test that the MaxVF for the following loop, that has no dependence distances, +; is calculated as vscale x 2 (max legal RVV vector size) or vscale x 8 +; (maximized bandwidth for i8 in the loop). +define void @test0(i32* %a, i8* %b, i32* %c) #0 { +; CHECK: LV: Checking a loop in "test0" +; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2 +; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 2 +; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 +; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 8 +; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: vscale x 8 +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %c, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv + %1 = load i8, i8* %arrayidx2, align 4 + %zext = zext i8 %1 to i32 + %add = add nsw i32 %zext, %0 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %iv + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + +; Test that the MaxVF for the following loop, with a dependence distance +; of 64 elements, is calculated as (maxvscale = 16) * 2. +define void @test1(i32* %a, i8* %b) #0 { +; CHECK: LV: Checking a loop in "test1" +; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2 +; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 2 +; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 +; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 4 +; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16 +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv + %1 = load i8, i8* %arrayidx2, align 4 + %zext = zext i8 %1 to i32 + %add = add nsw i32 %zext, %0 + %2 = add nuw nsw i64 %iv, 64 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + +; Test that the MaxVF for the following loop, with a dependence distance +; of 32 elements, is calculated as (maxvscale = 16) * 2. +define void @test2(i32* %a, i8* %b) #0 { +; CHECK: LV: Checking a loop in "test2" +; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 2 +; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 2 +; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 +; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 2 +; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16 +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv + %1 = load i8, i8* %arrayidx2, align 4 + %zext = zext i8 %1 to i32 + %add = add nsw i32 %zext, %0 + %2 = add nuw nsw i64 %iv, 32 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + +; Test that the MaxVF for the following loop, with a dependence distance +; of 16 elements, is calculated as (maxvscale = 16) * 1. +define void @test3(i32* %a, i8* %b) #0 { +; CHECK: LV: Checking a loop in "test3" +; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 1 +; CHECK_SCALABLE_ON: LV: Selecting VF: 4 +; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 +; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 1 +; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 16 +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i8, i8* %b, i64 %iv + %1 = load i8, i8* %arrayidx2, align 4 + %zext = zext i8 %1 to i32 + %add = add nsw i32 %zext, %0 + %2 = add nuw nsw i64 %iv, 16 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + +; Test the fallback mechanism when scalable vectors are not feasible due +; to e.g. dependence distance. +define void @test4(i32* %a, i32* %b) #0 { +; CHECK: LV: Checking a loop in "test4" +; CHECK_SCALABLE_ON-NOT: LV: Found feasible scalable VF +; CHECK_SCALABLE_ON-NOT: LV: Found feasible scalable VF +; CHECK_SCALABLE_ON: LV: Selecting VF: 4 +; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF +; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 4 +; CHECK_SCALABLE_ON_MAXBW-NOT: LV: Found feasible scalable VF +; CHECK_SCALABLE_ON_MAXBW: LV: Selecting VF: 4 +entry: + br label %loop + +loop: + %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop ] + %arrayidx = getelementptr inbounds i32, i32* %a, i64 %iv + %0 = load i32, i32* %arrayidx, align 4 + %arrayidx2 = getelementptr inbounds i32, i32* %b, i64 %iv + %1 = load i32, i32* %arrayidx2, align 4 + %add = add nsw i32 %1, %0 + %2 = add nuw nsw i64 %iv, 8 + %arrayidx5 = getelementptr inbounds i32, i32* %a, i64 %2 + store i32 %add, i32* %arrayidx5, align 4 + %iv.next = add nuw nsw i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %exit, label %loop + +exit: + ret void +} + +attributes #0 = { vscale_range(1, 16) }