diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -113,6 +113,9 @@ // Set up the register classes. addRegisterClass(XLenVT, &RISCV::GPRRegClass); + if (Subtarget.is64Bit()) + addRegisterClass(MVT::v8i8, &RISCV::GPRRegClass); + addRegisterClass(MVT::v4i8, &RISCV::GPRRegClass); if (Subtarget.hasStdExtZfhOrZfhmin()) addRegisterClass(MVT::f16, &RISCV::FPR16RegClass); diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td --- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td +++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td @@ -1682,6 +1682,7 @@ def : LdPat; def : LdPat; def : LdPat, Requires<[IsRV32]>; +def : LdPat, Requires<[IsRV32]>; def : LdPat; def : LdPat; @@ -1696,6 +1697,7 @@ def : StPat; def : StPat; def : StPat, Requires<[IsRV32]>; +def : StPat, Requires<[IsRV32]>; /// Fences @@ -1875,11 +1877,15 @@ def : LdPat; def : LdPat; def : LdPat; +def : LdPat; +def : LdPat; /// Stores def : StPat; def : StPat; +def : StPat; +def : StPat; } // Predicates = [IsRV64] /// readcyclecounter diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td --- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td +++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td @@ -123,7 +123,7 @@ [RegInfo<32,32,32>, RegInfo<64,64,64>]>; class GPRRegisterClass - : RegisterClass<"RISCV", [XLenVT, XLenFVT, i32], 32, regList> { + : RegisterClass<"RISCV", [XLenVT, XLenFVT, i32, v4i8, v8i8], 32, regList> { let RegInfos = XLenRI; } diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h --- a/llvm/lib/Target/RISCV/RISCVSubtarget.h +++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h @@ -176,7 +176,7 @@ bool hasVInstructionsAnyF() const { return hasVInstructionsF32(); } bool hasVInstructionsFullMultiply() const { return HasStdExtV; } unsigned getMaxInterleaveFactor() const { - return hasVInstructions() ? MaxInterleaveFactor : 1; + return hasVInstructions() ? MaxInterleaveFactor : MaxInterleaveFactor; } // Returns VLEN divided by DLEN. Where DLEN is the datapath width of the diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h @@ -329,7 +329,7 @@ unsigned getRegisterClassForType(bool Vector, Type *Ty = nullptr) const { if (Vector) - return RISCVRegisterClass::VRRC; + return ST->hasVInstructions() ? RISCVRegisterClass::VRRC : GPRRC; if (!Ty) return RISCVRegisterClass::GPRRC; diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp --- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp +++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp @@ -231,8 +231,9 @@ case TargetTransformInfo::RGK_Scalar: return TypeSize::getFixed(ST->getXLen()); case TargetTransformInfo::RGK_FixedWidthVector: - return TypeSize::getFixed( - ST->useRVVForFixedLengthVectors() ? LMUL * ST->getRealMinVLen() : 0); + return TypeSize::getFixed(ST->useRVVForFixedLengthVectors() + ? LMUL * ST->getRealMinVLen() + : ST->getXLen()); case TargetTransformInfo::RGK_ScalableVector: return TypeSize::getScalable( (ST->hasVInstructions() && @@ -1338,6 +1339,10 @@ InstructionCost BaseCost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, CostKind, OpInfo, I); + + if (!ST->hasVInstructions()) + return BaseCost; + // Assume memory ops cost scale with the number of vector registers // possible accessed by the instruction. Note that BasicTTI already // handles the LT.first term for us. diff --git a/llvm/test/CodeGen/RISCV/vectorization-scalar-register/memcpy.ll b/llvm/test/CodeGen/RISCV/vectorization-scalar-register/memcpy.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/vectorization-scalar-register/memcpy.ll @@ -0,0 +1,165 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -O2 -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=RV32-NONVEC %s +; RUN: llc -O2 -mtriple=riscv64 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=RV64-NONVEC %s +; RUN: opt -O2 -mtriple=riscv32 %s | llc -O2 -mtriple=riscv32 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=RV32-VEC %s +; RUN: opt -O2 -mtriple=riscv64 %s | llc -O2 -mtriple=riscv64 -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=RV64-VEC %s + +; C code: +; void foo(char *restrict a, char *restrict b, int n) { +; for (int i = 0; i < n; i++) +; a[i] = b[i]; +; } + +define void @foo(ptr noalias nocapture noundef writeonly %a, ptr noalias nocapture noundef readonly %b, i32 noundef signext %n) #0 { +; RV32-NONVEC-LABEL: foo: +; RV32-NONVEC: # %bb.0: # %entry +; RV32-NONVEC-NEXT: blez a2, .LBB0_3 +; RV32-NONVEC-NEXT: # %bb.1: # %for.body.preheader +; RV32-NONVEC-NEXT: li a3, 0 +; RV32-NONVEC-NEXT: li a4, 0 +; RV32-NONVEC-NEXT: .LBB0_2: # %for.body +; RV32-NONVEC-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32-NONVEC-NEXT: add a5, a1, a3 +; RV32-NONVEC-NEXT: lbu a5, 0(a5) +; RV32-NONVEC-NEXT: add a6, a0, a3 +; RV32-NONVEC-NEXT: addi a3, a3, 1 +; RV32-NONVEC-NEXT: seqz a7, a3 +; RV32-NONVEC-NEXT: add a4, a4, a7 +; RV32-NONVEC-NEXT: xor a7, a3, a2 +; RV32-NONVEC-NEXT: or a7, a7, a4 +; RV32-NONVEC-NEXT: sb a5, 0(a6) +; RV32-NONVEC-NEXT: bnez a7, .LBB0_2 +; RV32-NONVEC-NEXT: .LBB0_3: # %for.cond.cleanup +; RV32-NONVEC-NEXT: ret +; +; RV64-NONVEC-LABEL: foo: +; RV64-NONVEC: # %bb.0: # %entry +; RV64-NONVEC-NEXT: blez a2, .LBB0_2 +; RV64-NONVEC-NEXT: .LBB0_1: # %for.body +; RV64-NONVEC-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64-NONVEC-NEXT: lbu a3, 0(a1) +; RV64-NONVEC-NEXT: sb a3, 0(a0) +; RV64-NONVEC-NEXT: addi a2, a2, -1 +; RV64-NONVEC-NEXT: addi a0, a0, 1 +; RV64-NONVEC-NEXT: addi a1, a1, 1 +; RV64-NONVEC-NEXT: bnez a2, .LBB0_1 +; RV64-NONVEC-NEXT: .LBB0_2: # %for.cond.cleanup +; RV64-NONVEC-NEXT: ret +; +; RV32-VEC-LABEL: foo: +; RV32-VEC: # %bb.0: # %entry +; RV32-VEC-NEXT: blez a2, .LBB0_7 +; RV32-VEC-NEXT: # %bb.1: # %for.body.preheader +; RV32-VEC-NEXT: li a3, 8 +; RV32-VEC-NEXT: bgeu a2, a3, .LBB0_3 +; RV32-VEC-NEXT: # %bb.2: +; RV32-VEC-NEXT: li a3, 0 +; RV32-VEC-NEXT: li a4, 0 +; RV32-VEC-NEXT: j .LBB0_6 +; RV32-VEC-NEXT: .LBB0_3: # %vector.ph +; RV32-VEC-NEXT: li a4, 0 +; RV32-VEC-NEXT: andi a3, a2, -8 +; RV32-VEC-NEXT: li a6, 0 +; RV32-VEC-NEXT: li a5, 0 +; RV32-VEC-NEXT: .LBB0_4: # %vector.body +; RV32-VEC-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32-VEC-NEXT: add a7, a1, a6 +; RV32-VEC-NEXT: lw t0, 0(a7) +; RV32-VEC-NEXT: lw a7, 4(a7) +; RV32-VEC-NEXT: add t1, a0, a6 +; RV32-VEC-NEXT: sw t0, 0(t1) +; RV32-VEC-NEXT: addi t0, a6, 8 +; RV32-VEC-NEXT: sltu a6, t0, a6 +; RV32-VEC-NEXT: add a5, a5, a6 +; RV32-VEC-NEXT: xor a6, t0, a3 +; RV32-VEC-NEXT: or t2, a6, a5 +; RV32-VEC-NEXT: sw a7, 4(t1) +; RV32-VEC-NEXT: mv a6, t0 +; RV32-VEC-NEXT: bnez t2, .LBB0_4 +; RV32-VEC-NEXT: # %bb.5: # %middle.block +; RV32-VEC-NEXT: beq a3, a2, .LBB0_7 +; RV32-VEC-NEXT: .LBB0_6: # %for.body +; RV32-VEC-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32-VEC-NEXT: add a5, a1, a3 +; RV32-VEC-NEXT: lbu a5, 0(a5) +; RV32-VEC-NEXT: add a6, a0, a3 +; RV32-VEC-NEXT: addi a3, a3, 1 +; RV32-VEC-NEXT: seqz a7, a3 +; RV32-VEC-NEXT: add a4, a4, a7 +; RV32-VEC-NEXT: xor a7, a3, a2 +; RV32-VEC-NEXT: or a7, a7, a4 +; RV32-VEC-NEXT: sb a5, 0(a6) +; RV32-VEC-NEXT: bnez a7, .LBB0_6 +; RV32-VEC-NEXT: .LBB0_7: # %for.cond.cleanup +; RV32-VEC-NEXT: ret +; +; RV64-VEC-LABEL: foo: +; RV64-VEC: # %bb.0: # %entry +; RV64-VEC-NEXT: blez a2, .LBB0_8 +; RV64-VEC-NEXT: # %bb.1: # %for.body.preheader +; RV64-VEC-NEXT: li a3, 16 +; RV64-VEC-NEXT: bgeu a2, a3, .LBB0_3 +; RV64-VEC-NEXT: # %bb.2: +; RV64-VEC-NEXT: li a3, 0 +; RV64-VEC-NEXT: j .LBB0_6 +; RV64-VEC-NEXT: .LBB0_3: # %vector.ph +; RV64-VEC-NEXT: andi a3, a2, -16 +; RV64-VEC-NEXT: addi a4, a1, 8 +; RV64-VEC-NEXT: addi a5, a0, 8 +; RV64-VEC-NEXT: mv a6, a3 +; RV64-VEC-NEXT: .LBB0_4: # %vector.body +; RV64-VEC-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64-VEC-NEXT: ld a7, -8(a4) +; RV64-VEC-NEXT: ld t0, 0(a4) +; RV64-VEC-NEXT: sd a7, -8(a5) +; RV64-VEC-NEXT: sd t0, 0(a5) +; RV64-VEC-NEXT: addi a4, a4, 16 +; RV64-VEC-NEXT: addi a6, a6, -16 +; RV64-VEC-NEXT: addi a5, a5, 16 +; RV64-VEC-NEXT: bnez a6, .LBB0_4 +; RV64-VEC-NEXT: # %bb.5: # %middle.block +; RV64-VEC-NEXT: beq a3, a2, .LBB0_8 +; RV64-VEC-NEXT: .LBB0_6: # %for.body.preheader2 +; RV64-VEC-NEXT: sub a2, a2, a3 +; RV64-VEC-NEXT: add a0, a0, a3 +; RV64-VEC-NEXT: add a1, a1, a3 +; RV64-VEC-NEXT: .LBB0_7: # %for.body +; RV64-VEC-NEXT: # =>This Inner Loop Header: Depth=1 +; RV64-VEC-NEXT: lbu a3, 0(a1) +; RV64-VEC-NEXT: sb a3, 0(a0) +; RV64-VEC-NEXT: addi a2, a2, -1 +; RV64-VEC-NEXT: addi a0, a0, 1 +; RV64-VEC-NEXT: addi a1, a1, 1 +; RV64-VEC-NEXT: bnez a2, .LBB0_7 +; RV64-VEC-NEXT: .LBB0_8: # %for.cond.cleanup +; RV64-VEC-NEXT: ret +entry: + %cmp6 = icmp sgt i32 %n, 0 + br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: ; preds = %entry + %wide.trip.count = zext i32 %n to i64 + br label %for.body + +for.cond.cleanup.loopexit: ; preds = %for.body + br label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond.cleanup.loopexit, %entry + ret void + +for.body: ; preds = %for.body.preheader, %for.body + %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] + %arrayidx = getelementptr inbounds i8, ptr %b, i64 %indvars.iv + %0 = load i8, ptr %arrayidx, align 1 + %arrayidx2 = getelementptr inbounds i8, ptr %a, i64 %indvars.iv + store i8 %0, ptr %arrayidx2, align 1 + %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 + %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count + br i1 %exitcond.not, label %for.cond.cleanup.loopexit, label %for.body +} + +attributes #0 = { "no-builtins" }