diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h --- a/llvm/lib/Target/RISCV/RISCVISelLowering.h +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -702,6 +702,9 @@ MachineMemOperand::Flags Flags = MachineMemOperand::MONone, unsigned *Fast = nullptr) const override; + EVT getOptimalMemOpType(const MemOp &Op, + const AttributeList &FuncAttributes) const override; + bool splitValueIntoRegisterParts( SelectionDAG & DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, unsigned NumParts, MVT PartVT, std::optional CC) diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp --- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -17101,6 +17101,49 @@ return Subtarget.enableUnalignedVectorMem(); } + +EVT RISCVTargetLowering::getOptimalMemOpType(const MemOp &Op, + const AttributeList &FuncAttributes) const { + if (!Subtarget.hasVInstructions()) + return MVT::Other; + + if (FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) + return MVT::Other; + + // We use LMUL1 memory operations here for a non-obvious reason. Our caller + // has an expansion threshold, and we want the number of hardware memory + // operations to correspond roughly to that threshold. LMUL>1 operations + // are typically expanded linearly internally, and thus correspond to more + // than one actual memory operation. Note that store merging and load + // combining will typically form larger LMUL operations from the LMUL1 + // operations emitted here, and that's okay because combining isn't + // introducing new memory operations; it's just merging existing ones. + const unsigned MinVLenInBytes = Subtarget.getRealMinVLen()/8; + if (Op.size() < MinVLenInBytes) + // TODO: Figure out short memops. For the moment, do the default thing + // which ends up using scalar sequences. + return MVT::Other; + + // Prefer i8 for non-zero memset as it allows us to avoid materializing + // a large scalar constant and instead use vmv.v.x/i to do the + // broadcast. For everything else, prefer ELenVT to minimize VL and thus + // maximize the chance we can encode the size in the vsetvli. + MVT ELenVT = MVT::getIntegerVT(Subtarget.getELEN()); + MVT PreferredVT = (Op.isMemset() && !Op.isZeroMemset()) ? MVT::i8 : ELenVT; + + // Do we have sufficient alignment for our preferred VT? If not, revert + // to largest size allowed by our alignment criteria. + if (PreferredVT != MVT::i8 && !Subtarget.enableUnalignedVectorMem()) { + Align RequiredAlign(PreferredVT.getStoreSize()); + if (Op.isFixedDstAlign()) + RequiredAlign = std::min(RequiredAlign, Op.getDstAlign()); + if (Op.isMemcpy()) + RequiredAlign = std::min(RequiredAlign, Op.getSrcAlign()); + PreferredVT = MVT::getIntegerVT(RequiredAlign.value() * 8); + } + return MVT::getVectorVT(PreferredVT, MinVLenInBytes/PreferredVT.getStoreSize()); +} + bool RISCVTargetLowering::splitValueIntoRegisterParts( SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, unsigned NumParts, MVT PartVT, std::optional CC) const { diff --git a/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll b/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll --- a/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll +++ b/llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll @@ -290,9 +290,9 @@ ; ; RV32-FAST-LABEL: unaligned_memcpy16: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-FAST-NEXT: vle32.v v8, (a1) -; RV32-FAST-NEXT: vse32.v v8, (a0) +; RV32-FAST-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-FAST-NEXT: vle64.v v8, (a1) +; RV32-FAST-NEXT: vse64.v v8, (a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: unaligned_memcpy16: @@ -309,77 +309,46 @@ define void @unaligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { ; RV32-LABEL: unaligned_memcpy31: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lbu a2, 30(a1) -; RV32-NEXT: sb a2, 30(a0) ; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV32-NEXT: vle8.v v8, (a1) ; RV32-NEXT: vse8.v v8, (a0) -; RV32-NEXT: addi a2, a1, 28 -; RV32-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; RV32-NEXT: vle8.v v8, (a2) -; RV32-NEXT: addi a2, a0, 28 -; RV32-NEXT: vse8.v v8, (a2) -; RV32-NEXT: addi a2, a1, 24 -; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; RV32-NEXT: vle8.v v8, (a2) -; RV32-NEXT: addi a2, a0, 24 -; RV32-NEXT: vse8.v v8, (a2) -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV32-NEXT: addi a1, a1, 15 ; RV32-NEXT: vle8.v v8, (a1) -; RV32-NEXT: addi a0, a0, 16 +; RV32-NEXT: addi a0, a0, 15 ; RV32-NEXT: vse8.v v8, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: unaligned_memcpy31: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lbu a2, 30(a1) -; RV64-NEXT: sb a2, 30(a0) ; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma ; RV64-NEXT: vle8.v v8, (a1) ; RV64-NEXT: vse8.v v8, (a0) -; RV64-NEXT: addi a2, a1, 28 -; RV64-NEXT: vsetivli zero, 2, e8, mf8, ta, ma -; RV64-NEXT: vle8.v v8, (a2) -; RV64-NEXT: addi a2, a0, 28 -; RV64-NEXT: vse8.v v8, (a2) -; RV64-NEXT: addi a2, a1, 24 -; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; RV64-NEXT: vle8.v v8, (a2) -; RV64-NEXT: addi a2, a0, 24 -; RV64-NEXT: vse8.v v8, (a2) -; RV64-NEXT: addi a1, a1, 16 -; RV64-NEXT: vsetivli zero, 8, e8, mf2, ta, ma +; RV64-NEXT: addi a1, a1, 15 ; RV64-NEXT: vle8.v v8, (a1) -; RV64-NEXT: addi a0, a0, 16 +; RV64-NEXT: addi a0, a0, 15 ; RV64-NEXT: vse8.v v8, (a0) ; RV64-NEXT: ret ; ; RV32-FAST-LABEL: unaligned_memcpy31: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lw a2, 27(a1) -; RV32-FAST-NEXT: sw a2, 27(a0) -; RV32-FAST-NEXT: lw a2, 24(a1) -; RV32-FAST-NEXT: sw a2, 24(a0) -; RV32-FAST-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-FAST-NEXT: vle32.v v8, (a1) -; RV32-FAST-NEXT: vse32.v v8, (a0) -; RV32-FAST-NEXT: addi a1, a1, 16 -; RV32-FAST-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-FAST-NEXT: vle32.v v8, (a1) -; RV32-FAST-NEXT: addi a0, a0, 16 -; RV32-FAST-NEXT: vse32.v v8, (a0) +; RV32-FAST-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-FAST-NEXT: vle64.v v8, (a1) +; RV32-FAST-NEXT: vse64.v v8, (a0) +; RV32-FAST-NEXT: addi a1, a1, 15 +; RV32-FAST-NEXT: vle64.v v8, (a1) +; RV32-FAST-NEXT: addi a0, a0, 15 +; RV32-FAST-NEXT: vse64.v v8, (a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: unaligned_memcpy31: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: ld a2, 23(a1) -; RV64-FAST-NEXT: sd a2, 23(a0) -; RV64-FAST-NEXT: ld a2, 16(a1) -; RV64-FAST-NEXT: sd a2, 16(a0) ; RV64-FAST-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-FAST-NEXT: vle64.v v8, (a1) ; RV64-FAST-NEXT: vse64.v v8, (a0) +; RV64-FAST-NEXT: addi a1, a1, 15 +; RV64-FAST-NEXT: vle64.v v8, (a1) +; RV64-FAST-NEXT: addi a0, a0, 15 +; RV64-FAST-NEXT: vse64.v v8, (a0) ; RV64-FAST-NEXT: ret entry: tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 31, i1 false) @@ -405,9 +374,9 @@ ; ; RV32-FAST-LABEL: unaligned_memcpy32: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-FAST-NEXT: vle32.v v8, (a1) -; RV32-FAST-NEXT: vse32.v v8, (a0) +; RV32-FAST-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-FAST-NEXT: vle64.v v8, (a1) +; RV32-FAST-NEXT: vse64.v v8, (a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: unaligned_memcpy32: @@ -440,9 +409,9 @@ ; ; RV32-FAST-LABEL: unaligned_memcpy64: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-FAST-NEXT: vle32.v v8, (a1) -; RV32-FAST-NEXT: vse32.v v8, (a0) +; RV32-FAST-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-FAST-NEXT: vle64.v v8, (a1) +; RV32-FAST-NEXT: vse64.v v8, (a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: unaligned_memcpy64: @@ -487,14 +456,14 @@ ; ; RV32-FAST-LABEL: unaligned_memcpy96: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-FAST-NEXT: vle32.v v8, (a1) -; RV32-FAST-NEXT: vse32.v v8, (a0) +; RV32-FAST-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-FAST-NEXT: vle64.v v8, (a1) +; RV32-FAST-NEXT: vse64.v v8, (a0) ; RV32-FAST-NEXT: addi a1, a1, 64 -; RV32-FAST-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-FAST-NEXT: vle32.v v8, (a1) +; RV32-FAST-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-FAST-NEXT: vle64.v v8, (a1) ; RV32-FAST-NEXT: addi a0, a0, 64 -; RV32-FAST-NEXT: vse32.v v8, (a0) +; RV32-FAST-NEXT: vse64.v v8, (a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: unaligned_memcpy96: @@ -532,10 +501,9 @@ ; ; RV32-FAST-LABEL: unaligned_memcpy128: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: li a2, 32 -; RV32-FAST-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-FAST-NEXT: vle32.v v8, (a1) -; RV32-FAST-NEXT: vse32.v v8, (a0) +; RV32-FAST-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-FAST-NEXT: vle64.v v8, (a1) +; RV32-FAST-NEXT: vse64.v v8, (a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: unaligned_memcpy128: @@ -562,11 +530,14 @@ ; RV32-NEXT: vle8.v v8, (a2) ; RV32-NEXT: addi a2, a0, 128 ; RV32-NEXT: vse8.v v8, (a2) -; RV32-NEXT: addi a1, a1, 192 -; RV32-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; RV32-NEXT: vle8.v v8, (a1) -; RV32-NEXT: addi a0, a0, 192 -; RV32-NEXT: vse8.v v8, (a0) +; RV32-NEXT: lbu a2, 195(a1) +; RV32-NEXT: sb a2, 195(a0) +; RV32-NEXT: lbu a2, 194(a1) +; RV32-NEXT: sb a2, 194(a0) +; RV32-NEXT: lbu a2, 193(a1) +; RV32-NEXT: sb a2, 193(a0) +; RV32-NEXT: lbu a1, 192(a1) +; RV32-NEXT: sb a1, 192(a0) ; RV32-NEXT: ret ; ; RV64-LABEL: unaligned_memcpy196: @@ -581,26 +552,28 @@ ; RV64-NEXT: vle8.v v8, (a2) ; RV64-NEXT: addi a2, a0, 128 ; RV64-NEXT: vse8.v v8, (a2) -; RV64-NEXT: addi a1, a1, 192 -; RV64-NEXT: vsetivli zero, 4, e8, mf4, ta, ma -; RV64-NEXT: vle8.v v8, (a1) -; RV64-NEXT: addi a0, a0, 192 -; RV64-NEXT: vse8.v v8, (a0) +; RV64-NEXT: lbu a2, 195(a1) +; RV64-NEXT: sb a2, 195(a0) +; RV64-NEXT: lbu a2, 194(a1) +; RV64-NEXT: sb a2, 194(a0) +; RV64-NEXT: lbu a2, 193(a1) +; RV64-NEXT: sb a2, 193(a0) +; RV64-NEXT: lbu a1, 192(a1) +; RV64-NEXT: sb a1, 192(a0) ; RV64-NEXT: ret ; ; RV32-FAST-LABEL: unaligned_memcpy196: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: li a2, 32 -; RV32-FAST-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-FAST-NEXT: vle32.v v8, (a1) -; RV32-FAST-NEXT: vse32.v v8, (a0) ; RV32-FAST-NEXT: lw a2, 192(a1) ; RV32-FAST-NEXT: sw a2, 192(a0) +; RV32-FAST-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-FAST-NEXT: vle64.v v8, (a1) +; RV32-FAST-NEXT: vse64.v v8, (a0) ; RV32-FAST-NEXT: addi a1, a1, 128 -; RV32-FAST-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-FAST-NEXT: vle32.v v8, (a1) +; RV32-FAST-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-FAST-NEXT: vle64.v v8, (a1) ; RV32-FAST-NEXT: addi a0, a0, 128 -; RV32-FAST-NEXT: vse32.v v8, (a0) +; RV32-FAST-NEXT: vse64.v v8, (a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: unaligned_memcpy196: @@ -648,14 +621,13 @@ ; ; RV32-FAST-LABEL: unaligned_memcpy256: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: li a2, 32 -; RV32-FAST-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-FAST-NEXT: vle32.v v8, (a1) -; RV32-FAST-NEXT: vse32.v v8, (a0) +; RV32-FAST-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-FAST-NEXT: vle64.v v8, (a1) +; RV32-FAST-NEXT: vse64.v v8, (a0) ; RV32-FAST-NEXT: addi a1, a1, 128 -; RV32-FAST-NEXT: vle32.v v8, (a1) +; RV32-FAST-NEXT: vle64.v v8, (a1) ; RV32-FAST-NEXT: addi a0, a0, 128 -; RV32-FAST-NEXT: vse32.v v8, (a0) +; RV32-FAST-NEXT: vse64.v v8, (a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: unaligned_memcpy256: @@ -843,9 +815,9 @@ define void @aligned_memcpy16(ptr nocapture %dest, ptr %src) nounwind { ; RV32-BOTH-LABEL: aligned_memcpy16: ; RV32-BOTH: # %bb.0: # %entry -; RV32-BOTH-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-BOTH-NEXT: vle32.v v8, (a1) -; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-BOTH-NEXT: vle64.v v8, (a1) +; RV32-BOTH-NEXT: vse64.v v8, (a0) ; RV32-BOTH-NEXT: ret ; ; RV64-BOTH-LABEL: aligned_memcpy16: @@ -862,62 +834,48 @@ define void @aligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { ; RV32-LABEL: aligned_memcpy31: ; RV32: # %bb.0: # %entry -; RV32-NEXT: lbu a2, 30(a1) -; RV32-NEXT: sb a2, 30(a0) -; RV32-NEXT: lh a2, 28(a1) -; RV32-NEXT: sh a2, 28(a0) -; RV32-NEXT: lw a2, 24(a1) -; RV32-NEXT: sw a2, 24(a0) -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vle32.v v8, (a1) -; RV32-NEXT: vse32.v v8, (a0) -; RV32-NEXT: addi a1, a1, 16 -; RV32-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-NEXT: vle32.v v8, (a1) -; RV32-NEXT: addi a0, a0, 16 -; RV32-NEXT: vse32.v v8, (a0) +; RV32-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-NEXT: vle64.v v8, (a1) +; RV32-NEXT: vse64.v v8, (a0) +; RV32-NEXT: addi a1, a1, 15 +; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-NEXT: vle8.v v8, (a1) +; RV32-NEXT: addi a0, a0, 15 +; RV32-NEXT: vse8.v v8, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: aligned_memcpy31: ; RV64: # %bb.0: # %entry -; RV64-NEXT: lbu a2, 30(a1) -; RV64-NEXT: sb a2, 30(a0) -; RV64-NEXT: lh a2, 28(a1) -; RV64-NEXT: sh a2, 28(a0) -; RV64-NEXT: lw a2, 24(a1) -; RV64-NEXT: sw a2, 24(a0) -; RV64-NEXT: ld a2, 16(a1) -; RV64-NEXT: sd a2, 16(a0) ; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-NEXT: vle64.v v8, (a1) ; RV64-NEXT: vse64.v v8, (a0) +; RV64-NEXT: addi a1, a1, 15 +; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64-NEXT: vle8.v v8, (a1) +; RV64-NEXT: addi a0, a0, 15 +; RV64-NEXT: vse8.v v8, (a0) ; RV64-NEXT: ret ; ; RV32-FAST-LABEL: aligned_memcpy31: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: lw a2, 27(a1) -; RV32-FAST-NEXT: sw a2, 27(a0) -; RV32-FAST-NEXT: lw a2, 24(a1) -; RV32-FAST-NEXT: sw a2, 24(a0) -; RV32-FAST-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-FAST-NEXT: vle32.v v8, (a1) -; RV32-FAST-NEXT: vse32.v v8, (a0) -; RV32-FAST-NEXT: addi a1, a1, 16 -; RV32-FAST-NEXT: vsetivli zero, 2, e32, mf2, ta, ma -; RV32-FAST-NEXT: vle32.v v8, (a1) -; RV32-FAST-NEXT: addi a0, a0, 16 -; RV32-FAST-NEXT: vse32.v v8, (a0) +; RV32-FAST-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-FAST-NEXT: vle64.v v8, (a1) +; RV32-FAST-NEXT: vse64.v v8, (a0) +; RV32-FAST-NEXT: addi a1, a1, 15 +; RV32-FAST-NEXT: vle64.v v8, (a1) +; RV32-FAST-NEXT: addi a0, a0, 15 +; RV32-FAST-NEXT: vse64.v v8, (a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: aligned_memcpy31: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: ld a2, 23(a1) -; RV64-FAST-NEXT: sd a2, 23(a0) -; RV64-FAST-NEXT: ld a2, 16(a1) -; RV64-FAST-NEXT: sd a2, 16(a0) ; RV64-FAST-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-FAST-NEXT: vle64.v v8, (a1) ; RV64-FAST-NEXT: vse64.v v8, (a0) +; RV64-FAST-NEXT: addi a1, a1, 15 +; RV64-FAST-NEXT: vle64.v v8, (a1) +; RV64-FAST-NEXT: addi a0, a0, 15 +; RV64-FAST-NEXT: vse64.v v8, (a0) ; RV64-FAST-NEXT: ret entry: tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 31, i1 false) @@ -927,9 +885,9 @@ define void @aligned_memcpy32(ptr nocapture %dest, ptr %src) nounwind { ; RV32-BOTH-LABEL: aligned_memcpy32: ; RV32-BOTH: # %bb.0: # %entry -; RV32-BOTH-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-BOTH-NEXT: vle32.v v8, (a1) -; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-BOTH-NEXT: vle64.v v8, (a1) +; RV32-BOTH-NEXT: vse64.v v8, (a0) ; RV32-BOTH-NEXT: ret ; ; RV64-BOTH-LABEL: aligned_memcpy32: @@ -946,9 +904,9 @@ define void @aligned_memcpy64(ptr nocapture %dest, ptr %src) nounwind { ; RV32-BOTH-LABEL: aligned_memcpy64: ; RV32-BOTH: # %bb.0: # %entry -; RV32-BOTH-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-BOTH-NEXT: vle32.v v8, (a1) -; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-BOTH-NEXT: vle64.v v8, (a1) +; RV32-BOTH-NEXT: vse64.v v8, (a0) ; RV32-BOTH-NEXT: ret ; ; RV64-BOTH-LABEL: aligned_memcpy64: @@ -965,14 +923,14 @@ define void @aligned_memcpy96(ptr nocapture %dest, ptr %src) nounwind { ; RV32-BOTH-LABEL: aligned_memcpy96: ; RV32-BOTH: # %bb.0: # %entry -; RV32-BOTH-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-BOTH-NEXT: vle32.v v8, (a1) -; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-BOTH-NEXT: vle64.v v8, (a1) +; RV32-BOTH-NEXT: vse64.v v8, (a0) ; RV32-BOTH-NEXT: addi a1, a1, 64 -; RV32-BOTH-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-BOTH-NEXT: vle32.v v8, (a1) +; RV32-BOTH-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV32-BOTH-NEXT: vle64.v v8, (a1) ; RV32-BOTH-NEXT: addi a0, a0, 64 -; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: vse64.v v8, (a0) ; RV32-BOTH-NEXT: ret ; ; RV64-BOTH-LABEL: aligned_memcpy96: @@ -994,10 +952,9 @@ define void @aligned_memcpy128(ptr nocapture %dest, ptr %src) nounwind { ; RV32-BOTH-LABEL: aligned_memcpy128: ; RV32-BOTH: # %bb.0: # %entry -; RV32-BOTH-NEXT: li a2, 32 -; RV32-BOTH-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-BOTH-NEXT: vle32.v v8, (a1) -; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-BOTH-NEXT: vle64.v v8, (a1) +; RV32-BOTH-NEXT: vse64.v v8, (a0) ; RV32-BOTH-NEXT: ret ; ; RV64-BOTH-LABEL: aligned_memcpy128: @@ -1014,17 +971,16 @@ define void @aligned_memcpy196(ptr nocapture %dest, ptr %src) nounwind { ; RV32-BOTH-LABEL: aligned_memcpy196: ; RV32-BOTH: # %bb.0: # %entry -; RV32-BOTH-NEXT: li a2, 32 -; RV32-BOTH-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-BOTH-NEXT: vle32.v v8, (a1) -; RV32-BOTH-NEXT: vse32.v v8, (a0) ; RV32-BOTH-NEXT: lw a2, 192(a1) ; RV32-BOTH-NEXT: sw a2, 192(a0) +; RV32-BOTH-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-BOTH-NEXT: vle64.v v8, (a1) +; RV32-BOTH-NEXT: vse64.v v8, (a0) ; RV32-BOTH-NEXT: addi a1, a1, 128 -; RV32-BOTH-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-BOTH-NEXT: vle32.v v8, (a1) +; RV32-BOTH-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV32-BOTH-NEXT: vle64.v v8, (a1) ; RV32-BOTH-NEXT: addi a0, a0, 128 -; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: vse64.v v8, (a0) ; RV32-BOTH-NEXT: ret ; ; RV64-BOTH-LABEL: aligned_memcpy196: @@ -1048,14 +1004,13 @@ define void @aligned_memcpy256(ptr nocapture %dest, ptr %src) nounwind { ; RV32-BOTH-LABEL: aligned_memcpy256: ; RV32-BOTH: # %bb.0: # %entry -; RV32-BOTH-NEXT: li a2, 32 -; RV32-BOTH-NEXT: vsetvli zero, a2, e32, m8, ta, ma -; RV32-BOTH-NEXT: vle32.v v8, (a1) -; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV32-BOTH-NEXT: vle64.v v8, (a1) +; RV32-BOTH-NEXT: vse64.v v8, (a0) ; RV32-BOTH-NEXT: addi a1, a1, 128 -; RV32-BOTH-NEXT: vle32.v v8, (a1) +; RV32-BOTH-NEXT: vle64.v v8, (a1) ; RV32-BOTH-NEXT: addi a0, a0, 128 -; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: vse64.v v8, (a0) ; RV32-BOTH-NEXT: ret ; ; RV64-BOTH-LABEL: aligned_memcpy256: @@ -1078,12 +1033,12 @@ define void @memcpy16_align4(ptr nocapture %dest, ptr nocapture %src) nounwind { -; RV32-BOTH-LABEL: memcpy16_align4: -; RV32-BOTH: # %bb.0: # %entry -; RV32-BOTH-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-BOTH-NEXT: vle32.v v8, (a1) -; RV32-BOTH-NEXT: vse32.v v8, (a0) -; RV32-BOTH-NEXT: ret +; RV32-LABEL: memcpy16_align4: +; RV32: # %bb.0: # %entry +; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-NEXT: vle32.v v8, (a1) +; RV32-NEXT: vse32.v v8, (a0) +; RV32-NEXT: ret ; ; RV64-LABEL: memcpy16_align4: ; RV64: # %bb.0: # %entry @@ -1092,6 +1047,13 @@ ; RV64-NEXT: vse32.v v8, (a0) ; RV64-NEXT: ret ; +; RV32-FAST-LABEL: memcpy16_align4: +; RV32-FAST: # %bb.0: # %entry +; RV32-FAST-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV32-FAST-NEXT: vle64.v v8, (a1) +; RV32-FAST-NEXT: vse64.v v8, (a0) +; RV32-FAST-NEXT: ret +; ; RV64-FAST-LABEL: memcpy16_align4: ; RV64-FAST: # %bb.0: # %entry ; RV64-FAST-NEXT: vsetivli zero, 2, e64, m1, ta, ma diff --git a/llvm/test/CodeGen/RISCV/rvv/memset-inline.ll b/llvm/test/CodeGen/RISCV/rvv/memset-inline.ll --- a/llvm/test/CodeGen/RISCV/rvv/memset-inline.ll +++ b/llvm/test/CodeGen/RISCV/rvv/memset-inline.ll @@ -149,357 +149,71 @@ } define void @memset_16(ptr %a, i8 %value) nounwind { -; RV32-LABEL: memset_16: -; RV32: # %bb.0: -; RV32-NEXT: sb a1, 15(a0) -; RV32-NEXT: sb a1, 14(a0) -; RV32-NEXT: sb a1, 13(a0) -; RV32-NEXT: sb a1, 12(a0) -; RV32-NEXT: sb a1, 11(a0) -; RV32-NEXT: sb a1, 10(a0) -; RV32-NEXT: sb a1, 9(a0) -; RV32-NEXT: sb a1, 8(a0) -; RV32-NEXT: sb a1, 7(a0) -; RV32-NEXT: sb a1, 6(a0) -; RV32-NEXT: sb a1, 5(a0) -; RV32-NEXT: sb a1, 4(a0) -; RV32-NEXT: sb a1, 3(a0) -; RV32-NEXT: sb a1, 2(a0) -; RV32-NEXT: sb a1, 1(a0) -; RV32-NEXT: sb a1, 0(a0) -; RV32-NEXT: ret -; -; RV64-LABEL: memset_16: -; RV64: # %bb.0: -; RV64-NEXT: sb a1, 15(a0) -; RV64-NEXT: sb a1, 14(a0) -; RV64-NEXT: sb a1, 13(a0) -; RV64-NEXT: sb a1, 12(a0) -; RV64-NEXT: sb a1, 11(a0) -; RV64-NEXT: sb a1, 10(a0) -; RV64-NEXT: sb a1, 9(a0) -; RV64-NEXT: sb a1, 8(a0) -; RV64-NEXT: sb a1, 7(a0) -; RV64-NEXT: sb a1, 6(a0) -; RV64-NEXT: sb a1, 5(a0) -; RV64-NEXT: sb a1, 4(a0) -; RV64-NEXT: sb a1, 3(a0) -; RV64-NEXT: sb a1, 2(a0) -; RV64-NEXT: sb a1, 1(a0) -; RV64-NEXT: sb a1, 0(a0) -; RV64-NEXT: ret -; -; RV32-FAST-LABEL: memset_16: -; RV32-FAST: # %bb.0: -; RV32-FAST-NEXT: andi a1, a1, 255 -; RV32-FAST-NEXT: lui a2, 4112 -; RV32-FAST-NEXT: addi a2, a2, 257 -; RV32-FAST-NEXT: mul a1, a1, a2 -; RV32-FAST-NEXT: sw a1, 12(a0) -; RV32-FAST-NEXT: sw a1, 8(a0) -; RV32-FAST-NEXT: sw a1, 4(a0) -; RV32-FAST-NEXT: sw a1, 0(a0) -; RV32-FAST-NEXT: ret +; RV32-BOTH-LABEL: memset_16: +; RV32-BOTH: # %bb.0: +; RV32-BOTH-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-BOTH-NEXT: vmv.v.x v8, a1 +; RV32-BOTH-NEXT: vse8.v v8, (a0) +; RV32-BOTH-NEXT: ret ; -; RV64-FAST-LABEL: memset_16: -; RV64-FAST: # %bb.0: -; RV64-FAST-NEXT: andi a1, a1, 255 -; RV64-FAST-NEXT: lui a2, 4112 -; RV64-FAST-NEXT: addiw a2, a2, 257 -; RV64-FAST-NEXT: slli a3, a2, 32 -; RV64-FAST-NEXT: add a2, a2, a3 -; RV64-FAST-NEXT: mul a1, a1, a2 -; RV64-FAST-NEXT: sd a1, 8(a0) -; RV64-FAST-NEXT: sd a1, 0(a0) -; RV64-FAST-NEXT: ret +; RV64-BOTH-LABEL: memset_16: +; RV64-BOTH: # %bb.0: +; RV64-BOTH-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64-BOTH-NEXT: vmv.v.x v8, a1 +; RV64-BOTH-NEXT: vse8.v v8, (a0) +; RV64-BOTH-NEXT: ret tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 %value, i64 16, i1 0) ret void } define void @memset_32(ptr %a, i8 %value) nounwind { -; RV32-LABEL: memset_32: -; RV32: # %bb.0: -; RV32-NEXT: sb a1, 31(a0) -; RV32-NEXT: sb a1, 30(a0) -; RV32-NEXT: sb a1, 29(a0) -; RV32-NEXT: sb a1, 28(a0) -; RV32-NEXT: sb a1, 27(a0) -; RV32-NEXT: sb a1, 26(a0) -; RV32-NEXT: sb a1, 25(a0) -; RV32-NEXT: sb a1, 24(a0) -; RV32-NEXT: sb a1, 23(a0) -; RV32-NEXT: sb a1, 22(a0) -; RV32-NEXT: sb a1, 21(a0) -; RV32-NEXT: sb a1, 20(a0) -; RV32-NEXT: sb a1, 19(a0) -; RV32-NEXT: sb a1, 18(a0) -; RV32-NEXT: sb a1, 17(a0) -; RV32-NEXT: sb a1, 16(a0) -; RV32-NEXT: sb a1, 15(a0) -; RV32-NEXT: sb a1, 14(a0) -; RV32-NEXT: sb a1, 13(a0) -; RV32-NEXT: sb a1, 12(a0) -; RV32-NEXT: sb a1, 11(a0) -; RV32-NEXT: sb a1, 10(a0) -; RV32-NEXT: sb a1, 9(a0) -; RV32-NEXT: sb a1, 8(a0) -; RV32-NEXT: sb a1, 7(a0) -; RV32-NEXT: sb a1, 6(a0) -; RV32-NEXT: sb a1, 5(a0) -; RV32-NEXT: sb a1, 4(a0) -; RV32-NEXT: sb a1, 3(a0) -; RV32-NEXT: sb a1, 2(a0) -; RV32-NEXT: sb a1, 1(a0) -; RV32-NEXT: sb a1, 0(a0) -; RV32-NEXT: ret -; -; RV64-LABEL: memset_32: -; RV64: # %bb.0: -; RV64-NEXT: sb a1, 31(a0) -; RV64-NEXT: sb a1, 30(a0) -; RV64-NEXT: sb a1, 29(a0) -; RV64-NEXT: sb a1, 28(a0) -; RV64-NEXT: sb a1, 27(a0) -; RV64-NEXT: sb a1, 26(a0) -; RV64-NEXT: sb a1, 25(a0) -; RV64-NEXT: sb a1, 24(a0) -; RV64-NEXT: sb a1, 23(a0) -; RV64-NEXT: sb a1, 22(a0) -; RV64-NEXT: sb a1, 21(a0) -; RV64-NEXT: sb a1, 20(a0) -; RV64-NEXT: sb a1, 19(a0) -; RV64-NEXT: sb a1, 18(a0) -; RV64-NEXT: sb a1, 17(a0) -; RV64-NEXT: sb a1, 16(a0) -; RV64-NEXT: sb a1, 15(a0) -; RV64-NEXT: sb a1, 14(a0) -; RV64-NEXT: sb a1, 13(a0) -; RV64-NEXT: sb a1, 12(a0) -; RV64-NEXT: sb a1, 11(a0) -; RV64-NEXT: sb a1, 10(a0) -; RV64-NEXT: sb a1, 9(a0) -; RV64-NEXT: sb a1, 8(a0) -; RV64-NEXT: sb a1, 7(a0) -; RV64-NEXT: sb a1, 6(a0) -; RV64-NEXT: sb a1, 5(a0) -; RV64-NEXT: sb a1, 4(a0) -; RV64-NEXT: sb a1, 3(a0) -; RV64-NEXT: sb a1, 2(a0) -; RV64-NEXT: sb a1, 1(a0) -; RV64-NEXT: sb a1, 0(a0) -; RV64-NEXT: ret -; -; RV32-FAST-LABEL: memset_32: -; RV32-FAST: # %bb.0: -; RV32-FAST-NEXT: andi a1, a1, 255 -; RV32-FAST-NEXT: lui a2, 4112 -; RV32-FAST-NEXT: addi a2, a2, 257 -; RV32-FAST-NEXT: mul a1, a1, a2 -; RV32-FAST-NEXT: sw a1, 28(a0) -; RV32-FAST-NEXT: sw a1, 24(a0) -; RV32-FAST-NEXT: sw a1, 20(a0) -; RV32-FAST-NEXT: sw a1, 16(a0) -; RV32-FAST-NEXT: sw a1, 12(a0) -; RV32-FAST-NEXT: sw a1, 8(a0) -; RV32-FAST-NEXT: sw a1, 4(a0) -; RV32-FAST-NEXT: sw a1, 0(a0) -; RV32-FAST-NEXT: ret +; RV32-BOTH-LABEL: memset_32: +; RV32-BOTH: # %bb.0: +; RV32-BOTH-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-BOTH-NEXT: vmv.v.x v8, a1 +; RV32-BOTH-NEXT: addi a1, a0, 16 +; RV32-BOTH-NEXT: vse8.v v8, (a1) +; RV32-BOTH-NEXT: vse8.v v8, (a0) +; RV32-BOTH-NEXT: ret ; -; RV64-FAST-LABEL: memset_32: -; RV64-FAST: # %bb.0: -; RV64-FAST-NEXT: andi a1, a1, 255 -; RV64-FAST-NEXT: lui a2, 4112 -; RV64-FAST-NEXT: addiw a2, a2, 257 -; RV64-FAST-NEXT: slli a3, a2, 32 -; RV64-FAST-NEXT: add a2, a2, a3 -; RV64-FAST-NEXT: mul a1, a1, a2 -; RV64-FAST-NEXT: sd a1, 24(a0) -; RV64-FAST-NEXT: sd a1, 16(a0) -; RV64-FAST-NEXT: sd a1, 8(a0) -; RV64-FAST-NEXT: sd a1, 0(a0) -; RV64-FAST-NEXT: ret +; RV64-BOTH-LABEL: memset_32: +; RV64-BOTH: # %bb.0: +; RV64-BOTH-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64-BOTH-NEXT: vmv.v.x v8, a1 +; RV64-BOTH-NEXT: addi a1, a0, 16 +; RV64-BOTH-NEXT: vse8.v v8, (a1) +; RV64-BOTH-NEXT: vse8.v v8, (a0) +; RV64-BOTH-NEXT: ret tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 %value, i64 32, i1 0) ret void } define void @memset_64(ptr %a, i8 %value) nounwind { -; RV32-LABEL: memset_64: -; RV32: # %bb.0: -; RV32-NEXT: sb a1, 63(a0) -; RV32-NEXT: sb a1, 62(a0) -; RV32-NEXT: sb a1, 61(a0) -; RV32-NEXT: sb a1, 60(a0) -; RV32-NEXT: sb a1, 59(a0) -; RV32-NEXT: sb a1, 58(a0) -; RV32-NEXT: sb a1, 57(a0) -; RV32-NEXT: sb a1, 56(a0) -; RV32-NEXT: sb a1, 55(a0) -; RV32-NEXT: sb a1, 54(a0) -; RV32-NEXT: sb a1, 53(a0) -; RV32-NEXT: sb a1, 52(a0) -; RV32-NEXT: sb a1, 51(a0) -; RV32-NEXT: sb a1, 50(a0) -; RV32-NEXT: sb a1, 49(a0) -; RV32-NEXT: sb a1, 48(a0) -; RV32-NEXT: sb a1, 47(a0) -; RV32-NEXT: sb a1, 46(a0) -; RV32-NEXT: sb a1, 45(a0) -; RV32-NEXT: sb a1, 44(a0) -; RV32-NEXT: sb a1, 43(a0) -; RV32-NEXT: sb a1, 42(a0) -; RV32-NEXT: sb a1, 41(a0) -; RV32-NEXT: sb a1, 40(a0) -; RV32-NEXT: sb a1, 39(a0) -; RV32-NEXT: sb a1, 38(a0) -; RV32-NEXT: sb a1, 37(a0) -; RV32-NEXT: sb a1, 36(a0) -; RV32-NEXT: sb a1, 35(a0) -; RV32-NEXT: sb a1, 34(a0) -; RV32-NEXT: sb a1, 33(a0) -; RV32-NEXT: sb a1, 32(a0) -; RV32-NEXT: sb a1, 31(a0) -; RV32-NEXT: sb a1, 30(a0) -; RV32-NEXT: sb a1, 29(a0) -; RV32-NEXT: sb a1, 28(a0) -; RV32-NEXT: sb a1, 27(a0) -; RV32-NEXT: sb a1, 26(a0) -; RV32-NEXT: sb a1, 25(a0) -; RV32-NEXT: sb a1, 24(a0) -; RV32-NEXT: sb a1, 23(a0) -; RV32-NEXT: sb a1, 22(a0) -; RV32-NEXT: sb a1, 21(a0) -; RV32-NEXT: sb a1, 20(a0) -; RV32-NEXT: sb a1, 19(a0) -; RV32-NEXT: sb a1, 18(a0) -; RV32-NEXT: sb a1, 17(a0) -; RV32-NEXT: sb a1, 16(a0) -; RV32-NEXT: sb a1, 15(a0) -; RV32-NEXT: sb a1, 14(a0) -; RV32-NEXT: sb a1, 13(a0) -; RV32-NEXT: sb a1, 12(a0) -; RV32-NEXT: sb a1, 11(a0) -; RV32-NEXT: sb a1, 10(a0) -; RV32-NEXT: sb a1, 9(a0) -; RV32-NEXT: sb a1, 8(a0) -; RV32-NEXT: sb a1, 7(a0) -; RV32-NEXT: sb a1, 6(a0) -; RV32-NEXT: sb a1, 5(a0) -; RV32-NEXT: sb a1, 4(a0) -; RV32-NEXT: sb a1, 3(a0) -; RV32-NEXT: sb a1, 2(a0) -; RV32-NEXT: sb a1, 1(a0) -; RV32-NEXT: sb a1, 0(a0) -; RV32-NEXT: ret -; -; RV64-LABEL: memset_64: -; RV64: # %bb.0: -; RV64-NEXT: sb a1, 63(a0) -; RV64-NEXT: sb a1, 62(a0) -; RV64-NEXT: sb a1, 61(a0) -; RV64-NEXT: sb a1, 60(a0) -; RV64-NEXT: sb a1, 59(a0) -; RV64-NEXT: sb a1, 58(a0) -; RV64-NEXT: sb a1, 57(a0) -; RV64-NEXT: sb a1, 56(a0) -; RV64-NEXT: sb a1, 55(a0) -; RV64-NEXT: sb a1, 54(a0) -; RV64-NEXT: sb a1, 53(a0) -; RV64-NEXT: sb a1, 52(a0) -; RV64-NEXT: sb a1, 51(a0) -; RV64-NEXT: sb a1, 50(a0) -; RV64-NEXT: sb a1, 49(a0) -; RV64-NEXT: sb a1, 48(a0) -; RV64-NEXT: sb a1, 47(a0) -; RV64-NEXT: sb a1, 46(a0) -; RV64-NEXT: sb a1, 45(a0) -; RV64-NEXT: sb a1, 44(a0) -; RV64-NEXT: sb a1, 43(a0) -; RV64-NEXT: sb a1, 42(a0) -; RV64-NEXT: sb a1, 41(a0) -; RV64-NEXT: sb a1, 40(a0) -; RV64-NEXT: sb a1, 39(a0) -; RV64-NEXT: sb a1, 38(a0) -; RV64-NEXT: sb a1, 37(a0) -; RV64-NEXT: sb a1, 36(a0) -; RV64-NEXT: sb a1, 35(a0) -; RV64-NEXT: sb a1, 34(a0) -; RV64-NEXT: sb a1, 33(a0) -; RV64-NEXT: sb a1, 32(a0) -; RV64-NEXT: sb a1, 31(a0) -; RV64-NEXT: sb a1, 30(a0) -; RV64-NEXT: sb a1, 29(a0) -; RV64-NEXT: sb a1, 28(a0) -; RV64-NEXT: sb a1, 27(a0) -; RV64-NEXT: sb a1, 26(a0) -; RV64-NEXT: sb a1, 25(a0) -; RV64-NEXT: sb a1, 24(a0) -; RV64-NEXT: sb a1, 23(a0) -; RV64-NEXT: sb a1, 22(a0) -; RV64-NEXT: sb a1, 21(a0) -; RV64-NEXT: sb a1, 20(a0) -; RV64-NEXT: sb a1, 19(a0) -; RV64-NEXT: sb a1, 18(a0) -; RV64-NEXT: sb a1, 17(a0) -; RV64-NEXT: sb a1, 16(a0) -; RV64-NEXT: sb a1, 15(a0) -; RV64-NEXT: sb a1, 14(a0) -; RV64-NEXT: sb a1, 13(a0) -; RV64-NEXT: sb a1, 12(a0) -; RV64-NEXT: sb a1, 11(a0) -; RV64-NEXT: sb a1, 10(a0) -; RV64-NEXT: sb a1, 9(a0) -; RV64-NEXT: sb a1, 8(a0) -; RV64-NEXT: sb a1, 7(a0) -; RV64-NEXT: sb a1, 6(a0) -; RV64-NEXT: sb a1, 5(a0) -; RV64-NEXT: sb a1, 4(a0) -; RV64-NEXT: sb a1, 3(a0) -; RV64-NEXT: sb a1, 2(a0) -; RV64-NEXT: sb a1, 1(a0) -; RV64-NEXT: sb a1, 0(a0) -; RV64-NEXT: ret -; -; RV32-FAST-LABEL: memset_64: -; RV32-FAST: # %bb.0: -; RV32-FAST-NEXT: andi a1, a1, 255 -; RV32-FAST-NEXT: lui a2, 4112 -; RV32-FAST-NEXT: addi a2, a2, 257 -; RV32-FAST-NEXT: mul a1, a1, a2 -; RV32-FAST-NEXT: sw a1, 60(a0) -; RV32-FAST-NEXT: sw a1, 56(a0) -; RV32-FAST-NEXT: sw a1, 52(a0) -; RV32-FAST-NEXT: sw a1, 48(a0) -; RV32-FAST-NEXT: sw a1, 44(a0) -; RV32-FAST-NEXT: sw a1, 40(a0) -; RV32-FAST-NEXT: sw a1, 36(a0) -; RV32-FAST-NEXT: sw a1, 32(a0) -; RV32-FAST-NEXT: sw a1, 28(a0) -; RV32-FAST-NEXT: sw a1, 24(a0) -; RV32-FAST-NEXT: sw a1, 20(a0) -; RV32-FAST-NEXT: sw a1, 16(a0) -; RV32-FAST-NEXT: sw a1, 12(a0) -; RV32-FAST-NEXT: sw a1, 8(a0) -; RV32-FAST-NEXT: sw a1, 4(a0) -; RV32-FAST-NEXT: sw a1, 0(a0) -; RV32-FAST-NEXT: ret +; RV32-BOTH-LABEL: memset_64: +; RV32-BOTH: # %bb.0: +; RV32-BOTH-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-BOTH-NEXT: vmv.v.x v8, a1 +; RV32-BOTH-NEXT: addi a1, a0, 48 +; RV32-BOTH-NEXT: vse8.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 32 +; RV32-BOTH-NEXT: vse8.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 16 +; RV32-BOTH-NEXT: vse8.v v8, (a1) +; RV32-BOTH-NEXT: vse8.v v8, (a0) +; RV32-BOTH-NEXT: ret ; -; RV64-FAST-LABEL: memset_64: -; RV64-FAST: # %bb.0: -; RV64-FAST-NEXT: andi a1, a1, 255 -; RV64-FAST-NEXT: lui a2, 4112 -; RV64-FAST-NEXT: addiw a2, a2, 257 -; RV64-FAST-NEXT: slli a3, a2, 32 -; RV64-FAST-NEXT: add a2, a2, a3 -; RV64-FAST-NEXT: mul a1, a1, a2 -; RV64-FAST-NEXT: sd a1, 56(a0) -; RV64-FAST-NEXT: sd a1, 48(a0) -; RV64-FAST-NEXT: sd a1, 40(a0) -; RV64-FAST-NEXT: sd a1, 32(a0) -; RV64-FAST-NEXT: sd a1, 24(a0) -; RV64-FAST-NEXT: sd a1, 16(a0) -; RV64-FAST-NEXT: sd a1, 8(a0) -; RV64-FAST-NEXT: sd a1, 0(a0) -; RV64-FAST-NEXT: ret +; RV64-BOTH-LABEL: memset_64: +; RV64-BOTH: # %bb.0: +; RV64-BOTH-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64-BOTH-NEXT: vmv.v.x v8, a1 +; RV64-BOTH-NEXT: addi a1, a0, 48 +; RV64-BOTH-NEXT: vse8.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 32 +; RV64-BOTH-NEXT: vse8.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 16 +; RV64-BOTH-NEXT: vse8.v v8, (a1) +; RV64-BOTH-NEXT: vse8.v v8, (a0) +; RV64-BOTH-NEXT: ret tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 %value, i64 64, i1 0) ret void } @@ -577,26 +291,16 @@ define void @aligned_memset_16(ptr align 16 %a, i8 %value) nounwind { ; RV32-BOTH-LABEL: aligned_memset_16: ; RV32-BOTH: # %bb.0: -; RV32-BOTH-NEXT: andi a1, a1, 255 -; RV32-BOTH-NEXT: lui a2, 4112 -; RV32-BOTH-NEXT: addi a2, a2, 257 -; RV32-BOTH-NEXT: mul a1, a1, a2 -; RV32-BOTH-NEXT: sw a1, 12(a0) -; RV32-BOTH-NEXT: sw a1, 8(a0) -; RV32-BOTH-NEXT: sw a1, 4(a0) -; RV32-BOTH-NEXT: sw a1, 0(a0) +; RV32-BOTH-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-BOTH-NEXT: vmv.v.x v8, a1 +; RV32-BOTH-NEXT: vse8.v v8, (a0) ; RV32-BOTH-NEXT: ret ; ; RV64-BOTH-LABEL: aligned_memset_16: ; RV64-BOTH: # %bb.0: -; RV64-BOTH-NEXT: andi a1, a1, 255 -; RV64-BOTH-NEXT: lui a2, 4112 -; RV64-BOTH-NEXT: addiw a2, a2, 257 -; RV64-BOTH-NEXT: slli a3, a2, 32 -; RV64-BOTH-NEXT: add a2, a2, a3 -; RV64-BOTH-NEXT: mul a1, a1, a2 -; RV64-BOTH-NEXT: sd a1, 8(a0) -; RV64-BOTH-NEXT: sd a1, 0(a0) +; RV64-BOTH-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64-BOTH-NEXT: vmv.v.x v8, a1 +; RV64-BOTH-NEXT: vse8.v v8, (a0) ; RV64-BOTH-NEXT: ret tail call void @llvm.memset.inline.p0.i64(ptr align 16 %a, i8 %value, i64 16, i1 0) ret void @@ -605,32 +309,20 @@ define void @aligned_memset_32(ptr align 32 %a, i8 %value) nounwind { ; RV32-BOTH-LABEL: aligned_memset_32: ; RV32-BOTH: # %bb.0: -; RV32-BOTH-NEXT: andi a1, a1, 255 -; RV32-BOTH-NEXT: lui a2, 4112 -; RV32-BOTH-NEXT: addi a2, a2, 257 -; RV32-BOTH-NEXT: mul a1, a1, a2 -; RV32-BOTH-NEXT: sw a1, 28(a0) -; RV32-BOTH-NEXT: sw a1, 24(a0) -; RV32-BOTH-NEXT: sw a1, 20(a0) -; RV32-BOTH-NEXT: sw a1, 16(a0) -; RV32-BOTH-NEXT: sw a1, 12(a0) -; RV32-BOTH-NEXT: sw a1, 8(a0) -; RV32-BOTH-NEXT: sw a1, 4(a0) -; RV32-BOTH-NEXT: sw a1, 0(a0) +; RV32-BOTH-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-BOTH-NEXT: vmv.v.x v8, a1 +; RV32-BOTH-NEXT: addi a1, a0, 16 +; RV32-BOTH-NEXT: vse8.v v8, (a1) +; RV32-BOTH-NEXT: vse8.v v8, (a0) ; RV32-BOTH-NEXT: ret ; ; RV64-BOTH-LABEL: aligned_memset_32: ; RV64-BOTH: # %bb.0: -; RV64-BOTH-NEXT: andi a1, a1, 255 -; RV64-BOTH-NEXT: lui a2, 4112 -; RV64-BOTH-NEXT: addiw a2, a2, 257 -; RV64-BOTH-NEXT: slli a3, a2, 32 -; RV64-BOTH-NEXT: add a2, a2, a3 -; RV64-BOTH-NEXT: mul a1, a1, a2 -; RV64-BOTH-NEXT: sd a1, 24(a0) -; RV64-BOTH-NEXT: sd a1, 16(a0) -; RV64-BOTH-NEXT: sd a1, 8(a0) -; RV64-BOTH-NEXT: sd a1, 0(a0) +; RV64-BOTH-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64-BOTH-NEXT: vmv.v.x v8, a1 +; RV64-BOTH-NEXT: addi a1, a0, 16 +; RV64-BOTH-NEXT: vse8.v v8, (a1) +; RV64-BOTH-NEXT: vse8.v v8, (a0) ; RV64-BOTH-NEXT: ret tail call void @llvm.memset.inline.p0.i64(ptr align 32 %a, i8 %value, i64 32, i1 0) ret void @@ -639,44 +331,28 @@ define void @aligned_memset_64(ptr align 64 %a, i8 %value) nounwind { ; RV32-BOTH-LABEL: aligned_memset_64: ; RV32-BOTH: # %bb.0: -; RV32-BOTH-NEXT: andi a1, a1, 255 -; RV32-BOTH-NEXT: lui a2, 4112 -; RV32-BOTH-NEXT: addi a2, a2, 257 -; RV32-BOTH-NEXT: mul a1, a1, a2 -; RV32-BOTH-NEXT: sw a1, 60(a0) -; RV32-BOTH-NEXT: sw a1, 56(a0) -; RV32-BOTH-NEXT: sw a1, 52(a0) -; RV32-BOTH-NEXT: sw a1, 48(a0) -; RV32-BOTH-NEXT: sw a1, 44(a0) -; RV32-BOTH-NEXT: sw a1, 40(a0) -; RV32-BOTH-NEXT: sw a1, 36(a0) -; RV32-BOTH-NEXT: sw a1, 32(a0) -; RV32-BOTH-NEXT: sw a1, 28(a0) -; RV32-BOTH-NEXT: sw a1, 24(a0) -; RV32-BOTH-NEXT: sw a1, 20(a0) -; RV32-BOTH-NEXT: sw a1, 16(a0) -; RV32-BOTH-NEXT: sw a1, 12(a0) -; RV32-BOTH-NEXT: sw a1, 8(a0) -; RV32-BOTH-NEXT: sw a1, 4(a0) -; RV32-BOTH-NEXT: sw a1, 0(a0) +; RV32-BOTH-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-BOTH-NEXT: vmv.v.x v8, a1 +; RV32-BOTH-NEXT: addi a1, a0, 48 +; RV32-BOTH-NEXT: vse8.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 32 +; RV32-BOTH-NEXT: vse8.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 16 +; RV32-BOTH-NEXT: vse8.v v8, (a1) +; RV32-BOTH-NEXT: vse8.v v8, (a0) ; RV32-BOTH-NEXT: ret ; ; RV64-BOTH-LABEL: aligned_memset_64: ; RV64-BOTH: # %bb.0: -; RV64-BOTH-NEXT: andi a1, a1, 255 -; RV64-BOTH-NEXT: lui a2, 4112 -; RV64-BOTH-NEXT: addiw a2, a2, 257 -; RV64-BOTH-NEXT: slli a3, a2, 32 -; RV64-BOTH-NEXT: add a2, a2, a3 -; RV64-BOTH-NEXT: mul a1, a1, a2 -; RV64-BOTH-NEXT: sd a1, 56(a0) -; RV64-BOTH-NEXT: sd a1, 48(a0) -; RV64-BOTH-NEXT: sd a1, 40(a0) -; RV64-BOTH-NEXT: sd a1, 32(a0) -; RV64-BOTH-NEXT: sd a1, 24(a0) -; RV64-BOTH-NEXT: sd a1, 16(a0) -; RV64-BOTH-NEXT: sd a1, 8(a0) -; RV64-BOTH-NEXT: sd a1, 0(a0) +; RV64-BOTH-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64-BOTH-NEXT: vmv.v.x v8, a1 +; RV64-BOTH-NEXT: addi a1, a0, 48 +; RV64-BOTH-NEXT: vse8.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 32 +; RV64-BOTH-NEXT: vse8.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 16 +; RV64-BOTH-NEXT: vse8.v v8, (a1) +; RV64-BOTH-NEXT: vse8.v v8, (a0) ; RV64-BOTH-NEXT: ret tail call void @llvm.memset.inline.p0.i64(ptr align 64 %a, i8 %value, i64 64, i1 0) ret void @@ -796,55 +472,30 @@ define void @bzero_16(ptr %a) nounwind { ; RV32-LABEL: bzero_16: ; RV32: # %bb.0: -; RV32-NEXT: sb zero, 15(a0) -; RV32-NEXT: sb zero, 14(a0) -; RV32-NEXT: sb zero, 13(a0) -; RV32-NEXT: sb zero, 12(a0) -; RV32-NEXT: sb zero, 11(a0) -; RV32-NEXT: sb zero, 10(a0) -; RV32-NEXT: sb zero, 9(a0) -; RV32-NEXT: sb zero, 8(a0) -; RV32-NEXT: sb zero, 7(a0) -; RV32-NEXT: sb zero, 6(a0) -; RV32-NEXT: sb zero, 5(a0) -; RV32-NEXT: sb zero, 4(a0) -; RV32-NEXT: sb zero, 3(a0) -; RV32-NEXT: sb zero, 2(a0) -; RV32-NEXT: sb zero, 1(a0) -; RV32-NEXT: sb zero, 0(a0) +; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vse8.v v8, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: bzero_16: ; RV64: # %bb.0: -; RV64-NEXT: sb zero, 15(a0) -; RV64-NEXT: sb zero, 14(a0) -; RV64-NEXT: sb zero, 13(a0) -; RV64-NEXT: sb zero, 12(a0) -; RV64-NEXT: sb zero, 11(a0) -; RV64-NEXT: sb zero, 10(a0) -; RV64-NEXT: sb zero, 9(a0) -; RV64-NEXT: sb zero, 8(a0) -; RV64-NEXT: sb zero, 7(a0) -; RV64-NEXT: sb zero, 6(a0) -; RV64-NEXT: sb zero, 5(a0) -; RV64-NEXT: sb zero, 4(a0) -; RV64-NEXT: sb zero, 3(a0) -; RV64-NEXT: sb zero, 2(a0) -; RV64-NEXT: sb zero, 1(a0) -; RV64-NEXT: sb zero, 0(a0) +; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vse8.v v8, (a0) ; RV64-NEXT: ret ; ; RV32-FAST-LABEL: bzero_16: ; RV32-FAST: # %bb.0: -; RV32-FAST-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-FAST-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-FAST-NEXT: vmv.v.i v8, 0 -; RV32-FAST-NEXT: vse32.v v8, (a0) +; RV32-FAST-NEXT: vse64.v v8, (a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: bzero_16: ; RV64-FAST: # %bb.0: -; RV64-FAST-NEXT: sd zero, 8(a0) -; RV64-FAST-NEXT: sd zero, 0(a0) +; RV64-FAST-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-FAST-NEXT: vmv.v.i v8, 0 +; RV64-FAST-NEXT: vse64.v v8, (a0) ; RV64-FAST-NEXT: ret tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 0, i64 16, i1 0) ret void @@ -853,87 +504,37 @@ define void @bzero_32(ptr %a) nounwind { ; RV32-LABEL: bzero_32: ; RV32: # %bb.0: -; RV32-NEXT: sb zero, 31(a0) -; RV32-NEXT: sb zero, 30(a0) -; RV32-NEXT: sb zero, 29(a0) -; RV32-NEXT: sb zero, 28(a0) -; RV32-NEXT: sb zero, 27(a0) -; RV32-NEXT: sb zero, 26(a0) -; RV32-NEXT: sb zero, 25(a0) -; RV32-NEXT: sb zero, 24(a0) -; RV32-NEXT: sb zero, 23(a0) -; RV32-NEXT: sb zero, 22(a0) -; RV32-NEXT: sb zero, 21(a0) -; RV32-NEXT: sb zero, 20(a0) -; RV32-NEXT: sb zero, 19(a0) -; RV32-NEXT: sb zero, 18(a0) -; RV32-NEXT: sb zero, 17(a0) -; RV32-NEXT: sb zero, 16(a0) -; RV32-NEXT: sb zero, 15(a0) -; RV32-NEXT: sb zero, 14(a0) -; RV32-NEXT: sb zero, 13(a0) -; RV32-NEXT: sb zero, 12(a0) -; RV32-NEXT: sb zero, 11(a0) -; RV32-NEXT: sb zero, 10(a0) -; RV32-NEXT: sb zero, 9(a0) -; RV32-NEXT: sb zero, 8(a0) -; RV32-NEXT: sb zero, 7(a0) -; RV32-NEXT: sb zero, 6(a0) -; RV32-NEXT: sb zero, 5(a0) -; RV32-NEXT: sb zero, 4(a0) -; RV32-NEXT: sb zero, 3(a0) -; RV32-NEXT: sb zero, 2(a0) -; RV32-NEXT: sb zero, 1(a0) -; RV32-NEXT: sb zero, 0(a0) +; RV32-NEXT: addi a1, a0, 16 +; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vse8.v v8, (a1) +; RV32-NEXT: vse8.v v8, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: bzero_32: ; RV64: # %bb.0: -; RV64-NEXT: sb zero, 31(a0) -; RV64-NEXT: sb zero, 30(a0) -; RV64-NEXT: sb zero, 29(a0) -; RV64-NEXT: sb zero, 28(a0) -; RV64-NEXT: sb zero, 27(a0) -; RV64-NEXT: sb zero, 26(a0) -; RV64-NEXT: sb zero, 25(a0) -; RV64-NEXT: sb zero, 24(a0) -; RV64-NEXT: sb zero, 23(a0) -; RV64-NEXT: sb zero, 22(a0) -; RV64-NEXT: sb zero, 21(a0) -; RV64-NEXT: sb zero, 20(a0) -; RV64-NEXT: sb zero, 19(a0) -; RV64-NEXT: sb zero, 18(a0) -; RV64-NEXT: sb zero, 17(a0) -; RV64-NEXT: sb zero, 16(a0) -; RV64-NEXT: sb zero, 15(a0) -; RV64-NEXT: sb zero, 14(a0) -; RV64-NEXT: sb zero, 13(a0) -; RV64-NEXT: sb zero, 12(a0) -; RV64-NEXT: sb zero, 11(a0) -; RV64-NEXT: sb zero, 10(a0) -; RV64-NEXT: sb zero, 9(a0) -; RV64-NEXT: sb zero, 8(a0) -; RV64-NEXT: sb zero, 7(a0) -; RV64-NEXT: sb zero, 6(a0) -; RV64-NEXT: sb zero, 5(a0) -; RV64-NEXT: sb zero, 4(a0) -; RV64-NEXT: sb zero, 3(a0) -; RV64-NEXT: sb zero, 2(a0) -; RV64-NEXT: sb zero, 1(a0) -; RV64-NEXT: sb zero, 0(a0) +; RV64-NEXT: addi a1, a0, 16 +; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vse8.v v8, (a1) +; RV64-NEXT: vse8.v v8, (a0) ; RV64-NEXT: ret ; ; RV32-FAST-LABEL: bzero_32: ; RV32-FAST: # %bb.0: -; RV32-FAST-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-FAST-NEXT: addi a1, a0, 16 +; RV32-FAST-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-FAST-NEXT: vmv.v.i v8, 0 -; RV32-FAST-NEXT: vse32.v v8, (a0) +; RV32-FAST-NEXT: vse64.v v8, (a1) +; RV32-FAST-NEXT: vse64.v v8, (a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: bzero_32: ; RV64-FAST: # %bb.0: -; RV64-FAST-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-FAST-NEXT: addi a1, a0, 16 +; RV64-FAST-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-FAST-NEXT: vmv.v.i v8, 0 +; RV64-FAST-NEXT: vse64.v v8, (a1) ; RV64-FAST-NEXT: vse64.v v8, (a0) ; RV64-FAST-NEXT: ret tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 0, i64 32, i1 0) @@ -943,151 +544,53 @@ define void @bzero_64(ptr %a) nounwind { ; RV32-LABEL: bzero_64: ; RV32: # %bb.0: -; RV32-NEXT: sb zero, 63(a0) -; RV32-NEXT: sb zero, 62(a0) -; RV32-NEXT: sb zero, 61(a0) -; RV32-NEXT: sb zero, 60(a0) -; RV32-NEXT: sb zero, 59(a0) -; RV32-NEXT: sb zero, 58(a0) -; RV32-NEXT: sb zero, 57(a0) -; RV32-NEXT: sb zero, 56(a0) -; RV32-NEXT: sb zero, 55(a0) -; RV32-NEXT: sb zero, 54(a0) -; RV32-NEXT: sb zero, 53(a0) -; RV32-NEXT: sb zero, 52(a0) -; RV32-NEXT: sb zero, 51(a0) -; RV32-NEXT: sb zero, 50(a0) -; RV32-NEXT: sb zero, 49(a0) -; RV32-NEXT: sb zero, 48(a0) -; RV32-NEXT: sb zero, 47(a0) -; RV32-NEXT: sb zero, 46(a0) -; RV32-NEXT: sb zero, 45(a0) -; RV32-NEXT: sb zero, 44(a0) -; RV32-NEXT: sb zero, 43(a0) -; RV32-NEXT: sb zero, 42(a0) -; RV32-NEXT: sb zero, 41(a0) -; RV32-NEXT: sb zero, 40(a0) -; RV32-NEXT: sb zero, 39(a0) -; RV32-NEXT: sb zero, 38(a0) -; RV32-NEXT: sb zero, 37(a0) -; RV32-NEXT: sb zero, 36(a0) -; RV32-NEXT: sb zero, 35(a0) -; RV32-NEXT: sb zero, 34(a0) -; RV32-NEXT: sb zero, 33(a0) -; RV32-NEXT: sb zero, 32(a0) -; RV32-NEXT: sb zero, 31(a0) -; RV32-NEXT: sb zero, 30(a0) -; RV32-NEXT: sb zero, 29(a0) -; RV32-NEXT: sb zero, 28(a0) -; RV32-NEXT: sb zero, 27(a0) -; RV32-NEXT: sb zero, 26(a0) -; RV32-NEXT: sb zero, 25(a0) -; RV32-NEXT: sb zero, 24(a0) -; RV32-NEXT: sb zero, 23(a0) -; RV32-NEXT: sb zero, 22(a0) -; RV32-NEXT: sb zero, 21(a0) -; RV32-NEXT: sb zero, 20(a0) -; RV32-NEXT: sb zero, 19(a0) -; RV32-NEXT: sb zero, 18(a0) -; RV32-NEXT: sb zero, 17(a0) -; RV32-NEXT: sb zero, 16(a0) -; RV32-NEXT: sb zero, 15(a0) -; RV32-NEXT: sb zero, 14(a0) -; RV32-NEXT: sb zero, 13(a0) -; RV32-NEXT: sb zero, 12(a0) -; RV32-NEXT: sb zero, 11(a0) -; RV32-NEXT: sb zero, 10(a0) -; RV32-NEXT: sb zero, 9(a0) -; RV32-NEXT: sb zero, 8(a0) -; RV32-NEXT: sb zero, 7(a0) -; RV32-NEXT: sb zero, 6(a0) -; RV32-NEXT: sb zero, 5(a0) -; RV32-NEXT: sb zero, 4(a0) -; RV32-NEXT: sb zero, 3(a0) -; RV32-NEXT: sb zero, 2(a0) -; RV32-NEXT: sb zero, 1(a0) -; RV32-NEXT: sb zero, 0(a0) +; RV32-NEXT: addi a1, a0, 48 +; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV32-NEXT: vmv.v.i v8, 0 +; RV32-NEXT: vse8.v v8, (a1) +; RV32-NEXT: addi a1, a0, 32 +; RV32-NEXT: vse8.v v8, (a1) +; RV32-NEXT: addi a1, a0, 16 +; RV32-NEXT: vse8.v v8, (a1) +; RV32-NEXT: vse8.v v8, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: bzero_64: ; RV64: # %bb.0: -; RV64-NEXT: sb zero, 63(a0) -; RV64-NEXT: sb zero, 62(a0) -; RV64-NEXT: sb zero, 61(a0) -; RV64-NEXT: sb zero, 60(a0) -; RV64-NEXT: sb zero, 59(a0) -; RV64-NEXT: sb zero, 58(a0) -; RV64-NEXT: sb zero, 57(a0) -; RV64-NEXT: sb zero, 56(a0) -; RV64-NEXT: sb zero, 55(a0) -; RV64-NEXT: sb zero, 54(a0) -; RV64-NEXT: sb zero, 53(a0) -; RV64-NEXT: sb zero, 52(a0) -; RV64-NEXT: sb zero, 51(a0) -; RV64-NEXT: sb zero, 50(a0) -; RV64-NEXT: sb zero, 49(a0) -; RV64-NEXT: sb zero, 48(a0) -; RV64-NEXT: sb zero, 47(a0) -; RV64-NEXT: sb zero, 46(a0) -; RV64-NEXT: sb zero, 45(a0) -; RV64-NEXT: sb zero, 44(a0) -; RV64-NEXT: sb zero, 43(a0) -; RV64-NEXT: sb zero, 42(a0) -; RV64-NEXT: sb zero, 41(a0) -; RV64-NEXT: sb zero, 40(a0) -; RV64-NEXT: sb zero, 39(a0) -; RV64-NEXT: sb zero, 38(a0) -; RV64-NEXT: sb zero, 37(a0) -; RV64-NEXT: sb zero, 36(a0) -; RV64-NEXT: sb zero, 35(a0) -; RV64-NEXT: sb zero, 34(a0) -; RV64-NEXT: sb zero, 33(a0) -; RV64-NEXT: sb zero, 32(a0) -; RV64-NEXT: sb zero, 31(a0) -; RV64-NEXT: sb zero, 30(a0) -; RV64-NEXT: sb zero, 29(a0) -; RV64-NEXT: sb zero, 28(a0) -; RV64-NEXT: sb zero, 27(a0) -; RV64-NEXT: sb zero, 26(a0) -; RV64-NEXT: sb zero, 25(a0) -; RV64-NEXT: sb zero, 24(a0) -; RV64-NEXT: sb zero, 23(a0) -; RV64-NEXT: sb zero, 22(a0) -; RV64-NEXT: sb zero, 21(a0) -; RV64-NEXT: sb zero, 20(a0) -; RV64-NEXT: sb zero, 19(a0) -; RV64-NEXT: sb zero, 18(a0) -; RV64-NEXT: sb zero, 17(a0) -; RV64-NEXT: sb zero, 16(a0) -; RV64-NEXT: sb zero, 15(a0) -; RV64-NEXT: sb zero, 14(a0) -; RV64-NEXT: sb zero, 13(a0) -; RV64-NEXT: sb zero, 12(a0) -; RV64-NEXT: sb zero, 11(a0) -; RV64-NEXT: sb zero, 10(a0) -; RV64-NEXT: sb zero, 9(a0) -; RV64-NEXT: sb zero, 8(a0) -; RV64-NEXT: sb zero, 7(a0) -; RV64-NEXT: sb zero, 6(a0) -; RV64-NEXT: sb zero, 5(a0) -; RV64-NEXT: sb zero, 4(a0) -; RV64-NEXT: sb zero, 3(a0) -; RV64-NEXT: sb zero, 2(a0) -; RV64-NEXT: sb zero, 1(a0) -; RV64-NEXT: sb zero, 0(a0) +; RV64-NEXT: addi a1, a0, 48 +; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma +; RV64-NEXT: vmv.v.i v8, 0 +; RV64-NEXT: vse8.v v8, (a1) +; RV64-NEXT: addi a1, a0, 32 +; RV64-NEXT: vse8.v v8, (a1) +; RV64-NEXT: addi a1, a0, 16 +; RV64-NEXT: vse8.v v8, (a1) +; RV64-NEXT: vse8.v v8, (a0) ; RV64-NEXT: ret ; ; RV32-FAST-LABEL: bzero_64: ; RV32-FAST: # %bb.0: -; RV32-FAST-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-FAST-NEXT: addi a1, a0, 48 +; RV32-FAST-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-FAST-NEXT: vmv.v.i v8, 0 -; RV32-FAST-NEXT: vse32.v v8, (a0) +; RV32-FAST-NEXT: vse64.v v8, (a1) +; RV32-FAST-NEXT: addi a1, a0, 32 +; RV32-FAST-NEXT: vse64.v v8, (a1) +; RV32-FAST-NEXT: addi a1, a0, 16 +; RV32-FAST-NEXT: vse64.v v8, (a1) +; RV32-FAST-NEXT: vse64.v v8, (a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: bzero_64: ; RV64-FAST: # %bb.0: -; RV64-FAST-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-FAST-NEXT: addi a1, a0, 48 +; RV64-FAST-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-FAST-NEXT: vmv.v.i v8, 0 +; RV64-FAST-NEXT: vse64.v v8, (a1) +; RV64-FAST-NEXT: addi a1, a0, 32 +; RV64-FAST-NEXT: vse64.v v8, (a1) +; RV64-FAST-NEXT: addi a1, a0, 16 +; RV64-FAST-NEXT: vse64.v v8, (a1) ; RV64-FAST-NEXT: vse64.v v8, (a0) ; RV64-FAST-NEXT: ret tail call void @llvm.memset.inline.p0.i64(ptr %a, i8 0, i64 64, i1 0) @@ -1143,15 +646,16 @@ define void @aligned_bzero_16(ptr %a) nounwind { ; RV32-BOTH-LABEL: aligned_bzero_16: ; RV32-BOTH: # %bb.0: -; RV32-BOTH-NEXT: vsetivli zero, 4, e32, m1, ta, ma +; RV32-BOTH-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-BOTH-NEXT: vmv.v.i v8, 0 -; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: vse64.v v8, (a0) ; RV32-BOTH-NEXT: ret ; ; RV64-BOTH-LABEL: aligned_bzero_16: ; RV64-BOTH: # %bb.0: -; RV64-BOTH-NEXT: sd zero, 8(a0) -; RV64-BOTH-NEXT: sd zero, 0(a0) +; RV64-BOTH-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; RV64-BOTH-NEXT: vmv.v.i v8, 0 +; RV64-BOTH-NEXT: vse64.v v8, (a0) ; RV64-BOTH-NEXT: ret tail call void @llvm.memset.inline.p0.i64(ptr align 16 %a, i8 0, i64 16, i1 0) ret void @@ -1160,15 +664,19 @@ define void @aligned_bzero_32(ptr %a) nounwind { ; RV32-BOTH-LABEL: aligned_bzero_32: ; RV32-BOTH: # %bb.0: -; RV32-BOTH-NEXT: vsetivli zero, 8, e32, m2, ta, ma +; RV32-BOTH-NEXT: addi a1, a0, 16 +; RV32-BOTH-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-BOTH-NEXT: vmv.v.i v8, 0 -; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: vse64.v v8, (a0) ; RV32-BOTH-NEXT: ret ; ; RV64-BOTH-LABEL: aligned_bzero_32: ; RV64-BOTH: # %bb.0: -; RV64-BOTH-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-BOTH-NEXT: addi a1, a0, 16 +; RV64-BOTH-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-BOTH-NEXT: vmv.v.i v8, 0 +; RV64-BOTH-NEXT: vse64.v v8, (a1) ; RV64-BOTH-NEXT: vse64.v v8, (a0) ; RV64-BOTH-NEXT: ret tail call void @llvm.memset.inline.p0.i64(ptr align 32 %a, i8 0, i64 32, i1 0) @@ -1178,15 +686,27 @@ define void @aligned_bzero_64(ptr %a) nounwind { ; RV32-BOTH-LABEL: aligned_bzero_64: ; RV32-BOTH: # %bb.0: -; RV32-BOTH-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-BOTH-NEXT: addi a1, a0, 48 +; RV32-BOTH-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-BOTH-NEXT: vmv.v.i v8, 0 -; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 32 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 16 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: vse64.v v8, (a0) ; RV32-BOTH-NEXT: ret ; ; RV64-BOTH-LABEL: aligned_bzero_64: ; RV64-BOTH: # %bb.0: -; RV64-BOTH-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-BOTH-NEXT: addi a1, a0, 48 +; RV64-BOTH-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-BOTH-NEXT: vmv.v.i v8, 0 +; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 32 +; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 16 +; RV64-BOTH-NEXT: vse64.v v8, (a1) ; RV64-BOTH-NEXT: vse64.v v8, (a0) ; RV64-BOTH-NEXT: ret tail call void @llvm.memset.inline.p0.i64(ptr align 64 %a, i8 0, i64 64, i1 0) @@ -1197,16 +717,28 @@ ; RV32-BOTH-LABEL: aligned_bzero_66: ; RV32-BOTH: # %bb.0: ; RV32-BOTH-NEXT: sh zero, 64(a0) -; RV32-BOTH-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-BOTH-NEXT: addi a1, a0, 48 +; RV32-BOTH-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-BOTH-NEXT: vmv.v.i v8, 0 -; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 32 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 16 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: vse64.v v8, (a0) ; RV32-BOTH-NEXT: ret ; ; RV64-BOTH-LABEL: aligned_bzero_66: ; RV64-BOTH: # %bb.0: ; RV64-BOTH-NEXT: sh zero, 64(a0) -; RV64-BOTH-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-BOTH-NEXT: addi a1, a0, 48 +; RV64-BOTH-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-BOTH-NEXT: vmv.v.i v8, 0 +; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 32 +; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 16 +; RV64-BOTH-NEXT: vse64.v v8, (a1) ; RV64-BOTH-NEXT: vse64.v v8, (a0) ; RV64-BOTH-NEXT: ret tail call void @llvm.memset.inline.p0.i64(ptr align 64 %a, i8 0, i64 66, i1 0) @@ -1216,23 +748,35 @@ define void @aligned_bzero_96(ptr %a) nounwind { ; RV32-BOTH-LABEL: aligned_bzero_96: ; RV32-BOTH: # %bb.0: -; RV32-BOTH-NEXT: addi a1, a0, 64 -; RV32-BOTH-NEXT: vsetivli zero, 8, e32, m2, ta, ma -; RV32-BOTH-NEXT: vmv.v.i v8, 0 -; RV32-BOTH-NEXT: vse32.v v8, (a1) -; RV32-BOTH-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-BOTH-NEXT: addi a1, a0, 80 +; RV32-BOTH-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-BOTH-NEXT: vmv.v.i v8, 0 -; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 64 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 48 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 32 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 16 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: vse64.v v8, (a0) ; RV32-BOTH-NEXT: ret ; ; RV64-BOTH-LABEL: aligned_bzero_96: ; RV64-BOTH: # %bb.0: -; RV64-BOTH-NEXT: addi a1, a0, 64 -; RV64-BOTH-NEXT: vsetivli zero, 4, e64, m2, ta, ma +; RV64-BOTH-NEXT: addi a1, a0, 80 +; RV64-BOTH-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-BOTH-NEXT: vmv.v.i v8, 0 ; RV64-BOTH-NEXT: vse64.v v8, (a1) -; RV64-BOTH-NEXT: vsetivli zero, 8, e64, m4, ta, ma -; RV64-BOTH-NEXT: vmv.v.i v8, 0 +; RV64-BOTH-NEXT: addi a1, a0, 64 +; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 48 +; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 32 +; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 16 +; RV64-BOTH-NEXT: vse64.v v8, (a1) ; RV64-BOTH-NEXT: vse64.v v8, (a0) ; RV64-BOTH-NEXT: ret tail call void @llvm.memset.inline.p0.i64(ptr align 64 %a, i8 0, i64 96, i1 0) @@ -1242,16 +786,43 @@ define void @aligned_bzero_128(ptr %a) nounwind { ; RV32-BOTH-LABEL: aligned_bzero_128: ; RV32-BOTH: # %bb.0: -; RV32-BOTH-NEXT: li a1, 32 -; RV32-BOTH-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-BOTH-NEXT: addi a1, a0, 112 +; RV32-BOTH-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-BOTH-NEXT: vmv.v.i v8, 0 -; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 96 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 80 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 64 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 48 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 32 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 16 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: vse64.v v8, (a0) ; RV32-BOTH-NEXT: ret ; ; RV64-BOTH-LABEL: aligned_bzero_128: ; RV64-BOTH: # %bb.0: -; RV64-BOTH-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-BOTH-NEXT: addi a1, a0, 112 +; RV64-BOTH-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-BOTH-NEXT: vmv.v.i v8, 0 +; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 96 +; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 80 +; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 64 +; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 48 +; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 32 +; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 16 +; RV64-BOTH-NEXT: vse64.v v8, (a1) ; RV64-BOTH-NEXT: vse64.v v8, (a0) ; RV64-BOTH-NEXT: ret tail call void @llvm.memset.inline.p0.i64(ptr align 64 %a, i8 0, i64 128, i1 0) @@ -1261,20 +832,75 @@ define void @aligned_bzero_256(ptr %a) nounwind { ; RV32-BOTH-LABEL: aligned_bzero_256: ; RV32-BOTH: # %bb.0: -; RV32-BOTH-NEXT: li a1, 32 -; RV32-BOTH-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; RV32-BOTH-NEXT: addi a1, a0, 240 +; RV32-BOTH-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV32-BOTH-NEXT: vmv.v.i v8, 0 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 224 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 208 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 192 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 176 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 160 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 144 +; RV32-BOTH-NEXT: vse64.v v8, (a1) ; RV32-BOTH-NEXT: addi a1, a0, 128 -; RV32-BOTH-NEXT: vse32.v v8, (a1) -; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 112 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 96 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 80 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 64 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 48 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 32 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: addi a1, a0, 16 +; RV32-BOTH-NEXT: vse64.v v8, (a1) +; RV32-BOTH-NEXT: vse64.v v8, (a0) ; RV32-BOTH-NEXT: ret ; ; RV64-BOTH-LABEL: aligned_bzero_256: ; RV64-BOTH: # %bb.0: -; RV64-BOTH-NEXT: addi a1, a0, 128 -; RV64-BOTH-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-BOTH-NEXT: addi a1, a0, 240 +; RV64-BOTH-NEXT: vsetivli zero, 2, e64, m1, ta, ma ; RV64-BOTH-NEXT: vmv.v.i v8, 0 ; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 224 +; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 208 +; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 192 +; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 176 +; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 160 +; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 144 +; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 128 +; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 112 +; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 96 +; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 80 +; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 64 +; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 48 +; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 32 +; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: addi a1, a0, 16 +; RV64-BOTH-NEXT: vse64.v v8, (a1) ; RV64-BOTH-NEXT: vse64.v v8, (a0) ; RV64-BOTH-NEXT: ret tail call void @llvm.memset.inline.p0.i64(ptr align 64 %a, i8 0, i64 256, i1 0) diff --git a/llvm/test/CodeGen/RISCV/rvv/rvv-out-arguments.ll b/llvm/test/CodeGen/RISCV/rvv/rvv-out-arguments.ll --- a/llvm/test/CodeGen/RISCV/rvv/rvv-out-arguments.ll +++ b/llvm/test/CodeGen/RISCV/rvv/rvv-out-arguments.ll @@ -72,22 +72,21 @@ ; CHECK-NEXT: slli a0, a0, 3 ; CHECK-NEXT: sub sp, sp, a0 ; CHECK-NEXT: sw zero, -36(s0) -; CHECK-NEXT: sd zero, -48(s0) -; CHECK-NEXT: sd zero, -56(s0) -; CHECK-NEXT: vsetivli a0, 4, e32, m8, ta, ma -; CHECK-NEXT: sd a0, -64(s0) -; CHECK-NEXT: ld a0, -64(s0) -; CHECK-NEXT: addi a1, s0, -56 -; CHECK-NEXT: vsetvli zero, a0, e32, m8, ta, ma -; CHECK-NEXT: vle32.v v8, (a1) +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vmv.v.i v8, 0 +; CHECK-NEXT: addi a0, s0, -64 +; CHECK-NEXT: vse64.v v8, (a0) +; CHECK-NEXT: vsetivli a1, 4, e32, m8, ta, ma +; CHECK-NEXT: sd a1, -72(s0) +; CHECK-NEXT: ld a1, -72(s0) +; CHECK-NEXT: vsetvli zero, a1, e32, m8, ta, ma +; CHECK-NEXT: vle32.v v8, (a0) ; CHECK-NEXT: csrr s1, vlenb ; CHECK-NEXT: slli s1, s1, 3 ; CHECK-NEXT: sub s1, s0, s1 ; CHECK-NEXT: addi s1, s1, -112 ; CHECK-NEXT: vs8r.v v8, (s1) ; CHECK-NEXT: li a0, 1 -; CHECK-NEXT: sw a0, -68(s0) -; CHECK-NEXT: sw a0, -72(s0) ; CHECK-NEXT: sw a0, -76(s0) ; CHECK-NEXT: sw a0, -80(s0) ; CHECK-NEXT: sw a0, -84(s0) @@ -96,33 +95,35 @@ ; CHECK-NEXT: sw a0, -96(s0) ; CHECK-NEXT: sw a0, -100(s0) ; CHECK-NEXT: sw a0, -104(s0) -; CHECK-NEXT: lw a0, -68(s0) -; CHECK-NEXT: lw a1, -72(s0) +; CHECK-NEXT: sw a0, -108(s0) +; CHECK-NEXT: sw a0, -112(s0) +; CHECK-NEXT: lw a0, -76(s0) +; CHECK-NEXT: lw a1, -80(s0) ; CHECK-NEXT: vl8re32.v v8, (s1) -; CHECK-NEXT: lw a2, -76(s0) -; CHECK-NEXT: lw a3, -80(s0) -; CHECK-NEXT: lw a4, -84(s0) -; CHECK-NEXT: lw a5, -88(s0) -; CHECK-NEXT: lw a6, -92(s0) -; CHECK-NEXT: lw a7, -96(s0) -; CHECK-NEXT: lw t0, -100(s0) -; CHECK-NEXT: lw t1, -104(s0) +; CHECK-NEXT: lw a2, -84(s0) +; CHECK-NEXT: lw a3, -88(s0) +; CHECK-NEXT: lw a4, -92(s0) +; CHECK-NEXT: lw a5, -96(s0) +; CHECK-NEXT: lw a6, -100(s0) +; CHECK-NEXT: lw a7, -104(s0) +; CHECK-NEXT: lw t0, -108(s0) +; CHECK-NEXT: lw t1, -112(s0) ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: sd t1, 8(sp) ; CHECK-NEXT: sd t0, 0(sp) ; CHECK-NEXT: call lots_args ; CHECK-NEXT: addi sp, sp, 16 -; CHECK-NEXT: lw a0, -68(s0) -; CHECK-NEXT: lw a1, -72(s0) +; CHECK-NEXT: lw a0, -76(s0) +; CHECK-NEXT: lw a1, -80(s0) ; CHECK-NEXT: vl8re32.v v8, (s1) -; CHECK-NEXT: lw a2, -76(s0) -; CHECK-NEXT: lw a3, -80(s0) -; CHECK-NEXT: lw a4, -84(s0) -; CHECK-NEXT: lw a5, -88(s0) -; CHECK-NEXT: lw a6, -92(s0) -; CHECK-NEXT: lw a7, -96(s0) -; CHECK-NEXT: lw t0, -100(s0) -; CHECK-NEXT: lw t1, -104(s0) +; CHECK-NEXT: lw a2, -84(s0) +; CHECK-NEXT: lw a3, -88(s0) +; CHECK-NEXT: lw a4, -92(s0) +; CHECK-NEXT: lw a5, -96(s0) +; CHECK-NEXT: lw a6, -100(s0) +; CHECK-NEXT: lw a7, -104(s0) +; CHECK-NEXT: lw t0, -108(s0) +; CHECK-NEXT: lw t1, -112(s0) ; CHECK-NEXT: addi sp, sp, -16 ; CHECK-NEXT: sd t1, 8(sp) ; CHECK-NEXT: sd t0, 0(sp) diff --git a/llvm/test/CodeGen/RISCV/rvv/wrong-chain-fixed-load.ll b/llvm/test/CodeGen/RISCV/rvv/wrong-chain-fixed-load.ll --- a/llvm/test/CodeGen/RISCV/rvv/wrong-chain-fixed-load.ll +++ b/llvm/test/CodeGen/RISCV/rvv/wrong-chain-fixed-load.ll @@ -9,14 +9,14 @@ ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: lui a0, %hi(c) ; CHECK-NEXT: addi a0, a0, %lo(c) -; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; CHECK-NEXT: vle64.v v8, (a0) ; CHECK-NEXT: addi a1, a0, 16 -; CHECK-NEXT: vle64.v v9, (a1) -; CHECK-NEXT: addi a1, a0, 8 +; CHECK-NEXT: vsetivli zero, 2, e64, m1, ta, ma +; CHECK-NEXT: vle64.v v8, (a1) +; CHECK-NEXT: addi a1, a0, 24 ; CHECK-NEXT: vse64.v v8, (a1) -; CHECK-NEXT: addi a0, a0, 24 -; CHECK-NEXT: vse64.v v9, (a0) +; CHECK-NEXT: vle64.v v8, (a0) +; CHECK-NEXT: addi a0, a0, 8 +; CHECK-NEXT: vse64.v v8, (a0) ; CHECK-NEXT: ret entry: ; this thing is "__builtin_memmove(&c[1], &c[0], sizeof(c[0]) * 4);"