Index: llvm/include/llvm/CodeGen/TargetLowering.h =================================================================== --- llvm/include/llvm/CodeGen/TargetLowering.h +++ llvm/include/llvm/CodeGen/TargetLowering.h @@ -1820,14 +1820,14 @@ /// It returns EVT::Other if the type should be determined using generic /// target-independent logic. virtual EVT - getOptimalMemOpType(const MemOp &Op, + getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList & /*FuncAttributes*/) const { return MVT::Other; } /// LLT returning variant. virtual LLT - getOptimalMemOpLLT(const MemOp &Op, + getOptimalMemOpLLT(LLVMContext &Context, const MemOp &Op, const AttributeList & /*FuncAttributes*/) const { return LLT(); } @@ -3753,8 +3753,9 @@ /// It returns the types of the sequence of memory ops to perform /// memset / memcpy by reference. virtual bool - findOptimalMemOpLowering(std::vector &MemOps, unsigned Limit, - const MemOp &Op, unsigned DstAS, unsigned SrcAS, + findOptimalMemOpLowering(LLVMContext &Context, std::vector &MemOps, + unsigned Limit, const MemOp &Op, unsigned DstAS, + unsigned SrcAS, const AttributeList &FuncAttributes) const; /// Check to see if the specified operand of the specified instruction is a Index: llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp =================================================================== --- llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp +++ llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp @@ -7587,7 +7587,7 @@ // Returns a list of types to use for memory op lowering in MemOps. A partial // port of findOptimalMemOpLowering in TargetLowering. -static bool findGISelOptimalMemOpLowering(std::vector &MemOps, +static bool findGISelOptimalMemOpLowering(LLVMContext &Context, std::vector &MemOps, unsigned Limit, const MemOp &Op, unsigned DstAS, unsigned SrcAS, const AttributeList &FuncAttributes, @@ -7595,7 +7595,7 @@ if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign()) return false; - LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes); + LLT Ty = TLI.getOptimalMemOpLLT(Context, Op, FuncAttributes); if (Ty == LLT()) { // Use the largest scalar type whose alignment constraints are satisfied. @@ -7720,7 +7720,7 @@ auto ValVRegAndVal = getIConstantVRegValWithLookThrough(Val, MRI); bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0; - if (!findGISelOptimalMemOpLowering(MemOps, Limit, + if (!findGISelOptimalMemOpLowering(C, MemOps, Limit, MemOp::Set(KnownLen, DstAlignCanChange, Alignment, /*IsZeroMemset=*/IsZeroVal, @@ -7875,7 +7875,7 @@ MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo(); MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo(); - if (!findGISelOptimalMemOpLowering( + if (!findGISelOptimalMemOpLowering(C, MemOps, Limit, MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign, IsVolatile), @@ -7983,7 +7983,7 @@ // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due // to a bug in it's findOptimalMemOpLowering implementation. For now do the // same thing here. - if (!findGISelOptimalMemOpLowering( + if (!findGISelOptimalMemOpLowering(C, MemOps, Limit, MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign, /*IsVolatile*/ true), Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -7264,7 +7264,7 @@ /*IsZeroMemset*/ true, isVol) : MemOp::Copy(Size, DstAlignCanChange, Alignment, *SrcAlign, isVol, CopyFromConstant); - if (!TLI.findOptimalMemOpLowering( + if (!TLI.findOptimalMemOpLowering(*DAG.getContext(), MemOps, Limit, Op, DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes())) return SDValue(); @@ -7351,7 +7351,7 @@ // thing to do is generate a LoadExt/StoreTrunc pair. These simplify // to Load/Store if NVT==VT. // FIXME does the case above also need this? - EVT NVT = TLI.getTypeToTransformTo(C, VT); + EVT NVT = VT.isVector() ? VT : TLI.getTypeToTransformTo(C, VT); assert(NVT.bitsGE(VT)); bool isDereferenceable = @@ -7456,7 +7456,7 @@ SrcAlign = Alignment; assert(SrcAlign && "SrcAlign must be set"); unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemmove(OptSize); - if (!TLI.findOptimalMemOpLowering( + if (!TLI.findOptimalMemOpLowering(*DAG.getContext(), MemOps, Limit, MemOp::Copy(Size, DstAlignCanChange, Alignment, *SrcAlign, /*IsVolatile*/ true), @@ -7575,7 +7575,7 @@ bool IsZeroVal = isNullConstant(Src); unsigned Limit = AlwaysInline ? ~0 : TLI.getMaxStoresPerMemset(OptSize); - if (!TLI.findOptimalMemOpLowering( + if (!TLI.findOptimalMemOpLowering(*DAG.getContext(), MemOps, Limit, MemOp::Set(Size, DstAlignCanChange, Alignment, IsZeroVal, isVol), DstPtrInfo.getAddrSpace(), ~0u, MF.getFunction().getAttributes())) Index: llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp +++ llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp @@ -195,13 +195,13 @@ } bool TargetLowering::findOptimalMemOpLowering( - std::vector &MemOps, unsigned Limit, const MemOp &Op, unsigned DstAS, - unsigned SrcAS, const AttributeList &FuncAttributes) const { + LLVMContext &Context, std::vector &MemOps, unsigned Limit, const MemOp &Op, + unsigned DstAS, unsigned SrcAS, const AttributeList &FuncAttributes) const { if (Limit != ~unsigned(0) && Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign()) return false; - EVT VT = getOptimalMemOpType(Op, FuncAttributes); + EVT VT = getOptimalMemOpType(Context, Op, FuncAttributes); if (VT == MVT::Other) { // Use the largest integer type whose alignment constraints are satisfied. Index: llvm/lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -665,10 +665,10 @@ bool shouldConsiderGEPOffsetSplit() const override; - EVT getOptimalMemOpType(const MemOp &Op, + EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override; - LLT getOptimalMemOpLLT(const MemOp &Op, + LLT getOptimalMemOpLLT(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override; /// Return true if the addressing mode represented by AM is legal for this Index: llvm/lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -15330,7 +15330,7 @@ } EVT AArch64TargetLowering::getOptimalMemOpType( - const MemOp &Op, const AttributeList &FuncAttributes) const { + LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const { bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat); bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat; @@ -15360,7 +15360,7 @@ } LLT AArch64TargetLowering::getOptimalMemOpLLT( - const MemOp &Op, const AttributeList &FuncAttributes) const { + LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const { bool CanImplicitFloat = !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat); bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat; Index: llvm/lib/Target/AMDGPU/SIISelLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.h +++ llvm/lib/Target/AMDGPU/SIISelLowering.h @@ -329,7 +329,7 @@ MachineMemOperand::Flags Flags = MachineMemOperand::MONone, unsigned *IsFast = nullptr) const override; - EVT getOptimalMemOpType(const MemOp &Op, + EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override; bool isMemOpUniform(const SDNode *N) const; Index: llvm/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/lib/Target/AMDGPU/SIISelLowering.cpp @@ -1633,7 +1633,7 @@ } EVT SITargetLowering::getOptimalMemOpType( - const MemOp &Op, const AttributeList &FuncAttributes) const { +LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const { // FIXME: Should account for address space here. // The default fallback uses the private pointer size as a guess for a type to Index: llvm/lib/Target/ARM/ARMISelLowering.h =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.h +++ llvm/lib/Target/ARM/ARMISelLowering.h @@ -447,7 +447,7 @@ MachineMemOperand::Flags Flags, unsigned *Fast) const override; - EVT getOptimalMemOpType(const MemOp &Op, + EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override; bool isTruncateFree(Type *SrcTy, Type *DstTy) const override; Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -19083,7 +19083,7 @@ EVT ARMTargetLowering::getOptimalMemOpType( - const MemOp &Op, const AttributeList &FuncAttributes) const { +LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const { // See if we can use NEON instructions for this... if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() && !FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) { Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -1188,6 +1188,7 @@ // get the cost for this memcpy. std::vector MemOps; if (getTLI()->findOptimalMemOpLowering( + I->getContext(), MemOps, Limit, MOp, DstAddrSpace, SrcAddrSpace, F->getAttributes())) return MemOps.size() * Factor; Index: llvm/lib/Target/BPF/BPFISelLowering.h =================================================================== --- llvm/lib/Target/BPF/BPFISelLowering.h +++ llvm/lib/Target/BPF/BPFISelLowering.h @@ -105,7 +105,7 @@ void ReplaceNodeResults(SDNode *N, SmallVectorImpl &Results, SelectionDAG &DAG) const override; - EVT getOptimalMemOpType(const MemOp &Op, + EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override { return Op.size() >= 8 ? MVT::i64 : MVT::i32; } Index: llvm/lib/Target/Hexagon/HexagonISelLowering.h =================================================================== --- llvm/lib/Target/Hexagon/HexagonISelLowering.h +++ llvm/lib/Target/Hexagon/HexagonISelLowering.h @@ -322,7 +322,7 @@ /// the immediate into a register. bool isLegalICmpImmediate(int64_t Imm) const override; - EVT getOptimalMemOpType(const MemOp &Op, + EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override; bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT, Index: llvm/lib/Target/Hexagon/HexagonISelLowering.cpp =================================================================== --- llvm/lib/Target/Hexagon/HexagonISelLowering.cpp +++ llvm/lib/Target/Hexagon/HexagonISelLowering.cpp @@ -3763,7 +3763,7 @@ /// does not need to be loaded. It returns EVT::Other if the type should be /// determined using generic target-independent logic. EVT HexagonTargetLowering::getOptimalMemOpType( - const MemOp &Op, const AttributeList &FuncAttributes) const { +LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const { if (Op.size() >= 8 && Op.isAligned(Align(8))) return MVT::i64; if (Op.size() >= 4 && Op.isAligned(Align(4))) Index: llvm/lib/Target/Mips/MipsISelLowering.h =================================================================== --- llvm/lib/Target/Mips/MipsISelLowering.h +++ llvm/lib/Target/Mips/MipsISelLowering.h @@ -663,7 +663,7 @@ bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override; - EVT getOptimalMemOpType(const MemOp &Op, + EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override; /// isFPImmLegal - Returns true if the target can instruction select the Index: llvm/lib/Target/Mips/MipsISelLowering.cpp =================================================================== --- llvm/lib/Target/Mips/MipsISelLowering.cpp +++ llvm/lib/Target/Mips/MipsISelLowering.cpp @@ -4320,7 +4320,7 @@ } EVT MipsTargetLowering::getOptimalMemOpType( - const MemOp &Op, const AttributeList &FuncAttributes) const { +LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const { if (Subtarget.hasMips64()) return MVT::i64; Index: llvm/lib/Target/PowerPC/PPCISelLowering.h =================================================================== --- llvm/lib/Target/PowerPC/PPCISelLowering.h +++ llvm/lib/Target/PowerPC/PPCISelLowering.h @@ -1056,7 +1056,7 @@ /// It returns EVT::Other if the type should be determined using generic /// target-independent logic. - EVT getOptimalMemOpType(const MemOp &Op, + EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override; /// Is unaligned memory access allowed for the given type, and is it fast Index: llvm/lib/Target/PowerPC/PPCISelLowering.cpp =================================================================== --- llvm/lib/Target/PowerPC/PPCISelLowering.cpp +++ llvm/lib/Target/PowerPC/PPCISelLowering.cpp @@ -17077,7 +17077,7 @@ /// It returns EVT::Other if the type should be determined using generic /// target-independent logic. EVT PPCTargetLowering::getOptimalMemOpType( - const MemOp &Op, const AttributeList &FuncAttributes) const { +LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const { if (getTargetMachine().getOptLevel() != CodeGenOpt::None) { // We should use Altivec/VSX loads and stores when available. For unaligned // addresses, unaligned VSX loads are only fast starting with the P8. Index: llvm/lib/Target/RISCV/RISCVISelLowering.h =================================================================== --- llvm/lib/Target/RISCV/RISCVISelLowering.h +++ llvm/lib/Target/RISCV/RISCVISelLowering.h @@ -694,9 +694,13 @@ MachineMemOperand::Flags Flags = MachineMemOperand::MONone, unsigned *Fast = nullptr) const override; - EVT getOptimalMemOpType(const MemOp &Op, + EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override; + bool findOptimalMemOpLowering( +LLVMContext &Context, std::vector &MemOps, unsigned Limit, const MemOp &Op, unsigned DstAS, + unsigned SrcAS, const AttributeList &FuncAttributes) const override; + bool splitValueIntoRegisterParts( SelectionDAG & DAG, const SDLoc &DL, SDValue Val, SDValue *Parts, unsigned NumParts, MVT PartVT, std::optional CC) Index: llvm/lib/Target/RISCV/RISCVISelLowering.cpp =================================================================== --- llvm/lib/Target/RISCV/RISCVISelLowering.cpp +++ llvm/lib/Target/RISCV/RISCVISelLowering.cpp @@ -16975,18 +16975,12 @@ } -EVT RISCVTargetLowering::getOptimalMemOpType(const MemOp &Op, +EVT RISCVTargetLowering::getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const { if (!Subtarget.hasVInstructions()) return MVT::Other; - // Round down to the next smallest LMUL register class. This is the largest - // single load/store we can perform. TODO: For non-lmul sized operations - // smaller than LMUL8, can we do better by using VL? - unsigned VLen = Subtarget.getRealMinVLen()/8; - unsigned Size = NextPowerOf2(Op.size() & ~(VLen - 1))/2; - Size = std::min(Size, 8*VLen); - if (Size == 0) + if (Op.size() < 16) // TODO: Figure out short memops. For the moment, do the default thing // which ends up using scalar sequences. return MVT::Other; @@ -17008,7 +17002,70 @@ PreferredVT.getStoreSize() > Op.getSrcAlign().value()) PreferredVT = MVT::i8; } - return MVT::getVectorVT(PreferredVT, Size/PreferredVT.getStoreSize()); + + // Version 2 - VL toggle for size < LMUL8 + overlap beyond + if (Op.size() < Subtarget.getRealMinVLen()) { + if (Op.size() % PreferredVT.getStoreSize() == 0) + return EVT::getVectorVT(Context, PreferredVT, Op.size()/PreferredVT.getStoreSize()); + return EVT::getVectorVT(Context, MVT::i8, Op.size()); + } + return MVT::getVectorVT(PreferredVT, Subtarget.getRealMinVLen()/PreferredVT.getStoreSize()); +} + +bool RISCVTargetLowering::findOptimalMemOpLowering( +LLVMContext &Context, std::vector &MemOps, unsigned Limit, const MemOp &Op, unsigned DstAS, + unsigned SrcAS, const AttributeList &FuncAttributes) const { + + // Version 0 (Without this change at all) - Round down to LMUL + overlap if needed + +#if 1 + // Version 1 - Full LMUL8 chunks + VL-toggle tail + if (Op.size() > 16 && Subtarget.hasVInstructions()) { + // Prefer i8 for non-zero memset as it allows us to avoid materializing + // a large scalar constant and instead use vmv.v.x/i to do the + // broadcast. For everything else, prefer XLenVT to minimize VL and thus + // maximize the chance we can encode the size in the vsetvli. + MVT PreferredVT = + (Op.isMemset() && !Op.isZeroMemset()) ? MVT::i8 : Subtarget.getXLenVT(); + + // Do we have sufficient alignment for our preferred VT? If not, revert + // to byte aligned accesses. + if (PreferredVT != MVT::i8 && !Subtarget.enableUnalignedVectorMem()) { + if (Op.isFixedDstAlign() && + PreferredVT.getStoreSize() > Op.getDstAlign().value()) + PreferredVT = MVT::i8; + if (Op.isMemcpy() && + PreferredVT.getStoreSize() > Op.getSrcAlign().value()) + PreferredVT = MVT::i8; + } + + unsigned BytesLeft = Op.size(); + const unsigned FullChunkSize = Subtarget.getRealMinVLen(); + const unsigned FullChunkElems = FullChunkSize/PreferredVT.getStoreSize(); + while (BytesLeft >= FullChunkSize) { + MemOps.push_back(EVT::getVectorVT(Context, PreferredVT, FullChunkElems)); + BytesLeft -= FullChunkSize; + } + if (BytesLeft != 0) { + if (BytesLeft % PreferredVT.getStoreSize() == 0) + MemOps.push_back(EVT::getVectorVT(Context, PreferredVT, BytesLeft/PreferredVT.getStoreSize())); + else + MemOps.push_back(EVT::getVectorVT(Context, MVT::i8, BytesLeft)); + } + return true; + } +#endif + + // Other Variants Possible: + // - VL control for size < LMUL8 + LMUL8 chunks + overlap + // - LMUL8 chunks + overlap via minimum-LMUL + // - LMUL8 chunks + some selection of VL and LMUL overlap by size? + + // Note: We may not want to be using LMUL8 as our chunk size here, but that's a + // largely separate task. + + return TargetLowering::findOptimalMemOpLowering(Context, MemOps, Limit, Op, DstAS, + SrcAS, FuncAttributes); } bool RISCVTargetLowering::splitValueIntoRegisterParts( Index: llvm/lib/Target/SystemZ/SystemZISelLowering.h =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelLowering.h +++ llvm/lib/Target/SystemZ/SystemZISelLowering.h @@ -461,10 +461,10 @@ MachineMemOperand::Flags Flags, unsigned *Fast) const override; bool - findOptimalMemOpLowering(std::vector &MemOps, unsigned Limit, + findOptimalMemOpLowering(LLVMContext &Context, std::vector &MemOps, unsigned Limit, const MemOp &Op, unsigned DstAS, unsigned SrcAS, const AttributeList &FuncAttributes) const override; - EVT getOptimalMemOpType(const MemOp &Op, + EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override; bool isTruncateFree(Type *, Type *) const override; bool isTruncateFree(EVT, EVT) const override; Index: llvm/lib/Target/SystemZ/SystemZISelLowering.cpp =================================================================== --- llvm/lib/Target/SystemZ/SystemZISelLowering.cpp +++ llvm/lib/Target/SystemZ/SystemZISelLowering.cpp @@ -1004,7 +1004,7 @@ } bool SystemZTargetLowering::findOptimalMemOpLowering( - std::vector &MemOps, unsigned Limit, const MemOp &Op, unsigned DstAS, +LLVMContext &Context, std::vector &MemOps, unsigned Limit, const MemOp &Op, unsigned DstAS, unsigned SrcAS, const AttributeList &FuncAttributes) const { const int MVCFastLen = 16; @@ -1018,11 +1018,11 @@ return false; // Memset zero: Use XC } - return TargetLowering::findOptimalMemOpLowering(MemOps, Limit, Op, DstAS, + return TargetLowering::findOptimalMemOpLowering(Context, MemOps, Limit, Op, DstAS, SrcAS, FuncAttributes); } -EVT SystemZTargetLowering::getOptimalMemOpType(const MemOp &Op, +EVT SystemZTargetLowering::getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const { return Subtarget.hasVector() ? MVT::v2i64 : MVT::Other; } Index: llvm/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.h +++ llvm/lib/Target/X86/X86ISelLowering.h @@ -1004,7 +1004,7 @@ uint64_t getByValTypeAlignment(Type *Ty, const DataLayout &DL) const override; - EVT getOptimalMemOpType(const MemOp &Op, + EVT getOptimalMemOpType(LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const override; /// Returns true if it's safe to use load / store of the Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -2773,7 +2773,7 @@ /// For vector ops we check that the overall size isn't larger than our /// preferred vector width. EVT X86TargetLowering::getOptimalMemOpType( - const MemOp &Op, const AttributeList &FuncAttributes) const { +LLVMContext &Context, const MemOp &Op, const AttributeList &FuncAttributes) const { if (!FuncAttributes.hasFnAttr(Attribute::NoImplicitFloat)) { if (Op.size() >= 16 && (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) { Index: llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll +++ llvm/test/CodeGen/RISCV/rvv/memcpy-inline.ll @@ -307,49 +307,19 @@ } define void @unaligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { -; RV32-LABEL: unaligned_memcpy31: -; RV32: # %bb.0: # %entry -; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-NEXT: vle8.v v8, (a1) -; RV32-NEXT: vse8.v v8, (a0) -; RV32-NEXT: addi a1, a1, 15 -; RV32-NEXT: vle8.v v8, (a1) -; RV32-NEXT: addi a0, a0, 15 -; RV32-NEXT: vse8.v v8, (a0) -; RV32-NEXT: ret -; -; RV64-LABEL: unaligned_memcpy31: -; RV64: # %bb.0: # %entry -; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64-NEXT: vle8.v v8, (a1) -; RV64-NEXT: vse8.v v8, (a0) -; RV64-NEXT: addi a1, a1, 15 -; RV64-NEXT: vle8.v v8, (a1) -; RV64-NEXT: addi a0, a0, 15 -; RV64-NEXT: vse8.v v8, (a0) -; RV64-NEXT: ret -; -; RV32-FAST-LABEL: unaligned_memcpy31: -; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-FAST-NEXT: vle32.v v8, (a1) -; RV32-FAST-NEXT: vse32.v v8, (a0) -; RV32-FAST-NEXT: addi a1, a1, 15 -; RV32-FAST-NEXT: vle32.v v8, (a1) -; RV32-FAST-NEXT: addi a0, a0, 15 -; RV32-FAST-NEXT: vse32.v v8, (a0) -; RV32-FAST-NEXT: ret +; RV32-BOTH-LABEL: unaligned_memcpy31: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: vsetivli zero, 31, e8, m2, ta, ma +; RV32-BOTH-NEXT: vle8.v v8, (a1) +; RV32-BOTH-NEXT: vse8.v v8, (a0) +; RV32-BOTH-NEXT: ret ; -; RV64-FAST-LABEL: unaligned_memcpy31: -; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-FAST-NEXT: vle64.v v8, (a1) -; RV64-FAST-NEXT: vse64.v v8, (a0) -; RV64-FAST-NEXT: addi a1, a1, 15 -; RV64-FAST-NEXT: vle64.v v8, (a1) -; RV64-FAST-NEXT: addi a0, a0, 15 -; RV64-FAST-NEXT: vse64.v v8, (a0) -; RV64-FAST-NEXT: ret +; RV64-BOTH-LABEL: unaligned_memcpy31: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: vsetivli zero, 31, e8, m2, ta, ma +; RV64-BOTH-NEXT: vle8.v v8, (a1) +; RV64-BOTH-NEXT: vse8.v v8, (a0) +; RV64-BOTH-NEXT: ret entry: tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 31, i1 false) ret void @@ -428,48 +398,32 @@ define void @unaligned_memcpy96(ptr nocapture %dest, ptr %src) nounwind { ; RV32-LABEL: unaligned_memcpy96: ; RV32: # %bb.0: # %entry -; RV32-NEXT: li a2, 64 -; RV32-NEXT: vsetvli zero, a2, e8, m4, ta, ma -; RV32-NEXT: vle8.v v8, (a1) -; RV32-NEXT: vse8.v v8, (a0) -; RV32-NEXT: addi a1, a1, 32 +; RV32-NEXT: li a2, 96 +; RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; RV32-NEXT: vle8.v v8, (a1) -; RV32-NEXT: addi a0, a0, 32 ; RV32-NEXT: vse8.v v8, (a0) ; RV32-NEXT: ret ; ; RV64-LABEL: unaligned_memcpy96: ; RV64: # %bb.0: # %entry -; RV64-NEXT: li a2, 64 -; RV64-NEXT: vsetvli zero, a2, e8, m4, ta, ma -; RV64-NEXT: vle8.v v8, (a1) -; RV64-NEXT: vse8.v v8, (a0) -; RV64-NEXT: addi a1, a1, 32 +; RV64-NEXT: li a2, 96 +; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; RV64-NEXT: vle8.v v8, (a1) -; RV64-NEXT: addi a0, a0, 32 ; RV64-NEXT: vse8.v v8, (a0) ; RV64-NEXT: ret ; ; RV32-FAST-LABEL: unaligned_memcpy96: ; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-FAST-NEXT: vsetivli zero, 24, e32, m8, ta, ma ; RV32-FAST-NEXT: vle32.v v8, (a1) ; RV32-FAST-NEXT: vse32.v v8, (a0) -; RV32-FAST-NEXT: addi a1, a1, 32 -; RV32-FAST-NEXT: vle32.v v8, (a1) -; RV32-FAST-NEXT: addi a0, a0, 32 -; RV32-FAST-NEXT: vse32.v v8, (a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: unaligned_memcpy96: ; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-FAST-NEXT: vsetivli zero, 12, e64, m8, ta, ma ; RV64-FAST-NEXT: vle64.v v8, (a1) ; RV64-FAST-NEXT: vse64.v v8, (a0) -; RV64-FAST-NEXT: addi a1, a1, 32 -; RV64-FAST-NEXT: vle64.v v8, (a1) -; RV64-FAST-NEXT: addi a0, a0, 32 -; RV64-FAST-NEXT: vse64.v v8, (a0) ; RV64-FAST-NEXT: ret entry: tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 96, i1 false) @@ -519,9 +473,11 @@ ; RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; RV32-NEXT: vle8.v v8, (a1) ; RV32-NEXT: vse8.v v8, (a0) -; RV32-NEXT: addi a1, a1, 68 +; RV32-NEXT: addi a1, a1, 128 +; RV32-NEXT: li a2, 68 +; RV32-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; RV32-NEXT: vle8.v v8, (a1) -; RV32-NEXT: addi a0, a0, 68 +; RV32-NEXT: addi a0, a0, 128 ; RV32-NEXT: vse8.v v8, (a0) ; RV32-NEXT: ret ; @@ -531,9 +487,11 @@ ; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; RV64-NEXT: vle8.v v8, (a1) ; RV64-NEXT: vse8.v v8, (a0) -; RV64-NEXT: addi a1, a1, 68 +; RV64-NEXT: addi a1, a1, 128 +; RV64-NEXT: li a2, 68 +; RV64-NEXT: vsetvli zero, a2, e8, m8, ta, ma ; RV64-NEXT: vle8.v v8, (a1) -; RV64-NEXT: addi a0, a0, 68 +; RV64-NEXT: addi a0, a0, 128 ; RV64-NEXT: vse8.v v8, (a0) ; RV64-NEXT: ret ; @@ -543,21 +501,24 @@ ; RV32-FAST-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-FAST-NEXT: vle32.v v8, (a1) ; RV32-FAST-NEXT: vse32.v v8, (a0) -; RV32-FAST-NEXT: addi a1, a1, 68 +; RV32-FAST-NEXT: addi a1, a1, 128 +; RV32-FAST-NEXT: vsetivli zero, 17, e32, m8, ta, ma ; RV32-FAST-NEXT: vle32.v v8, (a1) -; RV32-FAST-NEXT: addi a0, a0, 68 +; RV32-FAST-NEXT: addi a0, a0, 128 ; RV32-FAST-NEXT: vse32.v v8, (a0) ; RV32-FAST-NEXT: ret ; ; RV64-FAST-LABEL: unaligned_memcpy196: ; RV64-FAST: # %bb.0: # %entry +; RV64-FAST-NEXT: addi a2, a1, 128 +; RV64-FAST-NEXT: li a3, 68 +; RV64-FAST-NEXT: vsetvli zero, a3, e8, m8, ta, ma +; RV64-FAST-NEXT: vle8.v v8, (a2) +; RV64-FAST-NEXT: addi a2, a0, 128 +; RV64-FAST-NEXT: vse8.v v8, (a2) ; RV64-FAST-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-FAST-NEXT: vle64.v v8, (a1) ; RV64-FAST-NEXT: vse64.v v8, (a0) -; RV64-FAST-NEXT: addi a1, a1, 68 -; RV64-FAST-NEXT: vle64.v v8, (a1) -; RV64-FAST-NEXT: addi a0, a0, 68 -; RV64-FAST-NEXT: vse64.v v8, (a0) ; RV64-FAST-NEXT: ret entry: tail call void @llvm.memcpy.inline.p0.p0.i64(ptr %dest, ptr %src, i64 196, i1 false) @@ -803,51 +764,19 @@ } define void @aligned_memcpy31(ptr nocapture %dest, ptr %src) nounwind { -; RV32-LABEL: aligned_memcpy31: -; RV32: # %bb.0: # %entry -; RV32-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-NEXT: vle32.v v8, (a1) -; RV32-NEXT: vse32.v v8, (a0) -; RV32-NEXT: addi a1, a1, 15 -; RV32-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV32-NEXT: vle8.v v8, (a1) -; RV32-NEXT: addi a0, a0, 15 -; RV32-NEXT: vse8.v v8, (a0) -; RV32-NEXT: ret -; -; RV64-LABEL: aligned_memcpy31: -; RV64: # %bb.0: # %entry -; RV64-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-NEXT: vle64.v v8, (a1) -; RV64-NEXT: vse64.v v8, (a0) -; RV64-NEXT: addi a1, a1, 15 -; RV64-NEXT: vsetivli zero, 16, e8, m1, ta, ma -; RV64-NEXT: vle8.v v8, (a1) -; RV64-NEXT: addi a0, a0, 15 -; RV64-NEXT: vse8.v v8, (a0) -; RV64-NEXT: ret -; -; RV32-FAST-LABEL: aligned_memcpy31: -; RV32-FAST: # %bb.0: # %entry -; RV32-FAST-NEXT: vsetivli zero, 4, e32, m1, ta, ma -; RV32-FAST-NEXT: vle32.v v8, (a1) -; RV32-FAST-NEXT: vse32.v v8, (a0) -; RV32-FAST-NEXT: addi a1, a1, 15 -; RV32-FAST-NEXT: vle32.v v8, (a1) -; RV32-FAST-NEXT: addi a0, a0, 15 -; RV32-FAST-NEXT: vse32.v v8, (a0) -; RV32-FAST-NEXT: ret +; RV32-BOTH-LABEL: aligned_memcpy31: +; RV32-BOTH: # %bb.0: # %entry +; RV32-BOTH-NEXT: vsetivli zero, 31, e8, m2, ta, ma +; RV32-BOTH-NEXT: vle8.v v8, (a1) +; RV32-BOTH-NEXT: vse8.v v8, (a0) +; RV32-BOTH-NEXT: ret ; -; RV64-FAST-LABEL: aligned_memcpy31: -; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: vsetivli zero, 2, e64, m1, ta, ma -; RV64-FAST-NEXT: vle64.v v8, (a1) -; RV64-FAST-NEXT: vse64.v v8, (a0) -; RV64-FAST-NEXT: addi a1, a1, 15 -; RV64-FAST-NEXT: vle64.v v8, (a1) -; RV64-FAST-NEXT: addi a0, a0, 15 -; RV64-FAST-NEXT: vse64.v v8, (a0) -; RV64-FAST-NEXT: ret +; RV64-BOTH-LABEL: aligned_memcpy31: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: vsetivli zero, 31, e8, m2, ta, ma +; RV64-BOTH-NEXT: vle8.v v8, (a1) +; RV64-BOTH-NEXT: vse8.v v8, (a0) +; RV64-BOTH-NEXT: ret entry: tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 31, i1 false) ret void @@ -894,24 +823,16 @@ define void @aligned_memcpy96(ptr nocapture %dest, ptr %src) nounwind { ; RV32-BOTH-LABEL: aligned_memcpy96: ; RV32-BOTH: # %bb.0: # %entry -; RV32-BOTH-NEXT: vsetivli zero, 16, e32, m4, ta, ma -; RV32-BOTH-NEXT: vle32.v v8, (a1) -; RV32-BOTH-NEXT: vse32.v v8, (a0) -; RV32-BOTH-NEXT: addi a1, a1, 32 +; RV32-BOTH-NEXT: vsetivli zero, 24, e32, m8, ta, ma ; RV32-BOTH-NEXT: vle32.v v8, (a1) -; RV32-BOTH-NEXT: addi a0, a0, 32 ; RV32-BOTH-NEXT: vse32.v v8, (a0) ; RV32-BOTH-NEXT: ret ; ; RV64-BOTH-LABEL: aligned_memcpy96: ; RV64-BOTH: # %bb.0: # %entry -; RV64-BOTH-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-BOTH-NEXT: vsetivli zero, 12, e64, m8, ta, ma ; RV64-BOTH-NEXT: vle64.v v8, (a1) ; RV64-BOTH-NEXT: vse64.v v8, (a0) -; RV64-BOTH-NEXT: addi a1, a1, 32 -; RV64-BOTH-NEXT: vle64.v v8, (a1) -; RV64-BOTH-NEXT: addi a0, a0, 32 -; RV64-BOTH-NEXT: vse64.v v8, (a0) ; RV64-BOTH-NEXT: ret entry: tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 96, i1 false) @@ -945,35 +866,25 @@ ; RV32-BOTH-NEXT: vsetvli zero, a2, e32, m8, ta, ma ; RV32-BOTH-NEXT: vle32.v v8, (a1) ; RV32-BOTH-NEXT: vse32.v v8, (a0) -; RV32-BOTH-NEXT: addi a1, a1, 68 +; RV32-BOTH-NEXT: addi a1, a1, 128 +; RV32-BOTH-NEXT: vsetivli zero, 17, e32, m8, ta, ma ; RV32-BOTH-NEXT: vle32.v v8, (a1) -; RV32-BOTH-NEXT: addi a0, a0, 68 +; RV32-BOTH-NEXT: addi a0, a0, 128 ; RV32-BOTH-NEXT: vse32.v v8, (a0) ; RV32-BOTH-NEXT: ret ; -; RV64-LABEL: aligned_memcpy196: -; RV64: # %bb.0: # %entry -; RV64-NEXT: addi a2, a1, 68 -; RV64-NEXT: li a3, 128 -; RV64-NEXT: vsetvli zero, a3, e8, m8, ta, ma -; RV64-NEXT: vle8.v v8, (a2) -; RV64-NEXT: addi a2, a0, 68 -; RV64-NEXT: vse8.v v8, (a2) -; RV64-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-NEXT: vle64.v v8, (a1) -; RV64-NEXT: vse64.v v8, (a0) -; RV64-NEXT: ret -; -; RV64-FAST-LABEL: aligned_memcpy196: -; RV64-FAST: # %bb.0: # %entry -; RV64-FAST-NEXT: vsetivli zero, 16, e64, m8, ta, ma -; RV64-FAST-NEXT: vle64.v v8, (a1) -; RV64-FAST-NEXT: vse64.v v8, (a0) -; RV64-FAST-NEXT: addi a1, a1, 68 -; RV64-FAST-NEXT: vle64.v v8, (a1) -; RV64-FAST-NEXT: addi a0, a0, 68 -; RV64-FAST-NEXT: vse64.v v8, (a0) -; RV64-FAST-NEXT: ret +; RV64-BOTH-LABEL: aligned_memcpy196: +; RV64-BOTH: # %bb.0: # %entry +; RV64-BOTH-NEXT: addi a2, a1, 128 +; RV64-BOTH-NEXT: li a3, 68 +; RV64-BOTH-NEXT: vsetvli zero, a3, e8, m8, ta, ma +; RV64-BOTH-NEXT: vle8.v v8, (a2) +; RV64-BOTH-NEXT: addi a2, a0, 128 +; RV64-BOTH-NEXT: vse8.v v8, (a2) +; RV64-BOTH-NEXT: vsetivli zero, 16, e64, m8, ta, ma +; RV64-BOTH-NEXT: vle64.v v8, (a1) +; RV64-BOTH-NEXT: vse64.v v8, (a0) +; RV64-BOTH-NEXT: ret entry: tail call void @llvm.memcpy.inline.p0.p0.i64(ptr align 8 %dest, ptr align 8 %src, i64 196, i1 false) ret void Index: llvm/test/CodeGen/RISCV/rvv/memset-inline.ll =================================================================== --- llvm/test/CodeGen/RISCV/rvv/memset-inline.ll +++ llvm/test/CodeGen/RISCV/rvv/memset-inline.ll @@ -648,18 +648,22 @@ define void @aligned_bzero_66(ptr %a) nounwind { ; RV32-BOTH-LABEL: aligned_bzero_66: ; RV32-BOTH: # %bb.0: -; RV32-BOTH-NEXT: sh zero, 64(a0) -; RV32-BOTH-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-BOTH-NEXT: li a1, 128 +; RV32-BOTH-NEXT: vsetvli zero, a1, e8, m8, ta, ma ; RV32-BOTH-NEXT: vmv.v.i v8, 0 -; RV32-BOTH-NEXT: vse32.v v8, (a0) +; RV32-BOTH-NEXT: li a1, 66 +; RV32-BOTH-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; RV32-BOTH-NEXT: vse8.v v8, (a0) ; RV32-BOTH-NEXT: ret ; ; RV64-BOTH-LABEL: aligned_bzero_66: ; RV64-BOTH: # %bb.0: -; RV64-BOTH-NEXT: sh zero, 64(a0) -; RV64-BOTH-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-BOTH-NEXT: li a1, 128 +; RV64-BOTH-NEXT: vsetvli zero, a1, e8, m8, ta, ma ; RV64-BOTH-NEXT: vmv.v.i v8, 0 -; RV64-BOTH-NEXT: vse64.v v8, (a0) +; RV64-BOTH-NEXT: li a1, 66 +; RV64-BOTH-NEXT: vsetvli zero, a1, e8, m8, ta, ma +; RV64-BOTH-NEXT: vse8.v v8, (a0) ; RV64-BOTH-NEXT: ret tail call void @llvm.memset.inline.p0.i64(ptr align 64 %a, i8 0, i64 66, i1 0) ret void @@ -668,19 +672,18 @@ define void @aligned_bzero_96(ptr %a) nounwind { ; RV32-BOTH-LABEL: aligned_bzero_96: ; RV32-BOTH: # %bb.0: -; RV32-BOTH-NEXT: addi a1, a0, 32 -; RV32-BOTH-NEXT: vsetivli zero, 16, e32, m4, ta, ma +; RV32-BOTH-NEXT: li a1, 32 +; RV32-BOTH-NEXT: vsetvli zero, a1, e32, m8, ta, ma ; RV32-BOTH-NEXT: vmv.v.i v8, 0 -; RV32-BOTH-NEXT: vse32.v v8, (a1) +; RV32-BOTH-NEXT: vsetivli zero, 24, e32, m8, ta, ma ; RV32-BOTH-NEXT: vse32.v v8, (a0) ; RV32-BOTH-NEXT: ret ; ; RV64-BOTH-LABEL: aligned_bzero_96: ; RV64-BOTH: # %bb.0: -; RV64-BOTH-NEXT: addi a1, a0, 32 -; RV64-BOTH-NEXT: vsetivli zero, 8, e64, m4, ta, ma +; RV64-BOTH-NEXT: vsetivli zero, 16, e64, m8, ta, ma ; RV64-BOTH-NEXT: vmv.v.i v8, 0 -; RV64-BOTH-NEXT: vse64.v v8, (a1) +; RV64-BOTH-NEXT: vsetivli zero, 12, e64, m8, ta, ma ; RV64-BOTH-NEXT: vse64.v v8, (a0) ; RV64-BOTH-NEXT: ret tail call void @llvm.memset.inline.p0.i64(ptr align 64 %a, i8 0, i64 96, i1 0)