diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerHelper.h @@ -65,9 +65,24 @@ bool matchCombineBr(MachineInstr &MI); bool tryCombineBr(MachineInstr &MI); + /// Optimize memcpy intrinsics et al, e.g. constant len calls. + /// + bool tryCombineMemCpyFamily(MachineInstr &MI); + /// Try to transform \p MI by using all of the above /// combine functions. Returns true if changed. bool tryCombine(MachineInstr &MI); + +private: + // Memcpy family optimization helpers. + bool optimizeMemcpy(MachineInstr &MI, Register Dst, Register Src, + unsigned KnownLen, unsigned DstAlign, unsigned SrcAlign, + bool IsVolatile); + bool optimizeMemmove(MachineInstr &MI, Register Dst, Register Src, + unsigned KnownLen, unsigned DstAlign, unsigned SrcAlign, + bool IsVolatile); + bool optimizeMemset(MachineInstr &MI, Register Dst, Register Val, + unsigned KnownLen, unsigned DstAlign, bool IsVolatile); }; } // namespace llvm diff --git a/llvm/include/llvm/CodeGen/GlobalISel/CombinerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/CombinerInfo.h --- a/llvm/include/llvm/CodeGen/GlobalISel/CombinerInfo.h +++ b/llvm/include/llvm/CodeGen/GlobalISel/CombinerInfo.h @@ -27,9 +27,11 @@ class CombinerInfo { public: CombinerInfo(bool AllowIllegalOps, bool ShouldLegalizeIllegal, - LegalizerInfo *LInfo) + LegalizerInfo *LInfo, bool OptEnabled, bool OptSize, + bool MinSize) : IllegalOpsAllowed(AllowIllegalOps), - LegalizeIllegalOps(ShouldLegalizeIllegal), LInfo(LInfo) { + LegalizeIllegalOps(ShouldLegalizeIllegal), LInfo(LInfo), + EnableOpt(OptEnabled), EnableOptSize(OptSize), EnableMinSize(OptSize) { assert(((AllowIllegalOps || !LegalizeIllegalOps) || LInfo) && "Expecting legalizerInfo when illegalops not allowed"); } @@ -43,6 +45,15 @@ bool LegalizeIllegalOps; // TODO: Make use of this. const LegalizerInfo *LInfo; + /// Whether optimizations should be enabled. This is to distinguish between + /// uses of the combiner unconditionally and only when optimizations are + /// specifically enabled/ + bool EnableOpt; + /// Whether we're optimizing for size. + bool EnableOptSize; + /// Whether we're optimizing for minsize (-Oz). + bool EnableMinSize; + /// Attempt to combine instructions using MI as the root. /// /// Use Observer to report the creation, modification, and erasure of diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h --- a/llvm/include/llvm/CodeGen/TargetLowering.h +++ b/llvm/include/llvm/CodeGen/TargetLowering.h @@ -1426,6 +1426,14 @@ return false; } + /// LLT handling variant. + virtual bool allowsMisalignedMemoryAccesses( + LLT, unsigned AddrSpace = 0, unsigned Align = 1, + MachineMemOperand::Flags Flags = MachineMemOperand::MONone, + bool * /*Fast*/ = nullptr) const { + return false; + } + /// Return true if the target supports a memory access of this type for the /// given address space and alignment. If the access is allowed, the optional /// final parameter returns if the access is also fast (as defined by the @@ -1463,6 +1471,16 @@ return MVT::Other; } + + /// LLT returning variant. + virtual LLT + getOptimalMemOpLLT(uint64_t /*Size*/, unsigned /*DstAlign*/, + unsigned /*SrcAlign*/, bool /*IsMemset*/, + bool /*ZeroMemset*/, bool /*MemcpyStrSrc*/, + const AttributeList & /*FuncAttributes*/) const { + return LLT(); + } + /// Returns true if it's safe to use load / store of the specified type to /// expand memcpy / memset inline. /// diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp --- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp +++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp @@ -10,9 +10,12 @@ #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h" #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h" #include "llvm/CodeGen/GlobalISel/Utils.h" +#include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/CodeGen/TargetLowering.h" +#include "llvm/Target/TargetMachine.h" #define DEBUG_TYPE "gi-combiner" @@ -404,6 +407,522 @@ return true; } +static bool shouldLowerMemFuncForSize(const MachineFunction &MF) { + // On Darwin, -Os means optimize for size without hurting performance, so + // only really optimize for size when -Oz (MinSize) is used. + if (MF.getTarget().getTargetTriple().isOSDarwin()) + return MF.getFunction().hasMinSize(); + return MF.getFunction().hasOptSize(); +} + +// Get a rough equivalent of an MVT for a given LLT. +static MVT getMVTForLLT(LLT Ty) { + if (!Ty.isVector()) + return MVT::getIntegerVT(Ty.getSizeInBits()); + + return MVT::getVectorVT( + MVT::getIntegerVT(Ty.getElementType().getSizeInBits()), + Ty.getNumElements()); +} + +// Returns a list of types to use for memory op lowering in MemOps. An partial +// port of findOptimalMemOpLowering in TargetLowering. +static bool findGISelOptimalMemOpLowering( + std::vector &MemOps, unsigned Limit, uint64_t Size, unsigned DstAlign, + unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, + bool AllowOverlap, unsigned DstAS, unsigned SrcAS, + const AttributeList &FuncAttributes, const TargetLowering &TLI) { + // If 'SrcAlign' is zero, that means the memory operation does not need to + // load the value, i.e. memset or memcpy from constant string. Otherwise, + // it's the inferred alignment of the source. 'DstAlign', on the other hand, + // is the specified alignment of the memory operation. If it is zero, that + // means it's possible to change the alignment of the destination. + // 'MemcpyStrSrc' indicates whether the memcpy source is constant so it does + // not need to be loaded. + if (!(SrcAlign == 0 || SrcAlign >= DstAlign)) + return false; + + LLT Ty = TLI.getOptimalMemOpLLT(Size, DstAlign, SrcAlign, IsMemset, + ZeroMemset, MemcpyStrSrc, FuncAttributes); + + if (Ty == LLT()) { + // Use the largest scalar type whose alignment constraints are satisfied. + // We only need to check DstAlign here as SrcAlign is always greater or + // equal to DstAlign (or zero). + Ty = LLT::scalar(64); + while (DstAlign && DstAlign < Ty.getSizeInBytes() && + !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, DstAlign)) + Ty = LLT::scalar(Ty.getSizeInBytes()); + assert(Ty.getSizeInBits() > 0 && "Could not find valid type"); + // FIXME: check for the largest legal type we can load/store to. + } + + unsigned NumMemOps = 0; + while (Size != 0) { + unsigned TySize = Ty.getSizeInBytes(); + while (TySize > Size) { + // For now, only use non-vector load / store's for the left-over pieces. + LLT NewTy = Ty; + // FIXME: check for mem op safety and legality of the types. Not all of + // SDAGisms map cleanly to GISel concepts. + if (NewTy.isVector()) + NewTy = NewTy.getSizeInBits() > 64 ? LLT::scalar(64) : LLT::scalar(32); + unsigned NewTySize = NewTy.getSizeInBytes(); + + NewTy = LLT::scalar(PowerOf2Floor(NewTy.getSizeInBits()-1)); + NewTySize = NewTy.getSizeInBytes(); + assert(NewTySize > 0 && "Could not find appropriate type"); + + // If the new LLT cannot cover all of the remaining bits, then consider + // issuing a (or a pair of) unaligned and overlapping load / store. + bool Fast; + // Need to get a VT equivalent for allowMisalignedMemoryAccesses(). + MVT VT = getMVTForLLT(Ty); + if (NumMemOps && AllowOverlap && NewTySize < Size && + TLI.allowsMisalignedMemoryAccesses( + VT, DstAS, DstAlign, MachineMemOperand::MONone, &Fast) && + Fast) + TySize = Size; + else { + Ty = NewTy; + TySize = NewTySize; + } + } + + if (++NumMemOps > Limit) + return false; + + MemOps.push_back(Ty); + Size -= TySize; + } + + return true; +} + +static Type *getTypeForLLT(LLT Ty, LLVMContext &C) { + if (Ty.isVector()) + return VectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()), + Ty.getNumElements()); + return IntegerType::get(C, Ty.getSizeInBits()); +} + +// Get a vectorized representation of the memset value operand, GISel edition. +static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) { + MachineRegisterInfo &MRI = *MIB.getMRI(); + unsigned NumBits = Ty.getScalarSizeInBits(); + auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI); + if (!Ty.isVector() && ValVRegAndVal) { + unsigned KnownVal = ValVRegAndVal->Value; + APInt Scalar = APInt(8, KnownVal); + APInt SplatVal = APInt::getSplat(NumBits, Scalar); + return MIB.buildConstant(Ty, SplatVal).getReg(0); + } + // FIXME: for vector types create a G_BUILD_VECTOR. + if (Ty.isVector()) + return Register(); + + // Extend the byte value to the larger type, and then multiply by a magic + // value 0x010101... in order to replicate it across every byte. + LLT ExtType = Ty.getScalarType(); + auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val); + if (NumBits > 8) { + APInt Magic = APInt::getSplat(NumBits, APInt(8, 0x01)); + auto MagicMI = MIB.buildConstant(ExtType, Magic); + Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0); + } + + assert(ExtType == Ty && "Vector memset value type not supported yet"); + return Val; +} + +bool CombinerHelper::optimizeMemset(MachineInstr &MI, Register Dst, Register Val, + unsigned KnownLen, unsigned Align, + bool IsVolatile) { + auto &MF = *MI.getParent()->getParent(); + const auto &TLI = *MF.getSubtarget().getTargetLowering(); + auto &DL = MF.getDataLayout(); + LLVMContext &C = MF.getFunction().getContext(); + + bool DstAlignCanChange = false; + MachineFrameInfo &MFI = MF.getFrameInfo(); + bool OptSize = shouldLowerMemFuncForSize(MF); + + if (KnownLen == 0) { + MI.eraseFromParent(); + return true; + } + + MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI); + if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex())) + DstAlignCanChange = true; + + unsigned Limit = TLI.getMaxStoresPerMemset(OptSize); + std::vector MemOps; + + const auto &DstMMO = **MI.memoperands_begin(); + MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo(); + + auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI); + bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0; + + if (!findGISelOptimalMemOpLowering( + MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Align), 0, + /*IsMemset=*/true, + /*ZeroMemset=*/IsZeroVal, /*MemcpyStrSrc=*/false, + /*AllowOverlap=*/!IsVolatile, DstPtrInfo.getAddrSpace(), ~0u, + MF.getFunction().getAttributes(), TLI)) + return false; + + if (DstAlignCanChange) { + // Get an estimate of the type from the LLT. + Type *IRTy = getTypeForLLT(MemOps[0], C); + unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy); + if (NewAlign > Align) { + unsigned FI = FIDef->getOperand(1).getIndex(); + // Give the stack frame object a larger alignment if needed. + if (MFI.getObjectAlignment(FI) < NewAlign) + MFI.setObjectAlignment(FI, NewAlign); + Align = NewAlign; + } + } + + MachineIRBuilder MIB(MI); + // Find the largest store and generate the bit pattern for it. + LLT LargestTy = MemOps[0]; + for (unsigned i = 1; i < MemOps.size(); i++) + if (MemOps[i].getSizeInBits() > LargestTy.getSizeInBits()) + LargestTy = MemOps[i]; + Register MemSetValue = getMemsetValue(Val, LargestTy, MIB); + + if (!MemSetValue) + return false; + + // Generate the stores. + LLT PtrTy = MRI.getType(Dst); + unsigned DstOff = 0; + unsigned Size = KnownLen; + for (unsigned I = 0; I < MemOps.size(); I++) { + LLT Ty = MemOps[I]; + unsigned TySize = Ty.getSizeInBytes(); + if (TySize > Size) { + // Issuing an unaligned load / store pair that overlaps with the previous + // pair. Adjust the offset accordingly. + assert(I == MemOps.size() - 1 && I != 0); + DstOff -= TySize - Size; + } + + // If this store is smaller than the largest store see whether we can get + // the smaller value for free with a truncate. + Register Value = MemSetValue; + if (Ty.getSizeInBits() < LargestTy.getSizeInBits()) { + MVT VT = getMVTForLLT(Ty); + MVT LargestVT = getMVTForLLT(LargestTy); + if (!LargestTy.isVector() && !Ty.isVector() && + TLI.isTruncateFree(LargestVT, VT)) + Value = MIB.buildTrunc(Ty, MemSetValue).getReg(0); + else + Value = getMemsetValue(Val, Ty, MIB); + if (!Value) + return false; + } + + auto *StoreMMO = + MF.getMachineMemOperand(&DstMMO, DstOff, Ty.getSizeInBytes()); + + Register Ptr = Dst; + if (DstOff != 0) { + auto Offset = + MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), DstOff); + Ptr = MIB.buildGEP(PtrTy, Dst, Offset).getReg(0); + } + + MIB.buildStore(Value, Ptr, *StoreMMO); + DstOff += Ty.getSizeInBytes(); + Size -= TySize; + } + + MI.eraseFromParent(); + return true; +} + + +bool CombinerHelper::optimizeMemcpy(MachineInstr &MI, Register Dst, + Register Src, unsigned KnownLen, + unsigned DstAlign, unsigned SrcAlign, + bool IsVolatile) { + auto &MF = *MI.getParent()->getParent(); + const auto &TLI = *MF.getSubtarget().getTargetLowering(); + auto &DL = MF.getDataLayout(); + LLVMContext &C = MF.getFunction().getContext(); + + bool DstAlignCanChange = false; + MachineFrameInfo &MFI = MF.getFrameInfo(); + bool OptSize = shouldLowerMemFuncForSize(MF); + unsigned Align = MinAlign(DstAlign, SrcAlign); + + if (KnownLen == 0) { + MI.eraseFromParent(); + return true; + } + + MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI); + if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex())) + DstAlignCanChange = true; + + // FIXME: infer better src pointer alignment like SelectionDAG does here. + // FIXME: also use the equivalent of isMemSrcFromConstant and alwaysinlining + // if the memcpy is in a tail call position. + + unsigned Limit = TLI.getMaxStoresPerMemcpy(OptSize); + std::vector MemOps; + + const auto &DstMMO = **MI.memoperands_begin(); + const auto &SrcMMO = **std::next(MI.memoperands_begin()); + MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo(); + MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo(); + + if (!findGISelOptimalMemOpLowering( + MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Align), SrcAlign, + /*IsMemset=*/false, + /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false, + /*AllowOverlap=*/!IsVolatile, DstPtrInfo.getAddrSpace(), + SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes(), TLI)) + return false; + + if (DstAlignCanChange) { + // Get an estimate of the type from the LLT. + Type *IRTy = getTypeForLLT(MemOps[0], C); + unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy); + + // Don't promote to an alignment that would require dynamic stack + // realignment. + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + if (!TRI->needsStackRealignment(MF)) + while (NewAlign > Align && + DL.exceedsNaturalStackAlignment(NewAlign)) + NewAlign /= 2; + + if (NewAlign > Align) { + unsigned FI = FIDef->getOperand(1).getIndex(); + // Give the stack frame object a larger alignment if needed. + if (MFI.getObjectAlignment(FI) < NewAlign) + MFI.setObjectAlignment(FI, NewAlign); + Align = NewAlign; + } + } + + LLVM_DEBUG(dbgs() << "Inlining memcpy: " << MI << " into loads & stores\n"); + + MachineIRBuilder MIB(MI); + // Now we need to emit a pair of load and stores for each of the types we've + // collected. + unsigned CurrOffset = 0; + LLT PtrTy = MRI.getType(Src); + unsigned Size = KnownLen; + for (auto CopyTy : MemOps) { + // Issuing an unaligned load / store pair that overlaps with the previous + // pair. Adjust the offset accordingly. + if (CopyTy.getSizeInBytes() > Size) + CurrOffset -= CopyTy.getSizeInBytes() - Size; + + // Construct MMOs for the accesses. + auto *LoadMMO = + MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes()); + auto *StoreMMO = + MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes()); + + // Create the load. + Register LoadPtr; + Register Offset; + if (CurrOffset == 0) { + LoadPtr = Src; + } else { + Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset) + .getReg(0); + LoadPtr = MIB.buildGEP(PtrTy, Src, Offset).getReg(0); + } + auto LdVal = MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO); + + // Create the store. + Register StorePtr; + if (CurrOffset == 0) + StorePtr = Dst; + else + StorePtr = MIB.buildGEP(PtrTy, Dst, Offset).getReg(0); + MIB.buildStore(LdVal, StorePtr, *StoreMMO); + CurrOffset += CopyTy.getSizeInBytes(); + Size -= CopyTy.getSizeInBytes(); + } + + MI.eraseFromParent(); + return true; +} + +bool CombinerHelper::optimizeMemmove(MachineInstr &MI, Register Dst, + Register Src, unsigned KnownLen, + unsigned DstAlign, unsigned SrcAlign, + bool IsVolatile) { + auto &MF = *MI.getParent()->getParent(); + const auto &TLI = *MF.getSubtarget().getTargetLowering(); + auto &DL = MF.getDataLayout(); + LLVMContext &C = MF.getFunction().getContext(); + + bool DstAlignCanChange = false; + MachineFrameInfo &MFI = MF.getFrameInfo(); + bool OptSize = shouldLowerMemFuncForSize(MF); + unsigned Align = MinAlign(DstAlign, SrcAlign); + + if (KnownLen == 0) { + MI.eraseFromParent(); + return true; + } + + MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI); + if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex())) + DstAlignCanChange = true; + + unsigned Limit = TLI.getMaxStoresPerMemmove(OptSize); + std::vector MemOps; + + const auto &DstMMO = **MI.memoperands_begin(); + const auto &SrcMMO = **std::next(MI.memoperands_begin()); + MachinePointerInfo DstPtrInfo = DstMMO.getPointerInfo(); + MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo(); + + // FIXME: SelectionDAG always passes false for 'AllowOverlap', apparently due + // to a bug in it's findOptimalMemOpLowering implementation. For now do the + // same thing here. + if (!findGISelOptimalMemOpLowering( + MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Align), SrcAlign, + /*IsMemset=*/false, + /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false, + /*AllowOverlap=*/false, DstPtrInfo.getAddrSpace(), + SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes(), TLI)) + return false; + + if (DstAlignCanChange) { + // Get an estimate of the type from the LLT. + Type *IRTy = getTypeForLLT(MemOps[0], C); + unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy); + + // Don't promote to an alignment that would require dynamic stack + // realignment. + const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo(); + if (!TRI->needsStackRealignment(MF)) + while (NewAlign > Align && + DL.exceedsNaturalStackAlignment(NewAlign)) + NewAlign /= 2; + + if (NewAlign > Align) { + unsigned FI = FIDef->getOperand(1).getIndex(); + // Give the stack frame object a larger alignment if needed. + if (MFI.getObjectAlignment(FI) < NewAlign) + MFI.setObjectAlignment(FI, NewAlign); + Align = NewAlign; + } + } + + LLVM_DEBUG(dbgs() << "Inlining memmove: " << MI << " into loads & stores\n"); + + MachineIRBuilder MIB(MI); + // Memmove requires that we perform the loads first before issuing the stores. + unsigned CurrOffset = 0; + LLT PtrTy = MRI.getType(Src); + SmallVector LoadVals; + for (auto CopyTy : MemOps) { + // Construct MMO for the load. + auto *LoadMMO = + MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes()); + + // Create the load. + Register LoadPtr; + Register Offset; + if (CurrOffset == 0) { + LoadPtr = Src; + } else { + Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset) + .getReg(0); + LoadPtr = MIB.buildGEP(PtrTy, Src, Offset).getReg(0); + } + LoadVals.push_back(MIB.buildLoad(CopyTy, LoadPtr, *LoadMMO).getReg(0)); + CurrOffset += CopyTy.getSizeInBytes(); + } + + CurrOffset = 0; + for (unsigned I = 0; I < MemOps.size(); ++I) { + LLT CopyTy = MemOps[I]; + // Now store the values loaded. + auto *StoreMMO = + MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes()); + + Register StorePtr; + Register Offset; + if (CurrOffset == 0) { + StorePtr = Dst; + } else { + Offset = MIB.buildConstant(LLT::scalar(PtrTy.getSizeInBits()), CurrOffset) + .getReg(0); + StorePtr = MIB.buildGEP(PtrTy, Dst, Offset).getReg(0); + } + MIB.buildStore(LoadVals[I], StorePtr, *StoreMMO); + CurrOffset += CopyTy.getSizeInBytes(); + } + MI.eraseFromParent(); + return true; +} + +bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI) { + // This combine is fairly complex so it's not written with a separate + // matcher function. + assert(MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS); + Intrinsic::ID ID = (Intrinsic::ID)MI.getIntrinsicID(); + Register Src = 0, Dst = 0, Len = 0, Val = 0; + bool IsVolatile; + unsigned DstAlign = 0, SrcAlign = 0; + auto MMOIt = MI.memoperands_begin(); + const MachineMemOperand *MemOp = *MMOIt; + switch (ID) { + case Intrinsic::memcpy: + case Intrinsic::memmove: + Dst = MI.getOperand(1).getReg(); + Src = MI.getOperand(2).getReg(); + Len = MI.getOperand(3).getReg(); + DstAlign = MemOp->getBaseAlignment(); + assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI"); + MemOp = *(++MMOIt); + SrcAlign = MemOp->getBaseAlignment(); + IsVolatile = MemOp->isVolatile(); + break; + case Intrinsic::memset: + Dst = MI.getOperand(1).getReg(); + Val = MI.getOperand(2).getReg(); + Len = MI.getOperand(3).getReg(); + DstAlign = MemOp->getBaseAlignment(); + IsVolatile = MemOp->isVolatile(); + break; + default: + return false; + } + + // Don't try to optimize volatile. + if (IsVolatile) + return false; + + // See if this is a constant length copy + auto LenVRegAndVal = getConstantVRegValWithLookThrough(Len, MRI); + if (!LenVRegAndVal) + return false; // Leave it to the legalizer to lower it to a libcall. + unsigned KnownLen = LenVRegAndVal->Value; + + if (ID == Intrinsic::memcpy) + return optimizeMemcpy(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile); + if (ID == Intrinsic::memmove) + return optimizeMemmove(MI, Dst, Src, KnownLen, DstAlign, SrcAlign, IsVolatile); + if (ID == Intrinsic::memset) + return optimizeMemset(MI, Dst, Val, KnownLen, DstAlign, IsVolatile); + return false; +} + bool CombinerHelper::tryCombine(MachineInstr &MI) { if (tryCombineCopy(MI)) return true; diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h @@ -272,6 +272,10 @@ EVT VT, unsigned AddrSpace = 0, unsigned Align = 1, MachineMemOperand::Flags Flags = MachineMemOperand::MONone, bool *Fast = nullptr) const override; + /// LLT variant. + bool allowsMisalignedMemoryAccesses( + LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, + bool *Fast = nullptr) const override; /// Provide custom lowering hooks for some operations. SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override; @@ -358,6 +362,10 @@ bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, const AttributeList &FuncAttributes) const override; + LLT getOptimalMemOpLLT(uint64_t Size, unsigned DstAlign, unsigned SrcAlign, + bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc, + const AttributeList &FuncAttributes) const override; + /// Return true if the addressing mode represented by AM is legal for this /// target, for a load/store of the specified type. bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty, diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -1100,6 +1100,32 @@ return true; } +// Same as above but handling LLTs instead. +bool AArch64TargetLowering::allowsMisalignedMemoryAccesses( + LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags, + bool *Fast) const { + if (Subtarget->requiresStrictAlign()) + return false; + + if (Fast) { + // Some CPUs are fine with unaligned stores except for 128-bit ones. + *Fast = !Subtarget->isMisaligned128StoreSlow() || + Ty.getSizeInBytes() != 16 || + // See comments in performSTORECombine() for more details about + // these conditions. + + // Code that uses clang vector extensions can mark that it + // wants unaligned accesses to be treated as fast by + // underspecifying alignment to be 1 or 2. + Align <= 2 || + + // Disregard v2i64. Memcpy lowering produces those and splitting + // them regresses performance on micro-benchmarks and olden/bh. + Ty == LLT::vector(2, 64); + } + return true; +} + FastISel * AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo, const TargetLibraryInfo *libInfo) const { @@ -8739,6 +8765,39 @@ return MVT::Other; } +LLT AArch64TargetLowering::getOptimalMemOpLLT( + uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset, + bool ZeroMemset, bool MemcpyStrSrc, + const AttributeList &FuncAttributes) const { + bool CanImplicitFloat = + !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat); + bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat; + bool CanUseFP = Subtarget->hasFPARMv8() && CanImplicitFloat; + // Only use AdvSIMD to implement memset of 32-byte and above. It would have + // taken one instruction to materialize the v2i64 zero and one store (with + // restrictive addressing mode). Just do i64 stores. + bool IsSmallMemset = IsMemset && Size < 32; + auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) { + if (memOpAlign(SrcAlign, DstAlign, AlignCheck)) + return true; + bool Fast; + return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone, + &Fast) && + Fast; + }; + + if (CanUseNEON && IsMemset && !IsSmallMemset && + AlignmentIsAcceptable(MVT::v2i64, 16)) + return LLT::vector(2, 64); + if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16)) + return LLT::scalar(128); + if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8)) + return LLT::scalar(64); + if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4)) + return LLT::scalar(32); + return LLT(); +} + // 12-bit optionally shifted immediates are legal for adds. bool AArch64TargetLowering::isLegalAddImmediate(int64_t Immed) const { if (Immed == std::numeric_limits::min()) { diff --git a/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp --- a/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp +++ b/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp @@ -28,9 +28,9 @@ namespace { class AArch64PreLegalizerCombinerInfo : public CombinerInfo { public: - AArch64PreLegalizerCombinerInfo() + AArch64PreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize) : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, - /*LegalizerInfo*/ nullptr) {} + /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize) {} virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, MachineIRBuilder &B) const override; }; @@ -51,6 +51,18 @@ case TargetOpcode::G_SEXTLOAD: case TargetOpcode::G_ZEXTLOAD: return Helper.tryCombineExtendingLoads(MI); + case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS: + switch (MI.getIntrinsicID()) { + case Intrinsic::memcpy: + case Intrinsic::memmove: + case Intrinsic::memset: { + // Try to inline memcpy type calls if optimizations are enabled. + return (EnableOpt && !EnableOptSize) ? Helper.tryCombineMemCpyFamily(MI) + : false; + } + default: + break; + } } return false; @@ -89,7 +101,11 @@ MachineFunctionProperties::Property::FailedISel)) return false; auto *TPC = &getAnalysis(); - AArch64PreLegalizerCombinerInfo PCInfo; + const Function &F = MF.getFunction(); + bool EnableOpt = + MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F); + AArch64PreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(), + F.hasMinSize()); Combiner C(PCInfo, TPC); return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr); } diff --git a/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp b/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp --- a/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp +++ b/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp @@ -27,7 +27,8 @@ public: MipsPreLegalizerCombinerInfo() : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false, - /*LegalizerInfo*/ nullptr) {} + /*LegalizerInfo*/ nullptr, /*EnableOpt*/ false, + /*EnableOptSize*/ false, /*EnableMinSize*/ false) {} virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI, MachineIRBuilder &B) const override; }; diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memcpy.mir @@ -0,0 +1,177 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=aarch64 -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s +--- | + target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128" + target triple = "arm64-apple-darwin" + + define void @test_memcpy1(i32* nocapture %dst, i32* nocapture readonly %src, i64 %len) local_unnamed_addr #0 { + entry: + %0 = bitcast i32* %dst to i8* + %1 = bitcast i32* %src to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 %1, i64 %len, i1 false) + ret void + } + + declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i1 immarg) #1 + + define void @test_memcpy2_const(i32* nocapture %dst, i32* nocapture readonly %src) local_unnamed_addr #0 { + entry: + %0 = bitcast i32* %dst to i8* + %1 = bitcast i32* %src to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 %1, i64 72, i1 false) + ret void + } + + define void @test_memcpy3_const_arrays_unaligned(i32* nocapture %dst, i32* nocapture readonly %src) local_unnamed_addr #0 { + entry: + %0 = bitcast i32* %dst to i8* + %1 = bitcast i32* %src to i8* + tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 %1, i64 143, i1 false) + ret void + } + + attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2,+zcm,+zcz" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #1 = { argmemonly nounwind } + +... +--- +name: test_memcpy1 +alignment: 2 +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +machineFunctionInfo: {} +body: | + bb.1.entry: + liveins: $x0, $x1, $x2 + + ; CHECK-LABEL: name: test_memcpy1 + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2 + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), [[COPY]](p0), [[COPY1]](p0), [[COPY2]](s64) :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + ; CHECK: RET_ReallyLR + %0:_(p0) = COPY $x0 + %1:_(p0) = COPY $x1 + %2:_(s64) = COPY $x2 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), %0(p0), %1(p0), %2(s64) :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + RET_ReallyLR + +... +--- +name: test_memcpy2_const +alignment: 2 +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +machineFunctionInfo: {} +body: | + bb.1.entry: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: test_memcpy2_const + ; CHECK: liveins: $x0, $x1 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1 + ; CHECK: [[LOAD:%[0-9]+]]:_(s128) = G_LOAD [[COPY1]](p0) :: (load 16 from %ir.1, align 4) + ; CHECK: G_STORE [[LOAD]](s128), [[COPY]](p0) :: (store 16 into %ir.0, align 4) + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C]](s64) + ; CHECK: [[LOAD1:%[0-9]+]]:_(s128) = G_LOAD [[GEP]](p0) :: (load 16 from %ir.1 + 16, align 4) + ; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) + ; CHECK: G_STORE [[LOAD1]](s128), [[GEP1]](p0) :: (store 16 into %ir.0 + 16, align 4) + ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 + ; CHECK: [[GEP2:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C1]](s64) + ; CHECK: [[LOAD2:%[0-9]+]]:_(s128) = G_LOAD [[GEP2]](p0) :: (load 16 from %ir.1 + 32, align 4) + ; CHECK: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64) + ; CHECK: G_STORE [[LOAD2]](s128), [[GEP3]](p0) :: (store 16 into %ir.0 + 32, align 4) + ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 48 + ; CHECK: [[GEP4:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C2]](s64) + ; CHECK: [[LOAD3:%[0-9]+]]:_(s128) = G_LOAD [[GEP4]](p0) :: (load 16 from %ir.1 + 48, align 4) + ; CHECK: [[GEP5:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C2]](s64) + ; CHECK: G_STORE [[LOAD3]](s128), [[GEP5]](p0) :: (store 16 into %ir.0 + 48, align 4) + ; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 + ; CHECK: [[GEP6:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C3]](s64) + ; CHECK: [[LOAD4:%[0-9]+]]:_(s64) = G_LOAD [[GEP6]](p0) :: (load 8 from %ir.1 + 64, align 4) + ; CHECK: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C3]](s64) + ; CHECK: G_STORE [[LOAD4]](s64), [[GEP7]](p0) :: (store 8 into %ir.0 + 64, align 4) + ; CHECK: RET_ReallyLR + %0:_(p0) = COPY $x0 + %1:_(p0) = COPY $x1 + %2:_(s64) = G_CONSTANT i64 72 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), %0(p0), %1(p0), %2(s64) :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + RET_ReallyLR + +... +--- +name: test_memcpy3_const_arrays_unaligned +alignment: 2 +tracksRegLiveness: true +registers: + - { id: 0, class: _ } + - { id: 1, class: _ } + - { id: 2, class: _ } +machineFunctionInfo: {} +body: | + bb.1.entry: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: test_memcpy3_const_arrays_unaligned + ; CHECK: liveins: $x0, $x1 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1 + ; CHECK: [[LOAD:%[0-9]+]]:_(s128) = G_LOAD [[COPY1]](p0) :: (load 16 from %ir.1, align 4) + ; CHECK: G_STORE [[LOAD]](s128), [[COPY]](p0) :: (store 16 into %ir.0, align 4) + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C]](s64) + ; CHECK: [[LOAD1:%[0-9]+]]:_(s128) = G_LOAD [[GEP]](p0) :: (load 16 from %ir.1 + 16, align 4) + ; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C]](s64) + ; CHECK: G_STORE [[LOAD1]](s128), [[GEP1]](p0) :: (store 16 into %ir.0 + 16, align 4) + ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 + ; CHECK: [[GEP2:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C1]](s64) + ; CHECK: [[LOAD2:%[0-9]+]]:_(s128) = G_LOAD [[GEP2]](p0) :: (load 16 from %ir.1 + 32, align 4) + ; CHECK: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64) + ; CHECK: G_STORE [[LOAD2]](s128), [[GEP3]](p0) :: (store 16 into %ir.0 + 32, align 4) + ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 48 + ; CHECK: [[GEP4:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C2]](s64) + ; CHECK: [[LOAD3:%[0-9]+]]:_(s128) = G_LOAD [[GEP4]](p0) :: (load 16 from %ir.1 + 48, align 4) + ; CHECK: [[GEP5:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C2]](s64) + ; CHECK: G_STORE [[LOAD3]](s128), [[GEP5]](p0) :: (store 16 into %ir.0 + 48, align 4) + ; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 64 + ; CHECK: [[GEP6:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C3]](s64) + ; CHECK: [[LOAD4:%[0-9]+]]:_(s128) = G_LOAD [[GEP6]](p0) :: (load 16 from %ir.1 + 64, align 4) + ; CHECK: [[GEP7:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C3]](s64) + ; CHECK: G_STORE [[LOAD4]](s128), [[GEP7]](p0) :: (store 16 into %ir.0 + 64, align 4) + ; CHECK: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 80 + ; CHECK: [[GEP8:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C4]](s64) + ; CHECK: [[LOAD5:%[0-9]+]]:_(s128) = G_LOAD [[GEP8]](p0) :: (load 16 from %ir.1 + 80, align 4) + ; CHECK: [[GEP9:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C4]](s64) + ; CHECK: G_STORE [[LOAD5]](s128), [[GEP9]](p0) :: (store 16 into %ir.0 + 80, align 4) + ; CHECK: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 96 + ; CHECK: [[GEP10:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C5]](s64) + ; CHECK: [[LOAD6:%[0-9]+]]:_(s128) = G_LOAD [[GEP10]](p0) :: (load 16 from %ir.1 + 96, align 4) + ; CHECK: [[GEP11:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C5]](s64) + ; CHECK: G_STORE [[LOAD6]](s128), [[GEP11]](p0) :: (store 16 into %ir.0 + 96, align 4) + ; CHECK: [[C6:%[0-9]+]]:_(s64) = G_CONSTANT i64 112 + ; CHECK: [[GEP12:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C6]](s64) + ; CHECK: [[LOAD7:%[0-9]+]]:_(s128) = G_LOAD [[GEP12]](p0) :: (load 16 from %ir.1 + 112, align 4) + ; CHECK: [[GEP13:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C6]](s64) + ; CHECK: G_STORE [[LOAD7]](s128), [[GEP13]](p0) :: (store 16 into %ir.0 + 112, align 4) + ; CHECK: [[C7:%[0-9]+]]:_(s64) = G_CONSTANT i64 127 + ; CHECK: [[GEP14:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C7]](s64) + ; CHECK: [[LOAD8:%[0-9]+]]:_(s128) = G_LOAD [[GEP14]](p0) :: (load 16 from %ir.1 + 127, align 4) + ; CHECK: [[GEP15:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C7]](s64) + ; CHECK: G_STORE [[LOAD8]](s128), [[GEP15]](p0) :: (store 16 into %ir.0 + 127, align 4) + ; CHECK: RET_ReallyLR + %0:_(p0) = COPY $x0 + %1:_(p0) = COPY $x1 + %2:_(s64) = G_CONSTANT i64 143 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memcpy), %0(p0), %1(p0), %2(s64) :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + RET_ReallyLR + +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memmove.mir @@ -0,0 +1,162 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=aarch64 -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s +--- | + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + target triple = "aarch64" + + define void @test_memmove1(i32* nocapture %dst, i32* nocapture readonly %src, i64 %len) local_unnamed_addr #0 { + entry: + %0 = bitcast i32* %dst to i8* + %1 = bitcast i32* %src to i8* + tail call void @llvm.memmove.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 %1, i64 %len, i1 false) + ret void + } + + declare void @llvm.memmove.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i1 immarg) #1 + + define void @test_memmove2_const(i32* nocapture %dst, i32* nocapture readonly %src) local_unnamed_addr #0 { + entry: + %0 = bitcast i32* %dst to i8* + %1 = bitcast i32* %src to i8* + tail call void @llvm.memmove.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 %1, i64 48, i1 false) + ret void + } + + define void @test_memmove3_const_toolarge(i32* nocapture %dst, i32* nocapture readonly %src) local_unnamed_addr #0 { + entry: + %0 = bitcast i32* %dst to i8* + %1 = bitcast i32* %src to i8* + tail call void @llvm.memmove.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 %1, i64 96, i1 false) + ret void + } + + define void @test_memmove4_const_unaligned(i32* nocapture %dst, i32* nocapture readonly %src) local_unnamed_addr #0 { + entry: + %0 = bitcast i32* %dst to i8* + %1 = bitcast i32* %src to i8* + tail call void @llvm.memmove.p0i8.p0i8.i64(i8* align 4 %0, i8* align 4 %1, i64 52, i1 false) + ret void + } + + attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2,+zcm,+zcz" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #1 = { argmemonly nounwind } + +... +--- +name: test_memmove1 +alignment: 2 +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $x0, $x1, $x2 + + ; CHECK-LABEL: name: test_memmove1 + ; CHECK: liveins: $x0, $x1, $x2 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s64) = COPY $x2 + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memmove), [[COPY]](p0), [[COPY1]](p0), [[COPY2]](s64) :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + ; CHECK: RET_ReallyLR + %0:_(p0) = COPY $x0 + %1:_(p0) = COPY $x1 + %2:_(s64) = COPY $x2 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memmove), %0(p0), %1(p0), %2(s64) :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + RET_ReallyLR + +... +--- +name: test_memmove2_const +alignment: 2 +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: test_memmove2_const + ; CHECK: liveins: $x0, $x1 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1 + ; CHECK: [[LOAD:%[0-9]+]]:_(s128) = G_LOAD [[COPY1]](p0) :: (load 16 from %ir.1, align 4) + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C]](s64) + ; CHECK: [[LOAD1:%[0-9]+]]:_(s128) = G_LOAD [[GEP]](p0) :: (load 16 from %ir.1 + 16, align 4) + ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 + ; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C1]](s64) + ; CHECK: [[LOAD2:%[0-9]+]]:_(s128) = G_LOAD [[GEP1]](p0) :: (load 16 from %ir.1 + 32, align 4) + ; CHECK: G_STORE [[LOAD]](s128), [[COPY]](p0) :: (store 16 into %ir.0, align 4) + ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CHECK: [[GEP2:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C2]](s64) + ; CHECK: G_STORE [[LOAD1]](s128), [[GEP2]](p0) :: (store 16 into %ir.0 + 16, align 4) + ; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 + ; CHECK: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C3]](s64) + ; CHECK: G_STORE [[LOAD2]](s128), [[GEP3]](p0) :: (store 16 into %ir.0 + 32, align 4) + ; CHECK: RET_ReallyLR + %0:_(p0) = COPY $x0 + %1:_(p0) = COPY $x1 + %2:_(s64) = G_CONSTANT i64 48 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memmove), %0(p0), %1(p0), %2(s64) :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + RET_ReallyLR + +... +--- +name: test_memmove3_const_toolarge +alignment: 2 +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: test_memmove3_const_toolarge + ; CHECK: liveins: $x0, $x1 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 96 + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memmove), [[COPY]](p0), [[COPY1]](p0), [[C]](s64) :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + ; CHECK: RET_ReallyLR + %0:_(p0) = COPY $x0 + %1:_(p0) = COPY $x1 + %2:_(s64) = G_CONSTANT i64 96 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memmove), %0(p0), %1(p0), %2(s64) :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + RET_ReallyLR + +... +--- +name: test_memmove4_const_unaligned +alignment: 2 +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $x0, $x1 + + ; CHECK-LABEL: name: test_memmove4_const_unaligned + ; CHECK: liveins: $x0, $x1 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(p0) = COPY $x1 + ; CHECK: [[LOAD:%[0-9]+]]:_(s128) = G_LOAD [[COPY1]](p0) :: (load 16 from %ir.1, align 4) + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C]](s64) + ; CHECK: [[LOAD1:%[0-9]+]]:_(s128) = G_LOAD [[GEP]](p0) :: (load 16 from %ir.1 + 16, align 4) + ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 + ; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C1]](s64) + ; CHECK: [[LOAD2:%[0-9]+]]:_(s128) = G_LOAD [[GEP1]](p0) :: (load 16 from %ir.1 + 32, align 4) + ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 48 + ; CHECK: [[GEP2:%[0-9]+]]:_(p0) = G_GEP [[COPY1]], [[C2]](s64) + ; CHECK: [[LOAD3:%[0-9]+]]:_(s32) = G_LOAD [[GEP2]](p0) :: (load 4 from %ir.1 + 48) + ; CHECK: G_STORE [[LOAD]](s128), [[COPY]](p0) :: (store 16 into %ir.0, align 4) + ; CHECK: [[C3:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CHECK: [[GEP3:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C3]](s64) + ; CHECK: G_STORE [[LOAD1]](s128), [[GEP3]](p0) :: (store 16 into %ir.0 + 16, align 4) + ; CHECK: [[C4:%[0-9]+]]:_(s64) = G_CONSTANT i64 32 + ; CHECK: [[GEP4:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C4]](s64) + ; CHECK: G_STORE [[LOAD2]](s128), [[GEP4]](p0) :: (store 16 into %ir.0 + 32, align 4) + ; CHECK: [[C5:%[0-9]+]]:_(s64) = G_CONSTANT i64 48 + ; CHECK: [[GEP5:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C5]](s64) + ; CHECK: G_STORE [[LOAD3]](s32), [[GEP5]](p0) :: (store 4 into %ir.0 + 48) + ; CHECK: RET_ReallyLR + %0:_(p0) = COPY $x0 + %1:_(p0) = COPY $x1 + %2:_(s64) = G_CONSTANT i64 52 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memmove), %0(p0), %1(p0), %2(s64) :: (store 1 into %ir.0, align 4), (load 1 from %ir.1, align 4) + RET_ReallyLR + +... diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/GlobalISel/inline-memset.mir @@ -0,0 +1,148 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=aarch64 -run-pass=aarch64-prelegalizer-combiner -verify-machineinstrs %s -o - | FileCheck %s +--- | + target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" + target triple = "aarch64" + + define void @test_ms1(i8* nocapture %dst, i32 %c, i32 %len) local_unnamed_addr #0 { + entry: + %0 = trunc i32 %c to i8 + %conv = zext i32 %len to i64 + tail call void @llvm.memset.p0i8.i64(i8* align 1 %dst, i8 %0, i64 %conv, i1 false) + ret void + } + + declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i1 immarg) #1 + + define void @test_ms2_const(i8* nocapture %dst, i32 %c) local_unnamed_addr #0 { + entry: + %0 = trunc i32 %c to i8 + tail call void @llvm.memset.p0i8.i64(i8* align 1 %dst, i8 %0, i64 16, i1 false) + ret void + } + + define void @test_ms3_const_both(i8* nocapture %dst) local_unnamed_addr #0 { + entry: + tail call void @llvm.memset.p0i8.i64(i8* align 1 %dst, i8 64, i64 16, i1 false) + ret void + } + + define void @test_ms4_const_both_unaligned(i8* nocapture %dst) local_unnamed_addr #0 { + entry: + tail call void @llvm.memset.p0i8.i64(i8* align 1 %dst, i8 64, i64 18, i1 false) + ret void + } + + declare void @llvm.stackprotector(i8*, i8**) #2 + + attributes #0 = { nounwind ssp uwtable "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cyclone" "target-features"="+aes,+crypto,+fp-armv8,+neon,+sha2,+zcm,+zcz" "unsafe-fp-math"="false" "use-soft-float"="false" } + attributes #1 = { argmemonly nounwind } + +... +--- +name: test_ms1 +alignment: 2 +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $w1, $w2, $x0 + + ; CHECK-LABEL: name: test_ms1 + ; CHECK: liveins: $w1, $w2, $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[COPY2:%[0-9]+]]:_(s32) = COPY $w2 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[COPY2]](s32) + ; CHECK: G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memset), [[COPY]](p0), [[TRUNC]](s8), [[ZEXT]](s64) :: (store 1 into %ir.dst) + ; CHECK: RET_ReallyLR + %0:_(p0) = COPY $x0 + %1:_(s32) = COPY $w1 + %2:_(s32) = COPY $w2 + %3:_(s8) = G_TRUNC %1(s32) + %4:_(s64) = G_ZEXT %2(s32) + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memset), %0(p0), %3(s8), %4(s64) :: (store 1 into %ir.dst) + RET_ReallyLR + +... +--- +name: test_ms2_const +alignment: 2 +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $w1, $x0 + + ; CHECK-LABEL: name: test_ms2_const + ; CHECK: liveins: $w1, $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY $w1 + ; CHECK: [[TRUNC:%[0-9]+]]:_(s8) = G_TRUNC [[COPY1]](s32) + ; CHECK: [[ZEXT:%[0-9]+]]:_(s64) = G_ZEXT [[TRUNC]](s8) + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 72340172838076673 + ; CHECK: [[MUL:%[0-9]+]]:_(s64) = G_MUL [[ZEXT]], [[C]] + ; CHECK: G_STORE [[MUL]](s64), [[COPY]](p0) :: (store 8 into %ir.dst, align 1) + ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64) + ; CHECK: G_STORE [[MUL]](s64), [[GEP]](p0) :: (store 8 into %ir.dst + 8, align 1) + ; CHECK: RET_ReallyLR + %0:_(p0) = COPY $x0 + %1:_(s32) = COPY $w1 + %3:_(s64) = G_CONSTANT i64 16 + %2:_(s8) = G_TRUNC %1(s32) + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memset), %0(p0), %2(s8), %3(s64) :: (store 1 into %ir.dst) + RET_ReallyLR + +... +--- +name: test_ms3_const_both +alignment: 2 +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $x0 + + ; CHECK-LABEL: name: test_ms3_const_both + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4629771061636907072 + ; CHECK: G_STORE [[C]](s64), [[COPY]](p0) :: (store 8 into %ir.dst, align 1) + ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64) + ; CHECK: G_STORE [[C]](s64), [[GEP]](p0) :: (store 8 into %ir.dst + 8, align 1) + ; CHECK: RET_ReallyLR + %0:_(p0) = COPY $x0 + %1:_(s8) = G_CONSTANT i8 64 + %2:_(s64) = G_CONSTANT i64 16 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memset), %0(p0), %1(s8), %2(s64) :: (store 1 into %ir.dst) + RET_ReallyLR + +... +--- +name: test_ms4_const_both_unaligned +alignment: 2 +tracksRegLiveness: true +body: | + bb.1.entry: + liveins: $x0 + + ; CHECK-LABEL: name: test_ms4_const_both_unaligned + ; CHECK: liveins: $x0 + ; CHECK: [[COPY:%[0-9]+]]:_(p0) = COPY $x0 + ; CHECK: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 4629771061636907072 + ; CHECK: G_STORE [[C]](s64), [[COPY]](p0) :: (store 8 into %ir.dst, align 1) + ; CHECK: [[C1:%[0-9]+]]:_(s64) = G_CONSTANT i64 8 + ; CHECK: [[GEP:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C1]](s64) + ; CHECK: G_STORE [[C]](s64), [[GEP]](p0) :: (store 8 into %ir.dst + 8, align 1) + ; CHECK: [[TRUNC:%[0-9]+]]:_(s16) = G_TRUNC [[C]](s64) + ; CHECK: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 16 + ; CHECK: [[GEP1:%[0-9]+]]:_(p0) = G_GEP [[COPY]], [[C2]](s64) + ; CHECK: G_STORE [[TRUNC]](s16), [[GEP1]](p0) :: (store 2 into %ir.dst + 16, align 1) + ; CHECK: RET_ReallyLR + %0:_(p0) = COPY $x0 + %1:_(s8) = G_CONSTANT i8 64 + %2:_(s64) = G_CONSTANT i64 18 + G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.memset), %0(p0), %1(s8), %2(s64) :: (store 1 into %ir.dst) + RET_ReallyLR + +...