Index: lib/Target/X86/X86SelectionDAGInfo.cpp =================================================================== --- lib/Target/X86/X86SelectionDAGInfo.cpp +++ lib/Target/X86/X86SelectionDAGInfo.cpp @@ -44,6 +44,24 @@ return false; } +namespace { + +// Represents a cover of a buffer of Size bytes with Count() blocks of type AVT +// (of size UBytes() bytes), as well as how many bytes remain (BytesLeft() is +// always smaller than the block size). +struct RepMovsRepeats { + RepMovsRepeats(uint64_t Size) : Size(Size) {} + + uint64_t Count() const { return Size / UBytes(); } + uint64_t BytesLeft() const { return Size % UBytes(); } + uint64_t UBytes() const { return AVT.getSizeInBits() / 8; } + + const uint64_t Size; + MVT AVT = MVT::i8; +}; + +} // namespace + SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val, SDValue Size, unsigned Align, bool isVolatile, @@ -66,8 +84,13 @@ // If not DWORD aligned or size is more than the threshold, call the library. // The libc version is likely to be faster for these cases. It can use the // address value and run time information about the CPU. - if ((Align & 3) != 0 || !ConstantSize || - ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) { + // On recent CPUs that have Enhanced RepMovs, with large constant sizes, it's + // always more efficient to call REPSTOSB. + if (!ConstantSize || + (!Subtarget.hasERMSB() && + ((Align & 3) != 0 || + ConstantSize->getZExtValue() > + Subtarget.getMaxInlineSizeThreshold()))) { // Check to see if there is a specialized entry-point for memory zeroing. ConstantSDNode *ValC = dyn_cast(Val); @@ -100,71 +123,65 @@ return SDValue(); } - uint64_t SizeVal = ConstantSize->getZExtValue(); SDValue InFlag; - EVT AVT; - SDValue Count; ConstantSDNode *ValC = dyn_cast(Val); - unsigned BytesLeft = 0; - if (ValC) { - unsigned ValReg; + RepMovsRepeats Repeats(ConstantSize->getZExtValue()); + if (ValC && !Subtarget.hasERMSB()) { + unsigned ValReg = X86::AL; uint64_t Val = ValC->getZExtValue() & 255; // If the value is a constant, then we can potentially use larger sets. switch (Align & 3) { case 2: // WORD aligned - AVT = MVT::i16; + Repeats.AVT = MVT::i16; ValReg = X86::AX; Val = (Val << 8) | Val; break; case 0: // DWORD aligned - AVT = MVT::i32; + Repeats.AVT = MVT::i32; ValReg = X86::EAX; Val = (Val << 8) | Val; Val = (Val << 16) | Val; if (Subtarget.is64Bit() && ((Align & 0x7) == 0)) { // QWORD aligned - AVT = MVT::i64; + Repeats.AVT = MVT::i64; ValReg = X86::RAX; Val = (Val << 32) | Val; } break; default: // Byte aligned - AVT = MVT::i8; - ValReg = X86::AL; - Count = DAG.getIntPtrConstant(SizeVal, dl); break; } - if (AVT.bitsGT(MVT::i8)) { - unsigned UBytes = AVT.getSizeInBits() / 8; - Count = DAG.getIntPtrConstant(SizeVal / UBytes, dl); - BytesLeft = SizeVal % UBytes; + if (Repeats.BytesLeft() > 0 && + DAG.getMachineFunction().getFunction()->optForMinSize()) { + // When agressively optimizing for size, avoid generating the code to + // handle BytesLeft. + Repeats.AVT = MVT::i8; + ValReg = X86::AL; + Val = ValC->getZExtValue() & 255; } - Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT), - InFlag); - InFlag = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, dl, ValReg, + DAG.getConstant(Val, dl, Repeats.AVT), InFlag); } else { - AVT = MVT::i8; - Count = DAG.getIntPtrConstant(SizeVal, dl); Chain = DAG.getCopyToReg(Chain, dl, X86::AL, Val, InFlag); - InFlag = Chain.getValue(1); } + InFlag = Chain.getValue(1); Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX, - Count, InFlag); + DAG.getIntPtrConstant(Repeats.Count(), dl), InFlag); InFlag = Chain.getValue(1); Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI, Dst, InFlag); InFlag = Chain.getValue(1); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; + SDValue Ops[] = { Chain, DAG.getValueType(Repeats.AVT), InFlag }; Chain = DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops); - if (BytesLeft) { + if (Repeats.BytesLeft()) { // Handle the last 1 - 7 bytes. - unsigned Offset = SizeVal - BytesLeft; + unsigned Offset = Repeats.Size - Repeats.BytesLeft(); EVT AddrVT = Dst.getValueType(); EVT SizeVT = Size.getValueType(); @@ -172,7 +189,7 @@ DAG.getNode(ISD::ADD, dl, AddrVT, Dst, DAG.getConstant(Offset, dl, AddrVT)), Val, - DAG.getConstant(BytesLeft, dl, SizeVT), + DAG.getConstant(Repeats.BytesLeft(), dl, SizeVT), Align, isVolatile, false, DstPtrInfo.getWithOffset(Offset)); } @@ -181,24 +198,6 @@ return Chain; } -namespace { - -// Represents a cover of a buffer of SizeVal bytes with blocks of size -// AVT, as well as how many bytes remain (BytesLeft is always smaller than -// the block size). -struct RepMovsRepeats { - RepMovsRepeats(const uint64_t SizeVal, const MVT& AVT) { - const unsigned UBytes = AVT.getSizeInBits() / 8; - Count = SizeVal / UBytes; - BytesLeft = SizeVal % UBytes; - } - - unsigned Count; - unsigned BytesLeft; -}; - -} // namespace - SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline, @@ -210,8 +209,8 @@ DAG.getMachineFunction().getSubtarget(); if (!ConstantSize) return SDValue(); - uint64_t SizeVal = ConstantSize->getZExtValue(); - if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold()) + RepMovsRepeats Repeats(ConstantSize->getZExtValue()); + if (!AlwaysInline && Repeats.Size > Subtarget.getMaxInlineSizeThreshold()) return SDValue(); /// If not DWORD aligned, it is more efficient to call the library. However @@ -232,35 +231,31 @@ if (isBaseRegConflictPossible(DAG, ClobberSet)) return SDValue(); - MVT AVT; - if (Subtarget.hasERMSB()) - // If the target has enhanced REPMOVSB, then it's at least as fast to use - // REP MOVSB instead of REP MOVS{W,D,Q}, and it avoids having to handle - // BytesLeft. - AVT = MVT::i8; - else if (Align & 1) - AVT = MVT::i8; - else if (Align & 2) - AVT = MVT::i16; - else if (Align & 4) - // DWORD aligned - AVT = MVT::i32; - else - // QWORD aligned - AVT = Subtarget.is64Bit() ? MVT::i64 : MVT::i32; - - RepMovsRepeats Repeats(SizeVal, AVT); - if (Repeats.BytesLeft > 0 && - DAG.getMachineFunction().getFunction()->optForMinSize()) { - // When agressively optimizing for size, avoid generating the code to handle - // BytesLeft. - AVT = MVT::i8; - Repeats = RepMovsRepeats(SizeVal, AVT); + // If the target has enhanced REPMOVSB, then it's at least as fast to use + // REP MOVSB instead of REP MOVS{W,D,Q}, and it avoids having to handle + // BytesLeft. + if (!Subtarget.hasERMSB() && !(Align & 1)) { + if (Align & 2) + // WORD aligned + Repeats.AVT = MVT::i16; + else if (Align & 4) + // DWORD aligned + Repeats.AVT = MVT::i32; + else + // QWORD aligned + Repeats.AVT = Subtarget.is64Bit() ? MVT::i64 : MVT::i32; + + if (Repeats.BytesLeft() > 0 && + DAG.getMachineFunction().getFunction()->optForMinSize()) { + // When agressively optimizing for size, avoid generating the code to + // handle BytesLeft. + Repeats.AVT = MVT::i8; + } } SDValue InFlag; Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX, - DAG.getIntPtrConstant(Repeats.Count, dl), InFlag); + DAG.getIntPtrConstant(Repeats.Count(), dl), InFlag); InFlag = Chain.getValue(1); Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI, Dst, InFlag); @@ -270,14 +265,14 @@ InFlag = Chain.getValue(1); SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); - SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; + SDValue Ops[] = { Chain, DAG.getValueType(Repeats.AVT), InFlag }; SDValue RepMovs = DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops); SmallVector Results; Results.push_back(RepMovs); - if (Repeats.BytesLeft) { + if (Repeats.BytesLeft()) { // Handle the last 1 - 7 bytes. - unsigned Offset = SizeVal - Repeats.BytesLeft; + unsigned Offset = Repeats.Size - Repeats.BytesLeft(); EVT DstVT = Dst.getValueType(); EVT SrcVT = Src.getValueType(); EVT SizeVT = Size.getValueType(); @@ -288,7 +283,7 @@ DAG.getNode(ISD::ADD, dl, SrcVT, Src, DAG.getConstant(Offset, dl, SrcVT)), - DAG.getConstant(Repeats.BytesLeft, dl, + DAG.getConstant(Repeats.BytesLeft(), dl, SizeVT), Align, isVolatile, AlwaysInline, false, DstPtrInfo.getWithOffset(Offset), Index: test/CodeGen/X86/memset-large.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/memset-large.ll @@ -0,0 +1,22 @@ +; RUN: llc -mtriple=x86_64-linux-gnu -mattr=-ermsb < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST +; RUN: llc -mtriple=x86_64-linux-gnu -mattr=+ermsb < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST +; RUN: llc -mtriple=i686-linux-gnu -mattr=-ermsb < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST32 +; RUN: llc -mtriple=i686-linux-gnu -mattr=+ermsb < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST +; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=generic < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST +; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=haswell < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST +; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skylake < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST +; FIXME: The documentation states that ivybridge has ermsb, but this is not +; enabled right now since I could not confirm by testing. +; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=ivybridge < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST + +declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1) nounwind + +define void @test1(i8* %dst) { +; ALL-LABEL: test1: +; NOFAST: memset +; NOFAST32: memset +; FAST: rep;stosb + call void @llvm.memset.p0i8.i64(i8* %dst, i8 42, i64 4095, i32 0, i1 false) + ret void +} +