Index: lib/Target/X86/X86.td =================================================================== --- lib/Target/X86/X86.td +++ lib/Target/X86/X86.td @@ -280,6 +280,16 @@ "fast-shld-rotate", "HasFastSHLDRotate", "true", "SHLD can be used as a faster rotate">; +// Nehalem and newer processors have high quality REP MOVS and STOS (aka +// "string operations"). See "REP String Enhancement" in the Intel Software +// Development Manual. This feature indicates that these are reasonable +// lowerings for string operations when calling the library function would have +// too high of cost. +def FeatureFastRepStrOps + : SubtargetFeature< + "fast-repstr", "HasFastRepStrOps", "true", + "REP MOVS/STOS are fast">; + // Ivy Bridge and newer processors have enhanced REP MOVSB and STOSB (aka // "string operations"). See "REP String Enhancement" in the Intel Software // Development Manual. This feature essentially means that REP MOVSB will copy @@ -288,7 +298,7 @@ def FeatureERMSB : SubtargetFeature< "ermsb", "HasERMSB", "true", - "REP MOVS/STOS are fast">; + "REP MOVSB/STOSB are as fast S/D/Q variants", [FeatureFastRepStrOps]>; //===----------------------------------------------------------------------===// // X86 processors supported. @@ -470,7 +480,8 @@ FeatureCMPXCHG16B, FeatureSlowBTMem, FeaturePOPCNT, - FeatureLAHFSAHF + FeatureLAHFSAHF, + FeatureFastRepStrOps ]>; def : NehalemProc<"nehalem">; def : NehalemProc<"corei7">; @@ -487,7 +498,8 @@ FeaturePOPCNT, FeatureAES, FeaturePCLMUL, - FeatureLAHFSAHF + FeatureLAHFSAHF, + FeatureFastRepStrOps ]>; def : WestmereProc<"westmere">; @@ -518,7 +530,8 @@ FeatureLAHFSAHF, FeatureSlow3OpsLEA, FeatureFastScalarFSQRT, - FeatureFastSHLDRotate + FeatureFastSHLDRotate, + FeatureFastRepStrOps ]>; class SandyBridgeProc : ProcModel; Index: lib/Target/X86/X86SelectionDAGInfo.cpp =================================================================== --- lib/Target/X86/X86SelectionDAGInfo.cpp +++ lib/Target/X86/X86SelectionDAGInfo.cpp @@ -62,10 +62,88 @@ } // namespace +static ConstantSDNode *getNonOpaqueConstantIntN(SDValue V, int Bits) { + auto *C = dyn_cast(V); + if (!C) + return nullptr; + + return (C->isOpaque() || !C->getAPIntValue().isIntN(Bits)) ? nullptr : C; +} + +static std::pair getUnscaledSizeAndVT(SelectionDAG &DAG, + const SDLoc &DL, + SDValue Size, + unsigned Align) { + SDValue UnscaledSize = Size; + + // Look through any zero extend. + while (UnscaledSize.getOpcode() == ISD::ZERO_EXTEND) + UnscaledSize = UnscaledSize.getOperand(0); + + // Unless the size is a shift left, we can't remove any scaling applied to it. + if (UnscaledSize.getOpcode() != ISD::SHL) + return {Size, MVT::i8}; + + // We also need the shift to be of a constant. + auto ShiftC = getNonOpaqueConstantIntN(UnscaledSize.getOperand(1), 64); + if (!ShiftC) + return {Size, MVT::i8}; + + // We only want to strip off as much of the size scaling as is allowed + // given the alignment. If this ends up being zero, nothing to do. + uint64_t Shift = Log2_64(MinAlign(1 << ShiftC->getZExtValue(), Align)); + if (Shift == 0) + return {Size, MVT::i8}; + + // We also can't meaningfully scale past a shift of three because + // that's 8-bytes, our largest repeating store size. + Shift = std::min(Shift, 3); + + // Table of value types based on shift amount. + MVT VTs[] = {MVT::i16, MVT::i32, MVT::i64}; + + // We just shift the size right and let the constant folder simplify + // the two shifts. + return { + DAG.getZExtOrTrunc( + DAG.getNode(ISD::SRL, DL, UnscaledSize.getValueType(), UnscaledSize, + DAG.getConstant(Shift, DL, Size.getValueType())), + DL, DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout())), + VTs[Shift - 1]}; +} + +static std::pair getWidenedValueAndReg(SelectionDAG &DAG, + const SDLoc &DL, + ConstantSDNode *ValC, + MVT WideVT) { + uint64_t RawVal = ValC->getZExtValue() & 255; + + switch (WideVT.SimpleTy) { + default: + llvm_unreachable("Only expect i16, i32, and i64 MVTs here!"); + + case MVT::i64: + RawVal |= (RawVal << 8) | (RawVal << 16) | (RawVal << 24); + RawVal |= (RawVal << 32); + return {DAG.getConstant(RawVal, DL, MVT::i64), X86::RAX}; + + case MVT::i32: + RawVal |= (RawVal << 8) | (RawVal << 16) | (RawVal << 24); + return {DAG.getConstant(RawVal, DL, MVT::i32), X86::EAX}; + + case MVT::i16: + RawVal |= (RawVal << 8); + return {DAG.getConstant(RawVal, DL, MVT::i16), X86::AX}; + } +} + SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset( SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val, SDValue Size, unsigned Align, bool isVolatile, MachinePointerInfo DstPtrInfo) const { + if (isVolatile) + return SDValue(); + ConstantSDNode *ConstantSize = dyn_cast(Size); const X86Subtarget &Subtarget = DAG.getMachineFunction().getSubtarget(); @@ -81,12 +159,42 @@ if (DstPtrInfo.getAddrSpace() >= 256) return SDValue(); - // If not DWORD aligned or size is more than the threshold, call the library. - // The libc version is likely to be faster for these cases. It can use the - // address value and run time information about the CPU. + // If we don't have a tuned inline expansion due to the size or alignment, + // fall back on a generic lowering. if ((Align & 3) != 0 || !ConstantSize || ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) { - // Check to see if there is a specialized entry-point for memory zeroing. + // When we have a fast REP+STOS CPU and either have ERMSB + 16-byte + // alignment or PIC overhead for a library call, bypass the library call + // entirely. + if (Subtarget.hasFastRepStrOps() && + (Subtarget.isPositionIndependent() || + (Subtarget.hasERMSB() && (Align & 4) == 0))) { + MVT AVT = MVT::i8; + unsigned ValRegister = X86::AL; + if (ConstantSDNode *ValC = getNonOpaqueConstantIntN(Val, 64)) { + std::tie(Size, AVT) = getUnscaledSizeAndVT(DAG, dl, Size, Align); + if (AVT != MVT::i8) + std::tie(Val, ValRegister) = getWidenedValueAndReg(DAG, dl, ValC, AVT); + } + + Chain = DAG.getCopyToReg(Chain, dl, ValRegister, Val, SDValue()); + SDValue InFlag = Chain.getValue(1); + + Chain = DAG.getCopyToReg( + Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX, Size, InFlag); + InFlag = Chain.getValue(1); + Chain = DAG.getCopyToReg( + Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI, Dst, InFlag); + InFlag = Chain.getValue(1); + + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Ops[] = {Chain, DAG.getValueType(AVT), InFlag}; + return DAG.getNode(X86ISD::REP_STOS, dl, Tys, Ops); + } + + // Otherwise use the libc version so it can use the address value and run + // time information about the CPU. Also check to see if there is + // a specialized entry-point for memory zeroing. ConstantSDNode *ValC = dyn_cast(Val); if (const char *bzeroEntry = ValC && @@ -203,16 +311,43 @@ SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline, MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const { - // This requires the copy size to be a constant, preferably - // within a subtarget-specific limit. ConstantSDNode *ConstantSize = dyn_cast(Size); const X86Subtarget &Subtarget = DAG.getMachineFunction().getSubtarget(); - if (!ConstantSize) + // Most options require the copy size to be a constant, preferably + // within a subtarget-specific limit. + if (!ConstantSize || + (!AlwaysInline && + ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold())) { + // When we have a fast REP+STOS CPU and either have ERMSB + 16-byte + // alignment or PIC overhead for a library call, bypass the library call + // entirely. + if (Subtarget.hasFastRepStrOps() && + (Subtarget.isPositionIndependent() || + (Subtarget.hasERMSB() && (Align & 4) == 0))) { + EVT AVT; + std::tie(Size, AVT) = getUnscaledSizeAndVT(DAG, dl, Size, Align); + + SDValue InFlag; + Chain = DAG.getCopyToReg( + Chain, dl, Subtarget.is64Bit() ? X86::RCX : X86::ECX, Size, InFlag); + InFlag = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RDI : X86::EDI, + Dst, InFlag); + InFlag = Chain.getValue(1); + Chain = DAG.getCopyToReg(Chain, dl, Subtarget.is64Bit() ? X86::RSI : X86::ESI, + Src, InFlag); + InFlag = Chain.getValue(1); + + SDVTList Tys = DAG.getVTList(MVT::Other, MVT::Glue); + SDValue Ops[] = { Chain, DAG.getValueType(AVT), InFlag }; + return DAG.getNode(X86ISD::REP_MOVS, dl, Tys, Ops); + } + + // Otherwise give up and let the library call be emitted. return SDValue(); + } RepMovsRepeats Repeats(ConstantSize->getZExtValue()); - if (!AlwaysInline && Repeats.Size > Subtarget.getMaxInlineSizeThreshold()) - return SDValue(); /// If not DWORD aligned, it is more efficient to call the library. However /// if calling the library is not allowed (AlwaysInline), then soldier on as Index: lib/Target/X86/X86Subtarget.h =================================================================== --- lib/Target/X86/X86Subtarget.h +++ lib/Target/X86/X86Subtarget.h @@ -235,6 +235,9 @@ /// True if SHLD based rotate is fast. bool HasFastSHLDRotate; + /// True if the processor has fast REP MOVS/STOS. + bool HasFastRepStrOps; + /// True if the processor has enhanced REP MOVSB/STOSB. bool HasERMSB; @@ -487,6 +490,7 @@ bool hasFastVectorFSQRT() const { return HasFastVectorFSQRT; } bool hasFastLZCNT() const { return HasFastLZCNT; } bool hasFastSHLDRotate() const { return HasFastSHLDRotate; } + bool hasFastRepStrOps() const { return HasFastRepStrOps; } bool hasERMSB() const { return HasERMSB; } bool hasSlowDivide32() const { return HasSlowDivide32; } bool hasSlowDivide64() const { return HasSlowDivide64; } Index: lib/Target/X86/X86Subtarget.cpp =================================================================== --- lib/Target/X86/X86Subtarget.cpp +++ lib/Target/X86/X86Subtarget.cpp @@ -327,6 +327,7 @@ HasFastVectorFSQRT = false; HasFastLZCNT = false; HasFastSHLDRotate = false; + HasFastRepStrOps = false; HasERMSB = false; HasSlowDivide32 = false; HasSlowDivide64 = false; Index: test/CodeGen/X86/mem_lowering.ll =================================================================== --- /dev/null +++ test/CodeGen/X86/mem_lowering.ll @@ -0,0 +1,2001 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc < %s -mcpu=pentium4 | FileCheck %s --check-prefixes=P4,P4-NOPIC +; RUN: llc < %s -relocation-model=pic -mcpu=pentium4 | FileCheck %s --check-prefixes=P4,P4-PIC + +; RUN: llc < %s -mcpu=nehalem | FileCheck %s --check-prefixes=NHM,NHM-NOPIC +; RUN: llc < %s -relocation-model=pic -mcpu=nehalem | FileCheck %s --check-prefixes=NHM,NHM-PIC + +; RUN: llc < %s -mcpu=westmere | FileCheck %s --check-prefixes=WSM,WSM-NOPIC +; RUN: llc < %s -relocation-model=pic -mcpu=westmere | FileCheck %s --check-prefixes=WSM,WSM-PIC + +; RUN: llc < %s -mcpu=sandybridge | FileCheck %s --check-prefixes=SNB,SNB-NOPIC +; RUN: llc < %s -relocation-model=pic -mcpu=sandybridge | FileCheck %s --check-prefixes=SNB,SNB-PIC + +; RUN: llc < %s -mcpu=ivybridge | FileCheck %s --check-prefixes=IVB,IVB-NOPIC +; RUN: llc < %s -relocation-model=pic -mcpu=ivybridge | FileCheck %s --check-prefixes=IVB,IVB-PIC + +; RUN: llc < %s -mcpu=haswell | FileCheck %s --check-prefixes=HSW,HSW-NOPIC +; RUN: llc < %s -relocation-model=pic -mcpu=haswell | FileCheck %s --check-prefixes=HSW,HSW-PIC + +; RUN: llc < %s -mcpu=skylake-avx512 | FileCheck %s --check-prefixes=SKX,SKX-NOPIC +; RUN: llc < %s -relocation-model=pic -mcpu=skylake-avx512 | FileCheck %s --check-prefixes=SKX,SKX-PIC + +target triple = "x86_64-unknown-linux-gnu" + +define void @zero1(i8* nocapture %ptr, i64 %size) nounwind { +; P4-NOPIC-LABEL: zero1: +; P4-NOPIC: # BB#0: # %entry +; P4-NOPIC-NEXT: movq %rsi, %rax +; P4-NOPIC-NEXT: testq %rax, %rax +; P4-NOPIC-NEXT: jle .LBB0_2 +; P4-NOPIC-NEXT: # BB#1: # %for.body.preheader +; P4-NOPIC-NEXT: pushq %rax +; P4-NOPIC-NEXT: xorl %esi, %esi +; P4-NOPIC-NEXT: movq %rax, %rdx +; P4-NOPIC-NEXT: callq memset +; P4-NOPIC-NEXT: addq $8, %rsp +; P4-NOPIC-NEXT: .LBB0_2: # %for.cond.cleanup +; P4-NOPIC-NEXT: retq +; +; P4-PIC-LABEL: zero1: +; P4-PIC: # BB#0: # %entry +; P4-PIC-NEXT: movq %rsi, %rax +; P4-PIC-NEXT: testq %rax, %rax +; P4-PIC-NEXT: jle .LBB0_2 +; P4-PIC-NEXT: # BB#1: # %for.body.preheader +; P4-PIC-NEXT: pushq %rax +; P4-PIC-NEXT: xorl %esi, %esi +; P4-PIC-NEXT: movq %rax, %rdx +; P4-PIC-NEXT: callq memset@PLT +; P4-PIC-NEXT: addq $8, %rsp +; P4-PIC-NEXT: .LBB0_2: # %for.cond.cleanup +; P4-PIC-NEXT: retq +; +; NHM-NOPIC-LABEL: zero1: +; NHM-NOPIC: # BB#0: # %entry +; NHM-NOPIC-NEXT: movq %rsi, %rax +; NHM-NOPIC-NEXT: testq %rax, %rax +; NHM-NOPIC-NEXT: jle .LBB0_2 +; NHM-NOPIC-NEXT: # BB#1: # %for.body.preheader +; NHM-NOPIC-NEXT: pushq %rax +; NHM-NOPIC-NEXT: xorl %esi, %esi +; NHM-NOPIC-NEXT: movq %rax, %rdx +; NHM-NOPIC-NEXT: callq memset +; NHM-NOPIC-NEXT: addq $8, %rsp +; NHM-NOPIC-NEXT: .LBB0_2: # %for.cond.cleanup +; NHM-NOPIC-NEXT: retq +; +; NHM-PIC-LABEL: zero1: +; NHM-PIC: # BB#0: # %entry +; NHM-PIC-NEXT: testq %rsi, %rsi +; NHM-PIC-NEXT: jle .LBB0_2 +; NHM-PIC-NEXT: # BB#1: # %for.body.preheader +; NHM-PIC-NEXT: xorl %eax, %eax +; NHM-PIC-NEXT: movq %rsi, %rcx +; NHM-PIC-NEXT: rep;stosb +; NHM-PIC-NEXT: .LBB0_2: # %for.cond.cleanup +; NHM-PIC-NEXT: retq +; +; WSM-NOPIC-LABEL: zero1: +; WSM-NOPIC: # BB#0: # %entry +; WSM-NOPIC-NEXT: movq %rsi, %rax +; WSM-NOPIC-NEXT: testq %rax, %rax +; WSM-NOPIC-NEXT: jle .LBB0_2 +; WSM-NOPIC-NEXT: # BB#1: # %for.body.preheader +; WSM-NOPIC-NEXT: pushq %rax +; WSM-NOPIC-NEXT: xorl %esi, %esi +; WSM-NOPIC-NEXT: movq %rax, %rdx +; WSM-NOPIC-NEXT: callq memset +; WSM-NOPIC-NEXT: addq $8, %rsp +; WSM-NOPIC-NEXT: .LBB0_2: # %for.cond.cleanup +; WSM-NOPIC-NEXT: retq +; +; WSM-PIC-LABEL: zero1: +; WSM-PIC: # BB#0: # %entry +; WSM-PIC-NEXT: testq %rsi, %rsi +; WSM-PIC-NEXT: jle .LBB0_2 +; WSM-PIC-NEXT: # BB#1: # %for.body.preheader +; WSM-PIC-NEXT: xorl %eax, %eax +; WSM-PIC-NEXT: movq %rsi, %rcx +; WSM-PIC-NEXT: rep;stosb +; WSM-PIC-NEXT: .LBB0_2: # %for.cond.cleanup +; WSM-PIC-NEXT: retq +; +; SNB-NOPIC-LABEL: zero1: +; SNB-NOPIC: # BB#0: # %entry +; SNB-NOPIC-NEXT: movq %rsi, %rax +; SNB-NOPIC-NEXT: testq %rax, %rax +; SNB-NOPIC-NEXT: jle .LBB0_2 +; SNB-NOPIC-NEXT: # BB#1: # %for.body.preheader +; SNB-NOPIC-NEXT: pushq %rax +; SNB-NOPIC-NEXT: xorl %esi, %esi +; SNB-NOPIC-NEXT: movq %rax, %rdx +; SNB-NOPIC-NEXT: callq memset +; SNB-NOPIC-NEXT: addq $8, %rsp +; SNB-NOPIC-NEXT: .LBB0_2: # %for.cond.cleanup +; SNB-NOPIC-NEXT: retq +; +; SNB-PIC-LABEL: zero1: +; SNB-PIC: # BB#0: # %entry +; SNB-PIC-NEXT: testq %rsi, %rsi +; SNB-PIC-NEXT: jle .LBB0_2 +; SNB-PIC-NEXT: # BB#1: # %for.body.preheader +; SNB-PIC-NEXT: xorl %eax, %eax +; SNB-PIC-NEXT: movq %rsi, %rcx +; SNB-PIC-NEXT: rep;stosb +; SNB-PIC-NEXT: .LBB0_2: # %for.cond.cleanup +; SNB-PIC-NEXT: retq +; +; IVB-LABEL: zero1: +; IVB: # BB#0: # %entry +; IVB-NEXT: testq %rsi, %rsi +; IVB-NEXT: jle .LBB0_2 +; IVB-NEXT: # BB#1: # %for.body.preheader +; IVB-NEXT: xorl %eax, %eax +; IVB-NEXT: movq %rsi, %rcx +; IVB-NEXT: rep;stosb +; IVB-NEXT: .LBB0_2: # %for.cond.cleanup +; IVB-NEXT: retq +; +; HSW-LABEL: zero1: +; HSW: # BB#0: # %entry +; HSW-NEXT: testq %rsi, %rsi +; HSW-NEXT: jle .LBB0_2 +; HSW-NEXT: # BB#1: # %for.body.preheader +; HSW-NEXT: xorl %eax, %eax +; HSW-NEXT: movq %rsi, %rcx +; HSW-NEXT: rep;stosb +; HSW-NEXT: .LBB0_2: # %for.cond.cleanup +; HSW-NEXT: retq +; +; SKX-LABEL: zero1: +; SKX: # BB#0: # %entry +; SKX-NEXT: testq %rsi, %rsi +; SKX-NEXT: jle .LBB0_2 +; SKX-NEXT: # BB#1: # %for.body.preheader +; SKX-NEXT: xorl %eax, %eax +; SKX-NEXT: movq %rsi, %rcx +; SKX-NEXT: rep;stosb +; SKX-NEXT: .LBB0_2: # %for.cond.cleanup +; SKX-NEXT: retq +entry: + %cmp5 = icmp sgt i64 %size, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + call void @llvm.memset.p0i8.i64(i8* %ptr, i8 0, i64 %size, i32 1, i1 false) + br label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +define void @zero2(i16* nocapture %ptr, i64 %size) nounwind { +; P4-NOPIC-LABEL: zero2: +; P4-NOPIC: # BB#0: # %entry +; P4-NOPIC-NEXT: movq %rsi, %rax +; P4-NOPIC-NEXT: testq %rax, %rax +; P4-NOPIC-NEXT: jle .LBB1_2 +; P4-NOPIC-NEXT: # BB#1: # %for.body.preheader +; P4-NOPIC-NEXT: pushq %rax +; P4-NOPIC-NEXT: addq %rax, %rax +; P4-NOPIC-NEXT: xorl %esi, %esi +; P4-NOPIC-NEXT: movq %rax, %rdx +; P4-NOPIC-NEXT: callq memset +; P4-NOPIC-NEXT: addq $8, %rsp +; P4-NOPIC-NEXT: .LBB1_2: # %for.cond.cleanup +; P4-NOPIC-NEXT: retq +; +; P4-PIC-LABEL: zero2: +; P4-PIC: # BB#0: # %entry +; P4-PIC-NEXT: movq %rsi, %rax +; P4-PIC-NEXT: testq %rax, %rax +; P4-PIC-NEXT: jle .LBB1_2 +; P4-PIC-NEXT: # BB#1: # %for.body.preheader +; P4-PIC-NEXT: pushq %rax +; P4-PIC-NEXT: addq %rax, %rax +; P4-PIC-NEXT: xorl %esi, %esi +; P4-PIC-NEXT: movq %rax, %rdx +; P4-PIC-NEXT: callq memset@PLT +; P4-PIC-NEXT: addq $8, %rsp +; P4-PIC-NEXT: .LBB1_2: # %for.cond.cleanup +; P4-PIC-NEXT: retq +; +; NHM-NOPIC-LABEL: zero2: +; NHM-NOPIC: # BB#0: # %entry +; NHM-NOPIC-NEXT: movq %rsi, %rax +; NHM-NOPIC-NEXT: testq %rax, %rax +; NHM-NOPIC-NEXT: jle .LBB1_2 +; NHM-NOPIC-NEXT: # BB#1: # %for.body.preheader +; NHM-NOPIC-NEXT: pushq %rax +; NHM-NOPIC-NEXT: addq %rax, %rax +; NHM-NOPIC-NEXT: xorl %esi, %esi +; NHM-NOPIC-NEXT: movq %rax, %rdx +; NHM-NOPIC-NEXT: callq memset +; NHM-NOPIC-NEXT: addq $8, %rsp +; NHM-NOPIC-NEXT: .LBB1_2: # %for.cond.cleanup +; NHM-NOPIC-NEXT: retq +; +; NHM-PIC-LABEL: zero2: +; NHM-PIC: # BB#0: # %entry +; NHM-PIC-NEXT: testq %rsi, %rsi +; NHM-PIC-NEXT: jle .LBB1_2 +; NHM-PIC-NEXT: # BB#1: # %for.body.preheader +; NHM-PIC-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF +; NHM-PIC-NEXT: andq %rax, %rsi +; NHM-PIC-NEXT: xorl %eax, %eax +; NHM-PIC-NEXT: movq %rsi, %rcx +; NHM-PIC-NEXT: rep;stosw +; NHM-PIC-NEXT: .LBB1_2: # %for.cond.cleanup +; NHM-PIC-NEXT: retq +; +; WSM-NOPIC-LABEL: zero2: +; WSM-NOPIC: # BB#0: # %entry +; WSM-NOPIC-NEXT: movq %rsi, %rax +; WSM-NOPIC-NEXT: testq %rax, %rax +; WSM-NOPIC-NEXT: jle .LBB1_2 +; WSM-NOPIC-NEXT: # BB#1: # %for.body.preheader +; WSM-NOPIC-NEXT: pushq %rax +; WSM-NOPIC-NEXT: addq %rax, %rax +; WSM-NOPIC-NEXT: xorl %esi, %esi +; WSM-NOPIC-NEXT: movq %rax, %rdx +; WSM-NOPIC-NEXT: callq memset +; WSM-NOPIC-NEXT: addq $8, %rsp +; WSM-NOPIC-NEXT: .LBB1_2: # %for.cond.cleanup +; WSM-NOPIC-NEXT: retq +; +; WSM-PIC-LABEL: zero2: +; WSM-PIC: # BB#0: # %entry +; WSM-PIC-NEXT: testq %rsi, %rsi +; WSM-PIC-NEXT: jle .LBB1_2 +; WSM-PIC-NEXT: # BB#1: # %for.body.preheader +; WSM-PIC-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF +; WSM-PIC-NEXT: andq %rax, %rsi +; WSM-PIC-NEXT: xorl %eax, %eax +; WSM-PIC-NEXT: movq %rsi, %rcx +; WSM-PIC-NEXT: rep;stosw +; WSM-PIC-NEXT: .LBB1_2: # %for.cond.cleanup +; WSM-PIC-NEXT: retq +; +; SNB-NOPIC-LABEL: zero2: +; SNB-NOPIC: # BB#0: # %entry +; SNB-NOPIC-NEXT: movq %rsi, %rax +; SNB-NOPIC-NEXT: testq %rax, %rax +; SNB-NOPIC-NEXT: jle .LBB1_2 +; SNB-NOPIC-NEXT: # BB#1: # %for.body.preheader +; SNB-NOPIC-NEXT: pushq %rax +; SNB-NOPIC-NEXT: addq %rax, %rax +; SNB-NOPIC-NEXT: xorl %esi, %esi +; SNB-NOPIC-NEXT: movq %rax, %rdx +; SNB-NOPIC-NEXT: callq memset +; SNB-NOPIC-NEXT: addq $8, %rsp +; SNB-NOPIC-NEXT: .LBB1_2: # %for.cond.cleanup +; SNB-NOPIC-NEXT: retq +; +; SNB-PIC-LABEL: zero2: +; SNB-PIC: # BB#0: # %entry +; SNB-PIC-NEXT: testq %rsi, %rsi +; SNB-PIC-NEXT: jle .LBB1_2 +; SNB-PIC-NEXT: # BB#1: # %for.body.preheader +; SNB-PIC-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF +; SNB-PIC-NEXT: andq %rax, %rsi +; SNB-PIC-NEXT: xorl %eax, %eax +; SNB-PIC-NEXT: movq %rsi, %rcx +; SNB-PIC-NEXT: rep;stosw +; SNB-PIC-NEXT: .LBB1_2: # %for.cond.cleanup +; SNB-PIC-NEXT: retq +; +; IVB-LABEL: zero2: +; IVB: # BB#0: # %entry +; IVB-NEXT: testq %rsi, %rsi +; IVB-NEXT: jle .LBB1_2 +; IVB-NEXT: # BB#1: # %for.body.preheader +; IVB-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF +; IVB-NEXT: andq %rax, %rsi +; IVB-NEXT: xorl %eax, %eax +; IVB-NEXT: movq %rsi, %rcx +; IVB-NEXT: rep;stosw +; IVB-NEXT: .LBB1_2: # %for.cond.cleanup +; IVB-NEXT: retq +; +; HSW-LABEL: zero2: +; HSW: # BB#0: # %entry +; HSW-NEXT: testq %rsi, %rsi +; HSW-NEXT: jle .LBB1_2 +; HSW-NEXT: # BB#1: # %for.body.preheader +; HSW-NEXT: movb $63, %al +; HSW-NEXT: bzhiq %rax, %rsi, %rcx +; HSW-NEXT: xorl %eax, %eax +; HSW-NEXT: rep;stosw +; HSW-NEXT: .LBB1_2: # %for.cond.cleanup +; HSW-NEXT: retq +; +; SKX-LABEL: zero2: +; SKX: # BB#0: # %entry +; SKX-NEXT: testq %rsi, %rsi +; SKX-NEXT: jle .LBB1_2 +; SKX-NEXT: # BB#1: # %for.body.preheader +; SKX-NEXT: movb $63, %al +; SKX-NEXT: bzhiq %rax, %rsi, %rcx +; SKX-NEXT: xorl %eax, %eax +; SKX-NEXT: rep;stosw +; SKX-NEXT: .LBB1_2: # %for.cond.cleanup +; SKX-NEXT: retq +entry: + %cmp5 = icmp sgt i64 %size, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %ptr8 = bitcast i16* %ptr to i8* + %0 = shl i64 %size, 1 + call void @llvm.memset.p0i8.i64(i8* %ptr8, i8 0, i64 %0, i32 2, i1 false) + br label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +define void @zero4(i32* nocapture %ptr, i64 %size) nounwind { +; P4-NOPIC-LABEL: zero4: +; P4-NOPIC: # BB#0: # %entry +; P4-NOPIC-NEXT: movq %rsi, %rax +; P4-NOPIC-NEXT: testq %rax, %rax +; P4-NOPIC-NEXT: jle .LBB2_2 +; P4-NOPIC-NEXT: # BB#1: # %for.body.preheader +; P4-NOPIC-NEXT: pushq %rax +; P4-NOPIC-NEXT: shlq $2, %rax +; P4-NOPIC-NEXT: xorl %esi, %esi +; P4-NOPIC-NEXT: movq %rax, %rdx +; P4-NOPIC-NEXT: callq memset +; P4-NOPIC-NEXT: addq $8, %rsp +; P4-NOPIC-NEXT: .LBB2_2: # %for.cond.cleanup +; P4-NOPIC-NEXT: retq +; +; P4-PIC-LABEL: zero4: +; P4-PIC: # BB#0: # %entry +; P4-PIC-NEXT: movq %rsi, %rax +; P4-PIC-NEXT: testq %rax, %rax +; P4-PIC-NEXT: jle .LBB2_2 +; P4-PIC-NEXT: # BB#1: # %for.body.preheader +; P4-PIC-NEXT: pushq %rax +; P4-PIC-NEXT: shlq $2, %rax +; P4-PIC-NEXT: xorl %esi, %esi +; P4-PIC-NEXT: movq %rax, %rdx +; P4-PIC-NEXT: callq memset@PLT +; P4-PIC-NEXT: addq $8, %rsp +; P4-PIC-NEXT: .LBB2_2: # %for.cond.cleanup +; P4-PIC-NEXT: retq +; +; NHM-NOPIC-LABEL: zero4: +; NHM-NOPIC: # BB#0: # %entry +; NHM-NOPIC-NEXT: movq %rsi, %rax +; NHM-NOPIC-NEXT: testq %rax, %rax +; NHM-NOPIC-NEXT: jle .LBB2_2 +; NHM-NOPIC-NEXT: # BB#1: # %for.body.preheader +; NHM-NOPIC-NEXT: pushq %rax +; NHM-NOPIC-NEXT: shlq $2, %rax +; NHM-NOPIC-NEXT: xorl %esi, %esi +; NHM-NOPIC-NEXT: movq %rax, %rdx +; NHM-NOPIC-NEXT: callq memset +; NHM-NOPIC-NEXT: addq $8, %rsp +; NHM-NOPIC-NEXT: .LBB2_2: # %for.cond.cleanup +; NHM-NOPIC-NEXT: retq +; +; NHM-PIC-LABEL: zero4: +; NHM-PIC: # BB#0: # %entry +; NHM-PIC-NEXT: testq %rsi, %rsi +; NHM-PIC-NEXT: jle .LBB2_2 +; NHM-PIC-NEXT: # BB#1: # %for.body.preheader +; NHM-PIC-NEXT: movabsq $4611686018427387903, %rax # imm = 0x3FFFFFFFFFFFFFFF +; NHM-PIC-NEXT: andq %rax, %rsi +; NHM-PIC-NEXT: xorl %eax, %eax +; NHM-PIC-NEXT: movq %rsi, %rcx +; NHM-PIC-NEXT: rep;stosl +; NHM-PIC-NEXT: .LBB2_2: # %for.cond.cleanup +; NHM-PIC-NEXT: retq +; +; WSM-NOPIC-LABEL: zero4: +; WSM-NOPIC: # BB#0: # %entry +; WSM-NOPIC-NEXT: movq %rsi, %rax +; WSM-NOPIC-NEXT: testq %rax, %rax +; WSM-NOPIC-NEXT: jle .LBB2_2 +; WSM-NOPIC-NEXT: # BB#1: # %for.body.preheader +; WSM-NOPIC-NEXT: pushq %rax +; WSM-NOPIC-NEXT: shlq $2, %rax +; WSM-NOPIC-NEXT: xorl %esi, %esi +; WSM-NOPIC-NEXT: movq %rax, %rdx +; WSM-NOPIC-NEXT: callq memset +; WSM-NOPIC-NEXT: addq $8, %rsp +; WSM-NOPIC-NEXT: .LBB2_2: # %for.cond.cleanup +; WSM-NOPIC-NEXT: retq +; +; WSM-PIC-LABEL: zero4: +; WSM-PIC: # BB#0: # %entry +; WSM-PIC-NEXT: testq %rsi, %rsi +; WSM-PIC-NEXT: jle .LBB2_2 +; WSM-PIC-NEXT: # BB#1: # %for.body.preheader +; WSM-PIC-NEXT: movabsq $4611686018427387903, %rax # imm = 0x3FFFFFFFFFFFFFFF +; WSM-PIC-NEXT: andq %rax, %rsi +; WSM-PIC-NEXT: xorl %eax, %eax +; WSM-PIC-NEXT: movq %rsi, %rcx +; WSM-PIC-NEXT: rep;stosl +; WSM-PIC-NEXT: .LBB2_2: # %for.cond.cleanup +; WSM-PIC-NEXT: retq +; +; SNB-NOPIC-LABEL: zero4: +; SNB-NOPIC: # BB#0: # %entry +; SNB-NOPIC-NEXT: movq %rsi, %rax +; SNB-NOPIC-NEXT: testq %rax, %rax +; SNB-NOPIC-NEXT: jle .LBB2_2 +; SNB-NOPIC-NEXT: # BB#1: # %for.body.preheader +; SNB-NOPIC-NEXT: pushq %rax +; SNB-NOPIC-NEXT: shlq $2, %rax +; SNB-NOPIC-NEXT: xorl %esi, %esi +; SNB-NOPIC-NEXT: movq %rax, %rdx +; SNB-NOPIC-NEXT: callq memset +; SNB-NOPIC-NEXT: addq $8, %rsp +; SNB-NOPIC-NEXT: .LBB2_2: # %for.cond.cleanup +; SNB-NOPIC-NEXT: retq +; +; SNB-PIC-LABEL: zero4: +; SNB-PIC: # BB#0: # %entry +; SNB-PIC-NEXT: testq %rsi, %rsi +; SNB-PIC-NEXT: jle .LBB2_2 +; SNB-PIC-NEXT: # BB#1: # %for.body.preheader +; SNB-PIC-NEXT: movabsq $4611686018427387903, %rax # imm = 0x3FFFFFFFFFFFFFFF +; SNB-PIC-NEXT: andq %rax, %rsi +; SNB-PIC-NEXT: xorl %eax, %eax +; SNB-PIC-NEXT: movq %rsi, %rcx +; SNB-PIC-NEXT: rep;stosl +; SNB-PIC-NEXT: .LBB2_2: # %for.cond.cleanup +; SNB-PIC-NEXT: retq +; +; IVB-NOPIC-LABEL: zero4: +; IVB-NOPIC: # BB#0: # %entry +; IVB-NOPIC-NEXT: movq %rsi, %rax +; IVB-NOPIC-NEXT: testq %rax, %rax +; IVB-NOPIC-NEXT: jle .LBB2_2 +; IVB-NOPIC-NEXT: # BB#1: # %for.body.preheader +; IVB-NOPIC-NEXT: pushq %rax +; IVB-NOPIC-NEXT: shlq $2, %rax +; IVB-NOPIC-NEXT: xorl %esi, %esi +; IVB-NOPIC-NEXT: movq %rax, %rdx +; IVB-NOPIC-NEXT: callq memset +; IVB-NOPIC-NEXT: addq $8, %rsp +; IVB-NOPIC-NEXT: .LBB2_2: # %for.cond.cleanup +; IVB-NOPIC-NEXT: retq +; +; IVB-PIC-LABEL: zero4: +; IVB-PIC: # BB#0: # %entry +; IVB-PIC-NEXT: testq %rsi, %rsi +; IVB-PIC-NEXT: jle .LBB2_2 +; IVB-PIC-NEXT: # BB#1: # %for.body.preheader +; IVB-PIC-NEXT: movabsq $4611686018427387903, %rax # imm = 0x3FFFFFFFFFFFFFFF +; IVB-PIC-NEXT: andq %rax, %rsi +; IVB-PIC-NEXT: xorl %eax, %eax +; IVB-PIC-NEXT: movq %rsi, %rcx +; IVB-PIC-NEXT: rep;stosl +; IVB-PIC-NEXT: .LBB2_2: # %for.cond.cleanup +; IVB-PIC-NEXT: retq +; +; HSW-NOPIC-LABEL: zero4: +; HSW-NOPIC: # BB#0: # %entry +; HSW-NOPIC-NEXT: movq %rsi, %rax +; HSW-NOPIC-NEXT: testq %rax, %rax +; HSW-NOPIC-NEXT: jle .LBB2_2 +; HSW-NOPIC-NEXT: # BB#1: # %for.body.preheader +; HSW-NOPIC-NEXT: pushq %rax +; HSW-NOPIC-NEXT: shlq $2, %rax +; HSW-NOPIC-NEXT: xorl %esi, %esi +; HSW-NOPIC-NEXT: movq %rax, %rdx +; HSW-NOPIC-NEXT: callq memset +; HSW-NOPIC-NEXT: addq $8, %rsp +; HSW-NOPIC-NEXT: .LBB2_2: # %for.cond.cleanup +; HSW-NOPIC-NEXT: retq +; +; HSW-PIC-LABEL: zero4: +; HSW-PIC: # BB#0: # %entry +; HSW-PIC-NEXT: testq %rsi, %rsi +; HSW-PIC-NEXT: jle .LBB2_2 +; HSW-PIC-NEXT: # BB#1: # %for.body.preheader +; HSW-PIC-NEXT: movb $62, %al +; HSW-PIC-NEXT: bzhiq %rax, %rsi, %rcx +; HSW-PIC-NEXT: xorl %eax, %eax +; HSW-PIC-NEXT: rep;stosl +; HSW-PIC-NEXT: .LBB2_2: # %for.cond.cleanup +; HSW-PIC-NEXT: retq +; +; SKX-NOPIC-LABEL: zero4: +; SKX-NOPIC: # BB#0: # %entry +; SKX-NOPIC-NEXT: movq %rsi, %rax +; SKX-NOPIC-NEXT: testq %rax, %rax +; SKX-NOPIC-NEXT: jle .LBB2_2 +; SKX-NOPIC-NEXT: # BB#1: # %for.body.preheader +; SKX-NOPIC-NEXT: pushq %rax +; SKX-NOPIC-NEXT: shlq $2, %rax +; SKX-NOPIC-NEXT: xorl %esi, %esi +; SKX-NOPIC-NEXT: movq %rax, %rdx +; SKX-NOPIC-NEXT: callq memset +; SKX-NOPIC-NEXT: addq $8, %rsp +; SKX-NOPIC-NEXT: .LBB2_2: # %for.cond.cleanup +; SKX-NOPIC-NEXT: retq +; +; SKX-PIC-LABEL: zero4: +; SKX-PIC: # BB#0: # %entry +; SKX-PIC-NEXT: testq %rsi, %rsi +; SKX-PIC-NEXT: jle .LBB2_2 +; SKX-PIC-NEXT: # BB#1: # %for.body.preheader +; SKX-PIC-NEXT: movb $62, %al +; SKX-PIC-NEXT: bzhiq %rax, %rsi, %rcx +; SKX-PIC-NEXT: xorl %eax, %eax +; SKX-PIC-NEXT: rep;stosl +; SKX-PIC-NEXT: .LBB2_2: # %for.cond.cleanup +; SKX-PIC-NEXT: retq +entry: + %cmp5 = icmp sgt i64 %size, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %ptr8 = bitcast i32* %ptr to i8* + %0 = shl i64 %size, 2 + call void @llvm.memset.p0i8.i64(i8* %ptr8, i8 0, i64 %0, i32 4, i1 false) + br label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +define void @zero8(i64* nocapture %ptr, i64 %size) nounwind { +; P4-NOPIC-LABEL: zero8: +; P4-NOPIC: # BB#0: # %entry +; P4-NOPIC-NEXT: movq %rsi, %rax +; P4-NOPIC-NEXT: testq %rax, %rax +; P4-NOPIC-NEXT: jle .LBB3_2 +; P4-NOPIC-NEXT: # BB#1: # %for.body.preheader +; P4-NOPIC-NEXT: pushq %rax +; P4-NOPIC-NEXT: shlq $3, %rax +; P4-NOPIC-NEXT: xorl %esi, %esi +; P4-NOPIC-NEXT: movq %rax, %rdx +; P4-NOPIC-NEXT: callq memset +; P4-NOPIC-NEXT: addq $8, %rsp +; P4-NOPIC-NEXT: .LBB3_2: # %for.cond.cleanup +; P4-NOPIC-NEXT: retq +; +; P4-PIC-LABEL: zero8: +; P4-PIC: # BB#0: # %entry +; P4-PIC-NEXT: movq %rsi, %rax +; P4-PIC-NEXT: testq %rax, %rax +; P4-PIC-NEXT: jle .LBB3_2 +; P4-PIC-NEXT: # BB#1: # %for.body.preheader +; P4-PIC-NEXT: pushq %rax +; P4-PIC-NEXT: shlq $3, %rax +; P4-PIC-NEXT: xorl %esi, %esi +; P4-PIC-NEXT: movq %rax, %rdx +; P4-PIC-NEXT: callq memset@PLT +; P4-PIC-NEXT: addq $8, %rsp +; P4-PIC-NEXT: .LBB3_2: # %for.cond.cleanup +; P4-PIC-NEXT: retq +; +; NHM-NOPIC-LABEL: zero8: +; NHM-NOPIC: # BB#0: # %entry +; NHM-NOPIC-NEXT: movq %rsi, %rax +; NHM-NOPIC-NEXT: testq %rax, %rax +; NHM-NOPIC-NEXT: jle .LBB3_2 +; NHM-NOPIC-NEXT: # BB#1: # %for.body.preheader +; NHM-NOPIC-NEXT: pushq %rax +; NHM-NOPIC-NEXT: shlq $3, %rax +; NHM-NOPIC-NEXT: xorl %esi, %esi +; NHM-NOPIC-NEXT: movq %rax, %rdx +; NHM-NOPIC-NEXT: callq memset +; NHM-NOPIC-NEXT: addq $8, %rsp +; NHM-NOPIC-NEXT: .LBB3_2: # %for.cond.cleanup +; NHM-NOPIC-NEXT: retq +; +; NHM-PIC-LABEL: zero8: +; NHM-PIC: # BB#0: # %entry +; NHM-PIC-NEXT: testq %rsi, %rsi +; NHM-PIC-NEXT: jle .LBB3_2 +; NHM-PIC-NEXT: # BB#1: # %for.body.preheader +; NHM-PIC-NEXT: movabsq $2305843009213693951, %rax # imm = 0x1FFFFFFFFFFFFFFF +; NHM-PIC-NEXT: andq %rax, %rsi +; NHM-PIC-NEXT: xorl %eax, %eax +; NHM-PIC-NEXT: movq %rsi, %rcx +; NHM-PIC-NEXT: rep;stosq +; NHM-PIC-NEXT: .LBB3_2: # %for.cond.cleanup +; NHM-PIC-NEXT: retq +; +; WSM-NOPIC-LABEL: zero8: +; WSM-NOPIC: # BB#0: # %entry +; WSM-NOPIC-NEXT: movq %rsi, %rax +; WSM-NOPIC-NEXT: testq %rax, %rax +; WSM-NOPIC-NEXT: jle .LBB3_2 +; WSM-NOPIC-NEXT: # BB#1: # %for.body.preheader +; WSM-NOPIC-NEXT: pushq %rax +; WSM-NOPIC-NEXT: shlq $3, %rax +; WSM-NOPIC-NEXT: xorl %esi, %esi +; WSM-NOPIC-NEXT: movq %rax, %rdx +; WSM-NOPIC-NEXT: callq memset +; WSM-NOPIC-NEXT: addq $8, %rsp +; WSM-NOPIC-NEXT: .LBB3_2: # %for.cond.cleanup +; WSM-NOPIC-NEXT: retq +; +; WSM-PIC-LABEL: zero8: +; WSM-PIC: # BB#0: # %entry +; WSM-PIC-NEXT: testq %rsi, %rsi +; WSM-PIC-NEXT: jle .LBB3_2 +; WSM-PIC-NEXT: # BB#1: # %for.body.preheader +; WSM-PIC-NEXT: movabsq $2305843009213693951, %rax # imm = 0x1FFFFFFFFFFFFFFF +; WSM-PIC-NEXT: andq %rax, %rsi +; WSM-PIC-NEXT: xorl %eax, %eax +; WSM-PIC-NEXT: movq %rsi, %rcx +; WSM-PIC-NEXT: rep;stosq +; WSM-PIC-NEXT: .LBB3_2: # %for.cond.cleanup +; WSM-PIC-NEXT: retq +; +; SNB-NOPIC-LABEL: zero8: +; SNB-NOPIC: # BB#0: # %entry +; SNB-NOPIC-NEXT: movq %rsi, %rax +; SNB-NOPIC-NEXT: testq %rax, %rax +; SNB-NOPIC-NEXT: jle .LBB3_2 +; SNB-NOPIC-NEXT: # BB#1: # %for.body.preheader +; SNB-NOPIC-NEXT: pushq %rax +; SNB-NOPIC-NEXT: shlq $3, %rax +; SNB-NOPIC-NEXT: xorl %esi, %esi +; SNB-NOPIC-NEXT: movq %rax, %rdx +; SNB-NOPIC-NEXT: callq memset +; SNB-NOPIC-NEXT: addq $8, %rsp +; SNB-NOPIC-NEXT: .LBB3_2: # %for.cond.cleanup +; SNB-NOPIC-NEXT: retq +; +; SNB-PIC-LABEL: zero8: +; SNB-PIC: # BB#0: # %entry +; SNB-PIC-NEXT: testq %rsi, %rsi +; SNB-PIC-NEXT: jle .LBB3_2 +; SNB-PIC-NEXT: # BB#1: # %for.body.preheader +; SNB-PIC-NEXT: movabsq $2305843009213693951, %rax # imm = 0x1FFFFFFFFFFFFFFF +; SNB-PIC-NEXT: andq %rax, %rsi +; SNB-PIC-NEXT: xorl %eax, %eax +; SNB-PIC-NEXT: movq %rsi, %rcx +; SNB-PIC-NEXT: rep;stosq +; SNB-PIC-NEXT: .LBB3_2: # %for.cond.cleanup +; SNB-PIC-NEXT: retq +; +; IVB-LABEL: zero8: +; IVB: # BB#0: # %entry +; IVB-NEXT: testq %rsi, %rsi +; IVB-NEXT: jle .LBB3_2 +; IVB-NEXT: # BB#1: # %for.body.preheader +; IVB-NEXT: movabsq $2305843009213693951, %rax # imm = 0x1FFFFFFFFFFFFFFF +; IVB-NEXT: andq %rax, %rsi +; IVB-NEXT: xorl %eax, %eax +; IVB-NEXT: movq %rsi, %rcx +; IVB-NEXT: rep;stosq +; IVB-NEXT: .LBB3_2: # %for.cond.cleanup +; IVB-NEXT: retq +; +; HSW-LABEL: zero8: +; HSW: # BB#0: # %entry +; HSW-NEXT: testq %rsi, %rsi +; HSW-NEXT: jle .LBB3_2 +; HSW-NEXT: # BB#1: # %for.body.preheader +; HSW-NEXT: movb $61, %al +; HSW-NEXT: bzhiq %rax, %rsi, %rcx +; HSW-NEXT: xorl %eax, %eax +; HSW-NEXT: rep;stosq +; HSW-NEXT: .LBB3_2: # %for.cond.cleanup +; HSW-NEXT: retq +; +; SKX-LABEL: zero8: +; SKX: # BB#0: # %entry +; SKX-NEXT: testq %rsi, %rsi +; SKX-NEXT: jle .LBB3_2 +; SKX-NEXT: # BB#1: # %for.body.preheader +; SKX-NEXT: movb $61, %al +; SKX-NEXT: bzhiq %rax, %rsi, %rcx +; SKX-NEXT: xorl %eax, %eax +; SKX-NEXT: rep;stosq +; SKX-NEXT: .LBB3_2: # %for.cond.cleanup +; SKX-NEXT: retq +entry: + %cmp5 = icmp sgt i64 %size, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %ptr8 = bitcast i64* %ptr to i8* + %0 = shl i64 %size, 3 + call void @llvm.memset.p0i8.i64(i8* %ptr8, i8 0, i64 %0, i32 8, i1 false) + br label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +define void @set1(i8* nocapture %ptr, i64 %size) nounwind { +; P4-NOPIC-LABEL: set1: +; P4-NOPIC: # BB#0: # %entry +; P4-NOPIC-NEXT: movq %rsi, %rax +; P4-NOPIC-NEXT: testq %rax, %rax +; P4-NOPIC-NEXT: jle .LBB4_2 +; P4-NOPIC-NEXT: # BB#1: # %for.body.preheader +; P4-NOPIC-NEXT: pushq %rax +; P4-NOPIC-NEXT: movl $15, %esi +; P4-NOPIC-NEXT: movq %rax, %rdx +; P4-NOPIC-NEXT: callq memset +; P4-NOPIC-NEXT: addq $8, %rsp +; P4-NOPIC-NEXT: .LBB4_2: # %for.cond.cleanup +; P4-NOPIC-NEXT: retq +; +; P4-PIC-LABEL: set1: +; P4-PIC: # BB#0: # %entry +; P4-PIC-NEXT: movq %rsi, %rax +; P4-PIC-NEXT: testq %rax, %rax +; P4-PIC-NEXT: jle .LBB4_2 +; P4-PIC-NEXT: # BB#1: # %for.body.preheader +; P4-PIC-NEXT: pushq %rax +; P4-PIC-NEXT: movl $15, %esi +; P4-PIC-NEXT: movq %rax, %rdx +; P4-PIC-NEXT: callq memset@PLT +; P4-PIC-NEXT: addq $8, %rsp +; P4-PIC-NEXT: .LBB4_2: # %for.cond.cleanup +; P4-PIC-NEXT: retq +; +; NHM-NOPIC-LABEL: set1: +; NHM-NOPIC: # BB#0: # %entry +; NHM-NOPIC-NEXT: movq %rsi, %rax +; NHM-NOPIC-NEXT: testq %rax, %rax +; NHM-NOPIC-NEXT: jle .LBB4_2 +; NHM-NOPIC-NEXT: # BB#1: # %for.body.preheader +; NHM-NOPIC-NEXT: pushq %rax +; NHM-NOPIC-NEXT: movl $15, %esi +; NHM-NOPIC-NEXT: movq %rax, %rdx +; NHM-NOPIC-NEXT: callq memset +; NHM-NOPIC-NEXT: addq $8, %rsp +; NHM-NOPIC-NEXT: .LBB4_2: # %for.cond.cleanup +; NHM-NOPIC-NEXT: retq +; +; NHM-PIC-LABEL: set1: +; NHM-PIC: # BB#0: # %entry +; NHM-PIC-NEXT: testq %rsi, %rsi +; NHM-PIC-NEXT: jle .LBB4_2 +; NHM-PIC-NEXT: # BB#1: # %for.body.preheader +; NHM-PIC-NEXT: movb $15, %al +; NHM-PIC-NEXT: movq %rsi, %rcx +; NHM-PIC-NEXT: rep;stosb +; NHM-PIC-NEXT: .LBB4_2: # %for.cond.cleanup +; NHM-PIC-NEXT: retq +; +; WSM-NOPIC-LABEL: set1: +; WSM-NOPIC: # BB#0: # %entry +; WSM-NOPIC-NEXT: movq %rsi, %rax +; WSM-NOPIC-NEXT: testq %rax, %rax +; WSM-NOPIC-NEXT: jle .LBB4_2 +; WSM-NOPIC-NEXT: # BB#1: # %for.body.preheader +; WSM-NOPIC-NEXT: pushq %rax +; WSM-NOPIC-NEXT: movl $15, %esi +; WSM-NOPIC-NEXT: movq %rax, %rdx +; WSM-NOPIC-NEXT: callq memset +; WSM-NOPIC-NEXT: addq $8, %rsp +; WSM-NOPIC-NEXT: .LBB4_2: # %for.cond.cleanup +; WSM-NOPIC-NEXT: retq +; +; WSM-PIC-LABEL: set1: +; WSM-PIC: # BB#0: # %entry +; WSM-PIC-NEXT: testq %rsi, %rsi +; WSM-PIC-NEXT: jle .LBB4_2 +; WSM-PIC-NEXT: # BB#1: # %for.body.preheader +; WSM-PIC-NEXT: movb $15, %al +; WSM-PIC-NEXT: movq %rsi, %rcx +; WSM-PIC-NEXT: rep;stosb +; WSM-PIC-NEXT: .LBB4_2: # %for.cond.cleanup +; WSM-PIC-NEXT: retq +; +; SNB-NOPIC-LABEL: set1: +; SNB-NOPIC: # BB#0: # %entry +; SNB-NOPIC-NEXT: movq %rsi, %rax +; SNB-NOPIC-NEXT: testq %rax, %rax +; SNB-NOPIC-NEXT: jle .LBB4_2 +; SNB-NOPIC-NEXT: # BB#1: # %for.body.preheader +; SNB-NOPIC-NEXT: pushq %rax +; SNB-NOPIC-NEXT: movl $15, %esi +; SNB-NOPIC-NEXT: movq %rax, %rdx +; SNB-NOPIC-NEXT: callq memset +; SNB-NOPIC-NEXT: addq $8, %rsp +; SNB-NOPIC-NEXT: .LBB4_2: # %for.cond.cleanup +; SNB-NOPIC-NEXT: retq +; +; SNB-PIC-LABEL: set1: +; SNB-PIC: # BB#0: # %entry +; SNB-PIC-NEXT: testq %rsi, %rsi +; SNB-PIC-NEXT: jle .LBB4_2 +; SNB-PIC-NEXT: # BB#1: # %for.body.preheader +; SNB-PIC-NEXT: movb $15, %al +; SNB-PIC-NEXT: movq %rsi, %rcx +; SNB-PIC-NEXT: rep;stosb +; SNB-PIC-NEXT: .LBB4_2: # %for.cond.cleanup +; SNB-PIC-NEXT: retq +; +; IVB-LABEL: set1: +; IVB: # BB#0: # %entry +; IVB-NEXT: testq %rsi, %rsi +; IVB-NEXT: jle .LBB4_2 +; IVB-NEXT: # BB#1: # %for.body.preheader +; IVB-NEXT: movb $15, %al +; IVB-NEXT: movq %rsi, %rcx +; IVB-NEXT: rep;stosb +; IVB-NEXT: .LBB4_2: # %for.cond.cleanup +; IVB-NEXT: retq +; +; HSW-LABEL: set1: +; HSW: # BB#0: # %entry +; HSW-NEXT: testq %rsi, %rsi +; HSW-NEXT: jle .LBB4_2 +; HSW-NEXT: # BB#1: # %for.body.preheader +; HSW-NEXT: movb $15, %al +; HSW-NEXT: movq %rsi, %rcx +; HSW-NEXT: rep;stosb +; HSW-NEXT: .LBB4_2: # %for.cond.cleanup +; HSW-NEXT: retq +; +; SKX-LABEL: set1: +; SKX: # BB#0: # %entry +; SKX-NEXT: testq %rsi, %rsi +; SKX-NEXT: jle .LBB4_2 +; SKX-NEXT: # BB#1: # %for.body.preheader +; SKX-NEXT: movb $15, %al +; SKX-NEXT: movq %rsi, %rcx +; SKX-NEXT: rep;stosb +; SKX-NEXT: .LBB4_2: # %for.cond.cleanup +; SKX-NEXT: retq +entry: + %cmp5 = icmp sgt i64 %size, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + call void @llvm.memset.p0i8.i64(i8* %ptr, i8 15, i64 %size, i32 1, i1 false) + br label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +define void @set2(i16* nocapture %ptr, i64 %size) nounwind { +; P4-NOPIC-LABEL: set2: +; P4-NOPIC: # BB#0: # %entry +; P4-NOPIC-NEXT: movq %rsi, %rax +; P4-NOPIC-NEXT: testq %rax, %rax +; P4-NOPIC-NEXT: jle .LBB5_2 +; P4-NOPIC-NEXT: # BB#1: # %for.body.preheader +; P4-NOPIC-NEXT: pushq %rax +; P4-NOPIC-NEXT: addq %rax, %rax +; P4-NOPIC-NEXT: movl $15, %esi +; P4-NOPIC-NEXT: movq %rax, %rdx +; P4-NOPIC-NEXT: callq memset +; P4-NOPIC-NEXT: addq $8, %rsp +; P4-NOPIC-NEXT: .LBB5_2: # %for.cond.cleanup +; P4-NOPIC-NEXT: retq +; +; P4-PIC-LABEL: set2: +; P4-PIC: # BB#0: # %entry +; P4-PIC-NEXT: movq %rsi, %rax +; P4-PIC-NEXT: testq %rax, %rax +; P4-PIC-NEXT: jle .LBB5_2 +; P4-PIC-NEXT: # BB#1: # %for.body.preheader +; P4-PIC-NEXT: pushq %rax +; P4-PIC-NEXT: addq %rax, %rax +; P4-PIC-NEXT: movl $15, %esi +; P4-PIC-NEXT: movq %rax, %rdx +; P4-PIC-NEXT: callq memset@PLT +; P4-PIC-NEXT: addq $8, %rsp +; P4-PIC-NEXT: .LBB5_2: # %for.cond.cleanup +; P4-PIC-NEXT: retq +; +; NHM-NOPIC-LABEL: set2: +; NHM-NOPIC: # BB#0: # %entry +; NHM-NOPIC-NEXT: movq %rsi, %rax +; NHM-NOPIC-NEXT: testq %rax, %rax +; NHM-NOPIC-NEXT: jle .LBB5_2 +; NHM-NOPIC-NEXT: # BB#1: # %for.body.preheader +; NHM-NOPIC-NEXT: pushq %rax +; NHM-NOPIC-NEXT: addq %rax, %rax +; NHM-NOPIC-NEXT: movl $15, %esi +; NHM-NOPIC-NEXT: movq %rax, %rdx +; NHM-NOPIC-NEXT: callq memset +; NHM-NOPIC-NEXT: addq $8, %rsp +; NHM-NOPIC-NEXT: .LBB5_2: # %for.cond.cleanup +; NHM-NOPIC-NEXT: retq +; +; NHM-PIC-LABEL: set2: +; NHM-PIC: # BB#0: # %entry +; NHM-PIC-NEXT: testq %rsi, %rsi +; NHM-PIC-NEXT: jle .LBB5_2 +; NHM-PIC-NEXT: # BB#1: # %for.body.preheader +; NHM-PIC-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF +; NHM-PIC-NEXT: andq %rax, %rsi +; NHM-PIC-NEXT: movw $3855, %ax # imm = 0xF0F +; NHM-PIC-NEXT: movq %rsi, %rcx +; NHM-PIC-NEXT: rep;stosw +; NHM-PIC-NEXT: .LBB5_2: # %for.cond.cleanup +; NHM-PIC-NEXT: retq +; +; WSM-NOPIC-LABEL: set2: +; WSM-NOPIC: # BB#0: # %entry +; WSM-NOPIC-NEXT: movq %rsi, %rax +; WSM-NOPIC-NEXT: testq %rax, %rax +; WSM-NOPIC-NEXT: jle .LBB5_2 +; WSM-NOPIC-NEXT: # BB#1: # %for.body.preheader +; WSM-NOPIC-NEXT: pushq %rax +; WSM-NOPIC-NEXT: addq %rax, %rax +; WSM-NOPIC-NEXT: movl $15, %esi +; WSM-NOPIC-NEXT: movq %rax, %rdx +; WSM-NOPIC-NEXT: callq memset +; WSM-NOPIC-NEXT: addq $8, %rsp +; WSM-NOPIC-NEXT: .LBB5_2: # %for.cond.cleanup +; WSM-NOPIC-NEXT: retq +; +; WSM-PIC-LABEL: set2: +; WSM-PIC: # BB#0: # %entry +; WSM-PIC-NEXT: testq %rsi, %rsi +; WSM-PIC-NEXT: jle .LBB5_2 +; WSM-PIC-NEXT: # BB#1: # %for.body.preheader +; WSM-PIC-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF +; WSM-PIC-NEXT: andq %rax, %rsi +; WSM-PIC-NEXT: movw $3855, %ax # imm = 0xF0F +; WSM-PIC-NEXT: movq %rsi, %rcx +; WSM-PIC-NEXT: rep;stosw +; WSM-PIC-NEXT: .LBB5_2: # %for.cond.cleanup +; WSM-PIC-NEXT: retq +; +; SNB-NOPIC-LABEL: set2: +; SNB-NOPIC: # BB#0: # %entry +; SNB-NOPIC-NEXT: movq %rsi, %rax +; SNB-NOPIC-NEXT: testq %rax, %rax +; SNB-NOPIC-NEXT: jle .LBB5_2 +; SNB-NOPIC-NEXT: # BB#1: # %for.body.preheader +; SNB-NOPIC-NEXT: pushq %rax +; SNB-NOPIC-NEXT: addq %rax, %rax +; SNB-NOPIC-NEXT: movl $15, %esi +; SNB-NOPIC-NEXT: movq %rax, %rdx +; SNB-NOPIC-NEXT: callq memset +; SNB-NOPIC-NEXT: addq $8, %rsp +; SNB-NOPIC-NEXT: .LBB5_2: # %for.cond.cleanup +; SNB-NOPIC-NEXT: retq +; +; SNB-PIC-LABEL: set2: +; SNB-PIC: # BB#0: # %entry +; SNB-PIC-NEXT: testq %rsi, %rsi +; SNB-PIC-NEXT: jle .LBB5_2 +; SNB-PIC-NEXT: # BB#1: # %for.body.preheader +; SNB-PIC-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF +; SNB-PIC-NEXT: andq %rax, %rsi +; SNB-PIC-NEXT: movw $3855, %ax # imm = 0xF0F +; SNB-PIC-NEXT: movq %rsi, %rcx +; SNB-PIC-NEXT: rep;stosw +; SNB-PIC-NEXT: .LBB5_2: # %for.cond.cleanup +; SNB-PIC-NEXT: retq +; +; IVB-LABEL: set2: +; IVB: # BB#0: # %entry +; IVB-NEXT: testq %rsi, %rsi +; IVB-NEXT: jle .LBB5_2 +; IVB-NEXT: # BB#1: # %for.body.preheader +; IVB-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF +; IVB-NEXT: andq %rax, %rsi +; IVB-NEXT: movw $3855, %ax # imm = 0xF0F +; IVB-NEXT: movq %rsi, %rcx +; IVB-NEXT: rep;stosw +; IVB-NEXT: .LBB5_2: # %for.cond.cleanup +; IVB-NEXT: retq +; +; HSW-LABEL: set2: +; HSW: # BB#0: # %entry +; HSW-NEXT: testq %rsi, %rsi +; HSW-NEXT: jle .LBB5_2 +; HSW-NEXT: # BB#1: # %for.body.preheader +; HSW-NEXT: movb $63, %al +; HSW-NEXT: bzhiq %rax, %rsi, %rcx +; HSW-NEXT: movw $3855, %ax # imm = 0xF0F +; HSW-NEXT: rep;stosw +; HSW-NEXT: .LBB5_2: # %for.cond.cleanup +; HSW-NEXT: retq +; +; SKX-LABEL: set2: +; SKX: # BB#0: # %entry +; SKX-NEXT: testq %rsi, %rsi +; SKX-NEXT: jle .LBB5_2 +; SKX-NEXT: # BB#1: # %for.body.preheader +; SKX-NEXT: movb $63, %al +; SKX-NEXT: bzhiq %rax, %rsi, %rcx +; SKX-NEXT: movw $3855, %ax # imm = 0xF0F +; SKX-NEXT: rep;stosw +; SKX-NEXT: .LBB5_2: # %for.cond.cleanup +; SKX-NEXT: retq +entry: + %cmp5 = icmp sgt i64 %size, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %ptr8 = bitcast i16* %ptr to i8* + %0 = shl i64 %size, 1 + call void @llvm.memset.p0i8.i64(i8* %ptr8, i8 15, i64 %0, i32 2, i1 false) + br label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +define void @set4(i32* nocapture %ptr, i64 %size) nounwind { +; P4-NOPIC-LABEL: set4: +; P4-NOPIC: # BB#0: # %entry +; P4-NOPIC-NEXT: movq %rsi, %rax +; P4-NOPIC-NEXT: testq %rax, %rax +; P4-NOPIC-NEXT: jle .LBB6_2 +; P4-NOPIC-NEXT: # BB#1: # %for.body.preheader +; P4-NOPIC-NEXT: pushq %rax +; P4-NOPIC-NEXT: shlq $2, %rax +; P4-NOPIC-NEXT: movl $15, %esi +; P4-NOPIC-NEXT: movq %rax, %rdx +; P4-NOPIC-NEXT: callq memset +; P4-NOPIC-NEXT: addq $8, %rsp +; P4-NOPIC-NEXT: .LBB6_2: # %for.cond.cleanup +; P4-NOPIC-NEXT: retq +; +; P4-PIC-LABEL: set4: +; P4-PIC: # BB#0: # %entry +; P4-PIC-NEXT: movq %rsi, %rax +; P4-PIC-NEXT: testq %rax, %rax +; P4-PIC-NEXT: jle .LBB6_2 +; P4-PIC-NEXT: # BB#1: # %for.body.preheader +; P4-PIC-NEXT: pushq %rax +; P4-PIC-NEXT: shlq $2, %rax +; P4-PIC-NEXT: movl $15, %esi +; P4-PIC-NEXT: movq %rax, %rdx +; P4-PIC-NEXT: callq memset@PLT +; P4-PIC-NEXT: addq $8, %rsp +; P4-PIC-NEXT: .LBB6_2: # %for.cond.cleanup +; P4-PIC-NEXT: retq +; +; NHM-NOPIC-LABEL: set4: +; NHM-NOPIC: # BB#0: # %entry +; NHM-NOPIC-NEXT: movq %rsi, %rax +; NHM-NOPIC-NEXT: testq %rax, %rax +; NHM-NOPIC-NEXT: jle .LBB6_2 +; NHM-NOPIC-NEXT: # BB#1: # %for.body.preheader +; NHM-NOPIC-NEXT: pushq %rax +; NHM-NOPIC-NEXT: shlq $2, %rax +; NHM-NOPIC-NEXT: movl $15, %esi +; NHM-NOPIC-NEXT: movq %rax, %rdx +; NHM-NOPIC-NEXT: callq memset +; NHM-NOPIC-NEXT: addq $8, %rsp +; NHM-NOPIC-NEXT: .LBB6_2: # %for.cond.cleanup +; NHM-NOPIC-NEXT: retq +; +; NHM-PIC-LABEL: set4: +; NHM-PIC: # BB#0: # %entry +; NHM-PIC-NEXT: testq %rsi, %rsi +; NHM-PIC-NEXT: jle .LBB6_2 +; NHM-PIC-NEXT: # BB#1: # %for.body.preheader +; NHM-PIC-NEXT: movabsq $4611686018427387903, %rax # imm = 0x3FFFFFFFFFFFFFFF +; NHM-PIC-NEXT: andq %rax, %rsi +; NHM-PIC-NEXT: movl $252645135, %eax # imm = 0xF0F0F0F +; NHM-PIC-NEXT: movq %rsi, %rcx +; NHM-PIC-NEXT: rep;stosl +; NHM-PIC-NEXT: .LBB6_2: # %for.cond.cleanup +; NHM-PIC-NEXT: retq +; +; WSM-NOPIC-LABEL: set4: +; WSM-NOPIC: # BB#0: # %entry +; WSM-NOPIC-NEXT: movq %rsi, %rax +; WSM-NOPIC-NEXT: testq %rax, %rax +; WSM-NOPIC-NEXT: jle .LBB6_2 +; WSM-NOPIC-NEXT: # BB#1: # %for.body.preheader +; WSM-NOPIC-NEXT: pushq %rax +; WSM-NOPIC-NEXT: shlq $2, %rax +; WSM-NOPIC-NEXT: movl $15, %esi +; WSM-NOPIC-NEXT: movq %rax, %rdx +; WSM-NOPIC-NEXT: callq memset +; WSM-NOPIC-NEXT: addq $8, %rsp +; WSM-NOPIC-NEXT: .LBB6_2: # %for.cond.cleanup +; WSM-NOPIC-NEXT: retq +; +; WSM-PIC-LABEL: set4: +; WSM-PIC: # BB#0: # %entry +; WSM-PIC-NEXT: testq %rsi, %rsi +; WSM-PIC-NEXT: jle .LBB6_2 +; WSM-PIC-NEXT: # BB#1: # %for.body.preheader +; WSM-PIC-NEXT: movabsq $4611686018427387903, %rax # imm = 0x3FFFFFFFFFFFFFFF +; WSM-PIC-NEXT: andq %rax, %rsi +; WSM-PIC-NEXT: movl $252645135, %eax # imm = 0xF0F0F0F +; WSM-PIC-NEXT: movq %rsi, %rcx +; WSM-PIC-NEXT: rep;stosl +; WSM-PIC-NEXT: .LBB6_2: # %for.cond.cleanup +; WSM-PIC-NEXT: retq +; +; SNB-NOPIC-LABEL: set4: +; SNB-NOPIC: # BB#0: # %entry +; SNB-NOPIC-NEXT: movq %rsi, %rax +; SNB-NOPIC-NEXT: testq %rax, %rax +; SNB-NOPIC-NEXT: jle .LBB6_2 +; SNB-NOPIC-NEXT: # BB#1: # %for.body.preheader +; SNB-NOPIC-NEXT: pushq %rax +; SNB-NOPIC-NEXT: shlq $2, %rax +; SNB-NOPIC-NEXT: movl $15, %esi +; SNB-NOPIC-NEXT: movq %rax, %rdx +; SNB-NOPIC-NEXT: callq memset +; SNB-NOPIC-NEXT: addq $8, %rsp +; SNB-NOPIC-NEXT: .LBB6_2: # %for.cond.cleanup +; SNB-NOPIC-NEXT: retq +; +; SNB-PIC-LABEL: set4: +; SNB-PIC: # BB#0: # %entry +; SNB-PIC-NEXT: testq %rsi, %rsi +; SNB-PIC-NEXT: jle .LBB6_2 +; SNB-PIC-NEXT: # BB#1: # %for.body.preheader +; SNB-PIC-NEXT: movabsq $4611686018427387903, %rax # imm = 0x3FFFFFFFFFFFFFFF +; SNB-PIC-NEXT: andq %rax, %rsi +; SNB-PIC-NEXT: movl $252645135, %eax # imm = 0xF0F0F0F +; SNB-PIC-NEXT: movq %rsi, %rcx +; SNB-PIC-NEXT: rep;stosl +; SNB-PIC-NEXT: .LBB6_2: # %for.cond.cleanup +; SNB-PIC-NEXT: retq +; +; IVB-NOPIC-LABEL: set4: +; IVB-NOPIC: # BB#0: # %entry +; IVB-NOPIC-NEXT: movq %rsi, %rax +; IVB-NOPIC-NEXT: testq %rax, %rax +; IVB-NOPIC-NEXT: jle .LBB6_2 +; IVB-NOPIC-NEXT: # BB#1: # %for.body.preheader +; IVB-NOPIC-NEXT: pushq %rax +; IVB-NOPIC-NEXT: shlq $2, %rax +; IVB-NOPIC-NEXT: movl $15, %esi +; IVB-NOPIC-NEXT: movq %rax, %rdx +; IVB-NOPIC-NEXT: callq memset +; IVB-NOPIC-NEXT: addq $8, %rsp +; IVB-NOPIC-NEXT: .LBB6_2: # %for.cond.cleanup +; IVB-NOPIC-NEXT: retq +; +; IVB-PIC-LABEL: set4: +; IVB-PIC: # BB#0: # %entry +; IVB-PIC-NEXT: testq %rsi, %rsi +; IVB-PIC-NEXT: jle .LBB6_2 +; IVB-PIC-NEXT: # BB#1: # %for.body.preheader +; IVB-PIC-NEXT: movabsq $4611686018427387903, %rax # imm = 0x3FFFFFFFFFFFFFFF +; IVB-PIC-NEXT: andq %rax, %rsi +; IVB-PIC-NEXT: movl $252645135, %eax # imm = 0xF0F0F0F +; IVB-PIC-NEXT: movq %rsi, %rcx +; IVB-PIC-NEXT: rep;stosl +; IVB-PIC-NEXT: .LBB6_2: # %for.cond.cleanup +; IVB-PIC-NEXT: retq +; +; HSW-NOPIC-LABEL: set4: +; HSW-NOPIC: # BB#0: # %entry +; HSW-NOPIC-NEXT: movq %rsi, %rax +; HSW-NOPIC-NEXT: testq %rax, %rax +; HSW-NOPIC-NEXT: jle .LBB6_2 +; HSW-NOPIC-NEXT: # BB#1: # %for.body.preheader +; HSW-NOPIC-NEXT: pushq %rax +; HSW-NOPIC-NEXT: shlq $2, %rax +; HSW-NOPIC-NEXT: movl $15, %esi +; HSW-NOPIC-NEXT: movq %rax, %rdx +; HSW-NOPIC-NEXT: callq memset +; HSW-NOPIC-NEXT: addq $8, %rsp +; HSW-NOPIC-NEXT: .LBB6_2: # %for.cond.cleanup +; HSW-NOPIC-NEXT: retq +; +; HSW-PIC-LABEL: set4: +; HSW-PIC: # BB#0: # %entry +; HSW-PIC-NEXT: testq %rsi, %rsi +; HSW-PIC-NEXT: jle .LBB6_2 +; HSW-PIC-NEXT: # BB#1: # %for.body.preheader +; HSW-PIC-NEXT: movb $62, %al +; HSW-PIC-NEXT: bzhiq %rax, %rsi, %rcx +; HSW-PIC-NEXT: movl $252645135, %eax # imm = 0xF0F0F0F +; HSW-PIC-NEXT: rep;stosl +; HSW-PIC-NEXT: .LBB6_2: # %for.cond.cleanup +; HSW-PIC-NEXT: retq +; +; SKX-NOPIC-LABEL: set4: +; SKX-NOPIC: # BB#0: # %entry +; SKX-NOPIC-NEXT: movq %rsi, %rax +; SKX-NOPIC-NEXT: testq %rax, %rax +; SKX-NOPIC-NEXT: jle .LBB6_2 +; SKX-NOPIC-NEXT: # BB#1: # %for.body.preheader +; SKX-NOPIC-NEXT: pushq %rax +; SKX-NOPIC-NEXT: shlq $2, %rax +; SKX-NOPIC-NEXT: movl $15, %esi +; SKX-NOPIC-NEXT: movq %rax, %rdx +; SKX-NOPIC-NEXT: callq memset +; SKX-NOPIC-NEXT: addq $8, %rsp +; SKX-NOPIC-NEXT: .LBB6_2: # %for.cond.cleanup +; SKX-NOPIC-NEXT: retq +; +; SKX-PIC-LABEL: set4: +; SKX-PIC: # BB#0: # %entry +; SKX-PIC-NEXT: testq %rsi, %rsi +; SKX-PIC-NEXT: jle .LBB6_2 +; SKX-PIC-NEXT: # BB#1: # %for.body.preheader +; SKX-PIC-NEXT: movb $62, %al +; SKX-PIC-NEXT: bzhiq %rax, %rsi, %rcx +; SKX-PIC-NEXT: movl $252645135, %eax # imm = 0xF0F0F0F +; SKX-PIC-NEXT: rep;stosl +; SKX-PIC-NEXT: .LBB6_2: # %for.cond.cleanup +; SKX-PIC-NEXT: retq +entry: + %cmp5 = icmp sgt i64 %size, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %ptr8 = bitcast i32* %ptr to i8* + %0 = shl i64 %size, 2 + call void @llvm.memset.p0i8.i64(i8* %ptr8, i8 15, i64 %0, i32 4, i1 false) + br label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +define void @set8(i64* nocapture %ptr, i64 %size) nounwind { +; P4-NOPIC-LABEL: set8: +; P4-NOPIC: # BB#0: # %entry +; P4-NOPIC-NEXT: movq %rsi, %rax +; P4-NOPIC-NEXT: testq %rax, %rax +; P4-NOPIC-NEXT: jle .LBB7_2 +; P4-NOPIC-NEXT: # BB#1: # %for.body.preheader +; P4-NOPIC-NEXT: pushq %rax +; P4-NOPIC-NEXT: shlq $3, %rax +; P4-NOPIC-NEXT: movl $15, %esi +; P4-NOPIC-NEXT: movq %rax, %rdx +; P4-NOPIC-NEXT: callq memset +; P4-NOPIC-NEXT: addq $8, %rsp +; P4-NOPIC-NEXT: .LBB7_2: # %for.cond.cleanup +; P4-NOPIC-NEXT: retq +; +; P4-PIC-LABEL: set8: +; P4-PIC: # BB#0: # %entry +; P4-PIC-NEXT: movq %rsi, %rax +; P4-PIC-NEXT: testq %rax, %rax +; P4-PIC-NEXT: jle .LBB7_2 +; P4-PIC-NEXT: # BB#1: # %for.body.preheader +; P4-PIC-NEXT: pushq %rax +; P4-PIC-NEXT: shlq $3, %rax +; P4-PIC-NEXT: movl $15, %esi +; P4-PIC-NEXT: movq %rax, %rdx +; P4-PIC-NEXT: callq memset@PLT +; P4-PIC-NEXT: addq $8, %rsp +; P4-PIC-NEXT: .LBB7_2: # %for.cond.cleanup +; P4-PIC-NEXT: retq +; +; NHM-NOPIC-LABEL: set8: +; NHM-NOPIC: # BB#0: # %entry +; NHM-NOPIC-NEXT: movq %rsi, %rax +; NHM-NOPIC-NEXT: testq %rax, %rax +; NHM-NOPIC-NEXT: jle .LBB7_2 +; NHM-NOPIC-NEXT: # BB#1: # %for.body.preheader +; NHM-NOPIC-NEXT: pushq %rax +; NHM-NOPIC-NEXT: shlq $3, %rax +; NHM-NOPIC-NEXT: movl $15, %esi +; NHM-NOPIC-NEXT: movq %rax, %rdx +; NHM-NOPIC-NEXT: callq memset +; NHM-NOPIC-NEXT: addq $8, %rsp +; NHM-NOPIC-NEXT: .LBB7_2: # %for.cond.cleanup +; NHM-NOPIC-NEXT: retq +; +; NHM-PIC-LABEL: set8: +; NHM-PIC: # BB#0: # %entry +; NHM-PIC-NEXT: testq %rsi, %rsi +; NHM-PIC-NEXT: jle .LBB7_2 +; NHM-PIC-NEXT: # BB#1: # %for.body.preheader +; NHM-PIC-NEXT: movabsq $2305843009213693951, %rax # imm = 0x1FFFFFFFFFFFFFFF +; NHM-PIC-NEXT: andq %rax, %rsi +; NHM-PIC-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F +; NHM-PIC-NEXT: movq %rsi, %rcx +; NHM-PIC-NEXT: rep;stosq +; NHM-PIC-NEXT: .LBB7_2: # %for.cond.cleanup +; NHM-PIC-NEXT: retq +; +; WSM-NOPIC-LABEL: set8: +; WSM-NOPIC: # BB#0: # %entry +; WSM-NOPIC-NEXT: movq %rsi, %rax +; WSM-NOPIC-NEXT: testq %rax, %rax +; WSM-NOPIC-NEXT: jle .LBB7_2 +; WSM-NOPIC-NEXT: # BB#1: # %for.body.preheader +; WSM-NOPIC-NEXT: pushq %rax +; WSM-NOPIC-NEXT: shlq $3, %rax +; WSM-NOPIC-NEXT: movl $15, %esi +; WSM-NOPIC-NEXT: movq %rax, %rdx +; WSM-NOPIC-NEXT: callq memset +; WSM-NOPIC-NEXT: addq $8, %rsp +; WSM-NOPIC-NEXT: .LBB7_2: # %for.cond.cleanup +; WSM-NOPIC-NEXT: retq +; +; WSM-PIC-LABEL: set8: +; WSM-PIC: # BB#0: # %entry +; WSM-PIC-NEXT: testq %rsi, %rsi +; WSM-PIC-NEXT: jle .LBB7_2 +; WSM-PIC-NEXT: # BB#1: # %for.body.preheader +; WSM-PIC-NEXT: movabsq $2305843009213693951, %rax # imm = 0x1FFFFFFFFFFFFFFF +; WSM-PIC-NEXT: andq %rax, %rsi +; WSM-PIC-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F +; WSM-PIC-NEXT: movq %rsi, %rcx +; WSM-PIC-NEXT: rep;stosq +; WSM-PIC-NEXT: .LBB7_2: # %for.cond.cleanup +; WSM-PIC-NEXT: retq +; +; SNB-NOPIC-LABEL: set8: +; SNB-NOPIC: # BB#0: # %entry +; SNB-NOPIC-NEXT: movq %rsi, %rax +; SNB-NOPIC-NEXT: testq %rax, %rax +; SNB-NOPIC-NEXT: jle .LBB7_2 +; SNB-NOPIC-NEXT: # BB#1: # %for.body.preheader +; SNB-NOPIC-NEXT: pushq %rax +; SNB-NOPIC-NEXT: shlq $3, %rax +; SNB-NOPIC-NEXT: movl $15, %esi +; SNB-NOPIC-NEXT: movq %rax, %rdx +; SNB-NOPIC-NEXT: callq memset +; SNB-NOPIC-NEXT: addq $8, %rsp +; SNB-NOPIC-NEXT: .LBB7_2: # %for.cond.cleanup +; SNB-NOPIC-NEXT: retq +; +; SNB-PIC-LABEL: set8: +; SNB-PIC: # BB#0: # %entry +; SNB-PIC-NEXT: testq %rsi, %rsi +; SNB-PIC-NEXT: jle .LBB7_2 +; SNB-PIC-NEXT: # BB#1: # %for.body.preheader +; SNB-PIC-NEXT: movabsq $2305843009213693951, %rax # imm = 0x1FFFFFFFFFFFFFFF +; SNB-PIC-NEXT: andq %rax, %rsi +; SNB-PIC-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F +; SNB-PIC-NEXT: movq %rsi, %rcx +; SNB-PIC-NEXT: rep;stosq +; SNB-PIC-NEXT: .LBB7_2: # %for.cond.cleanup +; SNB-PIC-NEXT: retq +; +; IVB-LABEL: set8: +; IVB: # BB#0: # %entry +; IVB-NEXT: testq %rsi, %rsi +; IVB-NEXT: jle .LBB7_2 +; IVB-NEXT: # BB#1: # %for.body.preheader +; IVB-NEXT: movabsq $2305843009213693951, %rax # imm = 0x1FFFFFFFFFFFFFFF +; IVB-NEXT: andq %rax, %rsi +; IVB-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F +; IVB-NEXT: movq %rsi, %rcx +; IVB-NEXT: rep;stosq +; IVB-NEXT: .LBB7_2: # %for.cond.cleanup +; IVB-NEXT: retq +; +; HSW-LABEL: set8: +; HSW: # BB#0: # %entry +; HSW-NEXT: testq %rsi, %rsi +; HSW-NEXT: jle .LBB7_2 +; HSW-NEXT: # BB#1: # %for.body.preheader +; HSW-NEXT: movb $61, %al +; HSW-NEXT: bzhiq %rax, %rsi, %rcx +; HSW-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F +; HSW-NEXT: rep;stosq +; HSW-NEXT: .LBB7_2: # %for.cond.cleanup +; HSW-NEXT: retq +; +; SKX-LABEL: set8: +; SKX: # BB#0: # %entry +; SKX-NEXT: testq %rsi, %rsi +; SKX-NEXT: jle .LBB7_2 +; SKX-NEXT: # BB#1: # %for.body.preheader +; SKX-NEXT: movb $61, %al +; SKX-NEXT: bzhiq %rax, %rsi, %rcx +; SKX-NEXT: movabsq $1085102592571150095, %rax # imm = 0xF0F0F0F0F0F0F0F +; SKX-NEXT: rep;stosq +; SKX-NEXT: .LBB7_2: # %for.cond.cleanup +; SKX-NEXT: retq +entry: + %cmp5 = icmp sgt i64 %size, 0 + br i1 %cmp5, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %ptr8 = bitcast i64* %ptr to i8* + %0 = shl i64 %size, 3 + call void @llvm.memset.p0i8.i64(i8* %ptr8, i8 15, i64 %0, i32 8, i1 false) + br label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +define void @copy1(i8* noalias nocapture %dst, i8* noalias nocapture readonly %src, i64 %size) nounwind { +; P4-NOPIC-LABEL: copy1: +; P4-NOPIC: # BB#0: # %entry +; P4-NOPIC-NEXT: testq %rdx, %rdx +; P4-NOPIC-NEXT: jle .LBB8_2 +; P4-NOPIC-NEXT: # BB#1: # %for.body.preheader +; P4-NOPIC-NEXT: pushq %rax +; P4-NOPIC-NEXT: callq memcpy +; P4-NOPIC-NEXT: addq $8, %rsp +; P4-NOPIC-NEXT: .LBB8_2: # %for.cond.cleanup +; P4-NOPIC-NEXT: retq +; +; P4-PIC-LABEL: copy1: +; P4-PIC: # BB#0: # %entry +; P4-PIC-NEXT: testq %rdx, %rdx +; P4-PIC-NEXT: jle .LBB8_2 +; P4-PIC-NEXT: # BB#1: # %for.body.preheader +; P4-PIC-NEXT: pushq %rax +; P4-PIC-NEXT: callq memcpy@PLT +; P4-PIC-NEXT: addq $8, %rsp +; P4-PIC-NEXT: .LBB8_2: # %for.cond.cleanup +; P4-PIC-NEXT: retq +; +; NHM-NOPIC-LABEL: copy1: +; NHM-NOPIC: # BB#0: # %entry +; NHM-NOPIC-NEXT: testq %rdx, %rdx +; NHM-NOPIC-NEXT: jle .LBB8_2 +; NHM-NOPIC-NEXT: # BB#1: # %for.body.preheader +; NHM-NOPIC-NEXT: pushq %rax +; NHM-NOPIC-NEXT: callq memcpy +; NHM-NOPIC-NEXT: addq $8, %rsp +; NHM-NOPIC-NEXT: .LBB8_2: # %for.cond.cleanup +; NHM-NOPIC-NEXT: retq +; +; NHM-PIC-LABEL: copy1: +; NHM-PIC: # BB#0: # %entry +; NHM-PIC-NEXT: testq %rdx, %rdx +; NHM-PIC-NEXT: jle .LBB8_2 +; NHM-PIC-NEXT: # BB#1: # %for.body.preheader +; NHM-PIC-NEXT: movq %rdx, %rcx +; NHM-PIC-NEXT: rep;movsb +; NHM-PIC-NEXT: .LBB8_2: # %for.cond.cleanup +; NHM-PIC-NEXT: retq +; +; WSM-NOPIC-LABEL: copy1: +; WSM-NOPIC: # BB#0: # %entry +; WSM-NOPIC-NEXT: testq %rdx, %rdx +; WSM-NOPIC-NEXT: jle .LBB8_2 +; WSM-NOPIC-NEXT: # BB#1: # %for.body.preheader +; WSM-NOPIC-NEXT: pushq %rax +; WSM-NOPIC-NEXT: callq memcpy +; WSM-NOPIC-NEXT: addq $8, %rsp +; WSM-NOPIC-NEXT: .LBB8_2: # %for.cond.cleanup +; WSM-NOPIC-NEXT: retq +; +; WSM-PIC-LABEL: copy1: +; WSM-PIC: # BB#0: # %entry +; WSM-PIC-NEXT: testq %rdx, %rdx +; WSM-PIC-NEXT: jle .LBB8_2 +; WSM-PIC-NEXT: # BB#1: # %for.body.preheader +; WSM-PIC-NEXT: movq %rdx, %rcx +; WSM-PIC-NEXT: rep;movsb +; WSM-PIC-NEXT: .LBB8_2: # %for.cond.cleanup +; WSM-PIC-NEXT: retq +; +; SNB-NOPIC-LABEL: copy1: +; SNB-NOPIC: # BB#0: # %entry +; SNB-NOPIC-NEXT: testq %rdx, %rdx +; SNB-NOPIC-NEXT: jle .LBB8_2 +; SNB-NOPIC-NEXT: # BB#1: # %for.body.preheader +; SNB-NOPIC-NEXT: pushq %rax +; SNB-NOPIC-NEXT: callq memcpy +; SNB-NOPIC-NEXT: addq $8, %rsp +; SNB-NOPIC-NEXT: .LBB8_2: # %for.cond.cleanup +; SNB-NOPIC-NEXT: retq +; +; SNB-PIC-LABEL: copy1: +; SNB-PIC: # BB#0: # %entry +; SNB-PIC-NEXT: testq %rdx, %rdx +; SNB-PIC-NEXT: jle .LBB8_2 +; SNB-PIC-NEXT: # BB#1: # %for.body.preheader +; SNB-PIC-NEXT: movq %rdx, %rcx +; SNB-PIC-NEXT: rep;movsb +; SNB-PIC-NEXT: .LBB8_2: # %for.cond.cleanup +; SNB-PIC-NEXT: retq +; +; IVB-LABEL: copy1: +; IVB: # BB#0: # %entry +; IVB-NEXT: testq %rdx, %rdx +; IVB-NEXT: jle .LBB8_2 +; IVB-NEXT: # BB#1: # %for.body.preheader +; IVB-NEXT: movq %rdx, %rcx +; IVB-NEXT: rep;movsb +; IVB-NEXT: .LBB8_2: # %for.cond.cleanup +; IVB-NEXT: retq +; +; HSW-LABEL: copy1: +; HSW: # BB#0: # %entry +; HSW-NEXT: testq %rdx, %rdx +; HSW-NEXT: jle .LBB8_2 +; HSW-NEXT: # BB#1: # %for.body.preheader +; HSW-NEXT: movq %rdx, %rcx +; HSW-NEXT: rep;movsb +; HSW-NEXT: .LBB8_2: # %for.cond.cleanup +; HSW-NEXT: retq +; +; SKX-LABEL: copy1: +; SKX: # BB#0: # %entry +; SKX-NEXT: testq %rdx, %rdx +; SKX-NEXT: jle .LBB8_2 +; SKX-NEXT: # BB#1: # %for.body.preheader +; SKX-NEXT: movq %rdx, %rcx +; SKX-NEXT: rep;movsb +; SKX-NEXT: .LBB8_2: # %for.cond.cleanup +; SKX-NEXT: retq +entry: + %cmp8 = icmp sgt i64 %size, 0 + br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst, i8* %src, i64 %size, i32 1, i1 false) + br label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +define void @copy2(i16* noalias nocapture %dst, i16* noalias nocapture readonly %src, i64 %size) nounwind { +; P4-NOPIC-LABEL: copy2: +; P4-NOPIC: # BB#0: # %entry +; P4-NOPIC-NEXT: testq %rdx, %rdx +; P4-NOPIC-NEXT: jle .LBB9_2 +; P4-NOPIC-NEXT: # BB#1: # %for.body.preheader +; P4-NOPIC-NEXT: pushq %rax +; P4-NOPIC-NEXT: addq %rdx, %rdx +; P4-NOPIC-NEXT: callq memcpy +; P4-NOPIC-NEXT: addq $8, %rsp +; P4-NOPIC-NEXT: .LBB9_2: # %for.cond.cleanup +; P4-NOPIC-NEXT: retq +; +; P4-PIC-LABEL: copy2: +; P4-PIC: # BB#0: # %entry +; P4-PIC-NEXT: testq %rdx, %rdx +; P4-PIC-NEXT: jle .LBB9_2 +; P4-PIC-NEXT: # BB#1: # %for.body.preheader +; P4-PIC-NEXT: pushq %rax +; P4-PIC-NEXT: addq %rdx, %rdx +; P4-PIC-NEXT: callq memcpy@PLT +; P4-PIC-NEXT: addq $8, %rsp +; P4-PIC-NEXT: .LBB9_2: # %for.cond.cleanup +; P4-PIC-NEXT: retq +; +; NHM-NOPIC-LABEL: copy2: +; NHM-NOPIC: # BB#0: # %entry +; NHM-NOPIC-NEXT: testq %rdx, %rdx +; NHM-NOPIC-NEXT: jle .LBB9_2 +; NHM-NOPIC-NEXT: # BB#1: # %for.body.preheader +; NHM-NOPIC-NEXT: pushq %rax +; NHM-NOPIC-NEXT: addq %rdx, %rdx +; NHM-NOPIC-NEXT: callq memcpy +; NHM-NOPIC-NEXT: addq $8, %rsp +; NHM-NOPIC-NEXT: .LBB9_2: # %for.cond.cleanup +; NHM-NOPIC-NEXT: retq +; +; NHM-PIC-LABEL: copy2: +; NHM-PIC: # BB#0: # %entry +; NHM-PIC-NEXT: testq %rdx, %rdx +; NHM-PIC-NEXT: jle .LBB9_2 +; NHM-PIC-NEXT: # BB#1: # %for.body.preheader +; NHM-PIC-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF +; NHM-PIC-NEXT: andq %rax, %rdx +; NHM-PIC-NEXT: movq %rdx, %rcx +; NHM-PIC-NEXT: rep;movsw +; NHM-PIC-NEXT: .LBB9_2: # %for.cond.cleanup +; NHM-PIC-NEXT: retq +; +; WSM-NOPIC-LABEL: copy2: +; WSM-NOPIC: # BB#0: # %entry +; WSM-NOPIC-NEXT: testq %rdx, %rdx +; WSM-NOPIC-NEXT: jle .LBB9_2 +; WSM-NOPIC-NEXT: # BB#1: # %for.body.preheader +; WSM-NOPIC-NEXT: pushq %rax +; WSM-NOPIC-NEXT: addq %rdx, %rdx +; WSM-NOPIC-NEXT: callq memcpy +; WSM-NOPIC-NEXT: addq $8, %rsp +; WSM-NOPIC-NEXT: .LBB9_2: # %for.cond.cleanup +; WSM-NOPIC-NEXT: retq +; +; WSM-PIC-LABEL: copy2: +; WSM-PIC: # BB#0: # %entry +; WSM-PIC-NEXT: testq %rdx, %rdx +; WSM-PIC-NEXT: jle .LBB9_2 +; WSM-PIC-NEXT: # BB#1: # %for.body.preheader +; WSM-PIC-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF +; WSM-PIC-NEXT: andq %rax, %rdx +; WSM-PIC-NEXT: movq %rdx, %rcx +; WSM-PIC-NEXT: rep;movsw +; WSM-PIC-NEXT: .LBB9_2: # %for.cond.cleanup +; WSM-PIC-NEXT: retq +; +; SNB-NOPIC-LABEL: copy2: +; SNB-NOPIC: # BB#0: # %entry +; SNB-NOPIC-NEXT: testq %rdx, %rdx +; SNB-NOPIC-NEXT: jle .LBB9_2 +; SNB-NOPIC-NEXT: # BB#1: # %for.body.preheader +; SNB-NOPIC-NEXT: pushq %rax +; SNB-NOPIC-NEXT: addq %rdx, %rdx +; SNB-NOPIC-NEXT: callq memcpy +; SNB-NOPIC-NEXT: addq $8, %rsp +; SNB-NOPIC-NEXT: .LBB9_2: # %for.cond.cleanup +; SNB-NOPIC-NEXT: retq +; +; SNB-PIC-LABEL: copy2: +; SNB-PIC: # BB#0: # %entry +; SNB-PIC-NEXT: testq %rdx, %rdx +; SNB-PIC-NEXT: jle .LBB9_2 +; SNB-PIC-NEXT: # BB#1: # %for.body.preheader +; SNB-PIC-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF +; SNB-PIC-NEXT: andq %rax, %rdx +; SNB-PIC-NEXT: movq %rdx, %rcx +; SNB-PIC-NEXT: rep;movsw +; SNB-PIC-NEXT: .LBB9_2: # %for.cond.cleanup +; SNB-PIC-NEXT: retq +; +; IVB-LABEL: copy2: +; IVB: # BB#0: # %entry +; IVB-NEXT: testq %rdx, %rdx +; IVB-NEXT: jle .LBB9_2 +; IVB-NEXT: # BB#1: # %for.body.preheader +; IVB-NEXT: movabsq $9223372036854775807, %rax # imm = 0x7FFFFFFFFFFFFFFF +; IVB-NEXT: andq %rax, %rdx +; IVB-NEXT: movq %rdx, %rcx +; IVB-NEXT: rep;movsw +; IVB-NEXT: .LBB9_2: # %for.cond.cleanup +; IVB-NEXT: retq +; +; HSW-LABEL: copy2: +; HSW: # BB#0: # %entry +; HSW-NEXT: testq %rdx, %rdx +; HSW-NEXT: jle .LBB9_2 +; HSW-NEXT: # BB#1: # %for.body.preheader +; HSW-NEXT: movb $63, %al +; HSW-NEXT: bzhiq %rax, %rdx, %rcx +; HSW-NEXT: rep;movsw +; HSW-NEXT: .LBB9_2: # %for.cond.cleanup +; HSW-NEXT: retq +; +; SKX-LABEL: copy2: +; SKX: # BB#0: # %entry +; SKX-NEXT: testq %rdx, %rdx +; SKX-NEXT: jle .LBB9_2 +; SKX-NEXT: # BB#1: # %for.body.preheader +; SKX-NEXT: movb $63, %al +; SKX-NEXT: bzhiq %rax, %rdx, %rcx +; SKX-NEXT: rep;movsw +; SKX-NEXT: .LBB9_2: # %for.cond.cleanup +; SKX-NEXT: retq +entry: + %cmp8 = icmp sgt i64 %size, 0 + br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %src12 = bitcast i16* %src to i8* + %dst11 = bitcast i16* %dst to i8* + %0 = shl i64 %size, 1 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst11, i8* %src12, i64 %0, i32 2, i1 false) + br label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +define void @copy4(i32* noalias nocapture %dst, i32* noalias nocapture readonly %src, i64 %size) nounwind { +; P4-NOPIC-LABEL: copy4: +; P4-NOPIC: # BB#0: # %entry +; P4-NOPIC-NEXT: testq %rdx, %rdx +; P4-NOPIC-NEXT: jle .LBB10_2 +; P4-NOPIC-NEXT: # BB#1: # %for.body.preheader +; P4-NOPIC-NEXT: pushq %rax +; P4-NOPIC-NEXT: shlq $2, %rdx +; P4-NOPIC-NEXT: callq memcpy +; P4-NOPIC-NEXT: addq $8, %rsp +; P4-NOPIC-NEXT: .LBB10_2: # %for.cond.cleanup +; P4-NOPIC-NEXT: retq +; +; P4-PIC-LABEL: copy4: +; P4-PIC: # BB#0: # %entry +; P4-PIC-NEXT: testq %rdx, %rdx +; P4-PIC-NEXT: jle .LBB10_2 +; P4-PIC-NEXT: # BB#1: # %for.body.preheader +; P4-PIC-NEXT: pushq %rax +; P4-PIC-NEXT: shlq $2, %rdx +; P4-PIC-NEXT: callq memcpy@PLT +; P4-PIC-NEXT: addq $8, %rsp +; P4-PIC-NEXT: .LBB10_2: # %for.cond.cleanup +; P4-PIC-NEXT: retq +; +; NHM-NOPIC-LABEL: copy4: +; NHM-NOPIC: # BB#0: # %entry +; NHM-NOPIC-NEXT: testq %rdx, %rdx +; NHM-NOPIC-NEXT: jle .LBB10_2 +; NHM-NOPIC-NEXT: # BB#1: # %for.body.preheader +; NHM-NOPIC-NEXT: pushq %rax +; NHM-NOPIC-NEXT: shlq $2, %rdx +; NHM-NOPIC-NEXT: callq memcpy +; NHM-NOPIC-NEXT: addq $8, %rsp +; NHM-NOPIC-NEXT: .LBB10_2: # %for.cond.cleanup +; NHM-NOPIC-NEXT: retq +; +; NHM-PIC-LABEL: copy4: +; NHM-PIC: # BB#0: # %entry +; NHM-PIC-NEXT: testq %rdx, %rdx +; NHM-PIC-NEXT: jle .LBB10_2 +; NHM-PIC-NEXT: # BB#1: # %for.body.preheader +; NHM-PIC-NEXT: movabsq $4611686018427387903, %rax # imm = 0x3FFFFFFFFFFFFFFF +; NHM-PIC-NEXT: andq %rax, %rdx +; NHM-PIC-NEXT: movq %rdx, %rcx +; NHM-PIC-NEXT: rep;movsl +; NHM-PIC-NEXT: .LBB10_2: # %for.cond.cleanup +; NHM-PIC-NEXT: retq +; +; WSM-NOPIC-LABEL: copy4: +; WSM-NOPIC: # BB#0: # %entry +; WSM-NOPIC-NEXT: testq %rdx, %rdx +; WSM-NOPIC-NEXT: jle .LBB10_2 +; WSM-NOPIC-NEXT: # BB#1: # %for.body.preheader +; WSM-NOPIC-NEXT: pushq %rax +; WSM-NOPIC-NEXT: shlq $2, %rdx +; WSM-NOPIC-NEXT: callq memcpy +; WSM-NOPIC-NEXT: addq $8, %rsp +; WSM-NOPIC-NEXT: .LBB10_2: # %for.cond.cleanup +; WSM-NOPIC-NEXT: retq +; +; WSM-PIC-LABEL: copy4: +; WSM-PIC: # BB#0: # %entry +; WSM-PIC-NEXT: testq %rdx, %rdx +; WSM-PIC-NEXT: jle .LBB10_2 +; WSM-PIC-NEXT: # BB#1: # %for.body.preheader +; WSM-PIC-NEXT: movabsq $4611686018427387903, %rax # imm = 0x3FFFFFFFFFFFFFFF +; WSM-PIC-NEXT: andq %rax, %rdx +; WSM-PIC-NEXT: movq %rdx, %rcx +; WSM-PIC-NEXT: rep;movsl +; WSM-PIC-NEXT: .LBB10_2: # %for.cond.cleanup +; WSM-PIC-NEXT: retq +; +; SNB-NOPIC-LABEL: copy4: +; SNB-NOPIC: # BB#0: # %entry +; SNB-NOPIC-NEXT: testq %rdx, %rdx +; SNB-NOPIC-NEXT: jle .LBB10_2 +; SNB-NOPIC-NEXT: # BB#1: # %for.body.preheader +; SNB-NOPIC-NEXT: pushq %rax +; SNB-NOPIC-NEXT: shlq $2, %rdx +; SNB-NOPIC-NEXT: callq memcpy +; SNB-NOPIC-NEXT: addq $8, %rsp +; SNB-NOPIC-NEXT: .LBB10_2: # %for.cond.cleanup +; SNB-NOPIC-NEXT: retq +; +; SNB-PIC-LABEL: copy4: +; SNB-PIC: # BB#0: # %entry +; SNB-PIC-NEXT: testq %rdx, %rdx +; SNB-PIC-NEXT: jle .LBB10_2 +; SNB-PIC-NEXT: # BB#1: # %for.body.preheader +; SNB-PIC-NEXT: movabsq $4611686018427387903, %rax # imm = 0x3FFFFFFFFFFFFFFF +; SNB-PIC-NEXT: andq %rax, %rdx +; SNB-PIC-NEXT: movq %rdx, %rcx +; SNB-PIC-NEXT: rep;movsl +; SNB-PIC-NEXT: .LBB10_2: # %for.cond.cleanup +; SNB-PIC-NEXT: retq +; +; IVB-NOPIC-LABEL: copy4: +; IVB-NOPIC: # BB#0: # %entry +; IVB-NOPIC-NEXT: testq %rdx, %rdx +; IVB-NOPIC-NEXT: jle .LBB10_2 +; IVB-NOPIC-NEXT: # BB#1: # %for.body.preheader +; IVB-NOPIC-NEXT: pushq %rax +; IVB-NOPIC-NEXT: shlq $2, %rdx +; IVB-NOPIC-NEXT: callq memcpy +; IVB-NOPIC-NEXT: addq $8, %rsp +; IVB-NOPIC-NEXT: .LBB10_2: # %for.cond.cleanup +; IVB-NOPIC-NEXT: retq +; +; IVB-PIC-LABEL: copy4: +; IVB-PIC: # BB#0: # %entry +; IVB-PIC-NEXT: testq %rdx, %rdx +; IVB-PIC-NEXT: jle .LBB10_2 +; IVB-PIC-NEXT: # BB#1: # %for.body.preheader +; IVB-PIC-NEXT: movabsq $4611686018427387903, %rax # imm = 0x3FFFFFFFFFFFFFFF +; IVB-PIC-NEXT: andq %rax, %rdx +; IVB-PIC-NEXT: movq %rdx, %rcx +; IVB-PIC-NEXT: rep;movsl +; IVB-PIC-NEXT: .LBB10_2: # %for.cond.cleanup +; IVB-PIC-NEXT: retq +; +; HSW-NOPIC-LABEL: copy4: +; HSW-NOPIC: # BB#0: # %entry +; HSW-NOPIC-NEXT: testq %rdx, %rdx +; HSW-NOPIC-NEXT: jle .LBB10_2 +; HSW-NOPIC-NEXT: # BB#1: # %for.body.preheader +; HSW-NOPIC-NEXT: pushq %rax +; HSW-NOPIC-NEXT: shlq $2, %rdx +; HSW-NOPIC-NEXT: callq memcpy +; HSW-NOPIC-NEXT: addq $8, %rsp +; HSW-NOPIC-NEXT: .LBB10_2: # %for.cond.cleanup +; HSW-NOPIC-NEXT: retq +; +; HSW-PIC-LABEL: copy4: +; HSW-PIC: # BB#0: # %entry +; HSW-PIC-NEXT: testq %rdx, %rdx +; HSW-PIC-NEXT: jle .LBB10_2 +; HSW-PIC-NEXT: # BB#1: # %for.body.preheader +; HSW-PIC-NEXT: movb $62, %al +; HSW-PIC-NEXT: bzhiq %rax, %rdx, %rcx +; HSW-PIC-NEXT: rep;movsl +; HSW-PIC-NEXT: .LBB10_2: # %for.cond.cleanup +; HSW-PIC-NEXT: retq +; +; SKX-NOPIC-LABEL: copy4: +; SKX-NOPIC: # BB#0: # %entry +; SKX-NOPIC-NEXT: testq %rdx, %rdx +; SKX-NOPIC-NEXT: jle .LBB10_2 +; SKX-NOPIC-NEXT: # BB#1: # %for.body.preheader +; SKX-NOPIC-NEXT: pushq %rax +; SKX-NOPIC-NEXT: shlq $2, %rdx +; SKX-NOPIC-NEXT: callq memcpy +; SKX-NOPIC-NEXT: addq $8, %rsp +; SKX-NOPIC-NEXT: .LBB10_2: # %for.cond.cleanup +; SKX-NOPIC-NEXT: retq +; +; SKX-PIC-LABEL: copy4: +; SKX-PIC: # BB#0: # %entry +; SKX-PIC-NEXT: testq %rdx, %rdx +; SKX-PIC-NEXT: jle .LBB10_2 +; SKX-PIC-NEXT: # BB#1: # %for.body.preheader +; SKX-PIC-NEXT: movb $62, %al +; SKX-PIC-NEXT: bzhiq %rax, %rdx, %rcx +; SKX-PIC-NEXT: rep;movsl +; SKX-PIC-NEXT: .LBB10_2: # %for.cond.cleanup +; SKX-PIC-NEXT: retq +entry: + %cmp8 = icmp sgt i64 %size, 0 + br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %src12 = bitcast i32* %src to i8* + %dst11 = bitcast i32* %dst to i8* + %0 = shl i64 %size, 2 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst11, i8* %src12, i64 %0, i32 4, i1 false) + br label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +define void @copy8(i64* noalias nocapture %dst, i64* noalias nocapture readonly %src, i64 %size) nounwind { +; P4-NOPIC-LABEL: copy8: +; P4-NOPIC: # BB#0: # %entry +; P4-NOPIC-NEXT: testq %rdx, %rdx +; P4-NOPIC-NEXT: jle .LBB11_2 +; P4-NOPIC-NEXT: # BB#1: # %for.body.preheader +; P4-NOPIC-NEXT: pushq %rax +; P4-NOPIC-NEXT: shlq $3, %rdx +; P4-NOPIC-NEXT: callq memcpy +; P4-NOPIC-NEXT: addq $8, %rsp +; P4-NOPIC-NEXT: .LBB11_2: # %for.cond.cleanup +; P4-NOPIC-NEXT: retq +; +; P4-PIC-LABEL: copy8: +; P4-PIC: # BB#0: # %entry +; P4-PIC-NEXT: testq %rdx, %rdx +; P4-PIC-NEXT: jle .LBB11_2 +; P4-PIC-NEXT: # BB#1: # %for.body.preheader +; P4-PIC-NEXT: pushq %rax +; P4-PIC-NEXT: shlq $3, %rdx +; P4-PIC-NEXT: callq memcpy@PLT +; P4-PIC-NEXT: addq $8, %rsp +; P4-PIC-NEXT: .LBB11_2: # %for.cond.cleanup +; P4-PIC-NEXT: retq +; +; NHM-NOPIC-LABEL: copy8: +; NHM-NOPIC: # BB#0: # %entry +; NHM-NOPIC-NEXT: testq %rdx, %rdx +; NHM-NOPIC-NEXT: jle .LBB11_2 +; NHM-NOPIC-NEXT: # BB#1: # %for.body.preheader +; NHM-NOPIC-NEXT: pushq %rax +; NHM-NOPIC-NEXT: shlq $3, %rdx +; NHM-NOPIC-NEXT: callq memcpy +; NHM-NOPIC-NEXT: addq $8, %rsp +; NHM-NOPIC-NEXT: .LBB11_2: # %for.cond.cleanup +; NHM-NOPIC-NEXT: retq +; +; NHM-PIC-LABEL: copy8: +; NHM-PIC: # BB#0: # %entry +; NHM-PIC-NEXT: testq %rdx, %rdx +; NHM-PIC-NEXT: jle .LBB11_2 +; NHM-PIC-NEXT: # BB#1: # %for.body.preheader +; NHM-PIC-NEXT: movabsq $2305843009213693951, %rax # imm = 0x1FFFFFFFFFFFFFFF +; NHM-PIC-NEXT: andq %rax, %rdx +; NHM-PIC-NEXT: movq %rdx, %rcx +; NHM-PIC-NEXT: rep;movsq +; NHM-PIC-NEXT: .LBB11_2: # %for.cond.cleanup +; NHM-PIC-NEXT: retq +; +; WSM-NOPIC-LABEL: copy8: +; WSM-NOPIC: # BB#0: # %entry +; WSM-NOPIC-NEXT: testq %rdx, %rdx +; WSM-NOPIC-NEXT: jle .LBB11_2 +; WSM-NOPIC-NEXT: # BB#1: # %for.body.preheader +; WSM-NOPIC-NEXT: pushq %rax +; WSM-NOPIC-NEXT: shlq $3, %rdx +; WSM-NOPIC-NEXT: callq memcpy +; WSM-NOPIC-NEXT: addq $8, %rsp +; WSM-NOPIC-NEXT: .LBB11_2: # %for.cond.cleanup +; WSM-NOPIC-NEXT: retq +; +; WSM-PIC-LABEL: copy8: +; WSM-PIC: # BB#0: # %entry +; WSM-PIC-NEXT: testq %rdx, %rdx +; WSM-PIC-NEXT: jle .LBB11_2 +; WSM-PIC-NEXT: # BB#1: # %for.body.preheader +; WSM-PIC-NEXT: movabsq $2305843009213693951, %rax # imm = 0x1FFFFFFFFFFFFFFF +; WSM-PIC-NEXT: andq %rax, %rdx +; WSM-PIC-NEXT: movq %rdx, %rcx +; WSM-PIC-NEXT: rep;movsq +; WSM-PIC-NEXT: .LBB11_2: # %for.cond.cleanup +; WSM-PIC-NEXT: retq +; +; SNB-NOPIC-LABEL: copy8: +; SNB-NOPIC: # BB#0: # %entry +; SNB-NOPIC-NEXT: testq %rdx, %rdx +; SNB-NOPIC-NEXT: jle .LBB11_2 +; SNB-NOPIC-NEXT: # BB#1: # %for.body.preheader +; SNB-NOPIC-NEXT: pushq %rax +; SNB-NOPIC-NEXT: shlq $3, %rdx +; SNB-NOPIC-NEXT: callq memcpy +; SNB-NOPIC-NEXT: addq $8, %rsp +; SNB-NOPIC-NEXT: .LBB11_2: # %for.cond.cleanup +; SNB-NOPIC-NEXT: retq +; +; SNB-PIC-LABEL: copy8: +; SNB-PIC: # BB#0: # %entry +; SNB-PIC-NEXT: testq %rdx, %rdx +; SNB-PIC-NEXT: jle .LBB11_2 +; SNB-PIC-NEXT: # BB#1: # %for.body.preheader +; SNB-PIC-NEXT: movabsq $2305843009213693951, %rax # imm = 0x1FFFFFFFFFFFFFFF +; SNB-PIC-NEXT: andq %rax, %rdx +; SNB-PIC-NEXT: movq %rdx, %rcx +; SNB-PIC-NEXT: rep;movsq +; SNB-PIC-NEXT: .LBB11_2: # %for.cond.cleanup +; SNB-PIC-NEXT: retq +; +; IVB-LABEL: copy8: +; IVB: # BB#0: # %entry +; IVB-NEXT: testq %rdx, %rdx +; IVB-NEXT: jle .LBB11_2 +; IVB-NEXT: # BB#1: # %for.body.preheader +; IVB-NEXT: movabsq $2305843009213693951, %rax # imm = 0x1FFFFFFFFFFFFFFF +; IVB-NEXT: andq %rax, %rdx +; IVB-NEXT: movq %rdx, %rcx +; IVB-NEXT: rep;movsq +; IVB-NEXT: .LBB11_2: # %for.cond.cleanup +; IVB-NEXT: retq +; +; HSW-LABEL: copy8: +; HSW: # BB#0: # %entry +; HSW-NEXT: testq %rdx, %rdx +; HSW-NEXT: jle .LBB11_2 +; HSW-NEXT: # BB#1: # %for.body.preheader +; HSW-NEXT: movb $61, %al +; HSW-NEXT: bzhiq %rax, %rdx, %rcx +; HSW-NEXT: rep;movsq +; HSW-NEXT: .LBB11_2: # %for.cond.cleanup +; HSW-NEXT: retq +; +; SKX-LABEL: copy8: +; SKX: # BB#0: # %entry +; SKX-NEXT: testq %rdx, %rdx +; SKX-NEXT: jle .LBB11_2 +; SKX-NEXT: # BB#1: # %for.body.preheader +; SKX-NEXT: movb $61, %al +; SKX-NEXT: bzhiq %rax, %rdx, %rcx +; SKX-NEXT: rep;movsq +; SKX-NEXT: .LBB11_2: # %for.cond.cleanup +; SKX-NEXT: retq +entry: + %cmp8 = icmp sgt i64 %size, 0 + br i1 %cmp8, label %for.body.preheader, label %for.cond.cleanup + +for.body.preheader: + %src12 = bitcast i64* %src to i8* + %dst11 = bitcast i64* %dst to i8* + %0 = shl i64 %size, 3 + call void @llvm.memcpy.p0i8.p0i8.i64(i8* %dst11, i8* %src12, i64 %0, i32 8, i1 false) + br label %for.cond.cleanup + +for.cond.cleanup: + ret void +} + +declare void @llvm.memset.p0i8.i64(i8* nocapture writeonly, i8, i64, i32, i1) + +declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture writeonly, i8* nocapture readonly, i64, i32, i1) Index: test/CodeGen/X86/memcpy-struct-by-value.ll =================================================================== --- test/CodeGen/X86/memcpy-struct-by-value.ll +++ test/CodeGen/X86/memcpy-struct-by-value.ll @@ -3,11 +3,9 @@ ; RUN: llc -mtriple=i686-linux-gnu -mattr=-ermsb < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST32 ; RUN: llc -mtriple=i686-linux-gnu -mattr=+ermsb < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST ; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=generic < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST +; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=ivybridge < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST ; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=haswell < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST ; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=skylake < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=FAST -; FIXME: The documentation states that ivybridge has ermsb, but this is not -; enabled right now since I could not confirm by testing. -; RUN: llc -mtriple=x86_64-linux-gnu -mcpu=ivybridge < %s -o - | FileCheck %s --check-prefix=ALL --check-prefix=NOFAST %struct.large = type { [4096 x i8] }