Index: include/llvm/Target/TargetLowering.h =================================================================== --- include/llvm/Target/TargetLowering.h +++ include/llvm/Target/TargetLowering.h @@ -690,6 +690,25 @@ return OptSize ? MaxStoresPerMemmoveOptSize : MaxStoresPerMemmove; } + /// \brief Determine if constant value used in llvm.memset should be placed + /// into register. + /// + /// \param OptSize If True the code generation is optimized for code + /// size, otherwise - for performance. + /// \param MemOps Type of each store operation. + /// \return True if moving memset value into a register is profitable. + /// + /// If llvm.memset is replaced by a series of store operations and the memset + /// value is constant, on some platforms the store operations may be + /// implemented as moves of immediate operand to memory, or the memset value + /// may be stored in a register and then used in moves from register to + /// memory. This function determines which way is more profitable. + virtual bool putMemsetImmediateToRegister(bool OptSize, + std::vector &MemOps) const { + // By default use previous behavior. + return false; + } + /// \brief Determine if the target supports unaligned memory accesses. /// /// This function returns true if the target allows unaligned memory accesses. Index: lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -25,6 +25,7 @@ #include "llvm/CodeGen/MachineConstantPool.h" #include "llvm/CodeGen/MachineFrameInfo.h" #include "llvm/CodeGen/MachineModuleInfo.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" #include "llvm/DebugInfo.h" #include "llvm/IR/CallingConv.h" #include "llvm/IR/Constants.h" @@ -3811,6 +3812,24 @@ &OutChains[0], OutChains.size()); } +/// \brief Lower the call to 'memset' intrinsic function into a series of store +/// operations. +/// +/// \param DAG Selection DAG where lowered code is placed. +/// \param dl Link to corresponding IR location. +/// \param Chain Control flow dependency. +/// \param Dst Pointer to destination memory location. +/// \param Src Value of byte to write into the memory. +/// \param Size Number of bytes to write. +/// \param Align Alignment of the destination in bytes. +/// \param isVol True if destination is volatile. +/// \param DstPtrInfo IR information on the memory pointer. +/// \returns New head in the control flow, if lowering was successful, empty +/// SDValue otherwise. +/// +/// The function tries to replace 'llvm.memset' intrinsic with several store +/// operations and value calculation code. This is usually profitable for small +/// memory size. static SDValue getMemsetStores(SelectionDAG &DAG, SDLoc dl, SDValue Chain, SDValue Dst, SDValue Src, uint64_t Size, @@ -3861,6 +3880,21 @@ LargestVT = MemOps[i]; SDValue MemSetValue = getMemsetValue(Src, LargestVT, DAG, dl); + // If profitable, put constant memset value in a register. + SDValue LongestValueInReg; + unsigned ValueReg = 0; + if (isa(MemSetValue) && + TLI.isTypeLegal(LargestVT) && + TLI.putMemsetImmediateToRegister(OptSize, MemOps)) { + const TargetLowering *TL = DAG.getTarget().getTargetLowering(); + const TargetRegisterClass* TRC = TL->getRegClassFor(LargestVT.getSimpleVT()); + ValueReg = MF.getRegInfo().createVirtualRegister(TRC); + Chain = DAG.getCopyToReg(Chain, dl, ValueReg, MemSetValue); + LongestValueInReg = DAG.getRegister(ValueReg, LargestVT); + } else + LongestValueInReg = MemSetValue; + SDValue RegInitialization = Chain; + for (unsigned i = 0; i < NumMemOps; i++) { EVT VT = MemOps[i]; unsigned VTSize = VT.getSizeInBits() / 8; @@ -3873,11 +3907,14 @@ // If this store is smaller than the largest store see whether we can get // the smaller value for free with a truncate. - SDValue Value = MemSetValue; + SDValue Value = LongestValueInReg; if (VT.bitsLT(LargestVT)) { if (!LargestVT.isVector() && !VT.isVector() && - TLI.isTruncateFree(LargestVT, VT)) - Value = DAG.getNode(ISD::TRUNCATE, dl, VT, MemSetValue); + TLI.isTruncateFree(LargestVT, VT)) { + if (ValueReg != 0) + Value = DAG.getCopyFromReg(RegInitialization, dl, ValueReg, LargestVT); + Value = DAG.getNode(ISD::TRUNCATE, dl, VT, Value); + } else Value = getMemsetValue(Src, VT, DAG, dl); } Index: lib/Target/X86/X86ISelLowering.h =================================================================== --- lib/Target/X86/X86ISelLowering.h +++ lib/Target/X86/X86ISelLowering.h @@ -572,6 +572,9 @@ /// legal as the hook is used before type legalization. virtual bool isSafeMemOpType(MVT VT) const; + virtual bool putMemsetImmediateToRegister(bool OptSize, + std::vector &MemOps) const; + /// allowsUnalignedMemoryAccesses - Returns true if the target allows /// unaligned memory accesses. of the specified type. Returns whether it /// is "fast" by reference in the second argument. Index: lib/Target/X86/X86ISelLowering.cpp =================================================================== --- lib/Target/X86/X86ISelLowering.cpp +++ lib/Target/X86/X86ISelLowering.cpp @@ -1639,6 +1639,11 @@ return true; } +bool X86TargetLowering:: putMemsetImmediateToRegister(bool OptSize, + std::vector &MemOps) const { + return MemOps.size() > 1; +} + bool X86TargetLowering::allowsUnalignedMemoryAccesses(EVT VT, bool *Fast) const { if (Fast) Index: test/CodeGen/X86/memset-sse-stack-realignment.ll =================================================================== --- test/CodeGen/X86/memset-sse-stack-realignment.ll +++ test/CodeGen/X86/memset-sse-stack-realignment.ll @@ -16,7 +16,7 @@ ; NOSSE-LABEL: test1: ; NOSSE-NOT: and -; NOSSE: movl $0 +; NOSSE: movl %e{{[a-d]}}x, ; SSE1-LABEL: test1: ; SSE1: andl $-16 @@ -49,7 +49,7 @@ ; NOSSE-LABEL: test2: ; NOSSE-NOT: and -; NOSSE: movl $0 +; NOSSE: movl %e{{[a-d]}}x, ; SSE1-LABEL: test2: ; SSE1: andl $-16 Index: test/CodeGen/X86/memset.ll =================================================================== --- test/CodeGen/X86/memset.ll +++ test/CodeGen/X86/memset.ll @@ -11,16 +11,14 @@ %tmp110117 = bitcast [8 x %struct.x]* %up_mvd to i8* ; [#uses=1] call void @llvm.memset.p0i8.i64(i8* %tmp110117, i8 0, i64 32, i32 8, i1 false) -; X86: movl $0, -; X86: movl $0, -; X86: movl $0, -; X86: movl $0, -; X86: movl $0, -; X86: movl $0, -; X86: movl $0, -; X86: movl $0, -; X86-NOT: movl $0, -; X86: ret +; X86: movl [[REGISTER:%e[a-d]x]], +; X86: movl [[REGISTER]], +; X86: movl [[REGISTER]], +; X86: movl [[REGISTER]], +; X86: movl [[REGISTER]], +; X86: movl [[REGISTER]], +; X86: movl [[REGISTER]], +; X86: movl [[REGISTER]], ; XMM: xorps %xmm{{[0-9]+}}, [[Z:%xmm[0-9]+]] ; XMM: movaps [[Z]], @@ -45,11 +43,10 @@ ; Ensure that alignment of '0' in an @llvm.memset intrinsic results in ; unaligned loads and stores. ; XMM: PR15348 -; XMM: movb $0, -; XMM: movl $0, -; XMM: movl $0, -; XMM: movl $0, -; XMM: movl $0, +; XMM: movl [[REGISTER:%e[a-d]x]], +; XMM: movl [[REGISTER]], +; XMM: movl [[REGISTER]], +; XMM: movl [[REGISTER]], call void @llvm.memset.p0i8.i64(i8* %a, i8 0, i64 17, i32 0, i1 false) ret void } Index: test/CodeGen/X86/tlv-1.ll =================================================================== --- test/CodeGen/X86/tlv-1.ll +++ test/CodeGen/X86/tlv-1.ll @@ -11,8 +11,8 @@ unreachable ; CHECK: movq _c@TLVP(%rip), %rdi ; CHECK-NEXT: callq *(%rdi) - ; CHECK-NEXT: movl $0, 56(%rax) - ; CHECK-NEXT: movq $0, 48(%rax) + ; CHECK-NEXT: movl %e[[REG:[a-d]x]], 56(%rax) + ; CHECK-NEXT: movq %r[[REG]], 48(%rax) } ; rdar://10291355