Index: llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h =================================================================== --- llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h +++ llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h @@ -34,6 +34,12 @@ SelectionDAGTargetInfo(const SelectionDAGTargetInfo &) = delete; SelectionDAGTargetInfo &operator=(const SelectionDAGTargetInfo &) = delete; virtual ~SelectionDAGTargetInfo(); + // Return true if target-specific code for memset will be better than generic + // approach + virtual bool shouldEmitTargetCodeForMemset(SelectionDAG &DAG, + SDValue Size) const { + return false; + } /// Emit target-specific code that performs a memcpy. /// This can be used by targets to provide code sequences for cases Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp =================================================================== --- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -7043,7 +7043,8 @@ // Check to see if we should lower the memset to stores first. // For cases within the target-specified limits, this is the best choice. ConstantSDNode *ConstantSize = dyn_cast(Size); - if (ConstantSize) { + if (ConstantSize && + !(TSI && TSI->shouldEmitTargetCodeForMemset(*this, Size))) { // Memset with size zero? Just return the original chain. if (ConstantSize->isZero()) return Chain; Index: llvm/lib/Target/X86/X86SelectionDAGInfo.h =================================================================== --- llvm/lib/Target/X86/X86SelectionDAGInfo.h +++ llvm/lib/Target/X86/X86SelectionDAGInfo.h @@ -26,6 +26,9 @@ public: explicit X86SelectionDAGInfo() = default; + virtual bool shouldEmitTargetCodeForMemset(SelectionDAG &DAG, + SDValue Size) const override; + SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src, SDValue Size, Align Alignment, Index: llvm/lib/Target/X86/X86SelectionDAGInfo.cpp =================================================================== --- llvm/lib/Target/X86/X86SelectionDAGInfo.cpp +++ llvm/lib/Target/X86/X86SelectionDAGInfo.cpp @@ -28,6 +28,21 @@ UseFSRMForMemcpy("x86-use-fsrm-for-memcpy", cl::Hidden, cl::init(false), cl::desc("Use fast short rep mov in memcpy lowering")); +bool X86SelectionDAGInfo::shouldEmitTargetCodeForMemset(SelectionDAG &DAG, + SDValue Size) const { + auto *ConstantSize = dyn_cast(Size); + + if (!ConstantSize) + return false; + + // In this case we can replace memset by more simple constructions like + // andq $0, (%rdi) + if (ConstantSize->getZExtValue() <= 8) + return false; + + return DAG.getMachineFunction().getFunction().hasMinSize(); +} + bool X86SelectionDAGInfo::isBaseRegConflictPossible( SelectionDAG &DAG, ArrayRef ClobberSet) const { // We cannot use TRI->hasBasePointer() until *after* we select all basic @@ -63,11 +78,15 @@ if (DstPtrInfo.getAddrSpace() >= 256) return SDValue(); + bool HasMinSize = DAG.getMachineFunction().getFunction().hasMinSize(); + // If not DWORD aligned or size is more than the threshold, call the library. // The libc version is likely to be faster for these cases. It can use the // address value and run time information about the CPU. - if (Alignment < Align(4) || !ConstantSize || - ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) { + if (!ConstantSize || + ((Alignment < Align(4) || + ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) && + !HasMinSize)) { // Check to see if there is a specialized entry-point for memory zeroing. ConstantSDNode *ValC = dyn_cast(Val); @@ -104,35 +123,37 @@ uint64_t SizeVal = ConstantSize->getZExtValue(); SDValue InFlag; - EVT AVT; + EVT AVT = MVT::i8; SDValue Count; ConstantSDNode *ValC = dyn_cast(Val); unsigned BytesLeft = 0; if (ValC) { - unsigned ValReg; - uint64_t Val = ValC->getZExtValue() & 255; + unsigned ValReg = X86::AL; + uint64_t MemsetVal = ValC->getZExtValue() & 255; - // If the value is a constant, then we can potentially use larger sets. - if (Alignment > Align(2)) { + if (HasMinSize && SizeVal % 2 != 0) { + // Byte aligned + AVT = MVT::i8; + ValReg = X86::AL; + Count = DAG.getIntPtrConstant(SizeVal, dl); + } else if (Alignment > Align(2)) { // If the value is a constant, then we + // can potentially use larger sets. // DWORD aligned AVT = MVT::i32; ValReg = X86::EAX; - Val = (Val << 8) | Val; - Val = (Val << 16) | Val; + MemsetVal = (MemsetVal << 8) | MemsetVal; + MemsetVal = (MemsetVal << 16) | MemsetVal; if (Subtarget.is64Bit() && Alignment > Align(8)) { // QWORD aligned AVT = MVT::i64; ValReg = X86::RAX; - Val = (Val << 32) | Val; + MemsetVal = (MemsetVal << 32) | MemsetVal; } } else if (Alignment == Align(2)) { // WORD aligned AVT = MVT::i16; ValReg = X86::AX; - Val = (Val << 8) | Val; + MemsetVal = (MemsetVal << 8) | MemsetVal; } else { - // Byte aligned - AVT = MVT::i8; - ValReg = X86::AL; Count = DAG.getIntPtrConstant(SizeVal, dl); } @@ -142,8 +163,8 @@ BytesLeft = SizeVal % UBytes; } - Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT), - InFlag); + Chain = DAG.getCopyToReg(Chain, dl, ValReg, + DAG.getConstant(MemsetVal, dl, AVT), InFlag); InFlag = Chain.getValue(1); } else { AVT = MVT::i8; Index: llvm/test/CodeGen/X86/memset-minsize.ll =================================================================== --- llvm/test/CodeGen/X86/memset-minsize.ll +++ llvm/test/CodeGen/X86/memset-minsize.ll @@ -29,11 +29,9 @@ define void @medium_memset_to_rep_stos(i32* %ptr) minsize nounwind { ; CHECK-LABEL: medium_memset_to_rep_stos: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: movl $512, %edx # imm = 0x200 -; CHECK-NEXT: xorl %esi, %esi -; CHECK-NEXT: callq memset@PLT -; CHECK-NEXT: popq %rax +; CHECK-NEXT: movl $128, %ecx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: rep;stosl %eax, %es:(%rdi) ; CHECK-NEXT: retq entry: %0 = bitcast i32* %ptr to i8* @@ -44,11 +42,9 @@ define void @large_memset_to_rep_stos(i32* %ptr) minsize nounwind { ; CHECK-LABEL: large_memset_to_rep_stos: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: movl $4096, %edx # imm = 0x1000 -; CHECK-NEXT: xorl %esi, %esi -; CHECK-NEXT: callq memset@PLT -; CHECK-NEXT: popq %rax +; CHECK-NEXT: movl $1024, %ecx # imm = 0x400 +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: rep;stosl %eax, %es:(%rdi) ; CHECK-NEXT: retq entry: %0 = bitcast i32* %ptr to i8* @@ -59,11 +55,9 @@ define void @huge_memset_to_rep_stos(i32* %ptr) minsize nounwind { ; CHECK-LABEL: huge_memset_to_rep_stos: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: movl $8192, %edx # imm = 0x2000 -; CHECK-NEXT: xorl %esi, %esi -; CHECK-NEXT: callq memset@PLT -; CHECK-NEXT: popq %rax +; CHECK-NEXT: movl $2048, %ecx # imm = 0x800 +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: rep;stosl %eax, %es:(%rdi) ; CHECK-NEXT: retq entry: %0 = bitcast i32* %ptr to i8* @@ -74,11 +68,9 @@ define void @odd_length_memset_to_rep_stos(i32* %ptr) minsize nounwind { ; CHECK-LABEL: odd_length_memset_to_rep_stos: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: movl $255, %edx -; CHECK-NEXT: xorl %esi, %esi -; CHECK-NEXT: callq memset@PLT -; CHECK-NEXT: popq %rax +; CHECK-NEXT: movl $255, %ecx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: rep;stosb %al, %es:(%rdi) ; CHECK-NEXT: retq entry: %0 = bitcast i32* %ptr to i8* @@ -89,11 +81,9 @@ define void @align_1_memset_to_rep_stos(i8* %ptr) minsize nounwind { ; CHECK-LABEL: align_1_memset_to_rep_stos: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: movl $256, %edx # imm = 0x100 -; CHECK-NEXT: xorl %esi, %esi -; CHECK-NEXT: callq memset@PLT -; CHECK-NEXT: popq %rax +; CHECK-NEXT: movl $256, %ecx # imm = 0x100 +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: rep;stosb %al, %es:(%rdi) ; CHECK-NEXT: retq entry: call void @llvm.memset.p0i8.i32(i8* align 1 %ptr, i8 0, i32 256, i1 false) @@ -103,11 +93,9 @@ define void @align_2_memset_to_rep_stos(i16* %ptr) minsize nounwind { ; CHECK-LABEL: align_2_memset_to_rep_stos: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: movl $256, %edx # imm = 0x100 -; CHECK-NEXT: xorl %esi, %esi -; CHECK-NEXT: callq memset@PLT -; CHECK-NEXT: popq %rax +; CHECK-NEXT: movl $128, %ecx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: rep;stosw %ax, %es:(%rdi) ; CHECK-NEXT: retq entry: %0 = bitcast i16* %ptr to i8* @@ -118,11 +106,10 @@ define void @align_4_memset_to_rep_stos(i16* %ptr) minsize nounwind { ; CHECK-LABEL: align_4_memset_to_rep_stos: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: movl $256, %edx # imm = 0x100 -; CHECK-NEXT: xorl %esi, %esi -; CHECK-NEXT: callq memset@PLT -; CHECK-NEXT: popq %rax +; CHECK-NEXT: pushq $64 +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: rep;stosl %eax, %es:(%rdi) ; CHECK-NEXT: retq entry: %0 = bitcast i16* %ptr to i8* @@ -133,11 +120,10 @@ define void @align_8_memset_to_rep_stos(i64* %ptr) minsize nounwind { ; CHECK-LABEL: align_8_memset_to_rep_stos: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: pushq %rax -; CHECK-NEXT: movl $256, %edx # imm = 0x100 -; CHECK-NEXT: xorl %esi, %esi -; CHECK-NEXT: callq memset@PLT -; CHECK-NEXT: popq %rax +; CHECK-NEXT: pushq $64 +; CHECK-NEXT: popq %rcx +; CHECK-NEXT: xorl %eax, %eax +; CHECK-NEXT: rep;stosl %eax, %es:(%rdi) ; CHECK-NEXT: retq entry: %0 = bitcast i64* %ptr to i8*