Index: llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h
===================================================================
--- llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h
+++ llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h
@@ -34,6 +34,12 @@
   SelectionDAGTargetInfo(const SelectionDAGTargetInfo &) = delete;
   SelectionDAGTargetInfo &operator=(const SelectionDAGTargetInfo &) = delete;
   virtual ~SelectionDAGTargetInfo();
+  // Return true if target-specific code for memset will be better than generic
+  // approach
+  virtual bool shouldEmitTargetCodeForMemset(SelectionDAG &DAG,
+                                             SDValue Size) const {
+    return false;
+  }
 
   /// Emit target-specific code that performs a memcpy.
   /// This can be used by targets to provide code sequences for cases
Index: llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
===================================================================
--- llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -7043,7 +7043,8 @@
   // Check to see if we should lower the memset to stores first.
   // For cases within the target-specified limits, this is the best choice.
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
-  if (ConstantSize) {
+  if (ConstantSize &&
+      !(TSI && TSI->shouldEmitTargetCodeForMemset(*this, Size))) {
     // Memset with size zero? Just return the original chain.
     if (ConstantSize->isZero())
       return Chain;
Index: llvm/lib/Target/X86/X86SelectionDAGInfo.h
===================================================================
--- llvm/lib/Target/X86/X86SelectionDAGInfo.h
+++ llvm/lib/Target/X86/X86SelectionDAGInfo.h
@@ -26,6 +26,9 @@
 public:
   explicit X86SelectionDAGInfo() = default;
 
+  virtual bool shouldEmitTargetCodeForMemset(SelectionDAG &DAG,
+                                             SDValue Size) const override;
+
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Dst, SDValue Src,
                                   SDValue Size, Align Alignment,
Index: llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
===================================================================
--- llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -28,6 +28,21 @@
     UseFSRMForMemcpy("x86-use-fsrm-for-memcpy", cl::Hidden, cl::init(false),
                      cl::desc("Use fast short rep mov in memcpy lowering"));
 
+bool X86SelectionDAGInfo::shouldEmitTargetCodeForMemset(SelectionDAG &DAG,
+                                                        SDValue Size) const {
+  auto *ConstantSize = dyn_cast<ConstantSDNode>(Size);
+
+  if (!ConstantSize)
+    return false;
+
+  // In this case we can replace memset by more simple constructions like
+  // andq $0, (%rdi)
+  if (ConstantSize->getZExtValue() <= 8)
+    return false;
+
+  return DAG.getMachineFunction().getFunction().hasMinSize();
+}
+
 bool X86SelectionDAGInfo::isBaseRegConflictPossible(
     SelectionDAG &DAG, ArrayRef<MCPhysReg> ClobberSet) const {
   // We cannot use TRI->hasBasePointer() until *after* we select all basic
@@ -63,11 +78,15 @@
   if (DstPtrInfo.getAddrSpace() >= 256)
     return SDValue();
 
+  bool HasMinSize = DAG.getMachineFunction().getFunction().hasMinSize();
+
   // If not DWORD aligned or size is more than the threshold, call the library.
   // The libc version is likely to be faster for these cases. It can use the
   // address value and run time information about the CPU.
-  if (Alignment < Align(4) || !ConstantSize ||
-      ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) {
+  if (!ConstantSize ||
+      ((Alignment < Align(4) ||
+        ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) &&
+       !HasMinSize)) {
     // Check to see if there is a specialized entry-point for memory zeroing.
     ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Val);
 
@@ -104,35 +123,37 @@
 
   uint64_t SizeVal = ConstantSize->getZExtValue();
   SDValue InFlag;
-  EVT AVT;
+  EVT AVT = MVT::i8;
   SDValue Count;
   ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Val);
   unsigned BytesLeft = 0;
   if (ValC) {
-    unsigned ValReg;
-    uint64_t Val = ValC->getZExtValue() & 255;
+    unsigned ValReg = X86::AL;
+    uint64_t MemsetVal = ValC->getZExtValue() & 255;
 
-    // If the value is a constant, then we can potentially use larger sets.
-    if (Alignment > Align(2)) {
+    if (HasMinSize && SizeVal % 2 != 0) {
+      // Byte aligned
+      AVT = MVT::i8;
+      ValReg = X86::AL;
+      Count = DAG.getIntPtrConstant(SizeVal, dl);
+    } else if (Alignment > Align(2)) { // If the value is a constant, then we
+                                       // can potentially use larger sets.
       // DWORD aligned
       AVT = MVT::i32;
       ValReg = X86::EAX;
-      Val = (Val << 8)  | Val;
-      Val = (Val << 16) | Val;
+      MemsetVal = (MemsetVal << 8) | MemsetVal;
+      MemsetVal = (MemsetVal << 16) | MemsetVal;
       if (Subtarget.is64Bit() && Alignment > Align(8)) { // QWORD aligned
         AVT = MVT::i64;
         ValReg = X86::RAX;
-        Val = (Val << 32) | Val;
+        MemsetVal = (MemsetVal << 32) | MemsetVal;
       }
     } else if (Alignment == Align(2)) {
       // WORD aligned
       AVT = MVT::i16;
       ValReg = X86::AX;
-      Val = (Val << 8) | Val;
+      MemsetVal = (MemsetVal << 8) | MemsetVal;
     } else {
-      // Byte aligned
-      AVT = MVT::i8;
-      ValReg = X86::AL;
       Count = DAG.getIntPtrConstant(SizeVal, dl);
     }
 
@@ -142,8 +163,8 @@
       BytesLeft = SizeVal % UBytes;
     }
 
-    Chain = DAG.getCopyToReg(Chain, dl, ValReg, DAG.getConstant(Val, dl, AVT),
-                             InFlag);
+    Chain = DAG.getCopyToReg(Chain, dl, ValReg,
+                             DAG.getConstant(MemsetVal, dl, AVT), InFlag);
     InFlag = Chain.getValue(1);
   } else {
     AVT = MVT::i8;
Index: llvm/test/CodeGen/X86/memset-minsize.ll
===================================================================
--- llvm/test/CodeGen/X86/memset-minsize.ll
+++ llvm/test/CodeGen/X86/memset-minsize.ll
@@ -29,11 +29,9 @@
 define void @medium_memset_to_rep_stos(i32* %ptr) minsize nounwind {
 ; CHECK-LABEL: medium_memset_to_rep_stos:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    movl $512, %edx # imm = 0x200
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    callq memset@PLT
-; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    movl $128, %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    rep;stosl %eax, %es:(%rdi)
 ; CHECK-NEXT:    retq
 entry:
   %0 = bitcast i32* %ptr to i8*
@@ -44,11 +42,9 @@
 define void @large_memset_to_rep_stos(i32* %ptr) minsize nounwind {
 ; CHECK-LABEL: large_memset_to_rep_stos:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    movl $4096, %edx # imm = 0x1000
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    callq memset@PLT
-; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    movl $1024, %ecx # imm = 0x400
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    rep;stosl %eax, %es:(%rdi)
 ; CHECK-NEXT:    retq
 entry:
   %0 = bitcast i32* %ptr to i8*
@@ -59,11 +55,9 @@
 define void @huge_memset_to_rep_stos(i32* %ptr) minsize nounwind {
 ; CHECK-LABEL: huge_memset_to_rep_stos:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    movl $8192, %edx # imm = 0x2000
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    callq memset@PLT
-; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    movl $2048, %ecx # imm = 0x800
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    rep;stosl %eax, %es:(%rdi)
 ; CHECK-NEXT:    retq
 entry:
   %0 = bitcast i32* %ptr to i8*
@@ -74,11 +68,9 @@
 define void @odd_length_memset_to_rep_stos(i32* %ptr) minsize nounwind {
 ; CHECK-LABEL: odd_length_memset_to_rep_stos:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    movl $255, %edx
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    callq memset@PLT
-; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    movl $255, %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    rep;stosb %al, %es:(%rdi)
 ; CHECK-NEXT:    retq
 entry:
   %0 = bitcast i32* %ptr to i8*
@@ -89,11 +81,9 @@
 define void @align_1_memset_to_rep_stos(i8* %ptr) minsize nounwind {
 ; CHECK-LABEL: align_1_memset_to_rep_stos:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    movl $256, %edx # imm = 0x100
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    callq memset@PLT
-; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    movl $256, %ecx # imm = 0x100
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    rep;stosb %al, %es:(%rdi)
 ; CHECK-NEXT:    retq
 entry:
   call void @llvm.memset.p0i8.i32(i8* align 1 %ptr, i8 0, i32 256, i1 false)
@@ -103,11 +93,9 @@
 define void @align_2_memset_to_rep_stos(i16* %ptr) minsize nounwind {
 ; CHECK-LABEL: align_2_memset_to_rep_stos:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    movl $256, %edx # imm = 0x100
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    callq memset@PLT
-; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    movl $128, %ecx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    rep;stosw %ax, %es:(%rdi)
 ; CHECK-NEXT:    retq
 entry:
   %0 = bitcast i16* %ptr to i8*
@@ -118,11 +106,10 @@
 define void @align_4_memset_to_rep_stos(i16* %ptr) minsize nounwind {
 ; CHECK-LABEL: align_4_memset_to_rep_stos:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    movl $256, %edx # imm = 0x100
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    callq memset@PLT
-; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    pushq $64
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    rep;stosl %eax, %es:(%rdi)
 ; CHECK-NEXT:    retq
 entry:
   %0 = bitcast i16* %ptr to i8*
@@ -133,11 +120,10 @@
 define void @align_8_memset_to_rep_stos(i64* %ptr) minsize nounwind {
 ; CHECK-LABEL: align_8_memset_to_rep_stos:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    pushq %rax
-; CHECK-NEXT:    movl $256, %edx # imm = 0x100
-; CHECK-NEXT:    xorl %esi, %esi
-; CHECK-NEXT:    callq memset@PLT
-; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    pushq $64
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    rep;stosl %eax, %es:(%rdi)
 ; CHECK-NEXT:    retq
 entry:
   %0 = bitcast i64* %ptr to i8*