diff --git a/clang/docs/LanguageExtensions.rst b/clang/docs/LanguageExtensions.rst
--- a/clang/docs/LanguageExtensions.rst
+++ b/clang/docs/LanguageExtensions.rst
@@ -3218,6 +3218,26 @@
 
 Note that this intrinsic cannot yet be called in a ``constexpr`` context.
 
+Guaranteed inlined memset
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+. code-block:: c
+
+  void __builtin_memset_inline(void *dst, int value, size_t size);
+
+
+``__builtin_memset_inline`` has been designed as a building block for efficient
+``memset`` implementations. It is identical to ``__builtin_memset`` but also
+guarantees not to call any external functions. See LLVM IR `llvm.memset.inline
+<https://llvm.org/docs/LangRef.html#llvm-memset-inline-intrinsic>`_ intrinsic
+for more information.
+
+This is useful to implement a custom version of ``memset``, implement a
+``libc`` memset or work around the absence of a ``libc``.
+
+Note that the `size` argument must be a compile time constant.
+
+Note that this intrinsic cannot yet be called in a ``constexpr`` context.
 
 Atomic Min/Max builtins with memory ordering
 --------------------------------------------
diff --git a/clang/include/clang/Basic/Builtins.def b/clang/include/clang/Basic/Builtins.def
--- a/clang/include/clang/Basic/Builtins.def
+++ b/clang/include/clang/Basic/Builtins.def
@@ -559,6 +559,7 @@
 BUILTIN(__builtin_memmove, "v*v*vC*z", "nF")
 BUILTIN(__builtin_mempcpy, "v*v*vC*z", "nF")
 BUILTIN(__builtin_memset, "v*v*iz", "nF")
+BUILTIN(__builtin_memset_inline, "vv*iIz", "n")
 BUILTIN(__builtin_printf, "icC*.", "Fp:0:")
 BUILTIN(__builtin_stpcpy, "c*c*cC*", "nF")
 BUILTIN(__builtin_stpncpy, "c*c*cC*z", "nF")
diff --git a/clang/lib/CodeGen/CGBuilder.h b/clang/lib/CodeGen/CGBuilder.h
--- a/clang/lib/CodeGen/CGBuilder.h
+++ b/clang/lib/CodeGen/CGBuilder.h
@@ -344,6 +344,14 @@
                         Dest.getAlignment().getAsAlign(), IsVolatile);
   }
 
+  using CGBuilderBaseTy::CreateMemSetInline;
+  llvm::CallInst *CreateMemSetInline(Address Dest, llvm::Value *Value,
+                                     uint64_t Size) {
+    return CreateMemSetInline(Dest.getPointer(),
+                              Dest.getAlignment().getAsAlign(), Value,
+                              getInt64(Size));
+  }
+
   using CGBuilderBaseTy::CreatePreserveStructAccessIndex;
   Address CreatePreserveStructAccessIndex(Address Addr, unsigned Index,
                                           unsigned FieldIndex,
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -3508,6 +3508,17 @@
     Builder.CreateMemSet(Dest, ByteVal, SizeVal, false);
     return RValue::get(Dest.getPointer());
   }
+  case Builtin::BI__builtin_memset_inline: {
+    Address Dest = EmitPointerWithAlignment(E->getArg(0));
+    Value *ByteVal =
+        Builder.CreateTrunc(EmitScalarExpr(E->getArg(1)), Builder.getInt8Ty());
+    uint64_t Size =
+        E->getArg(2)->EvaluateKnownConstInt(getContext()).getZExtValue();
+    EmitNonNullArgCheck(RValue::get(Dest.getPointer()), E->getArg(0)->getType(),
+                        E->getArg(0)->getExprLoc(), FD, 0);
+    Builder.CreateMemSetInline(Dest, ByteVal, Size);
+    return RValue::get(nullptr);
+  }
   case Builtin::BI__builtin___memset_chk: {
     // fold __builtin_memset_chk(x, y, cst1, cst2) to memset iff cst1<=cst2.
     Expr::EvalResult SizeResult, DstSizeResult;
diff --git a/clang/lib/Sema/SemaChecking.cpp b/clang/lib/Sema/SemaChecking.cpp
--- a/clang/lib/Sema/SemaChecking.cpp
+++ b/clang/lib/Sema/SemaChecking.cpp
@@ -2277,6 +2277,17 @@
     }
     break;
   }
+  case Builtin::BI__builtin_memset_inline: {
+    clang::Expr *SizeOp = TheCall->getArg(2);
+    // We warn about filling to `nullptr` pointers when `size` is greater than
+    // 0. When `size` is value dependent we cannot evaluate its value so we bail
+    // out.
+    if (SizeOp->isValueDependent())
+      break;
+    if (!SizeOp->EvaluateKnownConstInt(Context).isZero())
+      CheckNonNullArgument(*this, TheCall->getArg(0), TheCall->getExprLoc());
+    break;
+  }
 #define BUILTIN(ID, TYPE, ATTRS)
 #define ATOMIC_BUILTIN(ID, TYPE, ATTRS) \
   case Builtin::BI##ID: \
diff --git a/clang/test/CodeGen/builtins-memset-inline.c b/clang/test/CodeGen/builtins-memset-inline.c
new file mode 100644
--- /dev/null
+++ b/clang/test/CodeGen/builtins-memset-inline.c
@@ -0,0 +1,21 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: x86-registered-target
+// RUN: %clang_cc1 -no-opaque-pointers -triple x86_64-unknown-linux -emit-llvm %s -o - | FileCheck %s
+
+// CHECK-LABEL: define{{.*}} void @test_memset_inline_0(i8* noundef %dst, i8 noundef signext %value)
+void test_memset_inline_0(void *dst, char value) {
+  // CHECK:    call void @llvm.memset.inline.p0i8.i64(i8* align 1 %0, i8 %2, i64 0, i1 false)
+  __builtin_memset_inline(dst, value, 0);
+}
+
+// CHECK-LABEL: define{{.*}} void @test_memset_inline_1(i8* noundef %dst, i8 noundef signext %value)
+void test_memset_inline_1(void *dst, char value) {
+  // CHECK:    call void @llvm.memset.inline.p0i8.i64(i8* align 1 %0, i8 %2, i64 1, i1 false)
+  __builtin_memset_inline(dst, value, 1);
+}
+
+// CHECK-LABEL: define{{.*}} void @test_memset_inline_4(i8* noundef %dst, i8 noundef signext %value)
+void test_memset_inline_4(void *dst, char value) {
+  // CHECK:    call void @llvm.memset.inline.p0i8.i64(i8* align 1 %0, i8 %2, i64 4, i1 false)
+  __builtin_memset_inline(dst, value, 4);
+}
diff --git a/clang/test/Sema/builtins-memset-inline.cpp b/clang/test/Sema/builtins-memset-inline.cpp
new file mode 100644
--- /dev/null
+++ b/clang/test/Sema/builtins-memset-inline.cpp
@@ -0,0 +1,40 @@
+// RUN: %clang_cc1 -fsyntax-only -verify %s
+
+#define NULL ((char *)0)
+
+#if __has_builtin(__builtin_memset_inline)
+#warning defined as expected
+// expected-warning@-1 {{defined as expected}}
+#endif
+
+void test_memset_inline_invalid_arg_types() {
+  __builtin_memset_inline(1, 2, 3); // expected-error {{cannot initialize a parameter of type 'void *' with an rvalue of type 'int'}}
+}
+
+void test_memset_inline_null_dst(void *ptr) {
+  __builtin_memset_inline(NULL, 1, 4); // expected-warning {{null passed to a callee that requires a non-null argument}}
+}
+
+void test_memset_inline_null_buffer_is_ok_if_size_is_zero(void *ptr, char value) {
+  __builtin_memset_inline(NULL, value, /*size */ 0);
+}
+
+void test_memset_inline_non_constant_size(void *dst, char value, unsigned size) {
+  __builtin_memset_inline(dst, value, size); // expected-error {{argument to '__builtin_memset_inline' must be a constant integer}}
+}
+
+template <unsigned size>
+void test_memset_inline_template(void *dst, char value) {
+  // we do not try to evaluate size in non intantiated templates.
+  __builtin_memset_inline(dst, value, size);
+}
+
+void test_memset_inline_implicit_conversion(void *ptr, char value) {
+  char a[5];
+  __builtin_memset_inline(a, value, 5);
+}
+
+void test_memset_inline_num_args(void *dst, char value) {
+  __builtin_memset_inline();                    // expected-error {{too few arguments to function call}}
+  __builtin_memset_inline(dst, value, 4, NULL); // expected-error {{too many arguments to function call}}
+}
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
--- a/llvm/docs/LangRef.rst
+++ b/llvm/docs/LangRef.rst
@@ -13867,6 +13867,71 @@
 If ``<len>`` is not zero, ``<dest>`` should be well-defined, otherwise the
 behavior is undefined.
 
+.. _int_memset_inline:
+
+'``llvm.memset.inline``' Intrinsic
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+Syntax:
+"""""""
+
+This is an overloaded intrinsic. You can use ``llvm.memset.inline`` on any
+integer bit width and for different address spaces. Not all targets
+support all bit widths however.
+
+::
+
+      declare void @llvm.memset.inline.p0i8.p0i8.i32(i8* <dest>, i8 <val>,
+                                                     i32 <len>,
+                                                     i1 <isvolatile>)
+      declare void @llvm.memset.inline.p0i8.p0i8.i64(i8* <dest>, i8 <val>,
+                                                     i64 <len>,
+                                                     i1 <isvolatile>)
+
+Overview:
+"""""""""
+
+The '``llvm.memset.inline.*``' intrinsics fill a block of memory with a
+particular byte value and guarantees that no external functions are called.
+
+Note that, unlike the standard libc function, the ``llvm.memset.inline.*``
+intrinsics do not return a value, take an extra isvolatile argument and the
+pointer can be in specified address spaces.
+
+Arguments:
+""""""""""
+
+The first argument is a pointer to the destination to fill, the second
+is the byte value with which to fill it, the third argument is a constant
+integer argument specifying the number of bytes to fill, and the fourth
+is a boolean indicating a volatile access.
+
+The :ref:`align <attr_align>` parameter attribute can be provided
+for the first argument.
+
+If the ``isvolatile`` parameter is ``true``, the ``llvm.memset.inline`` call is
+a :ref:`volatile operation <volatile>`. The detailed access behavior is not
+very cleanly specified and it is unwise to depend on it.
+
+Semantics:
+""""""""""
+
+The '``llvm.memset.inline.*``' intrinsics fill "len" bytes of memory starting
+at the destination location. If the argument is known to be
+aligned to some boundary, this can be specified as an attribute on
+the argument.
+
+``len`` must be a constant expression.
+If ``<len>`` is 0, it is no-op modulo the behavior of attributes attached to
+the arguments.
+If ``<len>`` is not a well-defined value, the behavior is undefined.
+If ``<len>`` is not zero, ``<dest>`` should be well-defined, otherwise the
+behavior is undefined.
+
+The behavior of '``llvm.memset.inline.*``' is equivalent to the behavior of
+'``llvm.memset.*``', but the generated code is guaranteed not to call any
+external functions.
+
 '``llvm.sqrt.*``' Intrinsic
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
diff --git a/llvm/include/llvm/CodeGen/SelectionDAG.h b/llvm/include/llvm/CodeGen/SelectionDAG.h
--- a/llvm/include/llvm/CodeGen/SelectionDAG.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAG.h
@@ -1052,7 +1052,8 @@
                      const AAMDNodes &AAInfo = AAMDNodes());
 
   SDValue getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst, SDValue Src,
-                    SDValue Size, Align Alignment, bool isVol, bool isTailCall,
+                    SDValue Size, Align Alignment, bool isVol,
+                    bool AlwaysInline, bool isTailCall,
                     MachinePointerInfo DstPtrInfo,
                     const AAMDNodes &AAInfo = AAMDNodes());
 
diff --git a/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h b/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h
--- a/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h
+++ b/llvm/include/llvm/CodeGen/SelectionDAGTargetInfo.h
@@ -76,11 +76,13 @@
   /// that don't fit the target's parameters for simple stores and can be more
   /// efficient than using a library call. This function can return a null
   /// SDValue if the target declines to use custom code and a different
-  /// lowering strategy should be used.
+  /// lowering strategy should be used. Note that if AlwaysInline is true the
+  /// function has to return a valid SDValue.
   virtual SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
                                           SDValue Chain, SDValue Op1,
                                           SDValue Op2, SDValue Op3,
                                           Align Alignment, bool isVolatile,
+                                          bool AlwaysInline,
                                           MachinePointerInfo DstPtrInfo) const {
     return SDValue();
   }
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -3539,6 +3539,7 @@
 
   /// Determines the optimal series of memory ops to replace the memset / memcpy.
   /// Return true if the number of memory ops is below the threshold (Limit).
+  /// Note that this is always the case when Limit is ~0.
   /// It returns the types of the sequence of memory ops to perform
   /// memset / memcpy by reference.
   virtual bool
diff --git a/llvm/include/llvm/IR/IRBuilder.h b/llvm/include/llvm/IR/IRBuilder.h
--- a/llvm/include/llvm/IR/IRBuilder.h
+++ b/llvm/include/llvm/IR/IRBuilder.h
@@ -578,6 +578,12 @@
                          MDNode *ScopeTag = nullptr,
                          MDNode *NoAliasTag = nullptr);
 
+  CallInst *CreateMemSetInline(Value *Dst, MaybeAlign DstAlign, Value *Val,
+                               Value *Size, bool IsVolatile = false,
+                               MDNode *TBAATag = nullptr,
+                               MDNode *ScopeTag = nullptr,
+                               MDNode *NoAliasTag = nullptr);
+
   /// Create and insert an element unordered-atomic memset of the region of
   /// memory starting at the given pointer to the given value.
   ///
diff --git a/llvm/include/llvm/IR/IntrinsicInst.h b/llvm/include/llvm/IR/IntrinsicInst.h
--- a/llvm/include/llvm/IR/IntrinsicInst.h
+++ b/llvm/include/llvm/IR/IntrinsicInst.h
@@ -973,6 +973,7 @@
     case Intrinsic::memcpy:
     case Intrinsic::memmove:
     case Intrinsic::memset:
+    case Intrinsic::memset_inline:
     case Intrinsic::memcpy_inline:
       return true;
     default:
@@ -984,12 +985,33 @@
   }
 };
 
-/// This class wraps the llvm.memset intrinsic.
+/// This class wraps the llvm.memset and llvm.memset.inline intrinsics.
 class MemSetInst : public MemSetBase<MemIntrinsic> {
 public:
   // Methods for support type inquiry through isa, cast, and dyn_cast:
   static bool classof(const IntrinsicInst *I) {
-    return I->getIntrinsicID() == Intrinsic::memset;
+    switch (I->getIntrinsicID()) {
+    case Intrinsic::memset:
+    case Intrinsic::memset_inline:
+      return true;
+    default:
+      return false;
+    }
+  }
+  static bool classof(const Value *V) {
+    return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
+  }
+};
+
+/// This class wraps the llvm.memset.inline intrinsic.
+class MemSetInlineInst : public MemSetInst {
+public:
+  ConstantInt *getLength() const {
+    return cast<ConstantInt>(MemSetInst::getLength());
+  }
+  // Methods for support type inquiry through isa, cast, and dyn_cast:
+  static bool classof(const IntrinsicInst *I) {
+    return I->getIntrinsicID() == Intrinsic::memset_inline;
   }
   static bool classof(const Value *V) {
     return isa<IntrinsicInst>(V) && classof(cast<IntrinsicInst>(V));
@@ -1074,6 +1096,7 @@
     case Intrinsic::memcpy_inline:
     case Intrinsic::memmove:
     case Intrinsic::memset:
+    case Intrinsic::memset_inline:
     case Intrinsic::memcpy_element_unordered_atomic:
     case Intrinsic::memmove_element_unordered_atomic:
     case Intrinsic::memset_element_unordered_atomic:
@@ -1095,6 +1118,7 @@
   static bool classof(const IntrinsicInst *I) {
     switch (I->getIntrinsicID()) {
     case Intrinsic::memset:
+    case Intrinsic::memset_inline:
     case Intrinsic::memset_element_unordered_atomic:
       return true;
     default:
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -651,6 +651,17 @@
                              NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>,
                              ImmArg<ArgIndex<3>>]>;
 
+// Memset version that is guaranteed to be inlined.
+// In particular this means that the generated code is not allowed to call any
+// external function.
+// The third argument (specifying the size) must be a constant.
+def int_memset_inline
+    : Intrinsic<[],
+      [llvm_anyptr_ty, llvm_i8_ty, llvm_anyint_ty, llvm_i1_ty],
+      [IntrWriteMem, IntrArgMemOnly, IntrWillReturn, IntrNoFree,
+       NoCapture<ArgIndex<0>>, WriteOnly<ArgIndex<0>>, 
+       ImmArg<ArgIndex<2>>, ImmArg<ArgIndex<3>>]>;
+
 // FIXME: Add version of these floating point intrinsics which allow non-default
 // rounding modes and FP exception handling.
 
diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp
--- a/llvm/lib/Analysis/Lint.cpp
+++ b/llvm/lib/Analysis/Lint.cpp
@@ -335,6 +335,12 @@
                            MSI->getDestAlign(), nullptr, MemRef::Write);
       break;
     }
+    case Intrinsic::memset_inline: {
+      MemSetInlineInst *MSII = cast<MemSetInlineInst>(&I);
+      visitMemoryReference(I, MemoryLocation::getForDest(MSII),
+                           MSII->getDestAlign(), nullptr, MemRef::Write);
+      break;
+    }
 
     case Intrinsic::vastart:
       Check(I.getParent()->getParent()->isVarArg(),
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -6987,17 +6987,18 @@
 /// \param Size Number of bytes to write.
 /// \param Alignment Alignment of the destination in bytes.
 /// \param isVol True if destination is volatile.
+/// \param AlwaysInline Makes sure no function call is generated.
 /// \param DstPtrInfo IR information on the memory pointer.
 /// \returns New head in the control flow, if lowering was successful, empty
 /// SDValue otherwise.
 ///
 /// The function tries to replace 'llvm.memset' intrinsic with several store
 /// operations and value calculation code. This is usually profitable for small
-/// memory size.
+/// memory size or when the semantic requires inlining.
 static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
                                SDValue Chain, SDValue Dst, SDValue Src,
                                uint64_t Size, Align Alignment, bool isVol,
-                               MachinePointerInfo DstPtrInfo,
+                               bool AlwaysInline, MachinePointerInfo DstPtrInfo,
                                const AAMDNodes &AAInfo) {
   // Turn a memset of undef to nop.
   // FIXME: We need to honor volatile even is Src is undef.
@@ -7017,8 +7018,10 @@
     DstAlignCanChange = true;
   bool IsZeroVal =
       isa<ConstantSDNode>(Src) && cast<ConstantSDNode>(Src)->isZero();
+  unsigned Limit = AlwaysInline ? ~0 : TLI.getMaxStoresPerMemset(OptSize);
+
   if (!TLI.findOptimalMemOpLowering(
-          MemOps, TLI.getMaxStoresPerMemset(OptSize),
+          MemOps, Limit,
           MemOp::Set(Size, DstAlignCanChange, Alignment, IsZeroVal, isVol),
           DstPtrInfo.getAddrSpace(), ~0u, MF.getFunction().getAttributes()))
     return SDValue();
@@ -7314,7 +7317,7 @@
 
 SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
                                 SDValue Src, SDValue Size, Align Alignment,
-                                bool isVol, bool isTailCall,
+                                bool isVol, bool AlwaysInline, bool isTailCall,
                                 MachinePointerInfo DstPtrInfo,
                                 const AAMDNodes &AAInfo) {
   // Check to see if we should lower the memset to stores first.
@@ -7327,7 +7330,7 @@
 
     SDValue Result = getMemsetStores(*this, dl, Chain, Dst, Src,
                                      ConstantSize->getZExtValue(), Alignment,
-                                     isVol, DstPtrInfo, AAInfo);
+                                     isVol, false, DstPtrInfo, AAInfo);
 
     if (Result.getNode())
       return Result;
@@ -7337,11 +7340,23 @@
   // code. If the target chooses to do this, this is the next best.
   if (TSI) {
     SDValue Result = TSI->EmitTargetCodeForMemset(
-        *this, dl, Chain, Dst, Src, Size, Alignment, isVol, DstPtrInfo);
+        *this, dl, Chain, Dst, Src, Size, Alignment, isVol, AlwaysInline, DstPtrInfo);
     if (Result.getNode())
       return Result;
   }
 
+  // If we really need inline code and the target declined to provide it,
+  // use a (potentially long) sequence of loads and stores.
+  if (AlwaysInline) {
+    assert(ConstantSize && "AlwaysInline requires a constant size!");
+    SDValue Result = getMemsetStores(*this, dl, Chain, Dst, Src,
+                                     ConstantSize->getZExtValue(), Alignment,
+                                     isVol, true, DstPtrInfo, AAInfo);
+    assert(Result &&
+           "getMemsetStores must return a valid sequence when AlwaysInline");
+    return Result;
+  }
+
   checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
 
   // Emit a library call.
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -5909,10 +5909,28 @@
     bool isVol = MSI.isVolatile();
     bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
     SDValue Root = isVol ? getRoot() : getMemoryRoot();
-    SDValue MS = DAG.getMemset(Root, sdl, Op1, Op2, Op3, Alignment, isVol, isTC,
+    SDValue MS = DAG.getMemset(
+        Root, sdl, Op1, Op2, Op3, Alignment, isVol, /* AlwaysInline */ false,
+        isTC, MachinePointerInfo(I.getArgOperand(0)), I.getAAMetadata());
+    updateDAGForMaybeTailCall(MS);
+    return;
+  }
+  case Intrinsic::memset_inline: {
+    const auto &MSII = cast<MemSetInlineInst>(I);
+    SDValue Dst = getValue(I.getArgOperand(0));
+    SDValue Value = getValue(I.getArgOperand(1));
+    SDValue Size = getValue(I.getArgOperand(2));
+    assert(isa<ConstantSDNode>(Size) && "memset_inline needs constant size");
+    // @llvm.memset defines 0 and 1 to both mean no alignment.
+    Align DstAlign = MSII.getDestAlign().valueOrOne();
+    bool isVol = MSII.isVolatile();
+    bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
+    SDValue Root = isVol ? getRoot() : getMemoryRoot();
+    SDValue MC = DAG.getMemset(Root, sdl, Dst, Value, Size, DstAlign, isVol,
+                               /* AlwaysInline */ true, isTC,
                                MachinePointerInfo(I.getArgOperand(0)),
                                I.getAAMetadata());
-    updateDAGForMaybeTailCall(MS);
+    updateDAGForMaybeTailCall(MC);
     return;
   }
   case Intrinsic::memmove: {
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -196,7 +196,8 @@
 bool TargetLowering::findOptimalMemOpLowering(
     std::vector<EVT> &MemOps, unsigned Limit, const MemOp &Op, unsigned DstAS,
     unsigned SrcAS, const AttributeList &FuncAttributes) const {
-  if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
+  if (Limit != ~unsigned(0) && Op.isMemcpyWithFixedDstAlign() &&
+      Op.getSrcAlign() < Op.getDstAlign())
     return false;
 
   EVT VT = getOptimalMemOpType(Op, FuncAttributes);
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -164,6 +164,35 @@
   return CI;
 }
 
+CallInst *IRBuilderBase::CreateMemSetInline(Value *Dst, MaybeAlign DstAlign,
+                                            Value *Val, Value *Size,
+                                            bool IsVolatile, MDNode *TBAATag,
+                                            MDNode *ScopeTag,
+                                            MDNode *NoAliasTag) {
+  Dst = getCastedInt8PtrValue(Dst);
+  Value *Ops[] = {Dst, Val, Size, getInt1(IsVolatile)};
+  Type *Tys[] = {Dst->getType(), Size->getType()};
+  Module *M = BB->getParent()->getParent();
+  Function *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memset_inline, Tys);
+
+  CallInst *CI = createCallHelper(TheFn, Ops, this);
+
+  if (DstAlign)
+    cast<MemSetInlineInst>(CI)->setDestAlignment(*DstAlign);
+
+  // Set the TBAA info if present.
+  if (TBAATag)
+    CI->setMetadata(LLVMContext::MD_tbaa, TBAATag);
+
+  if (ScopeTag)
+    CI->setMetadata(LLVMContext::MD_alias_scope, ScopeTag);
+
+  if (NoAliasTag)
+    CI->setMetadata(LLVMContext::MD_noalias, NoAliasTag);
+
+  return CI;
+}
+
 CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemSet(
     Value *Ptr, Value *Val, Value *Size, Align Alignment, uint32_t ElementSize,
     MDNode *TBAATag, MDNode *ScopeTag, MDNode *NoAliasTag) {
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -4917,7 +4917,8 @@
   case Intrinsic::memcpy:
   case Intrinsic::memcpy_inline:
   case Intrinsic::memmove:
-  case Intrinsic::memset: {
+  case Intrinsic::memset:
+  case Intrinsic::memset_inline: {
     const auto *MI = cast<MemIntrinsic>(&Call);
     auto IsValidAlignment = [&](unsigned Alignment) -> bool {
       return Alignment == 0 || isPowerOf2_32(Alignment);
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -34,7 +34,7 @@
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Dst, SDValue Src,
                                   SDValue Size, Align Alignment,
-                                  bool isVolatile,
+                                  bool isVolatile, bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo) const override;
   SDValue
   EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -91,7 +91,7 @@
 
 SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
-    SDValue Size, Align Alignment, bool isVolatile,
+    SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
     MachinePointerInfo DstPtrInfo) const {
   const AArch64Subtarget &STI =
       DAG.getMachineFunction().getSubtarget<AArch64Subtarget>();
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -55,6 +55,7 @@
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Op1, SDValue Op2,
                                   SDValue Op3, Align Alignment, bool isVolatile,
+                                  bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo) const override;
 
   SDValue EmitSpecializedLibcall(SelectionDAG &DAG, const SDLoc &dl,
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -296,7 +296,7 @@
 
 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
-    SDValue Size, Align Alignment, bool isVolatile,
+    SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
     MachinePointerInfo DstPtrInfo) const {
 
   const ARMSubtarget &Subtarget =
@@ -314,6 +314,9 @@
                        DAG.getZExtOrTrunc(Size, dl, MVT::i32));
   }
 
-  return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
-                                Alignment.value(), RTLIB::MEMSET);
+  if (!AlwaysInline)
+    return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
+                                  Alignment.value(), RTLIB::MEMSET);
+
+  return SDValue();
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -1000,13 +1000,15 @@
     unsigned SrcAS, const AttributeList &FuncAttributes) const {
   const int MVCFastLen = 16;
 
-  // Don't expand Op into scalar loads/stores in these cases:
-  if (Op.isMemcpy() && Op.allowOverlap() && Op.size() <= MVCFastLen)
-    return false;  // Small memcpy: Use MVC
-  if (Op.isMemset() && Op.size() - 1 <= MVCFastLen)
-    return false;  // Small memset (first byte with STC/MVI): Use MVC
-  if (Op.isZeroMemset())
-    return false;  // Memset zero: Use XC
+  if (Limit != ~unsigned(0)) {
+    // Don't expand Op into scalar loads/stores in these cases:
+    if (Op.isMemcpy() && Op.allowOverlap() && Op.size() <= MVCFastLen)
+      return false; // Small memcpy: Use MVC
+    if (Op.isMemset() && Op.size() - 1 <= MVCFastLen)
+      return false; // Small memset (first byte with STC/MVI): Use MVC
+    if (Op.isZeroMemset())
+      return false; // Memset zero: Use XC
+  }
 
   return TargetLowering::findOptimalMemOpLowering(MemOps, Limit, Op, DstAS,
                                                   SrcAS, FuncAttributes);
diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
@@ -31,7 +31,7 @@
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &DL,
                                   SDValue Chain, SDValue Dst, SDValue Byte,
                                   SDValue Size, Align Alignment,
-                                  bool IsVolatile,
+                                  bool IsVolatile, bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo) const override;
 
   std::pair<SDValue, SDValue>
diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -88,7 +88,7 @@
 SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
     SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst,
     SDValue Byte, SDValue Size, Align Alignment, bool IsVolatile,
-    MachinePointerInfo DstPtrInfo) const {
+    bool AlwaysInline, MachinePointerInfo DstPtrInfo) const {
   EVT PtrVT = Dst.getValueType();
 
   if (IsVolatile)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
--- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
@@ -37,6 +37,7 @@
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &DL,
                                   SDValue Chain, SDValue Op1, SDValue Op2,
                                   SDValue Op3, Align Alignment, bool IsVolatile,
+                                  bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo) const override;
 };
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
--- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
@@ -44,7 +44,7 @@
 
 SDValue WebAssemblySelectionDAGInfo::EmitTargetCodeForMemset(
     SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Val,
-    SDValue Size, Align Alignment, bool IsVolatile,
+    SDValue Size, Align Alignment, bool IsVolatile, bool AlwaysInline,
     MachinePointerInfo DstPtrInfo) const {
   auto &ST = DAG.getMachineFunction().getSubtarget<WebAssemblySubtarget>();
   if (!ST.hasBulkMemory())
diff --git a/llvm/lib/Target/X86/X86SelectionDAGInfo.h b/llvm/lib/Target/X86/X86SelectionDAGInfo.h
--- a/llvm/lib/Target/X86/X86SelectionDAGInfo.h
+++ b/llvm/lib/Target/X86/X86SelectionDAGInfo.h
@@ -29,7 +29,7 @@
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Dst, SDValue Src,
                                   SDValue Size, Align Alignment,
-                                  bool isVolatile,
+                                  bool isVolatile, bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo) const override;
 
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
diff --git a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
--- a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -46,7 +46,7 @@
 
 SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
-    SDValue Size, Align Alignment, bool isVolatile,
+    SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
     MachinePointerInfo DstPtrInfo) const {
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
   const X86Subtarget &Subtarget =
@@ -143,7 +143,8 @@
                       DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
                                   DAG.getConstant(Offset, dl, AddrVT)),
                       Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment,
-                      isVolatile, false, DstPtrInfo.getWithOffset(Offset));
+                      isVolatile, AlwaysInline,
+                      /* isTailCall */ false, DstPtrInfo.getWithOffset(Offset));
   }
 
   // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
diff --git a/llvm/test/CodeGen/AArch64/memset-inline.ll b/llvm/test/CodeGen/AArch64/memset-inline.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/memset-inline.ll
@@ -0,0 +1,296 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu -mattr=-neon | FileCheck %s --check-prefixes=ALL,GPR
+; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu -mattr=neon  | FileCheck %s --check-prefixes=ALL,NEON
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+declare void @llvm.memset.inline.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+
+; /////////////////////////////////////////////////////////////////////////////
+
+define void @memset_1(i8* %a, i8 %value) nounwind {
+; ALL-LABEL: memset_1:
+; ALL:       // %bb.0:
+; ALL-NEXT:    strb w1, [x0]
+; ALL-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 1, i1 0)
+  ret void
+}
+
+define void @memset_2(i8* %a, i8 %value) nounwind {
+; ALL-LABEL: memset_2:
+; ALL:       // %bb.0:
+; ALL-NEXT:    bfi w1, w1, #8, #24
+; ALL-NEXT:    strh w1, [x0]
+; ALL-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 2, i1 0)
+  ret void
+}
+
+define void @memset_4(i8* %a, i8 %value) nounwind {
+; ALL-LABEL: memset_4:
+; ALL:       // %bb.0:
+; ALL-NEXT:    mov w8, #16843009
+; ALL-NEXT:    and w9, w1, #0xff
+; ALL-NEXT:    mul w8, w9, w8
+; ALL-NEXT:    str w8, [x0]
+; ALL-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 4, i1 0)
+  ret void
+}
+
+define void @memset_8(i8* %a, i8 %value) nounwind {
+; ALL-LABEL: memset_8:
+; ALL:       // %bb.0:
+; ALL-NEXT:    // kill: def $w1 killed $w1 def $x1
+; ALL-NEXT:    mov x8, #72340172838076673
+; ALL-NEXT:    and x9, x1, #0xff
+; ALL-NEXT:    mul x8, x9, x8
+; ALL-NEXT:    str x8, [x0]
+; ALL-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 8, i1 0)
+  ret void
+}
+
+define void @memset_16(i8* %a, i8 %value) nounwind {
+; ALL-LABEL: memset_16:
+; ALL:       // %bb.0:
+; ALL-NEXT:    // kill: def $w1 killed $w1 def $x1
+; ALL-NEXT:    mov x8, #72340172838076673
+; ALL-NEXT:    and x9, x1, #0xff
+; ALL-NEXT:    mul x8, x9, x8
+; ALL-NEXT:    stp x8, x8, [x0]
+; ALL-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 16, i1 0)
+  ret void
+}
+
+define void @memset_32(i8* %a, i8 %value) nounwind {
+; GPR-LABEL: memset_32:
+; GPR:       // %bb.0:
+; GPR-NEXT:    // kill: def $w1 killed $w1 def $x1
+; GPR-NEXT:    mov x8, #72340172838076673
+; GPR-NEXT:    and x9, x1, #0xff
+; GPR-NEXT:    mul x8, x9, x8
+; GPR-NEXT:    stp x8, x8, [x0, #16]
+; GPR-NEXT:    stp x8, x8, [x0]
+; GPR-NEXT:    ret
+;
+; NEON-LABEL: memset_32:
+; NEON:       // %bb.0:
+; NEON-NEXT:    dup v0.16b, w1
+; NEON-NEXT:    stp q0, q0, [x0]
+; NEON-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 32, i1 0)
+  ret void
+}
+
+define void @memset_64(i8* %a, i8 %value) nounwind {
+; GPR-LABEL: memset_64:
+; GPR:       // %bb.0:
+; GPR-NEXT:    // kill: def $w1 killed $w1 def $x1
+; GPR-NEXT:    mov x8, #72340172838076673
+; GPR-NEXT:    and x9, x1, #0xff
+; GPR-NEXT:    mul x8, x9, x8
+; GPR-NEXT:    stp x8, x8, [x0, #48]
+; GPR-NEXT:    stp x8, x8, [x0, #32]
+; GPR-NEXT:    stp x8, x8, [x0, #16]
+; GPR-NEXT:    stp x8, x8, [x0]
+; GPR-NEXT:    ret
+;
+; NEON-LABEL: memset_64:
+; NEON:       // %bb.0:
+; NEON-NEXT:    dup v0.16b, w1
+; NEON-NEXT:    stp q0, q0, [x0]
+; NEON-NEXT:    stp q0, q0, [x0, #32]
+; NEON-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 64, i1 0)
+  ret void
+}
+
+; /////////////////////////////////////////////////////////////////////////////
+
+define void @aligned_memset_16(i8* align 16 %a, i8 %value) nounwind {
+; ALL-LABEL: aligned_memset_16:
+; ALL:       // %bb.0:
+; ALL-NEXT:    // kill: def $w1 killed $w1 def $x1
+; ALL-NEXT:    mov x8, #72340172838076673
+; ALL-NEXT:    and x9, x1, #0xff
+; ALL-NEXT:    mul x8, x9, x8
+; ALL-NEXT:    stp x8, x8, [x0]
+; ALL-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* align 16 %a, i8 %value, i64 16, i1 0)
+  ret void
+}
+
+define void @aligned_memset_32(i8* align 32 %a, i8 %value) nounwind {
+; GPR-LABEL: aligned_memset_32:
+; GPR:       // %bb.0:
+; GPR-NEXT:    // kill: def $w1 killed $w1 def $x1
+; GPR-NEXT:    mov x8, #72340172838076673
+; GPR-NEXT:    and x9, x1, #0xff
+; GPR-NEXT:    mul x8, x9, x8
+; GPR-NEXT:    stp x8, x8, [x0, #16]
+; GPR-NEXT:    stp x8, x8, [x0]
+; GPR-NEXT:    ret
+;
+; NEON-LABEL: aligned_memset_32:
+; NEON:       // %bb.0:
+; NEON-NEXT:    dup v0.16b, w1
+; NEON-NEXT:    stp q0, q0, [x0]
+; NEON-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* align 32 %a, i8 %value, i64 32, i1 0)
+  ret void
+}
+
+define void @aligned_memset_64(i8* align 64 %a, i8 %value) nounwind {
+; GPR-LABEL: aligned_memset_64:
+; GPR:       // %bb.0:
+; GPR-NEXT:    // kill: def $w1 killed $w1 def $x1
+; GPR-NEXT:    mov x8, #72340172838076673
+; GPR-NEXT:    and x9, x1, #0xff
+; GPR-NEXT:    mul x8, x9, x8
+; GPR-NEXT:    stp x8, x8, [x0, #48]
+; GPR-NEXT:    stp x8, x8, [x0, #32]
+; GPR-NEXT:    stp x8, x8, [x0, #16]
+; GPR-NEXT:    stp x8, x8, [x0]
+; GPR-NEXT:    ret
+;
+; NEON-LABEL: aligned_memset_64:
+; NEON:       // %bb.0:
+; NEON-NEXT:    dup v0.16b, w1
+; NEON-NEXT:    stp q0, q0, [x0]
+; NEON-NEXT:    stp q0, q0, [x0, #32]
+; NEON-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* align 64 %a, i8 %value, i64 64, i1 0)
+  ret void
+}
+
+; /////////////////////////////////////////////////////////////////////////////
+
+define void @bzero_1(i8* %a) nounwind {
+; ALL-LABEL: bzero_1:
+; ALL:       // %bb.0:
+; ALL-NEXT:    strb wzr, [x0]
+; ALL-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 1, i1 0)
+  ret void
+}
+
+define void @bzero_2(i8* %a) nounwind {
+; ALL-LABEL: bzero_2:
+; ALL:       // %bb.0:
+; ALL-NEXT:    strh wzr, [x0]
+; ALL-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 2, i1 0)
+  ret void
+}
+
+define void @bzero_4(i8* %a) nounwind {
+; ALL-LABEL: bzero_4:
+; ALL:       // %bb.0:
+; ALL-NEXT:    str wzr, [x0]
+; ALL-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 4, i1 0)
+  ret void
+}
+
+define void @bzero_8(i8* %a) nounwind {
+; ALL-LABEL: bzero_8:
+; ALL:       // %bb.0:
+; ALL-NEXT:    str xzr, [x0]
+; ALL-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 8, i1 0)
+  ret void
+}
+
+define void @bzero_16(i8* %a) nounwind {
+; ALL-LABEL: bzero_16:
+; ALL:       // %bb.0:
+; ALL-NEXT:    stp xzr, xzr, [x0]
+; ALL-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 16, i1 0)
+  ret void
+}
+
+define void @bzero_32(i8* %a) nounwind {
+; GPR-LABEL: bzero_32:
+; GPR:       // %bb.0:
+; GPR-NEXT:    adrp x8, .LCPI15_0
+; GPR-NEXT:    ldr q0, [x8, :lo12:.LCPI15_0]
+; GPR-NEXT:    stp q0, q0, [x0]
+; GPR-NEXT:    ret
+;
+; NEON-LABEL: bzero_32:
+; NEON:       // %bb.0:
+; NEON-NEXT:    movi v0.2d, #0000000000000000
+; NEON-NEXT:    stp q0, q0, [x0]
+; NEON-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 32, i1 0)
+  ret void
+}
+
+define void @bzero_64(i8* %a) nounwind {
+; GPR-LABEL: bzero_64:
+; GPR:       // %bb.0:
+; GPR-NEXT:    adrp x8, .LCPI16_0
+; GPR-NEXT:    ldr q0, [x8, :lo12:.LCPI16_0]
+; GPR-NEXT:    stp q0, q0, [x0]
+; GPR-NEXT:    stp q0, q0, [x0, #32]
+; GPR-NEXT:    ret
+;
+; NEON-LABEL: bzero_64:
+; NEON:       // %bb.0:
+; NEON-NEXT:    movi v0.2d, #0000000000000000
+; NEON-NEXT:    stp q0, q0, [x0]
+; NEON-NEXT:    stp q0, q0, [x0, #32]
+; NEON-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 64, i1 0)
+  ret void
+}
+
+; /////////////////////////////////////////////////////////////////////////////
+
+define void @aligned_bzero_16(i8* %a) nounwind {
+; ALL-LABEL: aligned_bzero_16:
+; ALL:       // %bb.0:
+; ALL-NEXT:    stp xzr, xzr, [x0]
+; ALL-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* align 16 %a, i8 0, i64 16, i1 0)
+  ret void
+}
+
+define void @aligned_bzero_32(i8* %a) nounwind {
+; GPR-LABEL: aligned_bzero_32:
+; GPR:       // %bb.0:
+; GPR-NEXT:    adrp x8, .LCPI18_0
+; GPR-NEXT:    ldr q0, [x8, :lo12:.LCPI18_0]
+; GPR-NEXT:    stp q0, q0, [x0]
+; GPR-NEXT:    ret
+;
+; NEON-LABEL: aligned_bzero_32:
+; NEON:       // %bb.0:
+; NEON-NEXT:    movi v0.2d, #0000000000000000
+; NEON-NEXT:    stp q0, q0, [x0]
+; NEON-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* align 32 %a, i8 0, i64 32, i1 0)
+  ret void
+}
+
+define void @aligned_bzero_64(i8* %a) nounwind {
+; GPR-LABEL: aligned_bzero_64:
+; GPR:       // %bb.0:
+; GPR-NEXT:    adrp x8, .LCPI19_0
+; GPR-NEXT:    ldr q0, [x8, :lo12:.LCPI19_0]
+; GPR-NEXT:    stp q0, q0, [x0]
+; GPR-NEXT:    stp q0, q0, [x0, #32]
+; GPR-NEXT:    ret
+;
+; NEON-LABEL: aligned_bzero_64:
+; NEON:       // %bb.0:
+; NEON-NEXT:    movi v0.2d, #0000000000000000
+; NEON-NEXT:    stp q0, q0, [x0]
+; NEON-NEXT:    stp q0, q0, [x0, #32]
+; NEON-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* align 64 %a, i8 0, i64 64, i1 0)
+  ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/memset-vs-memset-inline.ll b/llvm/test/CodeGen/AArch64/memset-vs-memset-inline.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/memset-vs-memset-inline.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=aarch64-unknown-linux-gnu | FileCheck %s
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+declare void @llvm.memset.inline.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+
+define void @test1(i8* %a, i8 %value) nounwind {
+; CHECK-LABEL: test1:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    // kill: def $w1 killed $w1 def $x1
+; CHECK-NEXT:    mov x8, #72340172838076673
+; CHECK-NEXT:    and x9, x1, #0xff
+; CHECK-NEXT:    mul x8, x9, x8
+; CHECK-NEXT:    str x8, [x0]
+; CHECK-NEXT:    ret
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 8, i1 0)
+  ret void
+}
+
+define void @regular_memset_calls_external_function(i8* %a, i8 %value) nounwind {
+; CHECK-LABEL: regular_memset_calls_external_function:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    mov w2, #1024
+; CHECK-NEXT:    b memset
+  tail call void @llvm.memset.p0i8.i64(i8* %a, i8 %value, i64 1024, i1 0)
+  ret void
+}
+
+define void @inlined_set_doesnt_call_external_function(i8* %a, i8 %value) nounwind {
+; CHECK-LABEL: inlined_set_doesnt_call_external_function:
+; CHECK:       // %bb.0:
+; CHECK-NEXT:    dup v0.16b, w1
+; CHECK-NEXT:    stp q0, q0, [x0]
+; CHECK-NEXT:    stp q0, q0, [x0, #32]
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 1024, i1 0)
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/memset-inline.ll b/llvm/test/CodeGen/X86/memset-inline.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/memset-inline.ll
@@ -0,0 +1,548 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse2,-sse4.2 | FileCheck %s --check-prefixes=GPR,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse4.2,-avx  | FileCheck %s --check-prefixes=GPR,SSE4
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx,-avx512f | FileCheck %s --check-prefixes=GPR,AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=avx512f      | FileCheck %s --check-prefixes=GPR,AVX512
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+declare void @llvm.memset.inline.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+
+; /////////////////////////////////////////////////////////////////////////////
+
+define void @memset_1(i8* %a, i8 %value) nounwind {
+; GPR-LABEL: memset_1:
+; GPR:       # %bb.0:
+; GPR-NEXT:    movb %sil, (%rdi)
+; GPR-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 1, i1 0)
+  ret void
+}
+
+define void @memset_2(i8* %a, i8 %value) nounwind {
+; GPR-LABEL: memset_2:
+; GPR:       # %bb.0:
+; GPR-NEXT:    movzbl %sil, %eax
+; GPR-NEXT:    shll $8, %esi
+; GPR-NEXT:    orl %esi, %eax
+; GPR-NEXT:    movw %ax, (%rdi)
+; GPR-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 2, i1 0)
+  ret void
+}
+
+define void @memset_4(i8* %a, i8 %value) nounwind {
+; GPR-LABEL: memset_4:
+; GPR:       # %bb.0:
+; GPR-NEXT:    movzbl %sil, %eax
+; GPR-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
+; GPR-NEXT:    movl %eax, (%rdi)
+; GPR-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 4, i1 0)
+  ret void
+}
+
+define void @memset_8(i8* %a, i8 %value) nounwind {
+; GPR-LABEL: memset_8:
+; GPR:       # %bb.0:
+; GPR-NEXT:    # kill: def $esi killed $esi def $rsi
+; GPR-NEXT:    movzbl %sil, %eax
+; GPR-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
+; GPR-NEXT:    imulq %rax, %rcx
+; GPR-NEXT:    movq %rcx, (%rdi)
+; GPR-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 8, i1 0)
+  ret void
+}
+
+define void @memset_16(i8* %a, i8 %value) nounwind {
+; SSE2-LABEL: memset_16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    # kill: def $esi killed $esi def $rsi
+; SSE2-NEXT:    movzbl %sil, %eax
+; SSE2-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
+; SSE2-NEXT:    imulq %rax, %rcx
+; SSE2-NEXT:    movq %rcx, 8(%rdi)
+; SSE2-NEXT:    movq %rcx, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: memset_16:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movd %esi, %xmm0
+; SSE4-NEXT:    pxor %xmm1, %xmm1
+; SSE4-NEXT:    pshufb %xmm1, %xmm0
+; SSE4-NEXT:    movdqu %xmm0, (%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: memset_16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd %esi, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: memset_16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd %esi, %xmm0
+; AVX512-NEXT:    vpbroadcastb %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqu %xmm0, (%rdi)
+; AVX512-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 16, i1 0)
+  ret void
+}
+
+define void @memset_32(i8* %a, i8 %value) nounwind {
+; SSE2-LABEL: memset_32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    # kill: def $esi killed $esi def $rsi
+; SSE2-NEXT:    movzbl %sil, %eax
+; SSE2-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
+; SSE2-NEXT:    imulq %rax, %rcx
+; SSE2-NEXT:    movq %rcx, 24(%rdi)
+; SSE2-NEXT:    movq %rcx, 16(%rdi)
+; SSE2-NEXT:    movq %rcx, 8(%rdi)
+; SSE2-NEXT:    movq %rcx, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: memset_32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movd %esi, %xmm0
+; SSE4-NEXT:    pxor %xmm1, %xmm1
+; SSE4-NEXT:    pshufb %xmm1, %xmm0
+; SSE4-NEXT:    movdqu %xmm0, 16(%rdi)
+; SSE4-NEXT:    movdqu %xmm0, (%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: memset_32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd %esi, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovdqu %xmm0, 16(%rdi)
+; AVX-NEXT:    vmovdqu %xmm0, (%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: memset_32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd %esi, %xmm0
+; AVX512-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX512-NEXT:    vmovdqu %ymm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 32, i1 0)
+  ret void
+}
+
+define void @memset_64(i8* %a, i8 %value) nounwind {
+; SSE2-LABEL: memset_64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    # kill: def $esi killed $esi def $rsi
+; SSE2-NEXT:    movzbl %sil, %eax
+; SSE2-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
+; SSE2-NEXT:    imulq %rax, %rcx
+; SSE2-NEXT:    movq %rcx, 56(%rdi)
+; SSE2-NEXT:    movq %rcx, 48(%rdi)
+; SSE2-NEXT:    movq %rcx, 40(%rdi)
+; SSE2-NEXT:    movq %rcx, 32(%rdi)
+; SSE2-NEXT:    movq %rcx, 24(%rdi)
+; SSE2-NEXT:    movq %rcx, 16(%rdi)
+; SSE2-NEXT:    movq %rcx, 8(%rdi)
+; SSE2-NEXT:    movq %rcx, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: memset_64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movd %esi, %xmm0
+; SSE4-NEXT:    pxor %xmm1, %xmm1
+; SSE4-NEXT:    pshufb %xmm1, %xmm0
+; SSE4-NEXT:    movdqu %xmm0, 48(%rdi)
+; SSE4-NEXT:    movdqu %xmm0, 32(%rdi)
+; SSE4-NEXT:    movdqu %xmm0, 16(%rdi)
+; SSE4-NEXT:    movdqu %xmm0, (%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: memset_64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd %esi, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
+; AVX-NEXT:    vmovups %ymm0, (%rdi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: memset_64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movzbl %sil, %eax
+; AVX512-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
+; AVX512-NEXT:    vpbroadcastd %eax, %zmm0
+; AVX512-NEXT:    vmovdqu64 %zmm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 64, i1 0)
+  ret void
+}
+
+; /////////////////////////////////////////////////////////////////////////////
+
+define void @aligned_memset_16(i8* align 16 %a, i8 %value) nounwind {
+; SSE2-LABEL: aligned_memset_16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movd %esi, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: aligned_memset_16:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movd %esi, %xmm0
+; SSE4-NEXT:    pxor %xmm1, %xmm1
+; SSE4-NEXT:    pshufb %xmm1, %xmm0
+; SSE4-NEXT:    movdqa %xmm0, (%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: aligned_memset_16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd %esi, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, (%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: aligned_memset_16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd %esi, %xmm0
+; AVX512-NEXT:    vpbroadcastb %xmm0, %xmm0
+; AVX512-NEXT:    vmovdqa %xmm0, (%rdi)
+; AVX512-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* align 16 %a, i8 %value, i64 16, i1 0)
+  ret void
+}
+
+define void @aligned_memset_32(i8* align 32 %a, i8 %value) nounwind {
+; SSE2-LABEL: aligned_memset_32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movd %esi, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, 16(%rdi)
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: aligned_memset_32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movd %esi, %xmm0
+; SSE4-NEXT:    pxor %xmm1, %xmm1
+; SSE4-NEXT:    pshufb %xmm1, %xmm0
+; SSE4-NEXT:    movdqa %xmm0, 16(%rdi)
+; SSE4-NEXT:    movdqa %xmm0, (%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: aligned_memset_32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd %esi, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vmovdqa %xmm0, 16(%rdi)
+; AVX-NEXT:    vmovdqa %xmm0, (%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: aligned_memset_32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vmovd %esi, %xmm0
+; AVX512-NEXT:    vpbroadcastb %xmm0, %ymm0
+; AVX512-NEXT:    vmovdqa %ymm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* align 32 %a, i8 %value, i64 32, i1 0)
+  ret void
+}
+
+define void @aligned_memset_64(i8* align 64 %a, i8 %value) nounwind {
+; SSE2-LABEL: aligned_memset_64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movd %esi, %xmm0
+; SSE2-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; SSE2-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7]
+; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE2-NEXT:    movdqa %xmm0, 48(%rdi)
+; SSE2-NEXT:    movdqa %xmm0, 32(%rdi)
+; SSE2-NEXT:    movdqa %xmm0, 16(%rdi)
+; SSE2-NEXT:    movdqa %xmm0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: aligned_memset_64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movd %esi, %xmm0
+; SSE4-NEXT:    pxor %xmm1, %xmm1
+; SSE4-NEXT:    pshufb %xmm1, %xmm0
+; SSE4-NEXT:    movdqa %xmm0, 48(%rdi)
+; SSE4-NEXT:    movdqa %xmm0, 32(%rdi)
+; SSE4-NEXT:    movdqa %xmm0, 16(%rdi)
+; SSE4-NEXT:    movdqa %xmm0, (%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: aligned_memset_64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovd %esi, %xmm0
+; AVX-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX-NEXT:    vpshufb %xmm1, %xmm0, %xmm0
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX-NEXT:    vmovaps %ymm0, 32(%rdi)
+; AVX-NEXT:    vmovaps %ymm0, (%rdi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: aligned_memset_64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    movzbl %sil, %eax
+; AVX512-NEXT:    imull $16843009, %eax, %eax # imm = 0x1010101
+; AVX512-NEXT:    vpbroadcastd %eax, %zmm0
+; AVX512-NEXT:    vmovdqa64 %zmm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* align 64 %a, i8 %value, i64 64, i1 0)
+  ret void
+}
+
+; /////////////////////////////////////////////////////////////////////////////
+
+define void @bzero_1(i8* %a) nounwind {
+; GPR-LABEL: bzero_1:
+; GPR:       # %bb.0:
+; GPR-NEXT:    movb $0, (%rdi)
+; GPR-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 1, i1 0)
+  ret void
+}
+
+define void @bzero_2(i8* %a) nounwind {
+; GPR-LABEL: bzero_2:
+; GPR:       # %bb.0:
+; GPR-NEXT:    movw $0, (%rdi)
+; GPR-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 2, i1 0)
+  ret void
+}
+
+define void @bzero_4(i8* %a) nounwind {
+; GPR-LABEL: bzero_4:
+; GPR:       # %bb.0:
+; GPR-NEXT:    movl $0, (%rdi)
+; GPR-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 4, i1 0)
+  ret void
+}
+
+define void @bzero_8(i8* %a) nounwind {
+; GPR-LABEL: bzero_8:
+; GPR:       # %bb.0:
+; GPR-NEXT:    movq $0, (%rdi)
+; GPR-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 8, i1 0)
+  ret void
+}
+
+define void @bzero_16(i8* %a) nounwind {
+; SSE2-LABEL: bzero_16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq $0, 8(%rdi)
+; SSE2-NEXT:    movq $0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: bzero_16:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    xorps %xmm0, %xmm0
+; SSE4-NEXT:    movups %xmm0, (%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: bzero_16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovups %xmm0, (%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: bzero_16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovups %xmm0, (%rdi)
+; AVX512-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 16, i1 0)
+  ret void
+}
+
+define void @bzero_32(i8* %a) nounwind {
+; SSE2-LABEL: bzero_32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq $0, 24(%rdi)
+; SSE2-NEXT:    movq $0, 16(%rdi)
+; SSE2-NEXT:    movq $0, 8(%rdi)
+; SSE2-NEXT:    movq $0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: bzero_32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    xorps %xmm0, %xmm0
+; SSE4-NEXT:    movups %xmm0, 16(%rdi)
+; SSE4-NEXT:    movups %xmm0, (%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: bzero_32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovups %ymm0, (%rdi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: bzero_32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovups %ymm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 32, i1 0)
+  ret void
+}
+
+define void @bzero_64(i8* %a) nounwind {
+; SSE2-LABEL: bzero_64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq $0, 56(%rdi)
+; SSE2-NEXT:    movq $0, 48(%rdi)
+; SSE2-NEXT:    movq $0, 40(%rdi)
+; SSE2-NEXT:    movq $0, 32(%rdi)
+; SSE2-NEXT:    movq $0, 24(%rdi)
+; SSE2-NEXT:    movq $0, 16(%rdi)
+; SSE2-NEXT:    movq $0, 8(%rdi)
+; SSE2-NEXT:    movq $0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: bzero_64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    xorps %xmm0, %xmm0
+; SSE4-NEXT:    movups %xmm0, 48(%rdi)
+; SSE4-NEXT:    movups %xmm0, 32(%rdi)
+; SSE4-NEXT:    movups %xmm0, 16(%rdi)
+; SSE4-NEXT:    movups %xmm0, (%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: bzero_64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovups %ymm0, 32(%rdi)
+; AVX-NEXT:    vmovups %ymm0, (%rdi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: bzero_64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovups %zmm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 0, i64 64, i1 0)
+  ret void
+}
+
+; /////////////////////////////////////////////////////////////////////////////
+
+define void @aligned_bzero_16(i8* %a) nounwind {
+; SSE2-LABEL: aligned_bzero_16:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    movaps %xmm0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: aligned_bzero_16:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    xorps %xmm0, %xmm0
+; SSE4-NEXT:    movaps %xmm0, (%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: aligned_bzero_16:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovaps %xmm0, (%rdi)
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: aligned_bzero_16:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %xmm0, (%rdi)
+; AVX512-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* align 16 %a, i8 0, i64 16, i1 0)
+  ret void
+}
+
+define void @aligned_bzero_32(i8* %a) nounwind {
+; SSE2-LABEL: aligned_bzero_32:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    movaps %xmm0, 16(%rdi)
+; SSE2-NEXT:    movaps %xmm0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: aligned_bzero_32:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    xorps %xmm0, %xmm0
+; SSE4-NEXT:    movaps %xmm0, 16(%rdi)
+; SSE4-NEXT:    movaps %xmm0, (%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: aligned_bzero_32:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovaps %ymm0, (%rdi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: aligned_bzero_32:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %ymm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* align 32 %a, i8 0, i64 32, i1 0)
+  ret void
+}
+
+define void @aligned_bzero_64(i8* %a) nounwind {
+; SSE2-LABEL: aligned_bzero_64:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    xorps %xmm0, %xmm0
+; SSE2-NEXT:    movaps %xmm0, 48(%rdi)
+; SSE2-NEXT:    movaps %xmm0, 32(%rdi)
+; SSE2-NEXT:    movaps %xmm0, 16(%rdi)
+; SSE2-NEXT:    movaps %xmm0, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: aligned_bzero_64:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    xorps %xmm0, %xmm0
+; SSE4-NEXT:    movaps %xmm0, 48(%rdi)
+; SSE4-NEXT:    movaps %xmm0, 32(%rdi)
+; SSE4-NEXT:    movaps %xmm0, 16(%rdi)
+; SSE4-NEXT:    movaps %xmm0, (%rdi)
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: aligned_bzero_64:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX-NEXT:    vmovaps %ymm0, 32(%rdi)
+; AVX-NEXT:    vmovaps %ymm0, (%rdi)
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
+;
+; AVX512-LABEL: aligned_bzero_64:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    vxorps %xmm0, %xmm0, %xmm0
+; AVX512-NEXT:    vmovaps %zmm0, (%rdi)
+; AVX512-NEXT:    vzeroupper
+; AVX512-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* align 64 %a, i8 0, i64 64, i1 0)
+  ret void
+}
diff --git a/llvm/test/CodeGen/X86/memset-vs-memset-inline.ll b/llvm/test/CodeGen/X86/memset-vs-memset-inline.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/CodeGen/X86/memset-vs-memset-inline.ll
@@ -0,0 +1,40 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 | FileCheck %s
+
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+declare void @llvm.memset.inline.p0i8.i64(i8* nocapture, i8, i64, i1) nounwind
+
+define void @test1(i8* %a, i8 %value) nounwind {
+; CHECK-LABEL: test1:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $esi killed $esi def $rsi
+; CHECK-NEXT:    movzbl %sil, %eax
+; CHECK-NEXT:    movabsq $72340172838076673, %rcx # imm = 0x101010101010101
+; CHECK-NEXT:    imulq %rax, %rcx
+; CHECK-NEXT:    movq %rcx, (%rdi)
+; CHECK-NEXT:    retq
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 8, i1 0)
+  ret void
+}
+
+define void @regular_memset_calls_external_function(i8* %a, i8 %value) nounwind {
+; CHECK-LABEL: regular_memset_calls_external_function:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    movl $1024, %edx # imm = 0x400
+; CHECK-NEXT:    jmp memset@PLT # TAILCALL
+  tail call void @llvm.memset.p0i8.i64(i8* %a, i8 %value, i64 1024, i1 0)
+  ret void
+}
+
+define void @inlined_set_doesnt_call_external_function(i8* %a, i8 %value) nounwind {
+; CHECK-LABEL: inlined_set_doesnt_call_external_function:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    # kill: def $esi killed $esi def $rsi
+; CHECK-NEXT:    movzbl %sil, %ecx
+; CHECK-NEXT:    movabsq $72340172838076673, %rax # imm = 0x101010101010101
+; CHECK-NEXT:    imulq %rcx, %rax
+; CHECK-NEXT:    movq %rax, 1016(%rdi)
+; CHECK-NEXT:    movq %rax, 1008(%rdi)
+  tail call void @llvm.memset.inline.p0i8.i64(i8* %a, i8 %value, i64 1024, i1 0)
+  ret void
+}
diff --git a/llvm/test/Other/lint.ll b/llvm/test/Other/lint.ll
--- a/llvm/test/Other/lint.ll
+++ b/llvm/test/Other/lint.ll
@@ -6,6 +6,8 @@
 declare void @llvm.stackrestore(i8*)
 declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
 declare void @llvm.memcpy.inline.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i1) nounwind
+declare void @llvm.memset.p0i8.i8.i64(i8* nocapture, i8, i64, i1) nounwind
+declare void @llvm.memset.inline.p0i8.i8.i64(i8* nocapture, i8, i64, i1) nounwind
 declare void @has_sret(i8* sret(i8) %p)
 declare void @has_noaliases(i32* noalias %p, i32* %q)
 declare void @one_arg(i32)
@@ -87,6 +89,11 @@
 ; CHECK: Unusual: noalias argument aliases another argument
 call void @llvm.memcpy.p0i8.p0i8.i64(i8* bitcast (i32* @CG to i8*), i8* bitcast (i32* @CG to i8*), i64 1, i1 0)
 
+; CHECK: Write to read-only memory
+call void @llvm.memset.p0i8.i8.i64(i8* bitcast (i32* @CG to i8*), i8 1, i64 1, i1 0)
+; CHECK: Write to read-only memory
+call void @llvm.memset.inline.p0i8.i8.i64(i8* bitcast (i32* @CG to i8*), i8 1, i64 1, i1 0)
+
 ; CHECK: Undefined behavior: Buffer overflow
   %wider = bitcast i8* %buf to i16*
   store i16 0, i16* %wider
diff --git a/llvm/test/Verifier/intrinsic-immarg.ll b/llvm/test/Verifier/intrinsic-immarg.ll
--- a/llvm/test/Verifier/intrinsic-immarg.ll
+++ b/llvm/test/Verifier/intrinsic-immarg.ll
@@ -62,6 +62,23 @@
   ret void
 }
 
+declare void @llvm.memset.inline.p0i8.i32(i8* nocapture, i8, i32, i1)
+define void @memset_inline_is_volatile(i8* %dest, i8 %value, i1 %is.volatile) {
+  ; CHECK: immarg operand has non-immediate parameter
+  ; CHECK-NEXT: i1 %is.volatile
+  ; CHECK-NEXT: call void @llvm.memset.inline.p0i8.i32(i8* %dest, i8 %value, i32 8, i1 %is.volatile)
+  call void @llvm.memset.inline.p0i8.i32(i8* %dest, i8 %value, i32 8, i1 %is.volatile)
+  ret void
+}
+
+define void @memset_inline_variable_size(i8* %dest, i8 %value, i32 %size) {
+  ; CHECK: immarg operand has non-immediate parameter
+  ; CHECK-NEXT: i32 %size
+  ; CHECK-NEXT: call void @llvm.memset.inline.p0i8.i32(i8* %dest, i8 %value, i32 %size, i1 true)
+  call void @llvm.memset.inline.p0i8.i32(i8* %dest, i8 %value, i32 %size, i1 true)
+  ret void
+}
+
 
 declare i64 @llvm.objectsize.i64.p0i8(i8*, i1, i1, i1)
 define void @objectsize(i8* %ptr, i1 %a, i1 %b, i1 %c) {
diff --git a/llvm/test/Verifier/memset-inline.ll b/llvm/test/Verifier/memset-inline.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Verifier/memset-inline.ll
@@ -0,0 +1,9 @@
+; RUN: not opt -verify < %s 2>&1 | FileCheck %s
+
+; CHECK: alignment is not a power of two 
+
+define void @foo(i8* %P, i8 %value) {
+  call void @llvm.memset.inline.p0i8.i32(i8* align 3 %P, i8 %value, i32 4, i1 false)
+  ret void
+}
+declare void @llvm.memset.inline.p0i8.i32(i8* nocapture, i8, i32, i1) nounwind