diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -1170,6 +1170,41 @@
   return false;
 }
 
+// Combine patterns like:
+//   %0 = load <4 x i32>, <4 x i32>* %a
+//   %1 = insertelement <4 x i32> %0, i32 %b, i32 1
+//   store <4 x i32> %1, <4 x i32>* %a
+// to:
+//   %0 = getelementptr <4 x i32>, <4 x i32>* %a, i64 0, i64 1
+//   store i32 %b, i32* %0
+static Instruction *foldSingleElementStore(InstCombiner &IC, StoreInst &SI) {
+  if (!SI.isSimple())
+    return nullptr;
+
+  Instruction *Source;
+  Value       *NewElement;
+  Constant    *Idx;
+  if (!match(SI.getValueOperand(), m_InsertElement(m_Instruction(Source),
+                                                   m_Value(NewElement),
+                                                   m_Constant(Idx))))
+    return nullptr;
+
+  // Only do the fold when the load is not volatile/atomic.
+  if (LoadInst *Load = dyn_cast<LoadInst>(Source)) {
+    if (!Load->isSimple())
+      return nullptr;
+    auto ElePtrType = NewElement->getType()->getPointerTo();
+    auto ElePtr = IC.Builder.CreatePointerCast(SI.getPointerOperand(),
+                                               ElePtrType);
+    auto GEP = IC.Builder.CreateGEP(ElePtr, Idx);
+    SI.setOperand(0, NewElement);
+    SI.setOperand(1, GEP);
+    return &SI;
+  }
+
+  return nullptr;
+}
+
 static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
   // FIXME: We could probably with some care handle both volatile and atomic
   // stores here but it isn't clear that this is important.
@@ -1394,6 +1429,9 @@
   // FIXME: Some bits are legal for ordered atomic stores; needs refactoring.
   if (!SI.isUnordered()) return nullptr;
 
+  if (Instruction *NewSI = foldSingleElementStore(*this, SI))
+    return NewSI;
+
   // If the RHS is an alloca with a single use, zapify the store, making the
   // alloca dead.
   if (Ptr->hasOneUse()) {
diff --git a/llvm/test/Transforms/InstCombine/single-element-store.ll b/llvm/test/Transforms/InstCombine/single-element-store.ll
new file mode 100644
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/single-element-store.ll
@@ -0,0 +1,47 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -instcombine -S | FileCheck %s
+
+define void @insert_store(<16 x i8>* %q, i8 zeroext %s) {
+; CHECK-LABEL: @insert_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <16 x i8>, <16 x i8>* [[Q:%.*]], i64 0, i64 3
+; CHECK-NEXT:    store i8 [[S:%.*]], i8* [[TMP0]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <16 x i8>, <16 x i8>* %q
+  %vecins = insertelement <16 x i8> %0, i8 %s, i32 3
+  store <16 x i8> %vecins, <16 x i8>* %q
+  ret void
+}
+
+
+define void @single_shuffle_store(<4 x i32>* %a, i32 %b) {
+; CHECK-LABEL: @single_shuffle_store(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr <4 x i32>, <4 x i32>* [[A:%.*]], i64 0, i64 1
+; CHECK-NEXT:    store i32 [[B:%.*]], i32* [[TMP0]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <4 x i32>, <4 x i32>* %a
+  %1 = insertelement <4 x i32> %0, i32 %b, i32 1
+  %2 = shufflevector <4 x i32> %0, <4 x i32> %1, <4 x i32> <i32 0, i32 5, i32 2, i32 3>
+  store <4 x i32> %2, <4 x i32>* %a
+  ret void
+}
+
+define void @volatile_update(<16 x i8>* %q, i8 zeroext %s) {
+; CHECK-LABEL: @volatile_update(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[Q:%.*]], align 16
+; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <16 x i8> [[TMP0]], i8 [[S:%.*]], i32 3
+; CHECK-NEXT:    store volatile <16 x i8> [[VECINS]], <16 x i8>* [[Q]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <16 x i8>, <16 x i8>* %q
+  %vecins = insertelement <16 x i8> %0, i8 %s, i32 3
+  store volatile <16 x i8> %vecins, <16 x i8>* %q
+  ret void
+}