diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -1176,6 +1176,59 @@
   return false;
 }
 
+// Check if memory loc modified between two instrs in the same BB
+static bool isMemModifiedBetween(BasicBlock::iterator Begin,
+                                 BasicBlock::iterator End,
+                                 const MemoryLocation &Loc, AliasAnalysis *AA) {
+  for (BasicBlock::iterator BBI = Begin; BBI != End; ++BBI)
+    if (isModSet(AA->getModRefInfo(&*BBI, Loc)))
+      return true;
+  return false;
+}
+
+// Combine patterns like:
+//   %0 = load <4 x i32>, <4 x i32>* %a
+//   %1 = insertelement <4 x i32> %0, i32 %b, i32 1
+//   store <4 x i32> %1, <4 x i32>* %a
+// to:
+//   %0 = getelementptr inbounds <4 x i32>, <4 x i32>* %a, i64 0, i64 1
+//   store i32 %b, i32* %0
+static Instruction *foldSingleElementStore(InstCombiner &IC, StoreInst &SI,
+                                           AliasAnalysis *AA) {
+  if (!SI.isSimple() || !SI.getValueOperand()->getType()->isVectorTy())
+    return nullptr;
+
+  Instruction *Source;
+  Value *NewElement;
+  Constant *Idx;
+  if (!match(SI.getValueOperand(),
+             m_InsertElement(m_Instruction(Source), m_Value(NewElement),
+                             m_Constant(Idx))))
+    return nullptr;
+
+  if (auto *Load = dyn_cast<LoadInst>(Source)) {
+    Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
+
+    // Don't optimize for atomic/volatile load or stores.
+    if (!Load->isSimple() || Load->getParent() != SI.getParent() ||
+        SrcAddr != SI.getPointerOperand()->stripPointerCasts() ||
+        isMemModifiedBetween(Load->getIterator(), SI.getIterator(),
+                             MemoryLocation::get(&SI), AA))
+      return nullptr;
+
+    Type *ElePtrType = NewElement->getType()->getPointerTo();
+    Value *ElePtr =
+        IC.Builder.CreatePointerCast(SI.getPointerOperand(), ElePtrType);
+    Value *GEP =
+        IC.Builder.CreateInBoundsGEP(NewElement->getType(), ElePtr, Idx);
+    StoreInst *NSI = new StoreInst(NewElement, GEP);
+    NSI->copyMetadata(SI, {LLVMContext::MD_nontemporal});
+    return NSI;
+  }
+
+  return nullptr;
+}
+
 static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
   // FIXME: We could probably with some care handle both volatile and atomic
   // stores here but it isn't clear that this is important.
@@ -1400,6 +1453,9 @@
   // FIXME: Some bits are legal for ordered atomic stores; needs refactoring.
   if (!SI.isUnordered()) return nullptr;
 
+  if (Instruction *NewSI = foldSingleElementStore(*this, SI, AA))
+    return NewSI;
+
   // If the RHS is an alloca with a single use, zapify the store, making the
   // alloca dead.
   if (Ptr->hasOneUse()) {
diff --git a/llvm/test/Transforms/InstCombine/load-insert-store.ll b/llvm/test/Transforms/InstCombine/load-insert-store.ll
--- a/llvm/test/Transforms/InstCombine/load-insert-store.ll
+++ b/llvm/test/Transforms/InstCombine/load-insert-store.ll
@@ -4,9 +4,8 @@
 define void @insert_store(<16 x i8>* %q, i8 zeroext %s) {
 ; CHECK-LABEL: @insert_store(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, <16 x i8>* [[Q:%.*]], align 16
-; CHECK-NEXT:    [[VECINS:%.*]] = insertelement <16 x i8> [[TMP0]], i8 [[S:%.*]], i32 3
-; CHECK-NEXT:    store <16 x i8> [[VECINS]], <16 x i8>* [[Q]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[Q:%.*]], i64 0, i64 3
+; CHECK-NEXT:    store i8 [[S:%.*]], i8* [[TMP0]], align 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -19,9 +18,8 @@
 define void @single_shuffle_store(<4 x i32>* %a, i32 %b) {
 ; CHECK-LABEL: @single_shuffle_store(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, <4 x i32>* [[A:%.*]], align 16
-; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[B:%.*]], i32 1
-; CHECK-NEXT:    store <4 x i32> [[TMP1]], <4 x i32>* [[A]], align 16, !nontemporal !0
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <4 x i32>, <4 x i32>* [[A:%.*]], i64 0, i64 1
+; CHECK-NEXT:    store i32 [[B:%.*]], i32* [[TMP0]], align 4, !nontemporal !0
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -32,6 +30,7 @@
   ret void
 }
 
+; Should not support volatile or atomic load/stores.
 define void @volatile_update(<16 x i8>* %q, <16 x i8>* %p, i8 zeroext %s) {
 ; CHECK-LABEL: @volatile_update(
 ; CHECK-NEXT:  entry:
@@ -69,6 +68,9 @@
   ret void
 }
 
+; We can't transform if any instr could modify memory in between.
+; Here p and q may alias, so we can't remove the load.
+; r is impossible to alias with others, so it's safe to transform.
 define void @insert_store_mem_modify(<16 x i8>* %p, <16 x i8>* %q, <16 x i8>* noalias %r, i8 %s) {
 ; CHECK-LABEL: @insert_store_mem_modify(
 ; CHECK-NEXT:  entry:
@@ -76,10 +78,9 @@
 ; CHECK-NEXT:    store <16 x i8> zeroinitializer, <16 x i8>* [[Q:%.*]], align 16
 ; CHECK-NEXT:    [[INS:%.*]] = insertelement <16 x i8> [[LD]], i8 [[S:%.*]], i32 3
 ; CHECK-NEXT:    store <16 x i8> [[INS]], <16 x i8>* [[P]], align 16
-; CHECK-NEXT:    [[LD2:%.*]] = load <16 x i8>, <16 x i8>* [[Q]], align 16
 ; CHECK-NEXT:    store <16 x i8> zeroinitializer, <16 x i8>* [[R:%.*]], align 16
-; CHECK-NEXT:    [[INS2:%.*]] = insertelement <16 x i8> [[LD2]], i8 [[S]], i32 7
-; CHECK-NEXT:    store <16 x i8> [[INS2]], <16 x i8>* [[Q]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[Q]], i64 0, i64 7
+; CHECK-NEXT:    store i8 [[S]], i8* [[TMP0]], align 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -95,4 +96,35 @@
   ret void
 }
 
+; Check cases when calls may modify memory
+define void @insert_store_with_call(<16 x i8>* %p, <16 x i8>* %q, i8 %s) {
+; CHECK-LABEL: @insert_store_with_call(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[LD:%.*]] = load <16 x i8>, <16 x i8>* [[P:%.*]], align 16
+; CHECK-NEXT:    call void @maywrite(<16 x i8>* nonnull [[P]])
+; CHECK-NEXT:    [[INS:%.*]] = insertelement <16 x i8> [[LD]], i8 [[S:%.*]], i32 3
+; CHECK-NEXT:    store <16 x i8> [[INS]], <16 x i8>* [[P]], align 16
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    call void @nowrite(<16 x i8>* nonnull [[P]])
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, <16 x i8>* [[P]], i64 0, i64 7
+; CHECK-NEXT:    store i8 [[S]], i8* [[TMP0]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %ld = load <16 x i8>, <16 x i8>* %p
+  call void @maywrite(<16 x i8>* %p)
+  %ins = insertelement <16 x i8> %ld, i8 %s, i32 3
+  store <16 x i8> %ins, <16 x i8>* %p
+  call void @foo()  ; Barrier
+  %ld2 = load <16 x i8>, <16 x i8>* %p
+  call void @nowrite(<16 x i8>* %p)
+  %ins2 = insertelement <16 x i8> %ld2, i8 %s, i32 7
+  store <16 x i8> %ins2, <16 x i8>* %p
+  ret void
+}
+
+declare void @foo()
+declare void @maywrite(<16 x i8>*)
+declare void @nowrite(<16 x i8>*) readonly
+
 !0 = !{}