Index: lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- lib/Target/X86/X86ISelLowering.cpp
+++ lib/Target/X86/X86ISelLowering.cpp
@@ -26720,13 +26720,74 @@
   SDValue NewVec = DAG.getNode(X86ISD::VSEXT, dl, VT, WideLd);
   return DCI.CombineTo(N, NewVec, WideLd.getValue(1), true);
 }
-/// PerformMSTORECombine - Resolve truncating stores
+
+/// If this is a build vector of boolean constants and exactly one of those
+/// constants is true, return the operand index of that true element.
+/// Otherwise, return -1.
+static int getOneTrueElt(const SDValue V) {
+  // This needs to be a build vector of booleans.
+  auto *BV = dyn_cast<BuildVectorSDNode>(V);
+  if (!BV || BV->getValueType(0).getVectorElementType() != MVT::i1)
+    return -1;
+
+  int TrueIndex = -1;
+  unsigned NumElts = BV->getValueType(0).getVectorNumElements();
+  for (unsigned i = 0; i < NumElts; ++i) {
+    const SDValue &Op = BV->getOperand(i);
+    if (Op.getOpcode() == ISD::UNDEF)
+      continue;
+    auto *ConstNode = dyn_cast<ConstantSDNode>(Op);
+    if (!ConstNode)
+      return -1;
+    if (ConstNode->getAPIntValue().isAllOnesValue()) {
+      // If we already found a one, this is too many.
+      if (TrueIndex >= 0)
+        return -1;
+      TrueIndex = i;
+    }
+  }
+  return TrueIndex;
+}
+
+/// If exactly one element of the mask is set for a non-truncating masked store,
+/// it is a vector extract and scalar store.
+/// Note: It is expected that the degenerate cases of an all-zeros or all-ones
+/// mask have already been optimized in IR, so we don't bother with those.
+static SDValue reduceMaskedStoreToScalarStore(MaskedStoreSDNode *MS,
+                                              SelectionDAG &DAG) {
+  int TrueMaskElt = getOneTrueElt(MS->getMask());
+  if (TrueMaskElt < 0)
+    return SDValue();
+
+  SDLoc DL(MS);
+  EVT VT = MS->getValue().getValueType();
+  EVT EltVT = VT.getVectorElementType();
+
+  // Extract the one scalar element that is actually being stored.
+  SDValue ExtractIndex = DAG.getIntPtrConstant(TrueMaskElt, DL);
+  SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT,
+                                MS->getValue(), ExtractIndex);
+
+  // Store that element at the appropriate offset from the base pointer.
+  unsigned EltSize = EltVT.getStoreSize();
+  unsigned StoreOffset = TrueMaskElt * EltSize;
+  SDValue StoreOffsetVal = DAG.getIntPtrConstant(StoreOffset, DL);
+  SDValue BasePtr = MS->getBasePtr();
+  SDValue IndexedPtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(),
+                                   BasePtr, StoreOffsetVal);
+  return DAG.getStore(MS->getChain(), DL, Extract, IndexedPtr,
+                      MS->getPointerInfo(), MS->isVolatile(),
+                      MS->isNonTemporal(),
+                      MinAlign(MS->getAlignment(), EltSize));
+}
+
 static SDValue PerformMSTORECombine(SDNode *N, SelectionDAG &DAG,
                                     const X86Subtarget &Subtarget) {
   MaskedStoreSDNode *Mst = cast<MaskedStoreSDNode>(N);
   if (!Mst->isTruncatingStore())
-    return SDValue();
+    return reduceMaskedStoreToScalarStore(Mst, DAG);
 
+  // Resolve truncating stores.
   EVT VT = Mst->getValue().getValueType();
   unsigned NumElems = VT.getVectorNumElements();
   EVT StVT = Mst->getMemoryVT();
Index: test/CodeGen/X86/masked_memop.ll
===================================================================
--- test/CodeGen/X86/masked_memop.ll
+++ test/CodeGen/X86/masked_memop.ll
@@ -991,36 +991,55 @@
   ret void
 }
 
-define void @test22(<4 x i32> %trigger, <4 x i32>* %addr, <4 x i32> %val) {
-; AVX1-LABEL: test22:
+;  When only one element of the mask is set, reduce to a scalar store.
+
+define void @one_mask_bit_set1(<4 x i32>* %addr, <4 x i32> %val) {
+; AVX1-LABEL: one_mask_bit_set1:
 ; AVX1:       ## BB#0:
-; AVX1-NEXT:    movl $-1, %eax
-; AVX1-NEXT:    vmovd %eax, %xmm0
-; AVX1-NEXT:    vmaskmovps %xmm1, %xmm0, (%rdi)
+; AVX1-NEXT:    vmovd %xmm0, (%rdi)
 ; AVX1-NEXT:    retq
 ;
-; AVX2-LABEL: test22:
+; AVX2-LABEL: one_mask_bit_set1:
 ; AVX2:       ## BB#0:
-; AVX2-NEXT:    movl $-1, %eax
-; AVX2-NEXT:    vmovd %eax, %xmm0
-; AVX2-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
+; AVX2-NEXT:    vmovd %xmm0, (%rdi)
 ; AVX2-NEXT:    retq
 ;
-; AVX512F-LABEL: test22:
+; AVX512F-LABEL: one_mask_bit_set1:
 ; AVX512F:       ## BB#0:
-; AVX512F-NEXT:    movl $-1, %eax
-; AVX512F-NEXT:    vmovd %eax, %xmm0
-; AVX512F-NEXT:    vpmaskmovd %xmm1, %xmm0, (%rdi)
+; AVX512F-NEXT:    vmovd %xmm0, (%rdi)
 ; AVX512F-NEXT:    retq
 ;
-; SKX-LABEL: test22:
+; SKX-LABEL: one_mask_bit_set1:
 ; SKX:       ## BB#0:
-; SKX-NEXT:    movb $1, %al
-; SKX-NEXT:    kmovw %eax, %k1
-; SKX-NEXT:    vmovdqu32 %xmm1, (%rdi) {%k1}
+; SKX-NEXT:    vmovd %xmm0, (%rdi)
 ; SKX-NEXT:    retq
-  %mask = icmp eq <4 x i32> %trigger, zeroinitializer
-  call void @llvm.masked.store.v4i32(<4 x i32>%val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>)
+  call void @llvm.masked.store.v4i32(<4 x i32> %val, <4 x i32>* %addr, i32 4, <4 x i1><i1 true, i1 false, i1 false, i1 false>)
+  ret void
+}
+
+; Choose a different element to show that the correct address offset is produced.
+
+define void @one_mask_bit_set2(<4 x float>* %addr, <4 x float> %val) {
+; AVX1-LABEL: one_mask_bit_set2:
+; AVX1:       ## BB#0:
+; AVX1-NEXT:    vextractps  $2, %xmm0, 8(%rdi)
+; AVX1-NEXT:    retq
+;
+; AVX2-LABEL: one_mask_bit_set2:
+; AVX2:       ## BB#0:
+; AVX2-NEXT:    vextractps  $2, %xmm0, 8(%rdi)
+; AVX2-NEXT:    retq
+;
+; AVX512F-LABEL: one_mask_bit_set2:
+; AVX512F:       ## BB#0:
+; AVX512F-NEXT:    vextractps $2, %xmm0, 8(%rdi)
+; AVX512F-NEXT:    retq
+;
+; SKX-LABEL: one_mask_bit_set2:
+; SKX:       ## BB#0:
+; SKX-NEXT:    vextractps $2, %xmm0, 8(%rdi)
+; SKX-NEXT:    retq
+  call void @llvm.masked.store.v4f32(<4 x float> %val, <4 x float>* %addr, i32 4, <4 x i1><i1 false, i1 false, i1 true, i1 false>)
   ret void
 }
 
@@ -1032,6 +1051,7 @@
 declare void @llvm.masked.store.v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>)
 declare void @llvm.masked.store.v2f32(<2 x float>, <2 x float>*, i32, <2 x i1>)
 declare void @llvm.masked.store.v2i32(<2 x i32>, <2 x i32>*, i32, <2 x i1>)
+declare void @llvm.masked.store.v4f32(<4 x float>, <4 x float>*, i32, <4 x i1>)
 declare void @llvm.masked.store.v16f32(<16 x float>, <16 x float>*, i32, <16 x i1>)
 declare void @llvm.masked.store.v16f32p(<16 x float>*, <16 x float>**, i32, <16 x i1>)
 declare <16 x float> @llvm.masked.load.v16f32(<16 x float>*, i32, <16 x i1>, <16 x float>)