Index: llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
===================================================================
--- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -880,16 +880,31 @@
       continue;
     }
 
-    /// Convert vector increment or decrement to sub/add with an all-ones
-    /// constant:
-    /// add X, <1, 1...> --> sub X, <-1, -1...>
-    /// sub X, <1, 1...> --> add X, <-1, -1...>
-    /// The all-ones vector constant can be materialized using a pcmpeq
-    /// instruction that is commonly recognized as an idiom (has no register
-    /// dependency), so that's better/smaller than loading a splat 1 constant.
+    // Convert vector increment or decrement to sub/add with an all-ones
+    // constant:
+    // add X, <1, 1...> --> sub X, <-1, -1...>
+    // sub X, <1, 1...> --> add X, <-1, -1...>
+    // The all-ones vector constant can be materialized using a pcmpeq
+    // instruction that is commonly recognized as an idiom (has no register
+    // dependency), so that's better/smaller than loading a splat 1 constant.
+    //
+    // But don't do this if it would inhibit a potentially profitable load
+    // folding opportunity for the other operand. That only occurs with the
+    // intersection of:
+    // (1) The other operand (op0) is load foldable.
+    // (2) The op is an add (otherwise, we are *creating* an add and can still
+    //     load fold the other op).
+    // (3) The target has AVX (otherwise, we have a destructive add and can't
+    //     load fold the other op without killing the constant op).
+    // (4) The constant 1 vector has multiple uses (so it is profitable to load
+    //     into a register anyway).
+    auto mayPreventLoadFold = [&]() {
+      return X86::mayFoldLoad(N->getOperand(0), *Subtarget) &&
+             N->getOpcode() == ISD::ADD && Subtarget->hasAVX() &&
+             !N->getOperand(1).hasOneUse();
+    };
     if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
-        N->getSimpleValueType(0).isVector()) {
-
+        N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) {
       APInt SplatVal;
       if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
           SplatVal.isOne()) {
Index: llvm/lib/Target/X86/X86ISelLowering.h
===================================================================
--- llvm/lib/Target/X86/X86ISelLowering.h
+++ llvm/lib/Target/X86/X86ISelLowering.h
@@ -911,6 +911,11 @@
     /// as zero if AllowPartialUndefs is set, else we fail and return false.
     bool isConstantSplat(SDValue Op, APInt &SplatVal,
                          bool AllowPartialUndefs = true);
+
+    /// Check if Op is a load operation that could be folded into some other x86
+    /// instruction as a memory operand. Example: vpaddd (%rdi), %xmm0, %xmm0.
+    bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
+                     bool AssumeSingleUse = false);
   } // end namespace X86
 
   //===--------------------------------------------------------------------===//
Index: llvm/lib/Target/X86/X86ISelLowering.cpp
===================================================================
--- llvm/lib/Target/X86/X86ISelLowering.cpp
+++ llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -5039,8 +5039,8 @@
 //                           Other Lowering Hooks
 //===----------------------------------------------------------------------===//
 
-static bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
-                        bool AssumeSingleUse = false) {
+bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget,
+                      bool AssumeSingleUse) {
   if (!AssumeSingleUse && !Op.hasOneUse())
     return false;
   if (!ISD::isNormalLoad(Op.getNode()))
@@ -5062,7 +5062,7 @@
                                             const X86Subtarget &Subtarget,
                                             bool AssumeSingleUse = false) {
   assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory");
-  if (!mayFoldLoad(Op, Subtarget, AssumeSingleUse))
+  if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse))
     return false;
 
   // We can not replace a wide volatile load with a broadcast-from-memory,
@@ -12746,7 +12746,7 @@
                                                MutableArrayRef<int> InputMask) {
     unsigned EltSizeInBits = Input.getScalarValueSizeInBits();
     if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 ||
-                                 !mayFoldLoad(Input, Subtarget)))
+                                 !X86::mayFoldLoad(Input, Subtarget)))
       return;
     if (isNoopShuffleMask(InputMask))
       return;
@@ -16431,7 +16431,7 @@
     bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1);
     bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1);
     if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() &&
-        mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) {
+        X86::mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) {
       auto *Ld = cast<LoadSDNode>(peekThroughOneUseBitcasts(V1));
       if (!Ld->isNonTemporal()) {
         MVT MemVT = VT.getHalfNumVectorElementsVT();
@@ -19432,7 +19432,7 @@
     if (!VT.is128BitVector() && IdxVal >= NumEltsIn128 &&
         ((Subtarget.hasAVX2() && EltSizeInBits != 8) ||
          (Subtarget.hasAVX() && (EltSizeInBits >= 32) &&
-          mayFoldLoad(N1, Subtarget)))) {
+          X86::mayFoldLoad(N1, Subtarget)))) {
       SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1);
       SmallVector<int, 8> BlendMask;
       for (unsigned i = 0; i != NumElts; ++i)
@@ -19505,7 +19505,7 @@
       //   combine either bitwise AND or insert of float 0.0 to set these bits.
 
       bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize();
-      if (IdxVal == 0 && (!MinSize || !mayFoldLoad(N1, Subtarget))) {
+      if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) {
         // If this is an insertion of 32-bits into the low 32-bits of
         // a vector, we prefer to generate a blend with immediate rather
         // than an insertps. Blends are simpler operations in hardware and so
@@ -24645,8 +24645,8 @@
   //        being inserted between two CMOV's. (in i16 case too TBN)
   //        https://bugs.llvm.org/show_bug.cgi?id=40974
   if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) ||
-      (Op.getValueType() == MVT::i16 && !mayFoldLoad(Op1, Subtarget) &&
-       !mayFoldLoad(Op2, Subtarget))) {
+      (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) &&
+       !X86::mayFoldLoad(Op2, Subtarget))) {
     Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
     Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
     SDValue Ops[] = { Op2, Op1, CC, Cond };
@@ -36993,7 +36993,7 @@
       if (isUndefOrEqual(Mask, 0)) {
         if (V1.getValueType() == MaskVT &&
             V1.getOpcode() == ISD::SCALAR_TO_VECTOR &&
-            mayFoldLoad(V1.getOperand(0), Subtarget)) {
+            X86::mayFoldLoad(V1.getOperand(0), Subtarget)) {
           if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST)
             return SDValue(); // Nothing to do!
           Res = V1.getOperand(0);
@@ -38436,8 +38436,8 @@
     unsigned Imm = V.getConstantOperandVal(2);
     const X86Subtarget &Subtarget =
         static_cast<const X86Subtarget &>(DAG.getSubtarget());
-    if (!mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
-        mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))
+    if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) ||
+        X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget))
       return SDValue();
     Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
     return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
@@ -51684,7 +51684,8 @@
     // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
     if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
         (Subtarget.hasAVX2() ||
-         (EltSizeInBits >= 32 && mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
+         (EltSizeInBits >= 32 &&
+          X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) &&
         Op0.getOperand(0).getValueType() == VT.getScalarType())
       return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
 
@@ -53016,7 +53017,7 @@
   case ISD::SRL: {
     SDValue N0 = Op.getOperand(0);
     // Look out for (store (shl (load), x)).
-    if (mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
+    if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op))
       return false;
     break;
   }
@@ -53031,11 +53032,11 @@
     SDValue N0 = Op.getOperand(0);
     SDValue N1 = Op.getOperand(1);
     // Avoid disabling potential load folding opportunities.
-    if (mayFoldLoad(N1, Subtarget) &&
+    if (X86::mayFoldLoad(N1, Subtarget) &&
         (!Commute || !isa<ConstantSDNode>(N0) ||
          (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op))))
       return false;
-    if (mayFoldLoad(N0, Subtarget) &&
+    if (X86::mayFoldLoad(N0, Subtarget) &&
         ((Commute && !isa<ConstantSDNode>(N1)) ||
          (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op))))
       return false;
Index: llvm/test/CodeGen/X86/combine-sub.ll
===================================================================
--- llvm/test/CodeGen/X86/combine-sub.ll
+++ llvm/test/CodeGen/X86/combine-sub.ll
@@ -276,6 +276,10 @@
   ret <4 x i32> %sub
 }
 
+; With AVX, this could use broadcast (an extra load) and
+; load-folded 'add', but currently we favor the virtually
+; free pcmpeq instruction.
+
 define void @PR52032_oneuse_constant(<8 x i32>* %p) {
 ; SSE-LABEL: PR52032_oneuse_constant:
 ; SSE:       # %bb.0:
@@ -302,6 +306,9 @@
   ret void
 }
 
+; With AVX, we don't transform 'add' to 'sub' because that prevents load folding.
+; With SSE, we do it because we can't load fold the other op without overwriting the constant op.
+
 define void @PR52032(<8 x i32>* %p) {
 ; SSE-LABEL: PR52032:
 ; SSE:       # %bb.0:
@@ -322,12 +329,10 @@
 ;
 ; AVX-LABEL: PR52032:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpcmpeqd %ymm0, %ymm0, %ymm0
-; AVX-NEXT:    vmovdqu (%rdi), %ymm1
-; AVX-NEXT:    vmovdqu 32(%rdi), %ymm2
-; AVX-NEXT:    vpsubd %ymm0, %ymm1, %ymm1
+; AVX-NEXT:    vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1]
+; AVX-NEXT:    vpaddd (%rdi), %ymm0, %ymm1
 ; AVX-NEXT:    vmovdqu %ymm1, (%rdi)
-; AVX-NEXT:    vpsubd %ymm0, %ymm2, %ymm0
+; AVX-NEXT:    vpaddd 32(%rdi), %ymm0, %ymm0
 ; AVX-NEXT:    vmovdqu %ymm0, 32(%rdi)
 ; AVX-NEXT:    vzeroupper
 ; AVX-NEXT:    retq
@@ -341,6 +346,10 @@
   ret void
 }
 
+; Same as above, but 128-bit ops:
+; With AVX, we don't transform 'add' to 'sub' because that prevents load folding.
+; With SSE, we do it because we can't load fold the other op without overwriting the constant op.
+
 define void @PR52032_2(<4 x i32>* %p) {
 ; SSE-LABEL: PR52032_2:
 ; SSE:       # %bb.0:
@@ -355,12 +364,10 @@
 ;
 ; AVX-LABEL: PR52032_2:
 ; AVX:       # %bb.0:
-; AVX-NEXT:    vpcmpeqd %xmm0, %xmm0, %xmm0
-; AVX-NEXT:    vmovdqu (%rdi), %xmm1
-; AVX-NEXT:    vmovdqu 16(%rdi), %xmm2
-; AVX-NEXT:    vpsubd %xmm0, %xmm1, %xmm1
+; AVX-NEXT:    vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1]
+; AVX-NEXT:    vpaddd (%rdi), %xmm0, %xmm1
 ; AVX-NEXT:    vmovdqu %xmm1, (%rdi)
-; AVX-NEXT:    vpsubd %xmm0, %xmm2, %xmm0
+; AVX-NEXT:    vpaddd 16(%rdi), %xmm0, %xmm0
 ; AVX-NEXT:    vmovdqu %xmm0, 16(%rdi)
 ; AVX-NEXT:    retq
   %i3 = load <4 x i32>, <4 x i32>* %p, align 4
@@ -373,6 +380,8 @@
   ret void
 }
 
+; If we are starting with a 'sub', it is always better to do the transform.
+
 define void @PR52032_3(<4 x i32>* %p) {
 ; SSE-LABEL: PR52032_3:
 ; SSE:       # %bb.0:
@@ -403,6 +412,8 @@
   ret void
 }
 
+; If there's no chance of profitable load folding (because of extra uses), we convert 'add' to 'sub'.
+
 define void @PR52032_4(<4 x i32>* %p, <4 x i32>* %q) {
 ; SSE-LABEL: PR52032_4:
 ; SSE:       # %bb.0: