Index: llvm/lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -878,16 +878,30 @@ continue; } - /// Convert vector increment or decrement to sub/add with an all-ones - /// constant: - /// add X, <1, 1...> --> sub X, <-1, -1...> - /// sub X, <1, 1...> --> add X, <-1, -1...> - /// The all-ones vector constant can be materialized using a pcmpeq - /// instruction that is commonly recognized as an idiom (has no register - /// dependency), so that's better/smaller than loading a splat 1 constant. + // Convert vector increment or decrement to sub/add with an all-ones + // constant: + // add X, <1, 1...> --> sub X, <-1, -1...> + // sub X, <1, 1...> --> add X, <-1, -1...> + // The all-ones vector constant can be materialized using a pcmpeq + // instruction that is commonly recognized as an idiom (has no register + // dependency), so that's better/smaller than loading a splat 1 constant. + // + // But don't do this if it would inhibit a potentially profitable load + // folding opportunity for the other operand. That only occurs with the + // intersection of: + // (1) The other operand (op0) is load foldable. + // (2) The op is an add (otherwise, we are *creating* an add and can still + // load fold the other op). + // (3) The target has AVX (otherwise, we have a destructive add and can't + // load fold the other op without killing the constant op). + // (4) The constant 1 vector has multiple uses (so it is profitable to load + // into a register anyway). + auto mayPreventLoadFold = [&]() { + return X86::mayFoldLoad(N->getOperand(0)) && N->getOpcode() == ISD::ADD && + Subtarget->hasAVX() && !N->getOperand(1).hasOneUse(); + }; if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && - N->getSimpleValueType(0).isVector()) { - + N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) { APInt SplatVal; if (X86::isConstantSplat(N->getOperand(1), SplatVal) && SplatVal.isOne()) { Index: llvm/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.h +++ llvm/lib/Target/X86/X86ISelLowering.h @@ -911,6 +911,8 @@ /// as zero if AllowPartialUndefs is set, else we fail and return false. bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs = true); + + bool mayFoldLoad(SDValue Op, bool AssumeSingleUse = false); } // end namespace X86 //===--------------------------------------------------------------------===// Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5039,13 +5039,13 @@ // Other Lowering Hooks //===----------------------------------------------------------------------===// -static bool MayFoldLoad(SDValue Op, bool AssumeSingleUse = false) { +bool X86::mayFoldLoad(SDValue Op, bool AssumeSingleUse) { return (AssumeSingleUse || Op.hasOneUse()) && ISD::isNormalLoad(Op.getNode()); } static bool MayFoldLoadIntoBroadcastFromMem(SDValue Op, MVT EltVT, bool AssumeSingleUse = false) { - if (!MayFoldLoad(Op, AssumeSingleUse)) + if (!X86::mayFoldLoad(Op, AssumeSingleUse)) return false; // We can not replace a wide volatile load with a broadcast-from-memory, @@ -12728,7 +12728,7 @@ MutableArrayRef InputMask) { unsigned EltSizeInBits = Input.getScalarValueSizeInBits(); if (!Subtarget.hasAVX2() && - (!Subtarget.hasAVX() || EltSizeInBits < 32 || !MayFoldLoad(Input))) + (!Subtarget.hasAVX() || EltSizeInBits < 32 || !X86::mayFoldLoad(Input))) return; if (isNoopShuffleMask(InputMask)) return; @@ -16413,7 +16413,7 @@ bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1); bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1); if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() && - MayFoldLoad(peekThroughOneUseBitcasts(V1))) { + X86::mayFoldLoad(peekThroughOneUseBitcasts(V1))) { auto *Ld = cast(peekThroughOneUseBitcasts(V1)); if (!Ld->isNonTemporal()) { MVT MemVT = VT.getHalfNumVectorElementsVT(); @@ -19413,7 +19413,7 @@ // FIXME: relax the profitability check iff all N1 uses are insertions. if (!VT.is128BitVector() && IdxVal >= NumEltsIn128 && ((Subtarget.hasAVX2() && EltSizeInBits != 8) || - (Subtarget.hasAVX() && (EltSizeInBits >= 32) && MayFoldLoad(N1)))) { + (Subtarget.hasAVX() && (EltSizeInBits >= 32) && X86::mayFoldLoad(N1)))) { SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1); SmallVector BlendMask; for (unsigned i = 0; i != NumElts; ++i) @@ -19486,7 +19486,7 @@ // combine either bitwise AND or insert of float 0.0 to set these bits. bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize(); - if (IdxVal == 0 && (!MinSize || !MayFoldLoad(N1))) { + if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1))) { // If this is an insertion of 32-bits into the low 32-bits of // a vector, we prefer to generate a blend with immediate rather // than an insertps. Blends are simpler operations in hardware and so @@ -24626,8 +24626,8 @@ // being inserted between two CMOV's. (in i16 case too TBN) // https://bugs.llvm.org/show_bug.cgi?id=40974 if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) || - (Op.getValueType() == MVT::i16 && !MayFoldLoad(Op1) && - !MayFoldLoad(Op2))) { + (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1) && + !X86::mayFoldLoad(Op2))) { Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); SDValue Ops[] = { Op2, Op1, CC, Cond }; @@ -36974,7 +36974,7 @@ if (isUndefOrEqual(Mask, 0)) { if (V1.getValueType() == MaskVT && V1.getOpcode() == ISD::SCALAR_TO_VECTOR && - MayFoldLoad(V1.getOperand(0))) { + X86::mayFoldLoad(V1.getOperand(0))) { if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST) return SDValue(); // Nothing to do! Res = V1.getOperand(0); @@ -38415,8 +38415,8 @@ SDValue N0 = V.getOperand(0); SDValue N1 = V.getOperand(1); unsigned Imm = V.getConstantOperandVal(2); - if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) || - MayFoldLoad(peekThroughOneUseBitcasts(N1))) + if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0)) || + X86::mayFoldLoad(peekThroughOneUseBitcasts(N1))) return SDValue(); Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4); return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0, @@ -51620,7 +51620,7 @@ // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x) if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR && (Subtarget.hasAVX2() || - (EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) && + (EltSizeInBits >= 32 && X86::mayFoldLoad(Op0.getOperand(0)))) && Op0.getOperand(0).getValueType() == VT.getScalarType()) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0)); @@ -52952,7 +52952,7 @@ case ISD::SRL: { SDValue N0 = Op.getOperand(0); // Look out for (store (shl (load), x)). - if (MayFoldLoad(N0) && IsFoldableRMW(N0, Op)) + if (X86::mayFoldLoad(N0) && IsFoldableRMW(N0, Op)) return false; break; } @@ -52967,11 +52967,11 @@ SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); // Avoid disabling potential load folding opportunities. - if (MayFoldLoad(N1) && + if (X86::mayFoldLoad(N1) && (!Commute || !isa(N0) || (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op)))) return false; - if (MayFoldLoad(N0) && + if (X86::mayFoldLoad(N0) && ((Commute && !isa(N1)) || (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op)))) return false; Index: llvm/test/CodeGen/X86/combine-sub.ll =================================================================== --- llvm/test/CodeGen/X86/combine-sub.ll +++ llvm/test/CodeGen/X86/combine-sub.ll @@ -276,6 +276,9 @@ ret <4 x i32> %sub } +; With AVX, we don't transform 'add' to 'sub' because that prevents load folding. +; With SSE, we do it because we can't load fold the other op without overwriting the constant op. + define void @PR52032(<8 x i32>* %p) { ; SSE-LABEL: PR52032: ; SSE: # %bb.0: @@ -296,12 +299,10 @@ ; ; AVX-LABEL: PR52032: ; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX-NEXT: vmovdqu (%rdi), %ymm1 -; AVX-NEXT: vmovdqu 32(%rdi), %ymm2 -; AVX-NEXT: vpsubd %ymm0, %ymm1, %ymm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] +; AVX-NEXT: vpaddd (%rdi), %ymm0, %ymm1 ; AVX-NEXT: vmovdqu %ymm1, (%rdi) -; AVX-NEXT: vpsubd %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vpaddd 32(%rdi), %ymm0, %ymm0 ; AVX-NEXT: vmovdqu %ymm0, 32(%rdi) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -315,6 +316,10 @@ ret void } +; Same as above, but 128-bit ops: +; With AVX, we don't transform 'add' to 'sub' because that prevents load folding. +; With SSE, we do it because we can't load fold the other op without overwriting the constant op. + define void @PR52032_2(<4 x i32>* %p) { ; SSE-LABEL: PR52032_2: ; SSE: # %bb.0: @@ -329,12 +334,10 @@ ; ; AVX-LABEL: PR52032_2: ; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovdqu (%rdi), %xmm1 -; AVX-NEXT: vmovdqu 16(%rdi), %xmm2 -; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] +; AVX-NEXT: vpaddd (%rdi), %xmm0, %xmm1 ; AVX-NEXT: vmovdqu %xmm1, (%rdi) -; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vpaddd 16(%rdi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqu %xmm0, 16(%rdi) ; AVX-NEXT: retq %i3 = load <4 x i32>, <4 x i32>* %p, align 4 @@ -347,6 +350,8 @@ ret void } +; If we are starting with a 'sub', it is always better to do the transform. + define void @PR52032_3(<4 x i32>* %p) { ; SSE-LABEL: PR52032_3: ; SSE: # %bb.0: @@ -377,6 +382,8 @@ ret void } +; If there's no chance of profitable load folding (because of extra uses), we convert 'add' to 'sub'. + define void @PR52032_4(<4 x i32>* %p, <4 x i32>* %q) { ; SSE-LABEL: PR52032_4: ; SSE: # %bb.0: