Index: llvm/lib/Target/X86/X86ISelDAGToDAG.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -880,16 +880,31 @@ continue; } - /// Convert vector increment or decrement to sub/add with an all-ones - /// constant: - /// add X, <1, 1...> --> sub X, <-1, -1...> - /// sub X, <1, 1...> --> add X, <-1, -1...> - /// The all-ones vector constant can be materialized using a pcmpeq - /// instruction that is commonly recognized as an idiom (has no register - /// dependency), so that's better/smaller than loading a splat 1 constant. + // Convert vector increment or decrement to sub/add with an all-ones + // constant: + // add X, <1, 1...> --> sub X, <-1, -1...> + // sub X, <1, 1...> --> add X, <-1, -1...> + // The all-ones vector constant can be materialized using a pcmpeq + // instruction that is commonly recognized as an idiom (has no register + // dependency), so that's better/smaller than loading a splat 1 constant. + // + // But don't do this if it would inhibit a potentially profitable load + // folding opportunity for the other operand. That only occurs with the + // intersection of: + // (1) The other operand (op0) is load foldable. + // (2) The op is an add (otherwise, we are *creating* an add and can still + // load fold the other op). + // (3) The target has AVX (otherwise, we have a destructive add and can't + // load fold the other op without killing the constant op). + // (4) The constant 1 vector has multiple uses (so it is profitable to load + // into a register anyway). + auto mayPreventLoadFold = [&]() { + return X86::mayFoldLoad(N->getOperand(0), *Subtarget) && + N->getOpcode() == ISD::ADD && Subtarget->hasAVX() && + !N->getOperand(1).hasOneUse(); + }; if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && - N->getSimpleValueType(0).isVector()) { - + N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) { APInt SplatVal; if (X86::isConstantSplat(N->getOperand(1), SplatVal) && SplatVal.isOne()) { Index: llvm/lib/Target/X86/X86ISelLowering.h =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.h +++ llvm/lib/Target/X86/X86ISelLowering.h @@ -911,6 +911,11 @@ /// as zero if AllowPartialUndefs is set, else we fail and return false. bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs = true); + + /// Check if Op is a load operation that could be folded into some other x86 + /// instruction as a memory operand. Example: vpaddd (%rdi), %xmm0, %xmm0. + bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, + bool AssumeSingleUse = false); } // end namespace X86 //===--------------------------------------------------------------------===// Index: llvm/lib/Target/X86/X86ISelLowering.cpp =================================================================== --- llvm/lib/Target/X86/X86ISelLowering.cpp +++ llvm/lib/Target/X86/X86ISelLowering.cpp @@ -5039,8 +5039,8 @@ // Other Lowering Hooks //===----------------------------------------------------------------------===// -static bool mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, - bool AssumeSingleUse = false) { +bool X86::mayFoldLoad(SDValue Op, const X86Subtarget &Subtarget, + bool AssumeSingleUse) { if (!AssumeSingleUse && !Op.hasOneUse()) return false; if (!ISD::isNormalLoad(Op.getNode())) @@ -5062,7 +5062,7 @@ const X86Subtarget &Subtarget, bool AssumeSingleUse = false) { assert(Subtarget.hasAVX() && "Expected AVX for broadcast from memory"); - if (!mayFoldLoad(Op, Subtarget, AssumeSingleUse)) + if (!X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse)) return false; // We can not replace a wide volatile load with a broadcast-from-memory, @@ -12746,7 +12746,7 @@ MutableArrayRef InputMask) { unsigned EltSizeInBits = Input.getScalarValueSizeInBits(); if (!Subtarget.hasAVX2() && (!Subtarget.hasAVX() || EltSizeInBits < 32 || - !mayFoldLoad(Input, Subtarget))) + !X86::mayFoldLoad(Input, Subtarget))) return; if (isNoopShuffleMask(InputMask)) return; @@ -16431,7 +16431,7 @@ bool SplatLo = isShuffleEquivalent(Mask, {0, 1, 0, 1}, V1); bool SplatHi = isShuffleEquivalent(Mask, {2, 3, 2, 3}, V1); if ((SplatLo || SplatHi) && !Subtarget.hasAVX512() && V1.hasOneUse() && - mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) { + X86::mayFoldLoad(peekThroughOneUseBitcasts(V1), Subtarget)) { auto *Ld = cast(peekThroughOneUseBitcasts(V1)); if (!Ld->isNonTemporal()) { MVT MemVT = VT.getHalfNumVectorElementsVT(); @@ -19432,7 +19432,7 @@ if (!VT.is128BitVector() && IdxVal >= NumEltsIn128 && ((Subtarget.hasAVX2() && EltSizeInBits != 8) || (Subtarget.hasAVX() && (EltSizeInBits >= 32) && - mayFoldLoad(N1, Subtarget)))) { + X86::mayFoldLoad(N1, Subtarget)))) { SDValue N1SplatVec = DAG.getSplatBuildVector(VT, dl, N1); SmallVector BlendMask; for (unsigned i = 0; i != NumElts; ++i) @@ -19505,7 +19505,7 @@ // combine either bitwise AND or insert of float 0.0 to set these bits. bool MinSize = DAG.getMachineFunction().getFunction().hasMinSize(); - if (IdxVal == 0 && (!MinSize || !mayFoldLoad(N1, Subtarget))) { + if (IdxVal == 0 && (!MinSize || !X86::mayFoldLoad(N1, Subtarget))) { // If this is an insertion of 32-bits into the low 32-bits of // a vector, we prefer to generate a blend with immediate rather // than an insertps. Blends are simpler operations in hardware and so @@ -24645,8 +24645,8 @@ // being inserted between two CMOV's. (in i16 case too TBN) // https://bugs.llvm.org/show_bug.cgi?id=40974 if ((Op.getValueType() == MVT::i8 && Subtarget.hasCMov()) || - (Op.getValueType() == MVT::i16 && !mayFoldLoad(Op1, Subtarget) && - !mayFoldLoad(Op2, Subtarget))) { + (Op.getValueType() == MVT::i16 && !X86::mayFoldLoad(Op1, Subtarget) && + !X86::mayFoldLoad(Op2, Subtarget))) { Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1); Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2); SDValue Ops[] = { Op2, Op1, CC, Cond }; @@ -36993,7 +36993,7 @@ if (isUndefOrEqual(Mask, 0)) { if (V1.getValueType() == MaskVT && V1.getOpcode() == ISD::SCALAR_TO_VECTOR && - mayFoldLoad(V1.getOperand(0), Subtarget)) { + X86::mayFoldLoad(V1.getOperand(0), Subtarget)) { if (Depth == 0 && Root.getOpcode() == X86ISD::VBROADCAST) return SDValue(); // Nothing to do! Res = V1.getOperand(0); @@ -38436,8 +38436,8 @@ unsigned Imm = V.getConstantOperandVal(2); const X86Subtarget &Subtarget = static_cast(DAG.getSubtarget()); - if (!mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) || - mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget)) + if (!X86::mayFoldLoad(peekThroughOneUseBitcasts(N0), Subtarget) || + X86::mayFoldLoad(peekThroughOneUseBitcasts(N1), Subtarget)) return SDValue(); Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4); return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0, @@ -51684,7 +51684,8 @@ // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x) if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR && (Subtarget.hasAVX2() || - (EltSizeInBits >= 32 && mayFoldLoad(Op0.getOperand(0), Subtarget))) && + (EltSizeInBits >= 32 && + X86::mayFoldLoad(Op0.getOperand(0), Subtarget))) && Op0.getOperand(0).getValueType() == VT.getScalarType()) return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0)); @@ -53016,7 +53017,7 @@ case ISD::SRL: { SDValue N0 = Op.getOperand(0); // Look out for (store (shl (load), x)). - if (mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op)) + if (X86::mayFoldLoad(N0, Subtarget) && IsFoldableRMW(N0, Op)) return false; break; } @@ -53031,11 +53032,11 @@ SDValue N0 = Op.getOperand(0); SDValue N1 = Op.getOperand(1); // Avoid disabling potential load folding opportunities. - if (mayFoldLoad(N1, Subtarget) && + if (X86::mayFoldLoad(N1, Subtarget) && (!Commute || !isa(N0) || (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N1, Op)))) return false; - if (mayFoldLoad(N0, Subtarget) && + if (X86::mayFoldLoad(N0, Subtarget) && ((Commute && !isa(N1)) || (Op.getOpcode() != ISD::MUL && IsFoldableRMW(N0, Op)))) return false; Index: llvm/test/CodeGen/X86/combine-sub.ll =================================================================== --- llvm/test/CodeGen/X86/combine-sub.ll +++ llvm/test/CodeGen/X86/combine-sub.ll @@ -276,6 +276,10 @@ ret <4 x i32> %sub } +; With AVX, this could use broadcast (an extra load) and +; load-folded 'add', but currently we favor the virtually +; free pcmpeq instruction. + define void @PR52032_oneuse_constant(<8 x i32>* %p) { ; SSE-LABEL: PR52032_oneuse_constant: ; SSE: # %bb.0: @@ -302,6 +306,9 @@ ret void } +; With AVX, we don't transform 'add' to 'sub' because that prevents load folding. +; With SSE, we do it because we can't load fold the other op without overwriting the constant op. + define void @PR52032(<8 x i32>* %p) { ; SSE-LABEL: PR52032: ; SSE: # %bb.0: @@ -322,12 +329,10 @@ ; ; AVX-LABEL: PR52032: ; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX-NEXT: vmovdqu (%rdi), %ymm1 -; AVX-NEXT: vmovdqu 32(%rdi), %ymm2 -; AVX-NEXT: vpsubd %ymm0, %ymm1, %ymm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] +; AVX-NEXT: vpaddd (%rdi), %ymm0, %ymm1 ; AVX-NEXT: vmovdqu %ymm1, (%rdi) -; AVX-NEXT: vpsubd %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vpaddd 32(%rdi), %ymm0, %ymm0 ; AVX-NEXT: vmovdqu %ymm0, 32(%rdi) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -341,6 +346,10 @@ ret void } +; Same as above, but 128-bit ops: +; With AVX, we don't transform 'add' to 'sub' because that prevents load folding. +; With SSE, we do it because we can't load fold the other op without overwriting the constant op. + define void @PR52032_2(<4 x i32>* %p) { ; SSE-LABEL: PR52032_2: ; SSE: # %bb.0: @@ -355,12 +364,10 @@ ; ; AVX-LABEL: PR52032_2: ; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovdqu (%rdi), %xmm1 -; AVX-NEXT: vmovdqu 16(%rdi), %xmm2 -; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] +; AVX-NEXT: vpaddd (%rdi), %xmm0, %xmm1 ; AVX-NEXT: vmovdqu %xmm1, (%rdi) -; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vpaddd 16(%rdi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqu %xmm0, 16(%rdi) ; AVX-NEXT: retq %i3 = load <4 x i32>, <4 x i32>* %p, align 4 @@ -373,6 +380,8 @@ ret void } +; If we are starting with a 'sub', it is always better to do the transform. + define void @PR52032_3(<4 x i32>* %p) { ; SSE-LABEL: PR52032_3: ; SSE: # %bb.0: @@ -403,6 +412,8 @@ ret void } +; If there's no chance of profitable load folding (because of extra uses), we convert 'add' to 'sub'. + define void @PR52032_4(<4 x i32>* %p, <4 x i32>* %q) { ; SSE-LABEL: PR52032_4: ; SSE: # %bb.0: