diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp --- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp +++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp @@ -880,16 +880,31 @@ continue; } - /// Convert vector increment or decrement to sub/add with an all-ones - /// constant: - /// add X, <1, 1...> --> sub X, <-1, -1...> - /// sub X, <1, 1...> --> add X, <-1, -1...> - /// The all-ones vector constant can be materialized using a pcmpeq - /// instruction that is commonly recognized as an idiom (has no register - /// dependency), so that's better/smaller than loading a splat 1 constant. + // Convert vector increment or decrement to sub/add with an all-ones + // constant: + // add X, <1, 1...> --> sub X, <-1, -1...> + // sub X, <1, 1...> --> add X, <-1, -1...> + // The all-ones vector constant can be materialized using a pcmpeq + // instruction that is commonly recognized as an idiom (has no register + // dependency), so that's better/smaller than loading a splat 1 constant. + // + // But don't do this if it would inhibit a potentially profitable load + // folding opportunity for the other operand. That only occurs with the + // intersection of: + // (1) The other operand (op0) is load foldable. + // (2) The op is an add (otherwise, we are *creating* an add and can still + // load fold the other op). + // (3) The target has AVX (otherwise, we have a destructive add and can't + // load fold the other op without killing the constant op). + // (4) The constant 1 vector has multiple uses (so it is profitable to load + // into a register anyway). + auto mayPreventLoadFold = [&]() { + return X86::mayFoldLoad(N->getOperand(0), *Subtarget) && + N->getOpcode() == ISD::ADD && Subtarget->hasAVX() && + !N->getOperand(1).hasOneUse(); + }; if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) && - N->getSimpleValueType(0).isVector()) { - + N->getSimpleValueType(0).isVector() && !mayPreventLoadFold()) { APInt SplatVal; if (X86::isConstantSplat(N->getOperand(1), SplatVal) && SplatVal.isOne()) { diff --git a/llvm/test/CodeGen/X86/combine-sub.ll b/llvm/test/CodeGen/X86/combine-sub.ll --- a/llvm/test/CodeGen/X86/combine-sub.ll +++ b/llvm/test/CodeGen/X86/combine-sub.ll @@ -276,6 +276,10 @@ ret <4 x i32> %sub } +; With AVX, this could use broadcast (an extra load) and +; load-folded 'add', but currently we favor the virtually +; free pcmpeq instruction. + define void @PR52032_oneuse_constant(<8 x i32>* %p) { ; SSE-LABEL: PR52032_oneuse_constant: ; SSE: # %bb.0: @@ -302,6 +306,9 @@ ret void } +; With AVX, we don't transform 'add' to 'sub' because that prevents load folding. +; With SSE, we do it because we can't load fold the other op without overwriting the constant op. + define void @PR52032(<8 x i32>* %p) { ; SSE-LABEL: PR52032: ; SSE: # %bb.0: @@ -322,12 +329,10 @@ ; ; AVX-LABEL: PR52032: ; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0 -; AVX-NEXT: vmovdqu (%rdi), %ymm1 -; AVX-NEXT: vmovdqu 32(%rdi), %ymm2 -; AVX-NEXT: vpsubd %ymm0, %ymm1, %ymm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} ymm0 = [1,1,1,1,1,1,1,1] +; AVX-NEXT: vpaddd (%rdi), %ymm0, %ymm1 ; AVX-NEXT: vmovdqu %ymm1, (%rdi) -; AVX-NEXT: vpsubd %ymm0, %ymm2, %ymm0 +; AVX-NEXT: vpaddd 32(%rdi), %ymm0, %ymm0 ; AVX-NEXT: vmovdqu %ymm0, 32(%rdi) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq @@ -341,6 +346,10 @@ ret void } +; Same as above, but 128-bit ops: +; With AVX, we don't transform 'add' to 'sub' because that prevents load folding. +; With SSE, we do it because we can't load fold the other op without overwriting the constant op. + define void @PR52032_2(<4 x i32>* %p) { ; SSE-LABEL: PR52032_2: ; SSE: # %bb.0: @@ -355,12 +364,10 @@ ; ; AVX-LABEL: PR52032_2: ; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm0, %xmm0, %xmm0 -; AVX-NEXT: vmovdqu (%rdi), %xmm1 -; AVX-NEXT: vmovdqu 16(%rdi), %xmm2 -; AVX-NEXT: vpsubd %xmm0, %xmm1, %xmm1 +; AVX-NEXT: vpbroadcastd {{.*#+}} xmm0 = [1,1,1,1] +; AVX-NEXT: vpaddd (%rdi), %xmm0, %xmm1 ; AVX-NEXT: vmovdqu %xmm1, (%rdi) -; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vpaddd 16(%rdi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqu %xmm0, 16(%rdi) ; AVX-NEXT: retq %i3 = load <4 x i32>, <4 x i32>* %p, align 4 @@ -373,6 +380,8 @@ ret void } +; If we are starting with a 'sub', it is always better to do the transform. + define void @PR52032_3(<4 x i32>* %p) { ; SSE-LABEL: PR52032_3: ; SSE: # %bb.0: @@ -403,6 +412,8 @@ ret void } +; If there's no chance of profitable load folding (because of extra uses), we convert 'add' to 'sub'. + define void @PR52032_4(<4 x i32>* %p, <4 x i32>* %q) { ; SSE-LABEL: PR52032_4: ; SSE: # %bb.0: