Index: lib/CodeGen/SelectionDAG/DAGCombiner.cpp =================================================================== --- lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -2145,14 +2145,87 @@ } // fold (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2) - if (N1IsConst && N0.getOpcode() == ISD::ADD && N0.getNode()->hasOneUse() && + if (N1IsConst && N0.getOpcode() == ISD::ADD && (isConstantSplatVector(N0.getOperand(1).getNode(), Val) || - isa(N0.getOperand(1)))) - return DAG.getNode(ISD::ADD, SDLoc(N), VT, - DAG.getNode(ISD::MUL, SDLoc(N0), VT, - N0.getOperand(0), N1), - DAG.getNode(ISD::MUL, SDLoc(N1), VT, - N0.getOperand(1), N1)); + isa(N0.getOperand(1)))) { + + SDValue AddNode = N0; + SDValue ConstNode = N1; + bool IsProfitable = true; + + // If the (add x, c1) has multiple uses, we could increase + // the number of adds if we make this transformation. + // It would only be worth doing this if we can remove a + // multiply in the process. Check for that here. + // To illustrate: + // (A + c1) * c3 + // (A + c2) * c3 + // We're checking for cases where we have common "c3 * A" expressions. + if (!AddNode.getNode()->hasOneUse()) { + IsProfitable = false; + + // Walk all the users of the constant with which we're multiplying. + for (SDNode *Use : ConstNode->uses()) { + + if (Use == N) // This use is the one we're on right now. Skip it. + continue; + + if (Use->getOpcode() == ISD::MUL) { // We have another multiply use. + SDNode *OtherOp; + SDNode *MulVar = AddNode.getOperand(0).getNode(); + + // OtherOp is what we're multiplying against the constant. + if (Use->getOperand(0) == ConstNode) + OtherOp = Use->getOperand(1).getNode(); + else + OtherOp = Use->getOperand(0).getNode(); + + // Check to see if multiply is with the same operand of our "add". + // + // ConstNode = CONST + // Use = ConstNode * A <-- visiting Use. OtherOp is A. + // ... + // AddNode = (A + c1) <-- MulVar is A. + // = AddNode * ConstNode <-- current visiting instruction. + // + // If we make this transformation, we will have a common + // multiply (ConstNode * A) that we can save. + if (OtherOp == MulVar) { + IsProfitable = true; + break; + } + + // Now check to see if a future expansion will give us a common + // multiply. + // + // ConstNode = CONST + // AddNode = (A + c1) + // ... = AddNode * ConstNode <-- current visiting instruction. + // ... + // OtherOp = (A + c2) + // Use = OtherOp * ConstNode <-- visiting Use. + // + // If we make this transformation, we will have a common + // multiply (CONST * A) after we also do the same transformation + // to the "t2" instruction. + if (OtherOp->getOpcode() == ISD::ADD && + (isConstantSplatVector(OtherOp->getOperand(1).getNode(), Val) || + isa(OtherOp->getOperand(1).getNode())) && + OtherOp->getOperand(0).getNode() == MulVar) { + IsProfitable = true; + break; + } + } + } + } + + if (IsProfitable) + return DAG.getNode(ISD::ADD, SDLoc(N), VT, + DAG.getNode(ISD::MUL, SDLoc(AddNode), VT, + AddNode.getOperand(0), ConstNode), + DAG.getNode(ISD::MUL, SDLoc(N1), VT, + AddNode.getOperand(1), ConstNode)); + } // reassociate mul if (SDValue RMUL = ReassociateOps(ISD::MUL, SDLoc(N), N0, N1)) Index: test/CodeGen/X86/combine-multiplies.ll =================================================================== --- test/CodeGen/X86/combine-multiplies.ll +++ test/CodeGen/X86/combine-multiplies.ll @@ -0,0 +1,86 @@ +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s + +; Source file looks something like this: +; +; typedef int AAA[100][100]; +; +; void foo(AAA a,int lll) +; { +; int LOC = lll + 5; +; +; a[LOC][LOC] = 11; +; +; a[LOC][20] = 22; +; a[LOC+20][20] = 33; +; } +; +; We want to make sure we don't generate 2 multiply instructions, +; one for a[LOC][] and one for a[LOC+20]. visitMUL in DAGCombiner.cpp +; should combine the instructions in such a way to avoid the extra +; multiply. +; +; CHECK-LABEL: foo +; CHECK: imull +; CHECK-NOT: imull +; CHECK: retl +; + +; Function Attrs: nounwind +define void @foo([100 x i32]* nocapture %a, i32 %lll) #0 { +entry: + %add = add nsw i32 %lll, 5 + %arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 %add + store i32 11, i32* %arrayidx1, align 4 + %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 20 + store i32 22, i32* %arrayidx3, align 4 + %add4 = add nsw i32 %lll, 25 + %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add4, i32 20 + store i32 33, i32* %arrayidx6, align 4 + ret void +} + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } + +; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s + +; Source file looks something like this: +; +; typedef int AAA[100][100]; +; +; void foo(AAA a,int lll) +; { +; int LOC = lll + 5; +; +; a[LOC][LOC] = 11; +; +; a[LOC][20] = 22; +; a[LOC+20][20] = 33; +; } +; +; We want to make sure we don't generate 2 multiply instructions, +; one for a[LOC][] and one for a[LOC+20]. visitMUL in DAGCombiner.cpp +; should combine the instructions in such a way to avoid the extra +; multiply. +; +; CHECK-LABEL: foo +; CHECK: imull +; CHECK-NOT: imull +; CHECK: retl +; + +; Function Attrs: nounwind +define void @foo([100 x i32]* nocapture %a, i32 %lll) #0 { +entry: + %add = add nsw i32 %lll, 5 + %arrayidx1 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 %add + store i32 11, i32* %arrayidx1, align 4 + %arrayidx3 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add, i32 20 + store i32 22, i32* %arrayidx3, align 4 + %add4 = add nsw i32 %lll, 25 + %arrayidx6 = getelementptr inbounds [100 x i32], [100 x i32]* %a, i32 %add4, i32 20 + store i32 33, i32* %arrayidx6, align 4 + ret void +} + +attributes #0 = { nounwind "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="pentium4" "target-features"="+sse,+sse2" "unsafe-fp-math"="false" "use-soft-float"="false" } +