Generalize the negated pow2 folds to support vectors - it looks like some other shl(add(x,c1),c2) -> add(shl(x,c2),c3) style folds are conflicting but only affect scalar / non-uniform vectors - I'll address this in a future patch.
Details
Diff Detail
Unit Tests
Event Timeline
Aha, good catch.
But this is not enough of a generalization:
diff diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp index 172b304b776..06a0fcbf0fa 100644 --- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp @@ -233,29 +233,11 @@ Instruction *InstCombinerImpl::visitMul(BinaryOperator &I) { } } - if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) { - // (Y - X) * (-(2**n)) -> (X - Y) * (2**n), for positive nonzero n - // (Y + const) * (-(2**n)) -> (-constY) * (2**n), for positive nonzero n - // The "* (2**n)" thus becomes a potential shifting opportunity. - { - const APInt & Val = CI->getValue(); - const APInt &PosVal = Val.abs(); - if (Val.isNegative() && PosVal.isPowerOf2()) { - Value *X = nullptr, *Y = nullptr; - if (Op0->hasOneUse()) { - ConstantInt *C1; - Value *Sub = nullptr; - if (match(Op0, m_Sub(m_Value(Y), m_Value(X)))) - Sub = Builder.CreateSub(X, Y, "suba"); - else if (match(Op0, m_Add(m_Value(Y), m_ConstantInt(C1)))) - Sub = Builder.CreateSub(Builder.CreateNeg(C1), Y, "subc"); - if (Sub) - return - BinaryOperator::CreateMul(Sub, - ConstantInt::get(Y->getType(), PosVal)); - } - } - } + if (match(Op1, m_NegatedPower2())) { + // Interpret X * (-1<<C) as (-X) * (1<<C) and try to sink the negation. + if (Value *NegOp0 = Negator::Negate(/*IsNegation*/ false, Op0, *this)) + return BinaryOperator::CreateMul( + NegOp0, ConstantExpr::getNeg(cast<Constant>(Op1)), I.getName()); } if (Instruction *FoldedMul = foldBinOpIntoSelectOrPhi(I))
We miss a similar thing for sdiv exact %x, -1<<C --> -(ashr exact %x, C)
I can take over this patch if you prefer.
If you're offering - that'd be great, thank you! I randomly came across that we match this in DAG but not IR.....
Also I'm rather disappointed how often we have code that just matches m_ConstantInt where m_APInt (which would match uniform vectors with no undefs) or a full m_Constant could be used (possibly in conjunction with ConstantExpr) - it feels like something we should be aiming for.
llvm/test/Transforms/InstCombine/mul.ll | ||
---|---|---|
976 | And another example that we need something like Negator-driven hoisting. |
llvm/test/Transforms/InstCombine/mul.ll | ||
---|---|---|
976 | Also, there's some very interesting fold pirouetting going on there. INSTCOMBINE ITERATION #1 on muladd2 IC: ADD: ret i32 %mul IC: ADD: %mul = mul i32 %add, -4 IC: ADD: %add = add i32 %a0, 16 IC: Visiting: %add = add i32 %a0, 16 IC: Visiting: %mul = mul i32 %add, -4 Negator: attempting to sink negation into %add = add i32 %a0, 16 Negator: successfully sunk negation into %add = add i32 %a0, 16 NEW: %add.neg = sub i32 -16, %a0 Negator: Propagating 1 instrs to InstCombine IC: ADD DEFERRED: %add.neg = sub i32 -16, %a0 IC: Old = %mul = mul i32 %add, -4 New = %mul = mul i32 %add.neg, 4 IC: ADD: %mul = mul i32 %add.neg, 4 IC: ERASE %1 = mul i32 %add, -4 IC: ADD DEFERRED: %add = add i32 %a0, 16 IC: ERASE %add = add i32 %a0, 16 IC: ADD: %add.neg = sub i32 -16, %a0 IC: Visiting: %add.neg = sub i32 -16, %a0 Negator: attempting to sink negation into i32 %a0 Negator: failed to sink negation into i32 %a0 IC: Visiting: %mul = mul i32 %add.neg, 4 IC: Old = %mul = mul i32 %add.neg, 4 New = <badref> = shl i32 %add.neg, 2 IC: ADD: %mul = shl i32 %add.neg, 2 IC: ERASE %1 = mul i32 %add.neg, 4 IC: ADD DEFERRED: %add.neg = sub i32 -16, %a0 IC: ADD: %add.neg = sub i32 -16, %a0 IC: Visiting: %add.neg = sub i32 -16, %a0 Negator: attempting to sink negation into i32 %a0 Negator: failed to sink negation into i32 %a0 IC: Visiting: %mul = shl i32 %add.neg, 2 IC: ADD DEFERRED: %add.neg = sub i32 1073741808, %a0 IC: Mod = %mul = shl i32 %add.neg, 2 New = %mul = shl i32 %add.neg, 2 IC: ADD: %mul = shl i32 %add.neg, 2 IC: ADD: %add.neg = sub i32 1073741808, %a0 IC: Visiting: %add.neg = sub i32 1073741808, %a0 Negator: attempting to sink negation into i32 %a0 Negator: failed to sink negation into i32 %a0 IC: Visiting: %mul = shl i32 %add.neg, 2 IC: ADD DEFERRED: %1 = shl i32 %a0, 2 IC: Old = %mul = shl i32 %1, 2 New = <badref> = sub i32 -64, %add.neg IC: ADD: %mul = sub i32 -64, %add.neg IC: ERASE %2 = shl i32 %1, 2 IC: ADD DEFERRED: %1 = sub i32 1073741808, %a0 IC: ERASE %1 = sub i32 1073741808, %a0 IC: ADD: %add.neg = shl i32 %a0, 2 IC: Visiting: %add.neg = shl i32 %a0, 2 IC: Visiting: %mul = sub i32 -64, %add.neg Negator: attempting to sink negation into %add.neg = shl i32 %a0, 2 Negator: successfully sunk negation into %add.neg = shl i32 %a0, 2 NEW: %add.neg.neg = mul i32 %a0, -4 Negator: Propagating 1 instrs to InstCombine IC: ADD DEFERRED: %add.neg.neg = mul i32 %a0, -4 IC: Old = %mul = sub i32 -64, %add.neg New = <badref> = add i32 %add.neg.neg, -64 IC: ADD: %mul = add i32 %add.neg.neg, -64 IC: ERASE %1 = sub i32 -64, %add.neg IC: ADD DEFERRED: %add.neg = shl i32 %a0, 2 IC: ERASE %add.neg = shl i32 %a0, 2 IC: ADD: %add.neg.neg = mul i32 %a0, -4 IC: Visiting: %add.neg.neg = mul i32 %a0, -4 Negator: attempting to sink negation into i32 %a0 Negator: failed to sink negation into i32 %a0 IC: Visiting: %mul = add i32 %add.neg.neg, -64 IC: Visiting: ret i32 %mul INSTCOMBINE ITERATION #2 on muladd2 IC: ADD: ret i32 %mul IC: ADD: %mul = add i32 %add.neg.neg, -64 IC: ADD: %add.neg.neg = mul i32 %a0, -4 IC: Visiting: %add.neg.neg = mul i32 %a0, -4 Negator: attempting to sink negation into i32 %a0 Negator: failed to sink negation into i32 %a0 IC: Visiting: %mul = add i32 %add.neg.neg, -64 IC: Visiting: ret i32 %mul |
(Just some heads up, in case someone else got the same problem...)
I've noticed some regressions downstream after this patch. Haven't debugged/reduced it completely yet, but at least in one case I suspect that the problem actually might be in load-store-vectorizer.
Used to get something like this before load-store-vectorizer:
vector.body: ; preds = %vector.body, %vector.ph %index = phi i16 [ 0, %vector.ph ], [ %index.next, %vector.body ], !dbg !17 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %53, %vector.body ] %vec.phi72 = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %57, %vector.body ] %3 = shl i16 %index, 1 %next.gep = getelementptr i16, i16* %add.ptr, i16 %3 %4 = or i16 %3, 2 %next.gep73 = getelementptr i16, i16* %add.ptr, i16 %4 %5 = or i16 %3, 4 %next.gep74 = getelementptr i16, i16* %add.ptr, i16 %5 %6 = or i16 %3, 6 %next.gep75 = getelementptr i16, i16* %add.ptr, i16 %6 %7 = mul i16 %index, -2 %next.gep76 = getelementptr i16, i16* %incdec.ptr.us, i16 %7 %8 = or i16 %index, 1 %9 = mul i16 %8, -2 %next.gep77 = getelementptr i16, i16* %incdec.ptr.us, i16 %9 %10 = or i16 %index, 2 %11 = mul i16 %10, -2 %next.gep78 = getelementptr i16, i16* %incdec.ptr.us, i16 %11 %12 = or i16 %index, 3 %13 = mul i16 %12, -2 %next.gep79 = getelementptr i16, i16* %incdec.ptr.us, i16 %13 %next.gep84 = getelementptr i16, i16* %h, i16 %3 %next.gep85 = getelementptr i16, i16* %h, i16 %4 %next.gep86 = getelementptr i16, i16* %h, i16 %5 %next.gep87 = getelementptr i16, i16* %h, i16 %6 %next.gep92 = getelementptr i16, i16* %x.addr.051.us, i16 %7 %next.gep93 = getelementptr i16, i16* %x.addr.051.us, i16 %9 %next.gep94 = getelementptr i16, i16* %x.addr.051.us, i16 %11 %next.gep95 = getelementptr i16, i16* %x.addr.051.us, i16 %13 %14 = load i16, i16* %next.gep, align 1, !dbg !11, !tbaa !18 %15 = load i16, i16* %next.gep73, align 1, !dbg !11, !tbaa !18 %16 = load i16, i16* %next.gep74, align 1, !dbg !11, !tbaa !18 %17 = load i16, i16* %next.gep75, align 1, !dbg !11, !tbaa !18 %18 = insertelement <4 x i16> undef, i16 %14, i32 0, !dbg !11 %19 = insertelement <4 x i16> %18, i16 %15, i32 1, !dbg !11 %20 = insertelement <4 x i16> %19, i16 %16, i32 2, !dbg !11 %21 = insertelement <4 x i16> %20, i16 %17, i32 3, !dbg !11 %22 = load i16, i16* %next.gep84, align 1, !dbg !11, !tbaa !18 %23 = load i16, i16* %next.gep85, align 1, !dbg !11, !tbaa !18 %24 = load i16, i16* %next.gep86, align 1, !dbg !11, !tbaa !18 %25 = load i16, i16* %next.gep87, align 1, !dbg !11, !tbaa !18 %26 = insertelement <4 x i16> undef, i16 %22, i32 0, !dbg !11 %27 = insertelement <4 x i16> %26, i16 %23, i32 1, !dbg !11 %28 = insertelement <4 x i16> %27, i16 %24, i32 2, !dbg !11 %29 = insertelement <4 x i16> %28, i16 %25, i32 3, !dbg !11 %30 = load i16, i16* %next.gep76, align 1, !dbg !11, !tbaa !18 %31 = load i16, i16* %next.gep77, align 1, !dbg !11, !tbaa !18 %32 = load i16, i16* %next.gep78, align 1, !dbg !11, !tbaa !18 %33 = load i16, i16* %next.gep79, align 1, !dbg !11, !tbaa !18 %34 = insertelement <4 x i16> undef, i16 %30, i32 0, !dbg !11 %35 = insertelement <4 x i16> %34, i16 %31, i32 1, !dbg !11 %36 = insertelement <4 x i16> %35, i16 %32, i32 2, !dbg !11 %37 = insertelement <4 x i16> %36, i16 %33, i32 3, !dbg !11 %38 = load i16, i16* %next.gep92, align 1, !dbg !11, !tbaa !18 %39 = load i16, i16* %next.gep93, align 1, !dbg !11, !tbaa !18 %40 = load i16, i16* %next.gep94, align 1, !dbg !11, !tbaa !18 %41 = load i16, i16* %next.gep95, align 1, !dbg !11, !tbaa !18 %42 = insertelement <4 x i16> undef, i16 %38, i32 0, !dbg !17 %43 = insertelement <4 x i16> %42, i16 %39, i32 1, !dbg !17 %44 = insertelement <4 x i16> %43, i16 %40, i32 2, !dbg !17 %45 = insertelement <4 x i16> %44, i16 %41, i32 3, !dbg !17 %46 = sext <4 x i16> %29 to <4 x i32>, !dbg !17 %47 = sext <4 x i16> %45 to <4 x i32>, !dbg !17 %48 = mul nsw <4 x i32> %47, %46, !dbg !22 %49 = sext <4 x i16> %21 to <4 x i32>, !dbg !17 %50 = sext <4 x i16> %37 to <4 x i32>, !dbg !17 %51 = mul nsw <4 x i32> %50, %49, !dbg !23 %52 = sub <4 x i32> %vec.phi, %51, !dbg !24 %53 = add <4 x i32> %52, %48, !dbg !25 %54 = mul nsw <4 x i32> %50, %46, !dbg !26 %55 = add <4 x i32> %54, %vec.phi72, !dbg !27 %56 = mul nsw <4 x i32> %47, %49, !dbg !28 %57 = add <4 x i32> %55, %56, !dbg !29 %index.next = add i16 %index, 4, !dbg !17 %58 = icmp eq i16 %index.next, %n.vec, !dbg !17 br i1 %58, label %middle.block, label %vector.body, !dbg !17, !llvm.loop !30
But now we see this instead:
vector.body: ; preds = %vector.body, %vector.ph %index = phi i16 [ 0, %vector.ph ], [ %index.next, %vector.body ], !dbg !17 %vec.phi = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %50, %vector.body ] %vec.phi72 = phi <4 x i32> [ zeroinitializer, %vector.ph ], [ %54, %vector.body ] %3 = shl i16 %index, 1 %next.gep = getelementptr i16, i16* %add.ptr, i16 %3 %4 = or i16 %3, 2 %next.gep73 = getelementptr i16, i16* %add.ptr, i16 %4 %5 = or i16 %3, 4 %next.gep74 = getelementptr i16, i16* %add.ptr, i16 %5 %6 = or i16 %3, 6 %next.gep75 = getelementptr i16, i16* %add.ptr, i16 %6 %7 = mul i16 %index, -2 %next.gep76 = getelementptr i16, i16* %incdec.ptr.us, i16 %7 %8 = xor i16 %3, -2 %next.gep77 = getelementptr i16, i16* %incdec.ptr.us, i16 %8 %9 = add i16 %7, -4 %next.gep78 = getelementptr i16, i16* %incdec.ptr.us, i16 %9 %10 = add i16 %7, -6 %next.gep79 = getelementptr i16, i16* %incdec.ptr.us, i16 %10 %next.gep84 = getelementptr i16, i16* %h, i16 %3 %next.gep85 = getelementptr i16, i16* %h, i16 %4 %next.gep86 = getelementptr i16, i16* %h, i16 %5 %next.gep87 = getelementptr i16, i16* %h, i16 %6 %next.gep92 = getelementptr i16, i16* %x.addr.051.us, i16 %7 %next.gep93 = getelementptr i16, i16* %x.addr.051.us, i16 %8 %next.gep94 = getelementptr i16, i16* %x.addr.051.us, i16 %9 %next.gep95 = getelementptr i16, i16* %x.addr.051.us, i16 %10 %11 = load i16, i16* %next.gep, align 1, !dbg !11, !tbaa !18 %12 = load i16, i16* %next.gep73, align 1, !dbg !11, !tbaa !18 %13 = load i16, i16* %next.gep74, align 1, !dbg !11, !tbaa !18 %14 = load i16, i16* %next.gep75, align 1, !dbg !11, !tbaa !18 %15 = insertelement <4 x i16> undef, i16 %11, i32 0, !dbg !11 %16 = insertelement <4 x i16> %15, i16 %12, i32 1, !dbg !11 %17 = insertelement <4 x i16> %16, i16 %13, i32 2, !dbg !11 %18 = insertelement <4 x i16> %17, i16 %14, i32 3, !dbg !11 %19 = load i16, i16* %next.gep84, align 1, !dbg !11, !tbaa !18 %20 = load i16, i16* %next.gep85, align 1, !dbg !11, !tbaa !18 %21 = load i16, i16* %next.gep86, align 1, !dbg !11, !tbaa !18 %22 = load i16, i16* %next.gep87, align 1, !dbg !11, !tbaa !18 %23 = insertelement <4 x i16> undef, i16 %19, i32 0, !dbg !11 %24 = insertelement <4 x i16> %23, i16 %20, i32 1, !dbg !11 %25 = insertelement <4 x i16> %24, i16 %21, i32 2, !dbg !11 %26 = insertelement <4 x i16> %25, i16 %22, i32 3, !dbg !11 %27 = load i16, i16* %next.gep76, align 1, !dbg !11, !tbaa !18 %28 = load i16, i16* %next.gep77, align 1, !dbg !11, !tbaa !18 %29 = load i16, i16* %next.gep78, align 1, !dbg !11, !tbaa !18 %30 = load i16, i16* %next.gep79, align 1, !dbg !11, !tbaa !18 %31 = insertelement <4 x i16> undef, i16 %27, i32 0, !dbg !11 %32 = insertelement <4 x i16> %31, i16 %28, i32 1, !dbg !11 %33 = insertelement <4 x i16> %32, i16 %29, i32 2, !dbg !11 %34 = insertelement <4 x i16> %33, i16 %30, i32 3, !dbg !11 %35 = load i16, i16* %next.gep92, align 1, !dbg !11, !tbaa !18 %36 = load i16, i16* %next.gep93, align 1, !dbg !11, !tbaa !18 %37 = load i16, i16* %next.gep94, align 1, !dbg !11, !tbaa !18 %38 = load i16, i16* %next.gep95, align 1, !dbg !11, !tbaa !18 %39 = insertelement <4 x i16> undef, i16 %35, i32 0, !dbg !17 %40 = insertelement <4 x i16> %39, i16 %36, i32 1, !dbg !17 %41 = insertelement <4 x i16> %40, i16 %37, i32 2, !dbg !17 %42 = insertelement <4 x i16> %41, i16 %38, i32 3, !dbg !17 %43 = sext <4 x i16> %26 to <4 x i32>, !dbg !17 %44 = sext <4 x i16> %42 to <4 x i32>, !dbg !17 %45 = mul nsw <4 x i32> %44, %43, !dbg !22 %46 = sext <4 x i16> %18 to <4 x i32>, !dbg !17 %47 = sext <4 x i16> %34 to <4 x i32>, !dbg !17 %48 = mul nsw <4 x i32> %47, %46, !dbg !23 %49 = sub <4 x i32> %vec.phi, %48, !dbg !24 %50 = add <4 x i32> %49, %45, !dbg !25 %51 = mul nsw <4 x i32> %47, %43, !dbg !26 %52 = add <4 x i32> %51, %vec.phi72, !dbg !27 %53 = mul nsw <4 x i32> %44, %46, !dbg !28 %54 = add <4 x i32> %52, %53, !dbg !29 %index.next = add i16 %index, 4, !dbg !17 %55 = icmp eq i16 %index.next, %n.vec, !dbg !17 br i1 %55, label %middle.block, label %vector.body, !dbg !17, !llvm.loop !30
The old input to load-store-vectorizer resulted in two <8 x i16> loads, but the new input gives one <8 x i16 load, two <2 x i16> loads and one <4 x i16> load. So for some reason LSV fails to detect that the loads are consecutive given the new IR. At least that is my current theory.
And another example that we need something like Negator-driven hoisting.