Index: lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp =================================================================== --- lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -252,6 +252,161 @@ return true; } +template +inline BinaryOp_match +m_LowPart(const LHS &L) { + return m_And(L, m_SpecificInt(0xffffffff)); +} + +template +inline BinaryOp_match +m_HighPart(const LHS &L) { + return m_LShr(L, m_SpecificInt(32)); +} + +static Value *findOrCreateFullMul(Instruction &I, Value *x, Value *y, + DominatorTree &DT) { + // Try to find the wanted multiplication instruction. + // FIXME: Check the multiplication type. + for (const auto U : x->users()) { + LLVM_DEBUG(dbgs() << "User1 " << *U << "\n"); + if (match(U, m_ZExt(m_Specific(x)))) { + LLVM_DEBUG(dbgs() << "ZExt found: " << *U << "\n"); + for (const auto V : U->users()) { + if (match(V, m_c_Mul(m_Specific(U), m_ZExt(m_Specific(y))))) { + LLVM_DEBUG(dbgs() << "Mul found: " << *V << "\n"); + return cast(V); + } + } + } + } + + // Create the full multiplication instruction and place it just after its + // operands. This position is the higher possible so will be safe to be used + // as a replacement for all future matched patterns. + // FIXME: All this placement probably don't even work. + Instruction *insertPoint = &I.getParent()->front(); + auto xI = dyn_cast(x); + if (xI && DT.dominates(xI, insertPoint)) + insertPoint = xI; + auto yI = dyn_cast(y); + if (yI && DT.dominates(yI, insertPoint)) + insertPoint = yI; + + LLVM_DEBUG(dbgs() << "Insert Point: " << *insertPoint << "\n"); + IRBuilder<> Builder{insertPoint}; + auto ex = Builder.CreateZExt(x, Builder.getInt128Ty()); + auto ey = Builder.CreateZExt(y, Builder.getInt128Ty()); + return cast(Builder.CreateNUWMul(ex, ey, "p")); +} + +/// Matches the following pattern producing full multiplication: +/// +/// %xl = and i64 %x, 4294967295 +/// %xh = lshr i64 %x, 32 +/// %yl = and i64 %y, 4294967295 +/// %yh = lshr i64 %y, 32 +/// +/// %t0 = mul nuw i64 %yl, %xl +/// %t1 = mul nuw i64 %yl, %xh +/// %t2 = mul nuw i64 %yh, %xl +/// %t3 = mul nuw i64 %yh, %xh +/// +/// %t0l = and i64 %t0, 4294967295 +/// %t0h = lshr i64 %t0, 32 +/// +/// %u0 = add i64 %t0h, %t1 +/// %u0l = and i64 %u0, 4294967295 +/// %u0h = lshr i64 %u0, 32 +/// +/// %u1 = add i64 %u0l, %t2 +/// %u1ls = shl i64 %u1, 32 +/// %u1h = lshr i64 %u1, 32 +/// +/// %u2 = add i64 %u0h, %t3 +/// +/// %lo = or i64 %u1ls, %t0l +/// %hi = add i64 %u2, %u1h + +static bool foldMul(Instruction &I, DominatorTree &DT) { + + Value *x = nullptr; + Value *y = nullptr; + + Value *t0 = nullptr; + Value *t1 = nullptr; + Value *t2 = nullptr; + + Value *t3 = nullptr; + Value *u0 = nullptr; + + // Match low part of the full multiplication. + // + // First we match up to the multiplications t0, t1, t2. + // The t0 is reachable by two edges and we _assume_ it's the same node + // in general it does not have to be. + if (match(&I, + m_c_Or(m_LowPart(m_Value(t0)), + m_Shl(m_c_Add(m_LowPart(m_c_Add(m_HighPart(m_Deferred(t0)), + m_Value(t1))), + m_Value(t2)), + m_SpecificInt(32))))) { + + LLVM_DEBUG(dbgs() << "Lo found up to muls\n"); + LLVM_DEBUG(dbgs() << *t0 << "\n"); + LLVM_DEBUG(dbgs() << *t1 << "\n"); + LLVM_DEBUG(dbgs() << *t2 << "\n"); + + // 1. Match t1 and remember its arguments. We start with t1 is asymmetric. + // 2. Require t2 to be a swapped version of t1. + // 3. For t0 require to have the same arguments as t1. + if (match(t1, m_c_Mul(m_HighPart(m_Value(x)), m_LowPart(m_Value(y)))) && + match(t2, + m_c_Mul(m_LowPart(m_Specific(x)), m_HighPart(m_Specific(y)))) && + match(t0, + m_c_Mul(m_LowPart(m_Specific(x)), m_LowPart(m_Specific(y))))) { + LLVM_DEBUG(dbgs() << "Lo muls are ok\n"); + + // The whole pattern can be replaced with single multiplication. + auto mul = findOrCreateFullMul(I, x, y, DT); + IRBuilder<> Builder{&I}; + auto lo = Builder.CreateTrunc(mul, I.getType(), "p.lo"); + I.replaceAllUsesWith(lo); + return true; + } + } + + // Match low part of the full multiplication. + // + // First we match up to multiplications t2 and t3 and u0 node. + // Then check the u0 node. + // In the end check all 4 multiplications starting from asymmetric ones + // the same as in matching the low part. + if (match(&I, + m_c_Add(m_HighPart(m_c_Add(m_LowPart(m_Value(u0)), m_Value(t2))), + m_Add(m_HighPart(m_Deferred(u0)), m_Value(t3)))) && + match(u0, m_c_Add(m_HighPart(m_Value(t0)), m_Value(t1)))) { + if (match(t1, m_c_Mul(m_HighPart(m_Value(x)), m_LowPart(m_Value(y)))) && + match(t2, + m_c_Mul(m_LowPart(m_Specific(x)), m_HighPart(m_Specific(y)))) && + match(t0, + m_c_Mul(m_LowPart(m_Specific(x)), m_LowPart(m_Specific(y)))) && + match(t3, + m_c_Mul(m_HighPart(m_Specific(x)), m_HighPart(m_Specific(y))))) { + LLVM_DEBUG(dbgs() << "Hi found!!! (" << *x << ", " << *y << ")\n"); + + auto mul = findOrCreateFullMul(I, x, y, DT); + IRBuilder<> Builder{&I}; + auto hi = + Builder.CreateTrunc(Builder.CreateLShr(mul, 64), I.getType(), "p.hi"); + I.replaceAllUsesWith(hi); + return true; + } + } + + return false; +} + /// This is the entry point for folds that could be implemented in regular /// InstCombine, but they are separated because they are not expected to /// occur frequently and/or have more than a constant-length pattern match. @@ -270,6 +425,7 @@ for (Instruction &I : make_range(BB.rbegin(), BB.rend())) { MadeChange |= foldAnyOrAllBitsSet(I); MadeChange |= foldGuardedRotateToFunnelShift(I); + MadeChange |= foldMul(I, DT); } } Index: test/Transforms/AggressiveInstCombine/mul_full_32.ll =================================================================== --- test/Transforms/AggressiveInstCombine/mul_full_32.ll +++ test/Transforms/AggressiveInstCombine/mul_full_32.ll @@ -6,27 +6,14 @@ define { i64, i64 } @mul_full_64(i64 %x, i64 %y) { ; CHECK-LABEL: @mul_full_64( -; CHECK-NEXT: [[XL:%.*]] = and i64 [[X:%.*]], 4294967295 -; CHECK-NEXT: [[XH:%.*]] = lshr i64 [[X]], 32 -; CHECK-NEXT: [[YL:%.*]] = and i64 [[Y:%.*]], 4294967295 -; CHECK-NEXT: [[YH:%.*]] = lshr i64 [[Y]], 32 -; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[YL]], [[XL]] -; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[YL]], [[XH]] -; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[YH]], [[XL]] -; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[YH]], [[XH]] -; CHECK-NEXT: [[T0L:%.*]] = and i64 [[T0]], 4294967295 -; CHECK-NEXT: [[T0H:%.*]] = lshr i64 [[T0]], 32 -; CHECK-NEXT: [[U0:%.*]] = add i64 [[T0H]], [[T1]] -; CHECK-NEXT: [[U0L:%.*]] = and i64 [[U0]], 4294967295 -; CHECK-NEXT: [[U0H:%.*]] = lshr i64 [[U0]], 32 -; CHECK-NEXT: [[U1:%.*]] = add i64 [[U0L]], [[T2]] -; CHECK-NEXT: [[U1LS:%.*]] = shl i64 [[U1]], 32 -; CHECK-NEXT: [[U1H:%.*]] = lshr i64 [[U1]], 32 -; CHECK-NEXT: [[U2:%.*]] = add i64 [[U0H]], [[T3]] -; CHECK-NEXT: [[LO:%.*]] = or i64 [[U1LS]], [[T0L]] -; CHECK-NEXT: [[HI:%.*]] = add i64 [[U2]], [[U1H]] -; CHECK-NEXT: [[RES_LO:%.*]] = insertvalue { i64, i64 } undef, i64 [[LO]], 0 -; CHECK-NEXT: [[RES:%.*]] = insertvalue { i64, i64 } [[RES_LO]], i64 [[HI]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X:%.*]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y:%.*]] to i128 +; CHECK-NEXT: [[P:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[P_LO:%.*]] = trunc i128 [[P]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = lshr i128 [[P]], 64 +; CHECK-NEXT: [[P_HI:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: [[RES_LO:%.*]] = insertvalue { i64, i64 } undef, i64 [[P_LO]], 0 +; CHECK-NEXT: [[RES:%.*]] = insertvalue { i64, i64 } [[RES_LO]], i64 [[P_HI]], 1 ; CHECK-NEXT: ret { i64, i64 } [[RES]] ; %xl = and i64 %x, 4294967295 Index: test/Transforms/AggressiveInstCombine/mul_full_64.ll =================================================================== --- test/Transforms/AggressiveInstCombine/mul_full_64.ll +++ test/Transforms/AggressiveInstCombine/mul_full_64.ll @@ -6,27 +6,14 @@ define { i64, i64 } @mul_full_64_variant0(i64 %x, i64 %y) { ; CHECK-LABEL: @mul_full_64_variant0( -; CHECK-NEXT: [[XL:%.*]] = and i64 [[X:%.*]], 4294967295 -; CHECK-NEXT: [[XH:%.*]] = lshr i64 [[X]], 32 -; CHECK-NEXT: [[YL:%.*]] = and i64 [[Y:%.*]], 4294967295 -; CHECK-NEXT: [[YH:%.*]] = lshr i64 [[Y]], 32 -; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[YL]], [[XL]] -; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[YL]], [[XH]] -; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[YH]], [[XL]] -; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[YH]], [[XH]] -; CHECK-NEXT: [[T0L:%.*]] = and i64 [[T0]], 4294967295 -; CHECK-NEXT: [[T0H:%.*]] = lshr i64 [[T0]], 32 -; CHECK-NEXT: [[U0:%.*]] = add i64 [[T0H]], [[T1]] -; CHECK-NEXT: [[U0L:%.*]] = and i64 [[U0]], 4294967295 -; CHECK-NEXT: [[U0H:%.*]] = lshr i64 [[U0]], 32 -; CHECK-NEXT: [[U1:%.*]] = add i64 [[U0L]], [[T2]] -; CHECK-NEXT: [[U1LS:%.*]] = shl i64 [[U1]], 32 -; CHECK-NEXT: [[U1H:%.*]] = lshr i64 [[U1]], 32 -; CHECK-NEXT: [[U2:%.*]] = add i64 [[U0H]], [[T3]] -; CHECK-NEXT: [[LO:%.*]] = or i64 [[U1LS]], [[T0L]] -; CHECK-NEXT: [[HI:%.*]] = add i64 [[U2]], [[U1H]] -; CHECK-NEXT: [[RES_LO:%.*]] = insertvalue { i64, i64 } undef, i64 [[LO]], 0 -; CHECK-NEXT: [[RES:%.*]] = insertvalue { i64, i64 } [[RES_LO]], i64 [[HI]], 1 +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[X:%.*]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[Y:%.*]] to i128 +; CHECK-NEXT: [[P:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[P_LO:%.*]] = trunc i128 [[P]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = lshr i128 [[P]], 64 +; CHECK-NEXT: [[P_HI:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: [[RES_LO:%.*]] = insertvalue { i64, i64 } undef, i64 [[P_LO]], 0 +; CHECK-NEXT: [[RES:%.*]] = insertvalue { i64, i64 } [[RES_LO]], i64 [[P_HI]], 1 ; CHECK-NEXT: ret { i64, i64 } [[RES]] ; %xl = and i64 %x, 4294967295 @@ -89,23 +76,12 @@ define i64 @mul_full_64_variant1(i64 %a, i64 %b, i64* nocapture %rhi) { ; CHECK-LABEL: @mul_full_64_variant1( -; CHECK-NEXT: [[CONV:%.*]] = and i64 [[A:%.*]], 4294967295 -; CHECK-NEXT: [[SHR_I43:%.*]] = lshr i64 [[A]], 32 -; CHECK-NEXT: [[CONV3:%.*]] = and i64 [[B:%.*]], 4294967295 -; CHECK-NEXT: [[SHR_I41:%.*]] = lshr i64 [[B]], 32 -; CHECK-NEXT: [[MUL:%.*]] = mul nuw i64 [[SHR_I41]], [[SHR_I43]] -; CHECK-NEXT: [[MUL5:%.*]] = mul nuw i64 [[CONV3]], [[SHR_I43]] -; CHECK-NEXT: [[MUL6:%.*]] = mul nuw i64 [[SHR_I41]], [[CONV]] -; CHECK-NEXT: [[MUL7:%.*]] = mul nuw i64 [[CONV3]], [[CONV]] -; CHECK-NEXT: [[SHR_I40:%.*]] = lshr i64 [[MUL7]], 32 -; CHECK-NEXT: [[ADD:%.*]] = add i64 [[SHR_I40]], [[MUL5]] -; CHECK-NEXT: [[SHR_I39:%.*]] = lshr i64 [[ADD]], 32 -; CHECK-NEXT: [[ADD10:%.*]] = add i64 [[SHR_I39]], [[MUL]] -; CHECK-NEXT: [[CONV14:%.*]] = and i64 [[ADD]], 4294967295 -; CHECK-NEXT: [[ADD15:%.*]] = add i64 [[CONV14]], [[MUL6]] -; CHECK-NEXT: [[SHR_I:%.*]] = lshr i64 [[ADD15]], 32 -; CHECK-NEXT: [[ADD17:%.*]] = add i64 [[ADD10]], [[SHR_I]] -; CHECK-NEXT: store i64 [[ADD17]], i64* [[RHI:%.*]], align 8 +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[A:%.*]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[B:%.*]] to i128 +; CHECK-NEXT: [[P:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i128 [[P]], 64 +; CHECK-NEXT: [[P_HI:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: store i64 [[P_HI]], i64* [[RHI:%.*]], align 8 ; CHECK-NEXT: [[MUL18:%.*]] = mul i64 [[B]], [[A]] ; CHECK-NEXT: ret i64 [[MUL18]] ; @@ -132,27 +108,14 @@ define i64 @mul_full_64_variant2(i64 %a, i64 %b, i64* nocapture %rhi) { ; CHECK-LABEL: @mul_full_64_variant2( -; CHECK-NEXT: [[CONV:%.*]] = and i64 [[A:%.*]], 4294967295 -; CHECK-NEXT: [[SHR_I58:%.*]] = lshr i64 [[A]], 32 -; CHECK-NEXT: [[CONV3:%.*]] = and i64 [[B:%.*]], 4294967295 -; CHECK-NEXT: [[SHR_I56:%.*]] = lshr i64 [[B]], 32 -; CHECK-NEXT: [[MUL:%.*]] = mul nuw i64 [[SHR_I56]], [[SHR_I58]] -; CHECK-NEXT: [[MUL5:%.*]] = mul nuw i64 [[CONV3]], [[SHR_I58]] -; CHECK-NEXT: [[MUL6:%.*]] = mul nuw i64 [[SHR_I56]], [[CONV]] -; CHECK-NEXT: [[MUL7:%.*]] = mul nuw i64 [[CONV3]], [[CONV]] -; CHECK-NEXT: [[SHR_I55:%.*]] = lshr i64 [[MUL7]], 32 -; CHECK-NEXT: [[ADD:%.*]] = add i64 [[SHR_I55]], [[MUL5]] -; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i64 [[ADD]], 32 -; CHECK-NEXT: [[ADD10:%.*]] = add i64 [[SHR_I54]], [[MUL]] -; CHECK-NEXT: [[CONV14:%.*]] = and i64 [[ADD]], 4294967295 -; CHECK-NEXT: [[ADD15:%.*]] = add i64 [[CONV14]], [[MUL6]] -; CHECK-NEXT: [[SHR_I51:%.*]] = lshr i64 [[ADD15]], 32 -; CHECK-NEXT: [[ADD17:%.*]] = add i64 [[ADD10]], [[SHR_I51]] -; CHECK-NEXT: store i64 [[ADD17]], i64* [[RHI:%.*]], align 8 -; CHECK-NEXT: [[CONV24:%.*]] = shl i64 [[ADD15]], 32 -; CHECK-NEXT: [[CONV26:%.*]] = and i64 [[MUL7]], 4294967295 -; CHECK-NEXT: [[ADD27:%.*]] = or i64 [[CONV24]], [[CONV26]] -; CHECK-NEXT: ret i64 [[ADD27]] +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[A:%.*]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[B:%.*]] to i128 +; CHECK-NEXT: [[P:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[TMP3:%.*]] = lshr i128 [[P]], 64 +; CHECK-NEXT: [[P_HI:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: store i64 [[P_HI]], i64* [[RHI:%.*]], align 8 +; CHECK-NEXT: [[P_LO:%.*]] = trunc i128 [[P]] to i64 +; CHECK-NEXT: ret i64 [[P_LO]] ; %conv = and i64 %a, 4294967295 %shr.i58 = lshr i64 %a, 32 @@ -179,23 +142,19 @@ define i64 @mul_full_64_variant3(i64 %a, i64 %b, i64* nocapture %rhi) { ; CHECK-LABEL: @mul_full_64_variant3( -; CHECK-NEXT: [[CONV:%.*]] = and i64 [[A:%.*]], 4294967295 +; CHECK-NEXT: [[TMP1:%.*]] = zext i64 [[A:%.*]] to i128 +; CHECK-NEXT: [[TMP2:%.*]] = zext i64 [[B:%.*]] to i128 +; CHECK-NEXT: [[P:%.*]] = mul nuw i128 [[TMP1]], [[TMP2]] +; CHECK-NEXT: [[CONV:%.*]] = and i64 [[A]], 4294967295 ; CHECK-NEXT: [[SHR_I45:%.*]] = lshr i64 [[A]], 32 -; CHECK-NEXT: [[CONV3:%.*]] = and i64 [[B:%.*]], 4294967295 +; CHECK-NEXT: [[CONV3:%.*]] = and i64 [[B]], 4294967295 ; CHECK-NEXT: [[SHR_I43:%.*]] = lshr i64 [[B]], 32 -; CHECK-NEXT: [[MUL:%.*]] = mul nuw i64 [[SHR_I43]], [[SHR_I45]] ; CHECK-NEXT: [[MUL5:%.*]] = mul nuw i64 [[CONV3]], [[SHR_I45]] ; CHECK-NEXT: [[MUL6:%.*]] = mul nuw i64 [[SHR_I43]], [[CONV]] ; CHECK-NEXT: [[MUL7:%.*]] = mul nuw i64 [[CONV3]], [[CONV]] -; CHECK-NEXT: [[SHR_I42:%.*]] = lshr i64 [[MUL7]], 32 -; CHECK-NEXT: [[ADD:%.*]] = add i64 [[SHR_I42]], [[MUL5]] -; CHECK-NEXT: [[SHR_I41:%.*]] = lshr i64 [[ADD]], 32 -; CHECK-NEXT: [[ADD10:%.*]] = add i64 [[SHR_I41]], [[MUL]] -; CHECK-NEXT: [[CONV14:%.*]] = and i64 [[ADD]], 4294967295 -; CHECK-NEXT: [[ADD15:%.*]] = add i64 [[CONV14]], [[MUL6]] -; CHECK-NEXT: [[SHR_I:%.*]] = lshr i64 [[ADD15]], 32 -; CHECK-NEXT: [[ADD17:%.*]] = add i64 [[ADD10]], [[SHR_I]] -; CHECK-NEXT: store i64 [[ADD17]], i64* [[RHI:%.*]], align 8 +; CHECK-NEXT: [[TMP3:%.*]] = lshr i128 [[P]], 64 +; CHECK-NEXT: [[P_HI:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: store i64 [[P_HI]], i64* [[RHI:%.*]], align 8 ; CHECK-NEXT: [[ADD18:%.*]] = add i64 [[MUL6]], [[MUL5]] ; CHECK-NEXT: [[SHL:%.*]] = shl i64 [[ADD18]], 32 ; CHECK-NEXT: [[ADD19:%.*]] = add i64 [[SHL]], [[MUL7]]