Index: lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp =================================================================== --- lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -252,6 +252,129 @@ return true; } +template +inline BinaryOp_match +m_LowPart(const LHS &L) { + return m_And(L, m_SpecificInt(0xffffffff)); +} + +template +inline BinaryOp_match +m_HighPart(const LHS &L) { + return m_LShr(L, m_SpecificInt(32)); +} + +/// Matches the following pattern producing full multiplication: +/// +/// %xl = and i64 %x, 4294967295 +/// %xh = lshr i64 %x, 32 +/// %yl = and i64 %y, 4294967295 +/// %yh = lshr i64 %y, 32 +/// +/// %t0 = mul nuw i64 %yl, %xl +/// %t1 = mul nuw i64 %yl, %xh +/// %t2 = mul nuw i64 %yh, %xl +/// %t3 = mul nuw i64 %yh, %xh +/// +/// %t0l = and i64 %t0, 4294967295 +/// %t0h = lshr i64 %t0, 32 +/// +/// %u0 = add i64 %t0h, %t1 +/// %u0l = and i64 %u0, 4294967295 +/// %u0h = lshr i64 %u0, 32 +/// +/// %u1 = add i64 %u0l, %t2 +/// %u1ls = shl i64 %u1, 32 +/// %u1h = lshr i64 %u1, 32 +/// +/// %u2 = add i64 %u0h, %t3 +/// +/// %lo = or i64 %u1ls, %t0l +/// %hi = add i64 %u2, %u1h + +static bool foldMul(Instruction &I) { + + Value *x = nullptr; + Value *y = nullptr; + + Value *t0 = nullptr; + Value *t0a = nullptr; + Value *t1 = nullptr; + Value *t2 = nullptr; + + Value *t3 = nullptr; + Value *u0 = nullptr; + Value *u0a = nullptr; + + // Match lo: + + // We first match up to the multiplications t0, t1, t2. + if (match(&I, m_c_Or(m_LowPart(m_Value(t0)), + m_Shl(m_c_Add(m_LowPart(m_c_Add(m_HighPart(m_Value(t0a)), + m_Value(t1))), + m_Value(t2)), + m_SpecificInt(32))))) { + + LLVM_DEBUG(dbgs() << "Lo found up to muls\n"); + LLVM_DEBUG(dbgs() << *t0 << "\n"); + LLVM_DEBUG(dbgs() << *t0a << "\n"); + LLVM_DEBUG(dbgs() << *t1 << "\n"); + LLVM_DEBUG(dbgs() << *t2 << "\n"); + + // 1. t0 is reachable by two edges, require it to be the same. + // TODO: It does not need to be the same one, we can duplicate the t0 + // match pattern. + // 2. Match t1, and remember its arguments. We start with t1 not t0 because + // t1 is asymmetric. + // 3. Require t2 to be a swapped version of t1. + // 4. For t0 require to have the same arguments as t1. + if (t0 == t0a && + match(t1, m_c_Mul(m_HighPart(m_Value(x)), m_LowPart(m_Value(y)))) && + match(t2, + m_c_Mul(m_LowPart(m_Specific(x)), m_HighPart(m_Specific(y)))) && + match(t0, + m_c_Mul(m_LowPart(m_Specific(x)), m_LowPart(m_Specific(y))))) { + LLVM_DEBUG(dbgs() << "Lo muls are ok\n"); + + // The whole pattern can be replaces with single multiplication. + // FIXME: We must also search for umulh pattern and replace both with + // single multiplication. + IRBuilder<> Builder(&I); + auto lo = Builder.CreateMul(x, y, "p.lo"); + I.replaceAllUsesWith(lo); + return true; + } + } + + // Match hi: + if (match(&I, + m_c_Add(m_HighPart(m_c_Add(m_LowPart(m_Value(u0)), m_Value(t2))), + m_Add(m_HighPart(m_Value(u0a)), m_Value(t3))))) { + if (u0 == u0a && match(u0, m_c_Add(m_HighPart(m_Value(t0)), m_Value(t1)))) { + if (match(t1, m_c_Mul(m_HighPart(m_Value(x)), m_LowPart(m_Value(y)))) && + match(t2, + m_c_Mul(m_LowPart(m_Specific(x)), m_HighPart(m_Specific(y)))) && + match(t0, + m_c_Mul(m_LowPart(m_Specific(x)), m_LowPart(m_Specific(y)))) && + match(t3, m_c_Mul(m_HighPart(m_Specific(x)), + m_HighPart(m_Specific(y))))) { + LLVM_DEBUG(dbgs() << "Hi found!!!\n" << *x << "\n" << *y << "\n"); + + IRBuilder<> Builder(&I); + auto ex = Builder.CreateZExt(x, Builder.getInt128Ty()); + auto ey = Builder.CreateZExt(y, Builder.getInt128Ty()); + auto p = Builder.CreateNUWMul(ex, ey, "p"); + auto hi = + Builder.CreateTrunc(Builder.CreateLShr(p, 64), I.getType(), "p.hi"); + I.replaceAllUsesWith(hi); + return true; + } + } + } + + return false; +} + /// This is the entry point for folds that could be implemented in regular /// InstCombine, but they are separated because they are not expected to /// occur frequently and/or have more than a constant-length pattern match. @@ -270,6 +393,7 @@ for (Instruction &I : make_range(BB.rbegin(), BB.rend())) { MadeChange |= foldAnyOrAllBitsSet(I); MadeChange |= foldGuardedRotateToFunnelShift(I); + MadeChange |= foldMul(I); } } Index: test/Transforms/AggressiveInstCombine/mul128.ll =================================================================== --- /dev/null +++ test/Transforms/AggressiveInstCombine/mul128.ll @@ -0,0 +1,39 @@ +; RUN: opt < %s -aggressive-instcombine -S | FileCheck %s + +target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +; CHECK-LABEL: @mul_full_64( +; CHECK-NOT: mul nuw i64 +; CHECK: mul nuw i128 +define { i64, i64 } @mul_full_64(i64 %x, i64 %y) { + %xl = and i64 %x, 4294967295 + %xh = lshr i64 %x, 32 + %yl = and i64 %y, 4294967295 + %yh = lshr i64 %y, 32 + + %t0 = mul nuw i64 %yl, %xl + %t1 = mul nuw i64 %yl, %xh + %t2 = mul nuw i64 %yh, %xl + %t3 = mul nuw i64 %yh, %xh + + %t0l = and i64 %t0, 4294967295 + %t0h = lshr i64 %t0, 32 + + %u0 = add i64 %t0h, %t1 + %u0l = and i64 %u0, 4294967295 + %u0h = lshr i64 %u0, 32 + + %u1 = add i64 %u0l, %t2 + %u1ls = shl i64 %u1, 32 + %u1h = lshr i64 %u1, 32 + + %u2 = add i64 %u0h, %t3 + + %lo = or i64 %u1ls, %t0l + %hi = add i64 %u2, %u1h + + %res_lo = insertvalue { i64, i64 } undef, i64 %lo, 0 + %res = insertvalue { i64, i64 } %res_lo, i64 %hi, 1 + ret { i64, i64 } %res +}