Index: lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
===================================================================
--- lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -252,6 +252,129 @@
   return true;
 }
 
+template <typename LHS>
+inline BinaryOp_match<LHS, specific_intval, Instruction::And>
+m_LowPart(const LHS &L) {
+  return m_And(L, m_SpecificInt(0xffffffff));
+}
+
+template <typename LHS>
+inline BinaryOp_match<LHS, specific_intval, Instruction::LShr>
+m_HighPart(const LHS &L) {
+  return m_LShr(L, m_SpecificInt(32));
+}
+
+/// Matches the following pattern producing full multiplication:
+///
+/// %xl = and i64 %x, 4294967295
+/// %xh = lshr i64 %x, 32
+/// %yl = and i64 %y, 4294967295
+/// %yh = lshr i64 %y, 32
+///
+/// %t0 = mul nuw i64 %yl, %xl
+/// %t1 = mul nuw i64 %yl, %xh
+/// %t2 = mul nuw i64 %yh, %xl
+/// %t3 = mul nuw i64 %yh, %xh
+///
+/// %t0l = and i64 %t0, 4294967295
+/// %t0h = lshr i64 %t0, 32
+///
+/// %u0 = add i64 %t0h, %t1
+/// %u0l = and i64 %u0, 4294967295
+/// %u0h = lshr i64 %u0, 32
+///
+/// %u1 = add i64 %u0l, %t2
+/// %u1ls = shl i64 %u1, 32
+/// %u1h = lshr i64 %u1, 32
+///
+/// %u2 = add i64 %u0h, %t3
+///
+/// %lo = or i64 %u1ls, %t0l
+/// %hi = add i64 %u2, %u1h
+
+static bool foldMul(Instruction &I) {
+
+  Value *x = nullptr;
+  Value *y = nullptr;
+
+  Value *t0 = nullptr;
+  Value *t0a = nullptr;
+  Value *t1 = nullptr;
+  Value *t2 = nullptr;
+
+  Value *t3 = nullptr;
+  Value *u0 = nullptr;
+  Value *u0a = nullptr;
+
+  // Match lo:
+
+  // We first match up to the multiplications t0, t1, t2.
+  if (match(&I, m_c_Or(m_LowPart(m_Value(t0)),
+                       m_Shl(m_c_Add(m_LowPart(m_c_Add(m_HighPart(m_Value(t0a)),
+                                                       m_Value(t1))),
+                                     m_Value(t2)),
+                             m_SpecificInt(32))))) {
+
+    LLVM_DEBUG(dbgs() << "Lo found up to muls\n");
+    LLVM_DEBUG(dbgs() << *t0 << "\n");
+    LLVM_DEBUG(dbgs() << *t0a << "\n");
+    LLVM_DEBUG(dbgs() << *t1 << "\n");
+    LLVM_DEBUG(dbgs() << *t2 << "\n");
+
+    // 1. t0 is reachable by two edges, require it to be the same.
+    //    TODO: It does not need to be the same one, we can duplicate the t0
+    //    match pattern.
+    // 2. Match t1, and remember its arguments. We start with t1 not t0 because
+    //    t1 is asymmetric.
+    // 3. Require t2 to be a swapped version of t1.
+    // 4. For t0 require to have the same arguments as t1.
+    if (t0 == t0a &&
+        match(t1, m_c_Mul(m_HighPart(m_Value(x)), m_LowPart(m_Value(y)))) &&
+        match(t2,
+              m_c_Mul(m_LowPart(m_Specific(x)), m_HighPart(m_Specific(y)))) &&
+        match(t0,
+              m_c_Mul(m_LowPart(m_Specific(x)), m_LowPart(m_Specific(y))))) {
+      LLVM_DEBUG(dbgs() << "Lo muls are ok\n");
+
+      // The whole pattern can be replaces with single multiplication.
+      // FIXME: We must also search for umulh pattern and replace both with
+      // single multiplication.
+      IRBuilder<> Builder(&I);
+      auto lo = Builder.CreateMul(x, y, "p.lo");
+      I.replaceAllUsesWith(lo);
+      return true;
+    }
+  }
+
+  // Match hi:
+  if (match(&I,
+            m_c_Add(m_HighPart(m_c_Add(m_LowPart(m_Value(u0)), m_Value(t2))),
+                    m_Add(m_HighPart(m_Value(u0a)), m_Value(t3))))) {
+    if (u0 == u0a && match(u0, m_c_Add(m_HighPart(m_Value(t0)), m_Value(t1)))) {
+      if (match(t1, m_c_Mul(m_HighPart(m_Value(x)), m_LowPart(m_Value(y)))) &&
+          match(t2,
+                m_c_Mul(m_LowPart(m_Specific(x)), m_HighPart(m_Specific(y)))) &&
+          match(t0,
+                m_c_Mul(m_LowPart(m_Specific(x)), m_LowPart(m_Specific(y)))) &&
+          match(t3, m_c_Mul(m_HighPart(m_Specific(x)),
+                            m_HighPart(m_Specific(y))))) {
+        LLVM_DEBUG(dbgs() << "Hi found!!!\n" << *x << "\n" << *y << "\n");
+
+        IRBuilder<> Builder(&I);
+        auto ex = Builder.CreateZExt(x, Builder.getInt128Ty());
+        auto ey = Builder.CreateZExt(y, Builder.getInt128Ty());
+        auto p = Builder.CreateNUWMul(ex, ey, "p");
+        auto hi =
+            Builder.CreateTrunc(Builder.CreateLShr(p, 64), I.getType(), "p.hi");
+        I.replaceAllUsesWith(hi);
+        return true;
+      }
+    }
+  }
+
+  return false;
+}
+
 /// This is the entry point for folds that could be implemented in regular
 /// InstCombine, but they are separated because they are not expected to
 /// occur frequently and/or have more than a constant-length pattern match.
@@ -270,6 +393,7 @@
     for (Instruction &I : make_range(BB.rbegin(), BB.rend())) {
       MadeChange |= foldAnyOrAllBitsSet(I);
       MadeChange |= foldGuardedRotateToFunnelShift(I);
+      MadeChange |= foldMul(I);
     }
   }
 
Index: test/Transforms/AggressiveInstCombine/mul128.ll
===================================================================
--- /dev/null
+++ test/Transforms/AggressiveInstCombine/mul128.ll
@@ -0,0 +1,39 @@
+; RUN: opt < %s -aggressive-instcombine -S | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+; CHECK-LABEL: @mul_full_64(
+; CHECK-NOT:   mul nuw i64
+; CHECK:       mul nuw i128
+define { i64, i64 } @mul_full_64(i64 %x, i64 %y) {
+  %xl = and i64 %x, 4294967295
+  %xh = lshr i64 %x, 32
+  %yl = and i64 %y, 4294967295
+  %yh = lshr i64 %y, 32
+
+  %t0 = mul nuw i64 %yl, %xl
+  %t1 = mul nuw i64 %yl, %xh
+  %t2 = mul nuw i64 %yh, %xl
+  %t3 = mul nuw i64 %yh, %xh
+
+  %t0l = and i64 %t0, 4294967295
+  %t0h = lshr i64 %t0, 32
+  
+  %u0 = add i64 %t0h, %t1
+  %u0l = and i64 %u0, 4294967295
+  %u0h = lshr i64 %u0, 32
+
+  %u1 = add i64 %u0l, %t2
+  %u1ls = shl i64 %u1, 32
+  %u1h = lshr i64 %u1, 32
+
+  %u2 = add i64 %u0h, %t3
+
+  %lo = or i64 %u1ls, %t0l
+  %hi = add i64 %u2, %u1h
+
+  %res_lo = insertvalue { i64, i64 } undef, i64 %lo, 0
+  %res = insertvalue { i64, i64 } %res_lo, i64 %hi, 1
+  ret { i64, i64 } %res
+}