Index: lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp =================================================================== --- lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp +++ lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp @@ -251,10 +251,227 @@ return true; } +/// Finds the first instruction after both A and B. +/// A and B are assumed to be either Instruction or Argument. +static Instruction *getInstructionAfter(Value *A, Value *B, DominatorTree &DT) { + // TODO: Is there better way to achieve that? + Instruction *I = nullptr; + + if (auto AI = dyn_cast(A)) + I = AI->getNextNode(); + else // If Argument use the first instruction in the entry block. + I = &cast(A)->getParent()->front().front(); + + auto BI = dyn_cast(B); + if (BI && DT.dominates(I, BI)) + I = BI->getNextNode(); // After B. + + return I; +} + +/// Tries to find the full multiplication instructions pattern: +/// mul(zext(X), zext(Y)). +static Value *findFullMul(Value *X, Value *Y) { + auto *FullTy = IntegerType::get(X->getContext(), + X->getType()->getPrimitiveSizeInBits() * 2); + for (const auto U : X->users()) { + if (U->getType() == FullTy && match(U, m_ZExt(m_Specific(X)))) { + for (const auto V : U->users()) { + if (match(V, m_c_Mul(m_Specific(U), m_ZExt(m_Specific(Y))))) + return V; + } + } + } + return nullptr; +} + +/// Tries to find instruction mul(X, Y). +static Value *findLowMul(Value *X, Value *Y) { + for (const auto U : X->users()) { + if (match(U, m_c_Mul(m_Specific(X), m_Specific(Y)))) + return U; + } + return nullptr; +} + +/// Tries to find a mul with X, Y as arguments. Creates a new one if not found. +static Value *findOrCreateLowMul(Instruction &I, Value *X, Value *Y, + DominatorTree &DT) { + if (auto *Mul = findLowMul(X, Y)) + return Mul; + + if (auto *FullMul = findFullMul(X, Y)) { + IRBuilder<> Builder{&I}; + return Builder.CreateTrunc(FullMul, X->getType(), "fullmul.lo"); + } + + // Create the full multiplication instruction and place it just after its + // operands. This position is the higher possible so will be safe to be used + // as a replacement for all future matched patterns. + IRBuilder<> Builder{getInstructionAfter(X, Y, DT)}; + return Builder.CreateMul(X, Y, "mul"); +} + +/// Tries to find the full mul with X, Y as arguments. If not found it creates +/// a new one. It also replaces low mul if found. +static Value *findOrCreateFullMul(Value *X, Value *Y, DominatorTree &DT) { + + if (auto *Mul = findFullMul(X, Y)) + return Mul; + + auto *MulTy = IntegerType::get(X->getContext(), + X->getType()->getPrimitiveSizeInBits() * 2); + IRBuilder<> Builder{getInstructionAfter(X, Y, DT)}; + auto *FullMul = Builder.CreateNUWMul( + Builder.CreateZExt(X, MulTy, {"fullmul.", X->getName()}), + Builder.CreateZExt(Y, MulTy, {"fullmul.", Y->getName()}), "fullmul"); + + // If you find a low mul, replace it also with the full mul. + if (auto *LowMul = findLowMul(X, Y)) { + auto *FullMulLo = + Builder.CreateTrunc(FullMul, LowMul->getType(), "fullmul.lo"); + LowMul->replaceAllUsesWith(FullMulLo); + } + + return FullMul; +} + +/// Matches the following pattern producing full multiplication: +/// +/// %xl = and i64 %x, 4294967295 +/// %xh = lshr i64 %x, 32 +/// %yl = and i64 %y, 4294967295 +/// %yh = lshr i64 %y, 32 +/// +/// %t0 = mul nuw i64 %yl, %xl +/// %t1 = mul nuw i64 %yl, %xh +/// %t2 = mul nuw i64 %yh, %xl +/// %t3 = mul nuw i64 %yh, %xh +/// +/// %t0l = and i64 %t0, 4294967295 +/// %t0h = lshr i64 %t0, 32 +/// +/// %u0 = add i64 %t0h, %t1 +/// %u0l = and i64 %u0, 4294967295 +/// %u0h = lshr i64 %u0, 32 +/// +/// %u1 = add i64 %u0l, %t2 +/// %u1ls = shl i64 %u1, 32 +/// %u1h = lshr i64 %u1, 32 +/// +/// %u2 = add i64 %u0h, %t3 +/// +/// %lo = or i64 %u1ls, %t0l +/// %hi = add i64 %u2, %u1h +/// +static bool foldFullMul(Instruction &I, const DataLayout &DL, + DominatorTree &DT) { + + // We limit this up to 128 bits to have the low part mask be at most 64-bit + // (m_SpecificInt() matcher limitation). + static constexpr unsigned maxSizeInBits = 128; + + auto *ty = I.getType(); + if (!ty->isIntegerTy()) + return false; + + // Check the integer type size. + // Also make sure the size in bits is even to make low-high split trivial. + const auto sizeInBits = ty->getPrimitiveSizeInBits(); + if (sizeInBits > maxSizeInBits || sizeInBits % 2 != 0) + return false; + + // Skip integers bigger than native. + if (sizeInBits > DL.getLargestLegalIntTypeSizeInBits()) + return false; + + const auto halfSizeInBits = sizeInBits / 2; // Max 64. + const auto Half = m_SpecificInt(halfSizeInBits); + const auto lowMask = + m_SpecificInt(~uint64_t{0} >> ((maxSizeInBits / 2) - halfSizeInBits)); + + Value *x = nullptr; + Value *y = nullptr; + Value *t0 = nullptr; + Value *t1 = nullptr; + Value *t2 = nullptr; + Value *t3 = nullptr; + Value *u0 = nullptr; + + // Match low part of the full multiplication. + // + // First we match up to the multiplications t0, t1, t2. + // The t0 is reachable by two edges and we _assume_ it's the same node + // in general it does not have to be. + // + // The long pattern is: ((t2 + lo(t1 + hi(t0))) << 32) | lo(t0). + bool LowLongPattern = + match(&I, m_c_Or(m_And(m_Value(t0), lowMask), + m_Shl(m_c_Add(m_And(m_c_Add(m_LShr(m_Deferred(t0), Half), + m_Value(t1)), + lowMask), + m_Value(t2)), + Half))); + + // The short pattern is: ((t2 + t1) << Half) + t0. + bool LowShortPattern = match( + &I, m_c_Add(m_Value(t0), m_Shl(m_c_Add(m_Value(t1), m_Value(t2)), Half))); + + if (LowLongPattern || LowShortPattern) { + // 1. Match t1 and remember its arguments. We start with t1 is asymmetric. + // 2. Require t2 to be a swapped version of t1. + // 3. For t0 require to have the same arguments as t1. + if (match(t1, + m_c_Mul(m_LShr(m_Value(x), Half), m_And(m_Value(y), lowMask))) && + match(t2, m_c_Mul(m_And(m_Specific(x), lowMask), + m_LShr(m_Specific(y), Half))) && + match(t0, m_c_Mul(m_And(m_Specific(x), lowMask), + m_And(m_Specific(y), lowMask)))) { + // Replace with single multiplication. + auto Mul = findOrCreateLowMul(I, x, y, DT); + IRBuilder<> Builder{&I}; + auto Low = Builder.CreateTrunc(Mul, ty, "fullmul.lo"); + I.replaceAllUsesWith(Low); + return true; + } + } + + // Match hi part of the full multiplication. + // + // First we match up to multiplications t2 and t3 and u0 node. + // Then check the u0 node. + // In the end check all 4 multiplications starting from asymmetric ones + // the same as in matching the low part. + if (match(&I, + m_c_Add( + m_LShr(m_c_Add(m_And(m_Value(u0), lowMask), m_Value(t2)), Half), + m_c_Add(m_LShr(m_Deferred(u0), Half), m_Value(t3)))) && + match(u0, m_c_Add(m_LShr(m_Value(t0), Half), m_Value(t1)))) { + if (match(t1, + m_c_Mul(m_LShr(m_Value(x), Half), m_And(m_Value(y), lowMask))) && + match(t2, m_c_Mul(m_And(m_Specific(x), lowMask), + m_LShr(m_Specific(y), Half))) && + match(t0, m_c_Mul(m_And(m_Specific(x), lowMask), + m_And(m_Specific(y), lowMask))) && + match(t3, m_c_Mul(m_LShr(m_Specific(x), Half), + m_LShr(m_Specific(y), Half)))) { + auto mul = findOrCreateFullMul(x, y, DT); + IRBuilder<> Builder{&I}; + auto hi = Builder.CreateTrunc(Builder.CreateLShr(mul, halfSizeInBits), ty, + "fullmul.hi"); + I.replaceAllUsesWith(hi); + return true; + } + } + + return false; +} + /// This is the entry point for folds that could be implemented in regular /// InstCombine, but they are separated because they are not expected to /// occur frequently and/or have more than a constant-length pattern match. -static bool foldUnusualPatterns(Function &F, DominatorTree &DT) { +static bool foldUnusualPatterns(Function &F, const DataLayout &DL, + DominatorTree &DT) { bool MadeChange = false; for (BasicBlock &BB : F) { // Ignore unreachable basic blocks. @@ -269,6 +486,7 @@ for (Instruction &I : make_range(BB.rbegin(), BB.rend())) { MadeChange |= foldAnyOrAllBitsSet(I); MadeChange |= foldGuardedRotateToFunnelShift(I); + MadeChange |= foldFullMul(I, DL, DT); } } @@ -287,7 +505,7 @@ const DataLayout &DL = F.getParent()->getDataLayout(); TruncInstCombine TIC(TLI, DL, DT); MadeChange |= TIC.run(F); - MadeChange |= foldUnusualPatterns(F, DT); + MadeChange |= foldUnusualPatterns(F, DL, DT); return MadeChange; } Index: test/Transforms/AggressiveInstCombine/mul_full_32.ll =================================================================== --- test/Transforms/AggressiveInstCombine/mul_full_32.ll +++ test/Transforms/AggressiveInstCombine/mul_full_32.ll @@ -4,6 +4,8 @@ target datalayout = "e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" target triple = "i386-unknown-linux-gnu" +; This one should not be optimized. We don't want produce mul i128 when +; the biggest native integer is 32. define { i64, i64 } @mul_full_64(i64 %x, i64 %y) { ; CHECK-LABEL: @mul_full_64( ; CHECK-NEXT: [[XL:%.*]] = and i64 [[X:%.*]], 4294967295 @@ -62,27 +64,14 @@ define { i32, i32 } @mul_full_32(i32 %x, i32 %y) { ; CHECK-LABEL: @mul_full_32( -; CHECK-NEXT: [[XL:%.*]] = and i32 [[X:%.*]], 65535 -; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16 -; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y:%.*]], 65535 -; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16 -; CHECK-NEXT: [[T0:%.*]] = mul nuw i32 [[YL]], [[XL]] -; CHECK-NEXT: [[T1:%.*]] = mul nuw i32 [[YL]], [[XH]] -; CHECK-NEXT: [[T2:%.*]] = mul nuw i32 [[YH]], [[XL]] -; CHECK-NEXT: [[T3:%.*]] = mul nuw i32 [[YH]], [[XH]] -; CHECK-NEXT: [[T0L:%.*]] = and i32 [[T0]], 65535 -; CHECK-NEXT: [[T0H:%.*]] = lshr i32 [[T0]], 16 -; CHECK-NEXT: [[U0:%.*]] = add i32 [[T0H]], [[T1]] -; CHECK-NEXT: [[U0L:%.*]] = and i32 [[U0]], 65535 -; CHECK-NEXT: [[U0H:%.*]] = lshr i32 [[U0]], 16 -; CHECK-NEXT: [[U1:%.*]] = add i32 [[U0L]], [[T2]] -; CHECK-NEXT: [[U1LS:%.*]] = shl i32 [[U1]], 16 -; CHECK-NEXT: [[U1H:%.*]] = lshr i32 [[U1]], 16 -; CHECK-NEXT: [[U2:%.*]] = add i32 [[U0H]], [[T3]] -; CHECK-NEXT: [[LO:%.*]] = or i32 [[U1LS]], [[T0L]] -; CHECK-NEXT: [[HI:%.*]] = add i32 [[U2]], [[U1H]] -; CHECK-NEXT: [[RES_LO:%.*]] = insertvalue { i32, i32 } undef, i32 [[LO]], 0 -; CHECK-NEXT: [[RES:%.*]] = insertvalue { i32, i32 } [[RES_LO]], i32 [[HI]], 1 +; CHECK-NEXT: [[FULLMUL_X:%.*]] = zext i32 [[X:%.*]] to i64 +; CHECK-NEXT: [[FULLMUL_Y:%.*]] = zext i32 [[Y:%.*]] to i64 +; CHECK-NEXT: [[FULLMUL:%.*]] = mul nuw i64 [[FULLMUL_X]], [[FULLMUL_Y]] +; CHECK-NEXT: [[FULLMUL_LO:%.*]] = trunc i64 [[FULLMUL]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[FULLMUL]], 16 +; CHECK-NEXT: [[FULLMUL_HI:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[RES_LO:%.*]] = insertvalue { i32, i32 } undef, i32 [[FULLMUL_LO]], 0 +; CHECK-NEXT: [[RES:%.*]] = insertvalue { i32, i32 } [[RES_LO]], i32 [[FULLMUL_HI]], 1 ; CHECK-NEXT: ret { i32, i32 } [[RES]] ; %xl = and i32 %x, 65535 Index: test/Transforms/AggressiveInstCombine/mul_full_64.ll =================================================================== --- test/Transforms/AggressiveInstCombine/mul_full_64.ll +++ test/Transforms/AggressiveInstCombine/mul_full_64.ll @@ -6,27 +6,14 @@ define { i64, i64 } @mul_full_64_variant0(i64 %x, i64 %y) { ; CHECK-LABEL: @mul_full_64_variant0( -; CHECK-NEXT: [[XL:%.*]] = and i64 [[X:%.*]], 4294967295 -; CHECK-NEXT: [[XH:%.*]] = lshr i64 [[X]], 32 -; CHECK-NEXT: [[YL:%.*]] = and i64 [[Y:%.*]], 4294967295 -; CHECK-NEXT: [[YH:%.*]] = lshr i64 [[Y]], 32 -; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[YL]], [[XL]] -; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[YL]], [[XH]] -; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[YH]], [[XL]] -; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[YH]], [[XH]] -; CHECK-NEXT: [[T0L:%.*]] = and i64 [[T0]], 4294967295 -; CHECK-NEXT: [[T0H:%.*]] = lshr i64 [[T0]], 32 -; CHECK-NEXT: [[U0:%.*]] = add i64 [[T0H]], [[T1]] -; CHECK-NEXT: [[U0L:%.*]] = and i64 [[U0]], 4294967295 -; CHECK-NEXT: [[U0H:%.*]] = lshr i64 [[U0]], 32 -; CHECK-NEXT: [[U1:%.*]] = add i64 [[U0L]], [[T2]] -; CHECK-NEXT: [[U1LS:%.*]] = shl i64 [[U1]], 32 -; CHECK-NEXT: [[U1H:%.*]] = lshr i64 [[U1]], 32 -; CHECK-NEXT: [[U2:%.*]] = add i64 [[U0H]], [[T3]] -; CHECK-NEXT: [[LO:%.*]] = or i64 [[U1LS]], [[T0L]] -; CHECK-NEXT: [[HI:%.*]] = add i64 [[U2]], [[U1H]] -; CHECK-NEXT: [[RES_LO:%.*]] = insertvalue { i64, i64 } undef, i64 [[LO]], 0 -; CHECK-NEXT: [[RES:%.*]] = insertvalue { i64, i64 } [[RES_LO]], i64 [[HI]], 1 +; CHECK-NEXT: [[FULLMUL_X:%.*]] = zext i64 [[X:%.*]] to i128 +; CHECK-NEXT: [[FULLMUL_Y:%.*]] = zext i64 [[Y:%.*]] to i128 +; CHECK-NEXT: [[FULLMUL:%.*]] = mul nuw i128 [[FULLMUL_X]], [[FULLMUL_Y]] +; CHECK-NEXT: [[FULLMUL_LO:%.*]] = trunc i128 [[FULLMUL]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i128 [[FULLMUL]], 32 +; CHECK-NEXT: [[FULLMUL_HI:%.*]] = trunc i128 [[TMP1]] to i64 +; CHECK-NEXT: [[RES_LO:%.*]] = insertvalue { i64, i64 } undef, i64 [[FULLMUL_LO]], 0 +; CHECK-NEXT: [[RES:%.*]] = insertvalue { i64, i64 } [[RES_LO]], i64 [[FULLMUL_HI]], 1 ; CHECK-NEXT: ret { i64, i64 } [[RES]] ; %xl = and i64 %x, 4294967295 @@ -89,25 +76,14 @@ define i64 @mul_full_64_variant1(i64 %a, i64 %b, i64* nocapture %rhi) { ; CHECK-LABEL: @mul_full_64_variant1( -; CHECK-NEXT: [[CONV:%.*]] = and i64 [[A:%.*]], 4294967295 -; CHECK-NEXT: [[SHR_I43:%.*]] = lshr i64 [[A]], 32 -; CHECK-NEXT: [[CONV3:%.*]] = and i64 [[B:%.*]], 4294967295 -; CHECK-NEXT: [[SHR_I41:%.*]] = lshr i64 [[B]], 32 -; CHECK-NEXT: [[MUL:%.*]] = mul nuw i64 [[SHR_I41]], [[SHR_I43]] -; CHECK-NEXT: [[MUL5:%.*]] = mul nuw i64 [[CONV3]], [[SHR_I43]] -; CHECK-NEXT: [[MUL6:%.*]] = mul nuw i64 [[SHR_I41]], [[CONV]] -; CHECK-NEXT: [[MUL7:%.*]] = mul nuw i64 [[CONV3]], [[CONV]] -; CHECK-NEXT: [[SHR_I40:%.*]] = lshr i64 [[MUL7]], 32 -; CHECK-NEXT: [[ADD:%.*]] = add i64 [[SHR_I40]], [[MUL5]] -; CHECK-NEXT: [[SHR_I39:%.*]] = lshr i64 [[ADD]], 32 -; CHECK-NEXT: [[ADD10:%.*]] = add i64 [[SHR_I39]], [[MUL]] -; CHECK-NEXT: [[CONV14:%.*]] = and i64 [[ADD]], 4294967295 -; CHECK-NEXT: [[ADD15:%.*]] = add i64 [[CONV14]], [[MUL6]] -; CHECK-NEXT: [[SHR_I:%.*]] = lshr i64 [[ADD15]], 32 -; CHECK-NEXT: [[ADD17:%.*]] = add i64 [[ADD10]], [[SHR_I]] -; CHECK-NEXT: store i64 [[ADD17]], i64* [[RHI:%.*]], align 8 -; CHECK-NEXT: [[MULLO:%.*]] = mul i64 [[B]], [[A]] -; CHECK-NEXT: ret i64 [[MULLO]] +; CHECK-NEXT: [[FULLMUL_A:%.*]] = zext i64 [[A:%.*]] to i128 +; CHECK-NEXT: [[FULLMUL_B:%.*]] = zext i64 [[B:%.*]] to i128 +; CHECK-NEXT: [[FULLMUL:%.*]] = mul nuw i128 [[FULLMUL_A]], [[FULLMUL_B]] +; CHECK-NEXT: [[FULLMUL_LO:%.*]] = trunc i128 [[FULLMUL]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i128 [[FULLMUL]], 32 +; CHECK-NEXT: [[FULLMUL_HI:%.*]] = trunc i128 [[TMP1]] to i64 +; CHECK-NEXT: store i64 [[FULLMUL_HI]], i64* [[RHI:%.*]], align 8 +; CHECK-NEXT: ret i64 [[FULLMUL_LO]] ; %conv = and i64 %a, 4294967295 %shr.i43 = lshr i64 %a, 32 @@ -132,27 +108,14 @@ define i64 @mul_full_64_variant2(i64 %a, i64 %b, i64* nocapture %rhi) { ; CHECK-LABEL: @mul_full_64_variant2( -; CHECK-NEXT: [[CONV:%.*]] = and i64 [[A:%.*]], 4294967295 -; CHECK-NEXT: [[SHR_I58:%.*]] = lshr i64 [[A]], 32 -; CHECK-NEXT: [[CONV3:%.*]] = and i64 [[B:%.*]], 4294967295 -; CHECK-NEXT: [[SHR_I56:%.*]] = lshr i64 [[B]], 32 -; CHECK-NEXT: [[MUL:%.*]] = mul nuw i64 [[SHR_I56]], [[SHR_I58]] -; CHECK-NEXT: [[MUL5:%.*]] = mul nuw i64 [[CONV3]], [[SHR_I58]] -; CHECK-NEXT: [[MUL6:%.*]] = mul nuw i64 [[SHR_I56]], [[CONV]] -; CHECK-NEXT: [[MUL7:%.*]] = mul nuw i64 [[CONV3]], [[CONV]] -; CHECK-NEXT: [[SHR_I55:%.*]] = lshr i64 [[MUL7]], 32 -; CHECK-NEXT: [[ADD:%.*]] = add i64 [[SHR_I55]], [[MUL5]] -; CHECK-NEXT: [[SHR_I54:%.*]] = lshr i64 [[ADD]], 32 -; CHECK-NEXT: [[ADD10:%.*]] = add i64 [[SHR_I54]], [[MUL]] -; CHECK-NEXT: [[CONV14:%.*]] = and i64 [[ADD]], 4294967295 -; CHECK-NEXT: [[ADD15:%.*]] = add i64 [[CONV14]], [[MUL6]] -; CHECK-NEXT: [[SHR_I51:%.*]] = lshr i64 [[ADD15]], 32 -; CHECK-NEXT: [[ADD17:%.*]] = add i64 [[ADD10]], [[SHR_I51]] -; CHECK-NEXT: store i64 [[ADD17]], i64* [[RHI:%.*]], align 8 -; CHECK-NEXT: [[CONV24:%.*]] = shl i64 [[ADD15]], 32 -; CHECK-NEXT: [[CONV26:%.*]] = and i64 [[MUL7]], 4294967295 -; CHECK-NEXT: [[ADD27:%.*]] = or i64 [[CONV24]], [[CONV26]] -; CHECK-NEXT: ret i64 [[ADD27]] +; CHECK-NEXT: [[FULLMUL_A:%.*]] = zext i64 [[A:%.*]] to i128 +; CHECK-NEXT: [[FULLMUL_B:%.*]] = zext i64 [[B:%.*]] to i128 +; CHECK-NEXT: [[FULLMUL:%.*]] = mul nuw i128 [[FULLMUL_A]], [[FULLMUL_B]] +; CHECK-NEXT: [[FULLMUL_LO:%.*]] = trunc i128 [[FULLMUL]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i128 [[FULLMUL]], 32 +; CHECK-NEXT: [[FULLMUL_HI:%.*]] = trunc i128 [[TMP1]] to i64 +; CHECK-NEXT: store i64 [[FULLMUL_HI]], i64* [[RHI:%.*]], align 8 +; CHECK-NEXT: ret i64 [[FULLMUL_LO]] ; %conv = and i64 %a, 4294967295 %shr.i58 = lshr i64 %a, 32 @@ -179,27 +142,14 @@ define i64 @mul_full_64_variant3(i64 %a, i64 %b, i64* nocapture %rhi) { ; CHECK-LABEL: @mul_full_64_variant3( -; CHECK-NEXT: [[CONV:%.*]] = and i64 [[A:%.*]], 4294967295 -; CHECK-NEXT: [[SHR_I45:%.*]] = lshr i64 [[A]], 32 -; CHECK-NEXT: [[CONV3:%.*]] = and i64 [[B:%.*]], 4294967295 -; CHECK-NEXT: [[SHR_I43:%.*]] = lshr i64 [[B]], 32 -; CHECK-NEXT: [[MUL:%.*]] = mul nuw i64 [[SHR_I43]], [[SHR_I45]] -; CHECK-NEXT: [[MUL5:%.*]] = mul nuw i64 [[CONV3]], [[SHR_I45]] -; CHECK-NEXT: [[MUL6:%.*]] = mul nuw i64 [[SHR_I43]], [[CONV]] -; CHECK-NEXT: [[MUL7:%.*]] = mul nuw i64 [[CONV3]], [[CONV]] -; CHECK-NEXT: [[SHR_I42:%.*]] = lshr i64 [[MUL7]], 32 -; CHECK-NEXT: [[ADD:%.*]] = add i64 [[SHR_I42]], [[MUL5]] -; CHECK-NEXT: [[SHR_I41:%.*]] = lshr i64 [[ADD]], 32 -; CHECK-NEXT: [[ADD10:%.*]] = add i64 [[SHR_I41]], [[MUL]] -; CHECK-NEXT: [[CONV14:%.*]] = and i64 [[ADD]], 4294967295 -; CHECK-NEXT: [[ADD15:%.*]] = add i64 [[CONV14]], [[MUL6]] -; CHECK-NEXT: [[SHR_I:%.*]] = lshr i64 [[ADD15]], 32 -; CHECK-NEXT: [[ADD17:%.*]] = add i64 [[ADD10]], [[SHR_I]] -; CHECK-NEXT: store i64 [[ADD17]], i64* [[RHI:%.*]], align 8 -; CHECK-NEXT: [[ADD18:%.*]] = add i64 [[MUL6]], [[MUL5]] -; CHECK-NEXT: [[SHL:%.*]] = shl i64 [[ADD18]], 32 -; CHECK-NEXT: [[ADD19:%.*]] = add i64 [[SHL]], [[MUL7]] -; CHECK-NEXT: ret i64 [[ADD19]] +; CHECK-NEXT: [[FULLMUL_A:%.*]] = zext i64 [[A:%.*]] to i128 +; CHECK-NEXT: [[FULLMUL_B:%.*]] = zext i64 [[B:%.*]] to i128 +; CHECK-NEXT: [[FULLMUL:%.*]] = mul nuw i128 [[FULLMUL_A]], [[FULLMUL_B]] +; CHECK-NEXT: [[FULLMUL_LO:%.*]] = trunc i128 [[FULLMUL]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i128 [[FULLMUL]], 32 +; CHECK-NEXT: [[FULLMUL_HI:%.*]] = trunc i128 [[TMP1]] to i64 +; CHECK-NEXT: store i64 [[FULLMUL_HI]], i64* [[RHI:%.*]], align 8 +; CHECK-NEXT: ret i64 [[FULLMUL_LO]] ; %conv = and i64 %a, 4294967295 %shr.i45 = lshr i64 %a, 32 @@ -227,27 +177,14 @@ define { i32, i32 } @mul_full_32(i32 %x, i32 %y) { ; CHECK-LABEL: @mul_full_32( -; CHECK-NEXT: [[XL:%.*]] = and i32 [[X:%.*]], 65535 -; CHECK-NEXT: [[XH:%.*]] = lshr i32 [[X]], 16 -; CHECK-NEXT: [[YL:%.*]] = and i32 [[Y:%.*]], 65535 -; CHECK-NEXT: [[YH:%.*]] = lshr i32 [[Y]], 16 -; CHECK-NEXT: [[T0:%.*]] = mul nuw i32 [[YL]], [[XL]] -; CHECK-NEXT: [[T1:%.*]] = mul nuw i32 [[YL]], [[XH]] -; CHECK-NEXT: [[T2:%.*]] = mul nuw i32 [[YH]], [[XL]] -; CHECK-NEXT: [[T3:%.*]] = mul nuw i32 [[YH]], [[XH]] -; CHECK-NEXT: [[T0L:%.*]] = and i32 [[T0]], 65535 -; CHECK-NEXT: [[T0H:%.*]] = lshr i32 [[T0]], 16 -; CHECK-NEXT: [[U0:%.*]] = add i32 [[T0H]], [[T1]] -; CHECK-NEXT: [[U0L:%.*]] = and i32 [[U0]], 65535 -; CHECK-NEXT: [[U0H:%.*]] = lshr i32 [[U0]], 16 -; CHECK-NEXT: [[U1:%.*]] = add i32 [[U0L]], [[T2]] -; CHECK-NEXT: [[U1LS:%.*]] = shl i32 [[U1]], 16 -; CHECK-NEXT: [[U1H:%.*]] = lshr i32 [[U1]], 16 -; CHECK-NEXT: [[U2:%.*]] = add i32 [[U0H]], [[T3]] -; CHECK-NEXT: [[LO:%.*]] = or i32 [[U1LS]], [[T0L]] -; CHECK-NEXT: [[HI:%.*]] = add i32 [[U2]], [[U1H]] -; CHECK-NEXT: [[RES_LO:%.*]] = insertvalue { i32, i32 } undef, i32 [[LO]], 0 -; CHECK-NEXT: [[RES:%.*]] = insertvalue { i32, i32 } [[RES_LO]], i32 [[HI]], 1 +; CHECK-NEXT: [[FULLMUL_X:%.*]] = zext i32 [[X:%.*]] to i64 +; CHECK-NEXT: [[FULLMUL_Y:%.*]] = zext i32 [[Y:%.*]] to i64 +; CHECK-NEXT: [[FULLMUL:%.*]] = mul nuw i64 [[FULLMUL_X]], [[FULLMUL_Y]] +; CHECK-NEXT: [[FULLMUL_LO:%.*]] = trunc i64 [[FULLMUL]] to i32 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i64 [[FULLMUL]], 16 +; CHECK-NEXT: [[FULLMUL_HI:%.*]] = trunc i64 [[TMP1]] to i32 +; CHECK-NEXT: [[RES_LO:%.*]] = insertvalue { i32, i32 } undef, i32 [[FULLMUL_LO]], 0 +; CHECK-NEXT: [[RES:%.*]] = insertvalue { i32, i32 } [[RES_LO]], i32 [[FULLMUL_HI]], 1 ; CHECK-NEXT: ret { i32, i32 } [[RES]] ; %xl = and i32 %x, 65535 @@ -291,28 +228,15 @@ define { i64, i64 } @mul_full_64_variant0_1() { ; CHECK-LABEL: @mul_full_64_variant0_1( ; CHECK-NEXT: [[TMP1:%.*]] = call i64 @get_number() -; CHECK-NEXT: [[YL:%.*]] = and i64 [[TMP1]], 4294967295 -; CHECK-NEXT: [[YH:%.*]] = lshr i64 [[TMP1]], 32 ; CHECK-NEXT: [[TMP2:%.*]] = call i64 @get_number() -; CHECK-NEXT: [[XH:%.*]] = lshr i64 [[TMP2]], 32 -; CHECK-NEXT: [[XL:%.*]] = and i64 [[TMP2]], 4294967295 -; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[YL]], [[XH]] -; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[YH]], [[XH]] -; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[YH]], [[XL]] -; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[YL]], [[XL]] -; CHECK-NEXT: [[T0H:%.*]] = lshr i64 [[T0]], 32 -; CHECK-NEXT: [[U0:%.*]] = add i64 [[T0H]], [[T1]] -; CHECK-NEXT: [[U0L:%.*]] = and i64 [[U0]], 4294967295 -; CHECK-NEXT: [[U1:%.*]] = add i64 [[U0L]], [[T2]] -; CHECK-NEXT: [[U0H:%.*]] = lshr i64 [[U0]], 32 -; CHECK-NEXT: [[U2:%.*]] = add i64 [[U0H]], [[T3]] -; CHECK-NEXT: [[U1H:%.*]] = lshr i64 [[U1]], 32 -; CHECK-NEXT: [[HI:%.*]] = add i64 [[U2]], [[U1H]] -; CHECK-NEXT: [[U1LS:%.*]] = shl i64 [[U1]], 32 -; CHECK-NEXT: [[T0L:%.*]] = and i64 [[T0]], 4294967295 -; CHECK-NEXT: [[LO:%.*]] = or i64 [[U1LS]], [[T0L]] -; CHECK-NEXT: [[RES_LO:%.*]] = insertvalue { i64, i64 } undef, i64 [[LO]], 0 -; CHECK-NEXT: [[RES:%.*]] = insertvalue { i64, i64 } [[RES_LO]], i64 [[HI]], 1 +; CHECK-NEXT: [[FULLMUL_:%.*]] = zext i64 [[TMP2]] to i128 +; CHECK-NEXT: [[FULLMUL_1:%.*]] = zext i64 [[TMP1]] to i128 +; CHECK-NEXT: [[FULLMUL:%.*]] = mul nuw i128 [[FULLMUL_]], [[FULLMUL_1]] +; CHECK-NEXT: [[FULLMUL_LO:%.*]] = trunc i128 [[FULLMUL]] to i64 +; CHECK-NEXT: [[TMP3:%.*]] = lshr i128 [[FULLMUL]], 32 +; CHECK-NEXT: [[FULLMUL_HI:%.*]] = trunc i128 [[TMP3]] to i64 +; CHECK-NEXT: [[RES_LO:%.*]] = insertvalue { i64, i64 } undef, i64 [[FULLMUL_LO]], 0 +; CHECK-NEXT: [[RES:%.*]] = insertvalue { i64, i64 } [[RES_LO]], i64 [[FULLMUL_HI]], 1 ; CHECK-NEXT: ret { i64, i64 } [[RES]] ; %1 = call i64 @get_number() @@ -350,27 +274,14 @@ ; CHECK-LABEL: @mul_full_64_variant0_2( ; CHECK-NEXT: [[X:%.*]] = call i64 @get_number() ; CHECK-NEXT: [[Y:%.*]] = call i64 @get_number() -; CHECK-NEXT: [[YL:%.*]] = and i64 [[Y]], 4294967295 -; CHECK-NEXT: [[YH:%.*]] = lshr i64 [[Y]], 32 -; CHECK-NEXT: [[XH:%.*]] = lshr i64 [[X]], 32 -; CHECK-NEXT: [[XL:%.*]] = and i64 [[X]], 4294967295 -; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[XH]], [[YH]] -; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[XL]], [[YH]] -; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[XH]], [[YL]] -; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[XL]], [[YL]] -; CHECK-NEXT: [[T0H:%.*]] = lshr i64 [[T0]], 32 -; CHECK-NEXT: [[U0:%.*]] = add i64 [[T1]], [[T0H]] -; CHECK-NEXT: [[U0L:%.*]] = and i64 [[U0]], 4294967295 -; CHECK-NEXT: [[U1:%.*]] = add i64 [[T2]], [[U0L]] -; CHECK-NEXT: [[U0H:%.*]] = lshr i64 [[U0]], 32 -; CHECK-NEXT: [[U2:%.*]] = add i64 [[U0H]], [[T3]] -; CHECK-NEXT: [[U1H:%.*]] = lshr i64 [[U1]], 32 -; CHECK-NEXT: [[HI:%.*]] = add i64 [[U1H]], [[U2]] -; CHECK-NEXT: [[U1LS:%.*]] = shl i64 [[U1]], 32 -; CHECK-NEXT: [[T0L:%.*]] = and i64 [[T0]], 4294967295 -; CHECK-NEXT: [[LO:%.*]] = or i64 [[T0L]], [[U1LS]] -; CHECK-NEXT: [[RES_LO:%.*]] = insertvalue { i64, i64 } undef, i64 [[LO]], 0 -; CHECK-NEXT: [[RES:%.*]] = insertvalue { i64, i64 } [[RES_LO]], i64 [[HI]], 1 +; CHECK-NEXT: [[FULLMUL_X:%.*]] = zext i64 [[X]] to i128 +; CHECK-NEXT: [[FULLMUL_Y:%.*]] = zext i64 [[Y]] to i128 +; CHECK-NEXT: [[FULLMUL:%.*]] = mul nuw i128 [[FULLMUL_X]], [[FULLMUL_Y]] +; CHECK-NEXT: [[FULLMUL_LO:%.*]] = trunc i128 [[FULLMUL]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i128 [[FULLMUL]], 32 +; CHECK-NEXT: [[FULLMUL_HI:%.*]] = trunc i128 [[TMP1]] to i64 +; CHECK-NEXT: [[RES_LO:%.*]] = insertvalue { i64, i64 } undef, i64 [[FULLMUL_LO]], 0 +; CHECK-NEXT: [[RES:%.*]] = insertvalue { i64, i64 } [[RES_LO]], i64 [[FULLMUL_HI]], 1 ; CHECK-NEXT: ret { i64, i64 } [[RES]] ; %x = call i64 @get_number() @@ -407,23 +318,12 @@ define i64 @umulh_64(i64 %x, i64 %y) { ; CHECK-LABEL: @umulh_64( -; CHECK-NEXT: [[XL:%.*]] = and i64 [[X:%.*]], 4294967295 -; CHECK-NEXT: [[XH:%.*]] = lshr i64 [[X]], 32 -; CHECK-NEXT: [[YL:%.*]] = and i64 [[Y:%.*]], 4294967295 -; CHECK-NEXT: [[YH:%.*]] = lshr i64 [[Y]], 32 -; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[YL]], [[XL]] -; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[YL]], [[XH]] -; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[YH]], [[XL]] -; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[YH]], [[XH]] -; CHECK-NEXT: [[T0H:%.*]] = lshr i64 [[T0]], 32 -; CHECK-NEXT: [[U0:%.*]] = add i64 [[T0H]], [[T1]] -; CHECK-NEXT: [[U0L:%.*]] = and i64 [[U0]], 4294967295 -; CHECK-NEXT: [[U0H:%.*]] = lshr i64 [[U0]], 32 -; CHECK-NEXT: [[U1:%.*]] = add i64 [[U0L]], [[T2]] -; CHECK-NEXT: [[U1H:%.*]] = lshr i64 [[U1]], 32 -; CHECK-NEXT: [[U2:%.*]] = add i64 [[U0H]], [[T3]] -; CHECK-NEXT: [[HI:%.*]] = add i64 [[U2]], [[U1H]] -; CHECK-NEXT: ret i64 [[HI]] +; CHECK-NEXT: [[FULLMUL_X:%.*]] = zext i64 [[X:%.*]] to i128 +; CHECK-NEXT: [[FULLMUL_Y:%.*]] = zext i64 [[Y:%.*]] to i128 +; CHECK-NEXT: [[FULLMUL:%.*]] = mul nuw i128 [[FULLMUL_X]], [[FULLMUL_Y]] +; CHECK-NEXT: [[TMP1:%.*]] = lshr i128 [[FULLMUL]], 32 +; CHECK-NEXT: [[FULLMUL_HI:%.*]] = trunc i128 [[TMP1]] to i64 +; CHECK-NEXT: ret i64 [[FULLMUL_HI]] ; %xl = and i64 %x, 4294967295 %xh = lshr i64 %x, 32 @@ -453,21 +353,8 @@ define i64 @mullo(i64 %x, i64 %y) { ; CHECK-LABEL: @mullo( -; CHECK-NEXT: [[XL:%.*]] = and i64 [[X:%.*]], 4294967295 -; CHECK-NEXT: [[XH:%.*]] = lshr i64 [[X]], 32 -; CHECK-NEXT: [[YL:%.*]] = and i64 [[Y:%.*]], 4294967295 -; CHECK-NEXT: [[YH:%.*]] = lshr i64 [[Y]], 32 -; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[YL]], [[XL]] -; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[YL]], [[XH]] -; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[YH]], [[XL]] -; CHECK-NEXT: [[T0L:%.*]] = and i64 [[T0]], 4294967295 -; CHECK-NEXT: [[T0H:%.*]] = lshr i64 [[T0]], 32 -; CHECK-NEXT: [[U0:%.*]] = add i64 [[T0H]], [[T1]] -; CHECK-NEXT: [[U0L:%.*]] = and i64 [[U0]], 4294967295 -; CHECK-NEXT: [[U1:%.*]] = add i64 [[U0L]], [[T2]] -; CHECK-NEXT: [[U1LS:%.*]] = shl i64 [[U1]], 32 -; CHECK-NEXT: [[LO:%.*]] = or i64 [[U1LS]], [[T0L]] -; CHECK-NEXT: ret i64 [[LO]] +; CHECK-NEXT: [[MUL:%.*]] = mul i64 [[X:%.*]], [[Y:%.*]] +; CHECK-NEXT: ret i64 [[MUL]] ; %xl = and i64 %x, 4294967295 %xh = lshr i64 %x, 32 @@ -499,21 +386,7 @@ ; CHECK-LABEL: @mullo_duplicate( ; CHECK-NEXT: [[DUPLICATED_MUL:%.*]] = mul i64 [[X:%.*]], [[Y:%.*]] ; CHECK-NEXT: call void @eat_i64(i64 [[DUPLICATED_MUL]]) -; CHECK-NEXT: [[XL:%.*]] = and i64 [[X]], 4294967295 -; CHECK-NEXT: [[XH:%.*]] = lshr i64 [[X]], 32 -; CHECK-NEXT: [[YL:%.*]] = and i64 [[Y]], 4294967295 -; CHECK-NEXT: [[YH:%.*]] = lshr i64 [[Y]], 32 -; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[YL]], [[XL]] -; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[YL]], [[XH]] -; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[YH]], [[XL]] -; CHECK-NEXT: [[T0L:%.*]] = and i64 [[T0]], 4294967295 -; CHECK-NEXT: [[T0H:%.*]] = lshr i64 [[T0]], 32 -; CHECK-NEXT: [[U0:%.*]] = add i64 [[T0H]], [[T1]] -; CHECK-NEXT: [[U0L:%.*]] = and i64 [[U0]], 4294967295 -; CHECK-NEXT: [[U1:%.*]] = add i64 [[U0L]], [[T2]] -; CHECK-NEXT: [[U1LS:%.*]] = shl i64 [[U1]], 32 -; CHECK-NEXT: [[LO:%.*]] = or i64 [[U1LS]], [[T0L]] -; CHECK-NEXT: ret i64 [[LO]] +; CHECK-NEXT: ret i64 [[DUPLICATED_MUL]] ; %duplicated_mul = mul i64 %x, %y call void @eat_i64(i64 %duplicated_mul) @@ -546,27 +419,11 @@ ; CHECK-NEXT: [[YY:%.*]] = zext i64 [[Y:%.*]] to i128 ; CHECK-NEXT: [[DUPLICATED_MUL:%.*]] = mul i128 [[XX]], [[YY]] ; CHECK-NEXT: call void @eat_i128(i128 [[DUPLICATED_MUL]]) -; CHECK-NEXT: [[XL:%.*]] = and i64 [[X]], 4294967295 -; CHECK-NEXT: [[XH:%.*]] = lshr i64 [[X]], 32 -; CHECK-NEXT: [[YL:%.*]] = and i64 [[Y]], 4294967295 -; CHECK-NEXT: [[YH:%.*]] = lshr i64 [[Y]], 32 -; CHECK-NEXT: [[T0:%.*]] = mul nuw i64 [[YL]], [[XL]] -; CHECK-NEXT: [[T1:%.*]] = mul nuw i64 [[YL]], [[XH]] -; CHECK-NEXT: [[T2:%.*]] = mul nuw i64 [[YH]], [[XL]] -; CHECK-NEXT: [[T3:%.*]] = mul nuw i64 [[YH]], [[XH]] -; CHECK-NEXT: [[T0L:%.*]] = and i64 [[T0]], 4294967295 -; CHECK-NEXT: [[T0H:%.*]] = lshr i64 [[T0]], 32 -; CHECK-NEXT: [[U0:%.*]] = add i64 [[T0H]], [[T1]] -; CHECK-NEXT: [[U0L:%.*]] = and i64 [[U0]], 4294967295 -; CHECK-NEXT: [[U0H:%.*]] = lshr i64 [[U0]], 32 -; CHECK-NEXT: [[U1:%.*]] = add i64 [[U0L]], [[T2]] -; CHECK-NEXT: [[U1LS:%.*]] = shl i64 [[U1]], 32 -; CHECK-NEXT: [[U1H:%.*]] = lshr i64 [[U1]], 32 -; CHECK-NEXT: [[U2:%.*]] = add i64 [[U0H]], [[T3]] -; CHECK-NEXT: [[LO:%.*]] = or i64 [[U1LS]], [[T0L]] -; CHECK-NEXT: [[HI:%.*]] = add i64 [[U2]], [[U1H]] -; CHECK-NEXT: [[RES_LO:%.*]] = insertvalue { i64, i64 } undef, i64 [[LO]], 0 -; CHECK-NEXT: [[RES:%.*]] = insertvalue { i64, i64 } [[RES_LO]], i64 [[HI]], 1 +; CHECK-NEXT: [[FULLMUL_LO:%.*]] = trunc i128 [[DUPLICATED_MUL]] to i64 +; CHECK-NEXT: [[TMP1:%.*]] = lshr i128 [[DUPLICATED_MUL]], 32 +; CHECK-NEXT: [[FULLMUL_HI:%.*]] = trunc i128 [[TMP1]] to i64 +; CHECK-NEXT: [[RES_LO:%.*]] = insertvalue { i64, i64 } undef, i64 [[FULLMUL_LO]], 0 +; CHECK-NEXT: [[RES:%.*]] = insertvalue { i64, i64 } [[RES_LO]], i64 [[FULLMUL_HI]], 1 ; CHECK-NEXT: ret { i64, i64 } [[RES]] ; %xx = zext i64 %x to i128