diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h --- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h +++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h @@ -375,6 +375,7 @@ bool InvertFalseVal = false); Value *getSelectCondition(Value *A, Value *B, bool ABIsTheSame); + Instruction *foldLShrOverflowBit(BinaryOperator &I); Instruction *foldExtractOfOverflowIntrinsic(ExtractValueInst &EV); Instruction *foldIntrinsicWithOverflowCommon(IntrinsicInst *II); Instruction *foldFPSignBitOps(BinaryOperator &I); diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp --- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp +++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp @@ -839,6 +839,73 @@ return nullptr; } +// Tries to perform +// (lshr (add (zext X), (zext Y)), K) +// -> (icmp ult (add X, Y), X) +// where +// - The add's operands are zexts from a K-bits integer to a bigger type. +// - The add is only used by the shr, or by iK (or narrower) truncates. +// - The lshr type has more than 2 bits (other types are boolean math). +// - K > 1 +// note that +// - The resulting add cannot have nuw/nsw, else on overflow we get a +// poison value and the transform isn't legal anymore. +Instruction *InstCombinerImpl::foldLShrOverflowBit(BinaryOperator &I) { + assert(I.getOpcode() == Instruction::LShr); + + Value *Add = I.getOperand(0); + Value *ShiftAmt = I.getOperand(1); + Type *Ty = I.getType(); + + if (Ty->getScalarSizeInBits() < 3) + return nullptr; + + const APInt *ShAmtAPInt = nullptr; + Value *X = nullptr, *Y = nullptr; + if (!match(ShiftAmt, m_APInt(ShAmtAPInt)) || + !match(Add, m_Add(m_ZExt(m_Value(X)), m_ZExt(m_Value(Y))))) + return nullptr; + + const unsigned ShAmt = ShAmtAPInt->getZExtValue(); + if (ShAmt == 1) + return nullptr; + + // X/Y are zexts from `ShAmt`-sized ints. + if (X->getType()->getScalarSizeInBits() != ShAmt || + Y->getType()->getScalarSizeInBits() != ShAmt) + return nullptr; + + // Make sure that `Add` is only used by `I` and `ShAmt`-truncates. + if (!Add->hasOneUse()) { + for (User *U : Add->users()) { + if (U == &I) + continue; + + TruncInst *Trunc = dyn_cast(U); + if (!Trunc || Trunc->getType()->getScalarSizeInBits() > ShAmt) + return nullptr; + } + } + + // Insert at Add so that the newly created `NarrowAdd` will dominate it's + // users (i.e. `Add`'s users). + Instruction *AddInst = cast(Add); + Builder.SetInsertPoint(AddInst); + + Value *NarrowAdd = Builder.CreateAdd(X, Y, "add.narrowed"); + Value *Overflow = + Builder.CreateICmpULT(NarrowAdd, X, "add.narrowed.overflow"); + + // Replace the uses of the original add with a zext of the + // NarrowAdd's result. Note that all users at this stage are known to + // be ShAmt-sized truncs, or the lshr itself. + if (!Add->hasOneUse()) + replaceInstUsesWith(*AddInst, Builder.CreateZExt(NarrowAdd, Ty)); + + // Replace the LShr with a zext of the overflow check. + return new ZExtInst(Overflow, Ty); +} + Instruction *InstCombinerImpl::visitShl(BinaryOperator &I) { const SimplifyQuery Q = SQ.getWithInstruction(&I); @@ -1327,6 +1394,9 @@ return BinaryOperator::CreateAnd(Mask, X); } + if (Instruction *Overflow = foldLShrOverflowBit(I)) + return Overflow; + return nullptr; } diff --git a/llvm/test/Transforms/InstCombine/lshr.ll b/llvm/test/Transforms/InstCombine/lshr.ll --- a/llvm/test/Transforms/InstCombine/lshr.ll +++ b/llvm/test/Transforms/InstCombine/lshr.ll @@ -1045,10 +1045,9 @@ define i4 @not_bool_add_lshr(i2 %a, i2 %b) { ; CHECK-LABEL: @not_bool_add_lshr( -; CHECK-NEXT: [[ZEXT_A:%.*]] = zext i2 [[A:%.*]] to i4 -; CHECK-NEXT: [[ZEXT_B:%.*]] = zext i2 [[B:%.*]] to i4 -; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i4 [[ZEXT_A]], [[ZEXT_B]] -; CHECK-NEXT: [[LSHR:%.*]] = lshr i4 [[ADD]], 2 +; CHECK-NEXT: [[TMP1:%.*]] = xor i2 [[A:%.*]], -1 +; CHECK-NEXT: [[ADD_NARROWED_OVERFLOW:%.*]] = icmp ult i2 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[LSHR:%.*]] = zext i1 [[ADD_NARROWED_OVERFLOW]] to i4 ; CHECK-NEXT: ret i4 [[LSHR]] ; %zext.a = zext i2 %a to i4 diff --git a/llvm/test/Transforms/InstCombine/shift-add.ll b/llvm/test/Transforms/InstCombine/shift-add.ll --- a/llvm/test/Transforms/InstCombine/shift-add.ll +++ b/llvm/test/Transforms/InstCombine/shift-add.ll @@ -462,10 +462,9 @@ define i32 @lshr_16_add_zext_basic(i16 %a, i16 %b) { ; CHECK-LABEL: @lshr_16_add_zext_basic( -; CHECK-NEXT: [[ZEXT_A:%.*]] = zext i16 [[A:%.*]] to i32 -; CHECK-NEXT: [[ZEXT_B:%.*]] = zext i16 [[B:%.*]] to i32 -; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[ZEXT_A]], [[ZEXT_B]] -; CHECK-NEXT: [[LSHR:%.*]] = lshr i32 [[ADD]], 16 +; CHECK-NEXT: [[TMP1:%.*]] = xor i16 [[A:%.*]], -1 +; CHECK-NEXT: [[ADD_NARROWED_OVERFLOW:%.*]] = icmp ult i16 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[LSHR:%.*]] = zext i1 [[ADD_NARROWED_OVERFLOW]] to i32 ; CHECK-NEXT: ret i32 [[LSHR]] ; %zext.a = zext i16 %a to i32 @@ -507,10 +506,9 @@ define i64 @lshr_32_add_zext_basic(i32 %a, i32 %b) { ; CHECK-LABEL: @lshr_32_add_zext_basic( -; CHECK-NEXT: [[ZEXT_A:%.*]] = zext i32 [[A:%.*]] to i64 -; CHECK-NEXT: [[ZEXT_B:%.*]] = zext i32 [[B:%.*]] to i64 -; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i64 [[ZEXT_A]], [[ZEXT_B]] -; CHECK-NEXT: [[LSHR:%.*]] = lshr i64 [[ADD]], 32 +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[A:%.*]], -1 +; CHECK-NEXT: [[ADD_NARROWED_OVERFLOW:%.*]] = icmp ult i32 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[LSHR:%.*]] = zext i1 [[ADD_NARROWED_OVERFLOW]] to i64 ; CHECK-NEXT: ret i64 [[LSHR]] ; %zext.a = zext i32 %a to i64 @@ -548,10 +546,9 @@ define i64 @lshr_16_to_64_add_zext_basic(i16 %a, i16 %b) { ; CHECK-LABEL: @lshr_16_to_64_add_zext_basic( -; CHECK-NEXT: [[ZEXT_A:%.*]] = zext i16 [[A:%.*]] to i64 -; CHECK-NEXT: [[ZEXT_B:%.*]] = zext i16 [[B:%.*]] to i64 -; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i64 [[ZEXT_A]], [[ZEXT_B]] -; CHECK-NEXT: [[LSHR:%.*]] = lshr i64 [[ADD]], 16 +; CHECK-NEXT: [[TMP1:%.*]] = xor i16 [[A:%.*]], -1 +; CHECK-NEXT: [[ADD_NARROWED_OVERFLOW:%.*]] = icmp ult i16 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[LSHR:%.*]] = zext i1 [[ADD_NARROWED_OVERFLOW]] to i64 ; CHECK-NEXT: ret i64 [[LSHR]] ; %zext.a = zext i16 %a to i64 @@ -594,10 +591,9 @@ define i32 @ashr_16_add_zext_basic(i16 %a, i16 %b) { ; CHECK-LABEL: @ashr_16_add_zext_basic( -; CHECK-NEXT: [[ZEXT_A:%.*]] = zext i16 [[A:%.*]] to i32 -; CHECK-NEXT: [[ZEXT_B:%.*]] = zext i16 [[B:%.*]] to i32 -; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i32 [[ZEXT_A]], [[ZEXT_B]] -; CHECK-NEXT: [[LSHR:%.*]] = lshr i32 [[ADD]], 16 +; CHECK-NEXT: [[TMP1:%.*]] = xor i16 [[A:%.*]], -1 +; CHECK-NEXT: [[ADD_NARROWED_OVERFLOW:%.*]] = icmp ult i16 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[LSHR:%.*]] = zext i1 [[ADD_NARROWED_OVERFLOW]] to i32 ; CHECK-NEXT: ret i32 [[LSHR]] ; %zext.a = zext i16 %a to i32 @@ -609,10 +605,9 @@ define i64 @ashr_32_add_zext_basic(i32 %a, i32 %b) { ; CHECK-LABEL: @ashr_32_add_zext_basic( -; CHECK-NEXT: [[ZEXT_A:%.*]] = zext i32 [[A:%.*]] to i64 -; CHECK-NEXT: [[ZEXT_B:%.*]] = zext i32 [[B:%.*]] to i64 -; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i64 [[ZEXT_A]], [[ZEXT_B]] -; CHECK-NEXT: [[LSHR:%.*]] = lshr i64 [[ADD]], 32 +; CHECK-NEXT: [[TMP1:%.*]] = xor i32 [[A:%.*]], -1 +; CHECK-NEXT: [[ADD_NARROWED_OVERFLOW:%.*]] = icmp ult i32 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[LSHR:%.*]] = zext i1 [[ADD_NARROWED_OVERFLOW]] to i64 ; CHECK-NEXT: ret i64 [[LSHR]] ; %zext.a = zext i32 %a to i64 @@ -624,10 +619,9 @@ define i64 @ashr_16_to_64_add_zext_basic(i16 %a, i16 %b) { ; CHECK-LABEL: @ashr_16_to_64_add_zext_basic( -; CHECK-NEXT: [[ZEXT_A:%.*]] = zext i16 [[A:%.*]] to i64 -; CHECK-NEXT: [[ZEXT_B:%.*]] = zext i16 [[B:%.*]] to i64 -; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i64 [[ZEXT_A]], [[ZEXT_B]] -; CHECK-NEXT: [[LSHR:%.*]] = lshr i64 [[ADD]], 16 +; CHECK-NEXT: [[TMP1:%.*]] = xor i16 [[A:%.*]], -1 +; CHECK-NEXT: [[ADD_NARROWED_OVERFLOW:%.*]] = icmp ult i16 [[TMP1]], [[B:%.*]] +; CHECK-NEXT: [[LSHR:%.*]] = zext i1 [[ADD_NARROWED_OVERFLOW]] to i64 ; CHECK-NEXT: ret i64 [[LSHR]] ; %zext.a = zext i16 %a to i64 @@ -639,13 +633,10 @@ define i32 @lshr_32_add_zext_trunc(i32 %a, i32 %b) { ; CHECK-LABEL: @lshr_32_add_zext_trunc( -; CHECK-NEXT: [[ZEXT_A:%.*]] = zext i32 [[A:%.*]] to i64 -; CHECK-NEXT: [[ZEXT_B:%.*]] = zext i32 [[B:%.*]] to i64 -; CHECK-NEXT: [[ADD:%.*]] = add nuw nsw i64 [[ZEXT_A]], [[ZEXT_B]] -; CHECK-NEXT: [[TRUNC_ADD:%.*]] = trunc i64 [[ADD]] to i32 -; CHECK-NEXT: [[SHR:%.*]] = lshr i64 [[ADD]], 32 -; CHECK-NEXT: [[TRUNC_SHR:%.*]] = trunc i64 [[SHR]] to i32 -; CHECK-NEXT: [[RET:%.*]] = add i32 [[TRUNC_ADD]], [[TRUNC_SHR]] +; CHECK-NEXT: [[ADD_NARROWED:%.*]] = add i32 [[A:%.*]], [[B:%.*]] +; CHECK-NEXT: [[ADD_NARROWED_OVERFLOW:%.*]] = icmp ult i32 [[ADD_NARROWED]], [[A]] +; CHECK-NEXT: [[TRUNC_SHR:%.*]] = zext i1 [[ADD_NARROWED_OVERFLOW]] to i32 +; CHECK-NEXT: [[RET:%.*]] = add i32 [[ADD_NARROWED]], [[TRUNC_SHR]] ; CHECK-NEXT: ret i32 [[RET]] ; %zext.a = zext i32 %a to i64 @@ -661,29 +652,27 @@ define <3 x i32> @add3_i96(<3 x i32> %0, <3 x i32> %1) { ; CHECK-LABEL: @add3_i96( ; CHECK-NEXT: [[TMP3:%.*]] = extractelement <3 x i32> [[TMP0:%.*]], i64 0 -; CHECK-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64 -; CHECK-NEXT: [[TMP5:%.*]] = extractelement <3 x i32> [[TMP1:%.*]], i64 0 +; CHECK-NEXT: [[TMP4:%.*]] = extractelement <3 x i32> [[TMP1:%.*]], i64 0 +; CHECK-NEXT: [[ADD_NARROWED:%.*]] = add i32 [[TMP4]], [[TMP3]] +; CHECK-NEXT: [[ADD_NARROWED_OVERFLOW:%.*]] = icmp ult i32 [[ADD_NARROWED]], [[TMP4]] +; CHECK-NEXT: [[TMP5:%.*]] = extractelement <3 x i32> [[TMP0]], i64 1 ; CHECK-NEXT: [[TMP6:%.*]] = zext i32 [[TMP5]] to i64 -; CHECK-NEXT: [[TMP7:%.*]] = add nuw nsw i64 [[TMP6]], [[TMP4]] -; CHECK-NEXT: [[TMP8:%.*]] = extractelement <3 x i32> [[TMP0]], i64 1 -; CHECK-NEXT: [[TMP9:%.*]] = zext i32 [[TMP8]] to i64 -; CHECK-NEXT: [[TMP10:%.*]] = extractelement <3 x i32> [[TMP1]], i64 1 -; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 -; CHECK-NEXT: [[TMP12:%.*]] = add nuw nsw i64 [[TMP11]], [[TMP9]] -; CHECK-NEXT: [[TMP13:%.*]] = lshr i64 [[TMP7]], 32 -; CHECK-NEXT: [[TMP14:%.*]] = add nuw nsw i64 [[TMP12]], [[TMP13]] -; CHECK-NEXT: [[TMP15:%.*]] = extractelement <3 x i32> [[TMP0]], i64 2 -; CHECK-NEXT: [[TMP16:%.*]] = extractelement <3 x i32> [[TMP1]], i64 2 -; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP16]], [[TMP15]] -; CHECK-NEXT: [[TMP18:%.*]] = lshr i64 [[TMP14]], 32 -; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32 -; CHECK-NEXT: [[TMP20:%.*]] = add i32 [[TMP17]], [[TMP19]] -; CHECK-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP7]] to i32 -; CHECK-NEXT: [[TMP22:%.*]] = insertelement <3 x i32> undef, i32 [[TMP21]], i64 0 -; CHECK-NEXT: [[TMP23:%.*]] = trunc i64 [[TMP14]] to i32 -; CHECK-NEXT: [[TMP24:%.*]] = insertelement <3 x i32> [[TMP22]], i32 [[TMP23]], i64 1 -; CHECK-NEXT: [[TMP25:%.*]] = insertelement <3 x i32> [[TMP24]], i32 [[TMP20]], i64 2 -; CHECK-NEXT: ret <3 x i32> [[TMP25]] +; CHECK-NEXT: [[TMP7:%.*]] = extractelement <3 x i32> [[TMP1]], i64 1 +; CHECK-NEXT: [[TMP8:%.*]] = zext i32 [[TMP7]] to i64 +; CHECK-NEXT: [[TMP9:%.*]] = add nuw nsw i64 [[TMP8]], [[TMP6]] +; CHECK-NEXT: [[TMP10:%.*]] = zext i1 [[ADD_NARROWED_OVERFLOW]] to i64 +; CHECK-NEXT: [[TMP11:%.*]] = add nuw nsw i64 [[TMP9]], [[TMP10]] +; CHECK-NEXT: [[TMP12:%.*]] = extractelement <3 x i32> [[TMP0]], i64 2 +; CHECK-NEXT: [[TMP13:%.*]] = extractelement <3 x i32> [[TMP1]], i64 2 +; CHECK-NEXT: [[TMP14:%.*]] = add i32 [[TMP13]], [[TMP12]] +; CHECK-NEXT: [[TMP15:%.*]] = lshr i64 [[TMP11]], 32 +; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[TMP15]] to i32 +; CHECK-NEXT: [[TMP17:%.*]] = add i32 [[TMP14]], [[TMP16]] +; CHECK-NEXT: [[TMP18:%.*]] = insertelement <3 x i32> undef, i32 [[ADD_NARROWED]], i64 0 +; CHECK-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP11]] to i32 +; CHECK-NEXT: [[TMP20:%.*]] = insertelement <3 x i32> [[TMP18]], i32 [[TMP19]], i64 1 +; CHECK-NEXT: [[TMP21:%.*]] = insertelement <3 x i32> [[TMP20]], i32 [[TMP17]], i64 2 +; CHECK-NEXT: ret <3 x i32> [[TMP21]] ; %3 = extractelement <3 x i32> %0, i64 0 %4 = zext i32 %3 to i64