Index: llvm/lib/Support/KnownBits.cpp =================================================================== --- llvm/lib/Support/KnownBits.cpp +++ llvm/lib/Support/KnownBits.cpp @@ -420,18 +420,19 @@ assert((!SelfMultiply || (LHS.One == RHS.One && LHS.Zero == RHS.Zero)) && "Self multiplication knownbits mismatch"); - // Compute a conservative estimate for high known-0 bits. + // Compute the high known-0 bits by multiplying the unsigned max of each side. + // Conservatively, M active bits * N active bits results in M + N bits in the + // result. But if we know a value is a power-of-2 for example, then this + // computes one more leading zero. // TODO: This could be generalized to number of sign bits (negative numbers). - unsigned LHSLeadZ = LHS.countMinLeadingZeros(); - unsigned RHSLeadZ = RHS.countMinLeadingZeros(); - - // If either operand is a power-of-2, the multiply is only shifting bits in - // the other operand (there can't be a carry into the M+N bit of the result). - // Note: if we know that a value is entirely 0, that should simplify below. - bool BonusLZ = LHS.countMaxPopulation() == 1 || RHS.countMaxPopulation() == 1; - - unsigned LeadZ = std::max(LHSLeadZ + RHSLeadZ + BonusLZ, BitWidth) - BitWidth; - assert(LeadZ <= BitWidth && "More zeros than bits?"); + APInt UMaxLHS = LHS.getMaxValue(); + APInt UMaxRHS = RHS.getMaxValue(); + + // For leading zeros in the result to be valid, the unsigned max product must + // fit in the bitwidth (it must not overflow). + bool HasOverflow; + APInt UMaxResult = UMaxLHS.umul_ov(UMaxRHS, HasOverflow); + unsigned LeadZ = HasOverflow ? 0 : UMaxResult.countLeadingZeros(); // The result of the bottom bits of an integer multiply can be // inferred by looking at the bottom bits of both operands and Index: llvm/test/CodeGen/X86/mul128.ll =================================================================== --- llvm/test/CodeGen/X86/mul128.ll +++ llvm/test/CodeGen/X86/mul128.ll @@ -107,15 +107,12 @@ define void @PR13897() nounwind { ; X64-LABEL: PR13897: ; X64: # %bb.0: # %"0x0" -; X64-NEXT: movl bbb(%rip), %ecx -; X64-NEXT: movabsq $4294967297, %rdx # imm = 0x100000001 -; X64-NEXT: movq %rcx, %rax -; X64-NEXT: mulq %rdx -; X64-NEXT: addq %rcx, %rdx +; X64-NEXT: movl bbb(%rip), %eax +; X64-NEXT: movq %rax, %rcx ; X64-NEXT: shlq $32, %rcx -; X64-NEXT: addq %rcx, %rdx -; X64-NEXT: movq %rax, aaa(%rip) -; X64-NEXT: movq %rdx, aaa+8(%rip) +; X64-NEXT: orq %rax, %rcx +; X64-NEXT: movq %rcx, aaa+8(%rip) +; X64-NEXT: movq %rcx, aaa(%rip) ; X64-NEXT: retq ; ; X86-LABEL: PR13897: Index: llvm/test/Transforms/InstCombine/icmp-mul.ll =================================================================== --- llvm/test/Transforms/InstCombine/icmp-mul.ll +++ llvm/test/Transforms/InstCombine/icmp-mul.ll @@ -858,12 +858,11 @@ ret i1 %r } +; The top 32-bits must be zero. + define i1 @splat_mul_known_lz(i32 %x) { ; CHECK-LABEL: @splat_mul_known_lz( -; CHECK-NEXT: [[Z:%.*]] = zext i32 [[X:%.*]] to i128 -; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i128 [[Z]], 18446744078004518913 -; CHECK-NEXT: [[R:%.*]] = icmp ult i128 [[M]], 79228162514264337593543950336 -; CHECK-NEXT: ret i1 [[R]] +; CHECK-NEXT: ret i1 true ; %z = zext i32 %x to i128 %m = mul i128 %z, 18446744078004518913 ; 0x00000000_00000001_00000001_00000001 @@ -872,6 +871,8 @@ ret i1 %r } +; Negative test - the 33rd bit could be set. + define i1 @splat_mul_unknown_lz(i32 %x) { ; CHECK-LABEL: @splat_mul_unknown_lz( ; CHECK-NEXT: [[Z:%.*]] = zext i32 [[X:%.*]] to i128 Index: llvm/test/Transforms/InstCombine/narrow-switch.ll =================================================================== --- llvm/test/Transforms/InstCombine/narrow-switch.ll +++ llvm/test/Transforms/InstCombine/narrow-switch.ll @@ -99,14 +99,14 @@ ; Make sure to avoid assertion crashes and use the type before ; truncation to generate the sub constant expressions that leads ; to the recomputed condition. -; We allow to truncate from i64 to i59 if in 32-bit mode, +; We allow truncate from i64 to i58 if in 32-bit mode, ; because both are illegal. -define void @trunc64to59(i64 %a) { -; ALL-LABEL: @trunc64to59( -; CHECK32: switch i59 -; CHECK32-NEXT: i59 0, label %sw.bb1 -; CHECK32-NEXT: i59 18717182647723699, label %sw.bb2 +define void @trunc64to58(i64 %a) { +; ALL-LABEL: @trunc64to58( +; CHECK32: switch i58 +; CHECK32-NEXT: i58 0, label %sw.bb1 +; CHECK32-NEXT: i58 18717182647723699, label %sw.bb2 ; CHECK32-NEXT: ] ; CHECK64: switch i64 ; CHECK64-NEXT: i64 0, label %sw.bb1 Index: llvm/test/Transforms/PhaseOrdering/X86/pixel-splat.ll =================================================================== --- llvm/test/Transforms/PhaseOrdering/X86/pixel-splat.ll +++ llvm/test/Transforms/PhaseOrdering/X86/pixel-splat.ll @@ -40,21 +40,19 @@ ; CHECK-NEXT: [[WIDE_LOAD4:%.*]] = load <4 x i8>, <4 x i8>* [[TMP3]], align 1 ; CHECK-NEXT: [[TMP4:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i32> ; CHECK-NEXT: [[TMP5:%.*]] = zext <4 x i8> [[WIDE_LOAD4]] to <4 x i32> -; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <4 x i32> [[TMP4]], -; CHECK-NEXT: [[TMP7:%.*]] = mul nuw nsw <4 x i32> [[TMP5]], -; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i32> [[TMP4]], -; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i32> [[TMP5]], -; CHECK-NEXT: [[TMP10:%.*]] = add nsw <4 x i32> [[TMP8]], [[TMP6]] -; CHECK-NEXT: [[TMP11:%.*]] = add nsw <4 x i32> [[TMP9]], [[TMP7]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[POUT:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP6:%.*]] = mul nuw nsw <4 x i32> [[TMP4]], +; CHECK-NEXT: [[TMP7:%.*]] = mul nuw nsw <4 x i32> [[TMP5]], +; CHECK-NEXT: [[TMP8:%.*]] = or <4 x i32> [[TMP6]], +; CHECK-NEXT: [[TMP9:%.*]] = or <4 x i32> [[TMP7]], +; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, i32* [[POUT:%.*]], i64 [[INDEX]] +; CHECK-NEXT: [[TMP11:%.*]] = bitcast i32* [[TMP10]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP8]], <4 x i32>* [[TMP11]], align 4 +; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, i32* [[TMP10]], i64 4 ; CHECK-NEXT: [[TMP13:%.*]] = bitcast i32* [[TMP12]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP10]], <4 x i32>* [[TMP13]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = getelementptr inbounds i32, i32* [[TMP12]], i64 4 -; CHECK-NEXT: [[TMP15:%.*]] = bitcast i32* [[TMP14]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP11]], <4 x i32>* [[TMP15]], align 4 +; CHECK-NEXT: store <4 x i32> [[TMP9]], <4 x i32>* [[TMP13]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 -; CHECK-NEXT: [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-NEXT: [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]] ; CHECK: middle.block: ; CHECK-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[N_VEC]], [[WIDE_TRIP_COUNT]] ; CHECK-NEXT: br i1 [[CMP_N]], label [[FOR_END]], label [[FOR_BODY_PREHEADER5]] @@ -64,11 +62,10 @@ ; CHECK: for.body: ; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY_PREHEADER5]] ] ; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i8, i8* [[PIN]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP17:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 -; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP17]] to i32 -; CHECK-NEXT: [[REASS_MUL:%.*]] = mul nuw nsw i32 [[CONV]], 65792 -; CHECK-NEXT: [[OR2:%.*]] = or i32 [[CONV]], -16777216 -; CHECK-NEXT: [[OR3:%.*]] = add nsw i32 [[OR2]], [[REASS_MUL]] +; CHECK-NEXT: [[TMP15:%.*]] = load i8, i8* [[ARRAYIDX]], align 1 +; CHECK-NEXT: [[CONV:%.*]] = zext i8 [[TMP15]] to i32 +; CHECK-NEXT: [[OR2:%.*]] = mul nuw nsw i32 [[CONV]], 65793 +; CHECK-NEXT: [[OR3:%.*]] = or i32 [[OR2]], -16777216 ; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, i32* [[POUT]], i64 [[INDVARS_IV]] ; CHECK-NEXT: store i32 [[OR3]], i32* [[ARRAYIDX5]], align 4 ; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1