diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp --- a/llvm/lib/Support/KnownBits.cpp +++ b/llvm/lib/Support/KnownBits.cpp @@ -421,9 +421,16 @@ "Self multiplication knownbits mismatch"); // Compute a conservative estimate for high known-0 bits. + // TODO: This could be generalized to number of sign bits (negative numbers). unsigned LHSLeadZ = LHS.countMinLeadingZeros(); unsigned RHSLeadZ = RHS.countMinLeadingZeros(); - unsigned LeadZ = std::max(LHSLeadZ + RHSLeadZ, BitWidth) - BitWidth; + + // If either operand is a power-of-2, the multiply is only shifting bits in + // the other operand (there can't be a carry into the M+N bit of the result). + // Note: if we know that a value is entirely 0, that should simplify below. + bool BonusLZ = LHS.countMaxPopulation() == 1 || RHS.countMaxPopulation() == 1; + + unsigned LeadZ = std::max(LHSLeadZ + RHSLeadZ + BonusLZ, BitWidth) - BitWidth; assert(LeadZ <= BitWidth && "More zeros than bits?"); // The result of the bottom bits of an integer multiply can be diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -1528,7 +1528,6 @@ ; GCN-NEXT: v_mac_f32_e32 v3, 0x4f800000, v4 ; GCN-NEXT: v_rcp_f32_e32 v3, v3 ; GCN-NEXT: v_mov_b32_e32 v12, 0 -; GCN-NEXT: s_mov_b32 s4, 0x8000 ; GCN-NEXT: v_mul_f32_e32 v3, 0x5f7ffffc, v3 ; GCN-NEXT: v_mul_f32_e32 v4, 0x2f800000, v3 ; GCN-NEXT: v_trunc_f32_e32 v4, v4 @@ -1578,18 +1577,14 @@ ; GCN-NEXT: v_add_i32_e32 v5, vcc, v5, v6 ; GCN-NEXT: v_addc_u32_e32 v6, vcc, 0, v7, vcc ; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v5 -; GCN-NEXT: v_addc_u32_e32 v4, vcc, v4, v6, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v5, 17, v4 -; GCN-NEXT: v_lshlrev_b32_e32 v4, 15, v4 +; GCN-NEXT: v_addc_u32_e32 v3, vcc, v4, v6, vcc ; GCN-NEXT: v_lshrrev_b32_e32 v3, 17, v3 -; GCN-NEXT: v_add_i32_e32 v3, vcc, v3, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, 0, v5, vcc ; GCN-NEXT: v_mul_lo_u32 v4, v1, v3 ; GCN-NEXT: v_mul_hi_u32 v5, v0, v3 ; GCN-NEXT: v_add_i32_e32 v4, vcc, v5, v4 ; GCN-NEXT: v_mul_lo_u32 v5, v0, v3 ; GCN-NEXT: v_sub_i32_e32 v6, vcc, 0, v4 -; GCN-NEXT: v_sub_i32_e32 v5, vcc, s4, v5 +; GCN-NEXT: v_sub_i32_e32 v5, vcc, 0x8000, v5 ; GCN-NEXT: v_subb_u32_e64 v6, s[4:5], v6, v1, vcc ; GCN-NEXT: v_sub_i32_e64 v7, s[4:5], v5, v0 ; GCN-NEXT: v_subbrev_u32_e64 v6, s[4:5], 0, v6, s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -1701,7 +1701,6 @@ ; GCN-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3 ; GCN-NEXT: v_rcp_f32_e32 v2, v2 ; GCN-NEXT: v_mov_b32_e32 v11, 0 -; GCN-NEXT: s_mov_b32 s4, 0x8000 ; GCN-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2 ; GCN-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2 ; GCN-NEXT: v_trunc_f32_e32 v3, v3 @@ -1751,18 +1750,14 @@ ; GCN-NEXT: v_add_i32_e32 v4, vcc, v4, v5 ; GCN-NEXT: v_addc_u32_e32 v5, vcc, 0, v6, vcc ; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v4 -; GCN-NEXT: v_addc_u32_e32 v3, vcc, v3, v5, vcc -; GCN-NEXT: v_lshrrev_b32_e32 v4, 17, v3 -; GCN-NEXT: v_lshlrev_b32_e32 v3, 15, v3 +; GCN-NEXT: v_addc_u32_e32 v2, vcc, v3, v5, vcc ; GCN-NEXT: v_lshrrev_b32_e32 v2, 17, v2 -; GCN-NEXT: v_add_i32_e32 v2, vcc, v2, v3 -; GCN-NEXT: v_addc_u32_e32 v2, vcc, 0, v4, vcc ; GCN-NEXT: v_mul_lo_u32 v3, v1, v2 ; GCN-NEXT: v_mul_hi_u32 v4, v0, v2 ; GCN-NEXT: v_mul_lo_u32 v2, v0, v2 ; GCN-NEXT: v_add_i32_e32 v3, vcc, v4, v3 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, 0, v3 -; GCN-NEXT: v_sub_i32_e32 v2, vcc, s4, v2 +; GCN-NEXT: v_sub_i32_e32 v2, vcc, 0x8000, v2 ; GCN-NEXT: v_subb_u32_e64 v4, s[4:5], v4, v1, vcc ; GCN-NEXT: v_sub_i32_e64 v5, s[4:5], v2, v0 ; GCN-NEXT: v_subbrev_u32_e64 v6, s[6:7], 0, v4, s[4:5] diff --git a/llvm/test/Transforms/InstCombine/icmp-mul.ll b/llvm/test/Transforms/InstCombine/icmp-mul.ll --- a/llvm/test/Transforms/InstCombine/icmp-mul.ll +++ b/llvm/test/Transforms/InstCombine/icmp-mul.ll @@ -684,11 +684,7 @@ define i1 @mul_of_bool(i32 %x, i8 %y) { ; CHECK-LABEL: @mul_of_bool( -; CHECK-NEXT: [[B:%.*]] = and i32 [[X:%.*]], 1 -; CHECK-NEXT: [[Z:%.*]] = zext i8 [[Y:%.*]] to i32 -; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i32 [[B]], [[Z]] -; CHECK-NEXT: [[R:%.*]] = icmp ugt i32 [[M]], 255 -; CHECK-NEXT: ret i1 [[R]] +; CHECK-NEXT: ret i1 false ; %b = and i32 %x, 1 %z = zext i8 %y to i32 @@ -699,11 +695,7 @@ define i1 @mul_of_bool_commute(i32 %x, i32 %y) { ; CHECK-LABEL: @mul_of_bool_commute( -; CHECK-NEXT: [[X1:%.*]] = and i32 [[X:%.*]], 1 -; CHECK-NEXT: [[Y8:%.*]] = and i32 [[Y:%.*]], 255 -; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i32 [[Y8]], [[X1]] -; CHECK-NEXT: [[R:%.*]] = icmp ugt i32 [[M]], 255 -; CHECK-NEXT: ret i1 [[R]] +; CHECK-NEXT: ret i1 false ; %x1 = and i32 %x, 1 %y8 = and i32 %y, 255 @@ -714,11 +706,7 @@ define i1 @mul_of_bools(i32 %x, i32 %y) { ; CHECK-LABEL: @mul_of_bools( -; CHECK-NEXT: [[X1:%.*]] = and i32 [[X:%.*]], 1 -; CHECK-NEXT: [[Y1:%.*]] = and i32 [[Y:%.*]], 1 -; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i32 [[X1]], [[Y1]] -; CHECK-NEXT: [[R:%.*]] = icmp ult i32 [[M]], 2 -; CHECK-NEXT: ret i1 [[R]] +; CHECK-NEXT: ret i1 true ; %x1 = and i32 %x, 1 %y1 = and i32 %y, 1 @@ -727,6 +715,8 @@ ret i1 %r } +; negative test - not a mask of low bit + define i1 @not_mul_of_bool(i32 %x, i8 %y) { ; CHECK-LABEL: @not_mul_of_bool( ; CHECK-NEXT: [[Q:%.*]] = and i32 [[X:%.*]], 3 @@ -742,6 +732,8 @@ ret i1 %r } +; negative test - not a single low bit + define i1 @not_mul_of_bool_commute(i32 %x, i32 %y) { ; CHECK-LABEL: @not_mul_of_bool_commute( ; CHECK-NEXT: [[X30:%.*]] = lshr i32 [[X:%.*]], 30 @@ -757,6 +749,9 @@ ret i1 %r } +; negative test - no leading zeros for 's' +; TODO: If analysis was generalized for sign bits, we could reduce this to false. + define i1 @mul_of_bool_no_lz_other_op(i32 %x, i8 %y) { ; CHECK-LABEL: @mul_of_bool_no_lz_other_op( ; CHECK-NEXT: [[B:%.*]] = and i32 [[X:%.*]], 1 @@ -772,13 +767,11 @@ ret i1 %r } +; high and low bits are known 0 + define i1 @mul_of_pow2(i32 %x, i8 %y) { ; CHECK-LABEL: @mul_of_pow2( -; CHECK-NEXT: [[B:%.*]] = and i32 [[X:%.*]], 2 -; CHECK-NEXT: [[Z:%.*]] = zext i8 [[Y:%.*]] to i32 -; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i32 [[B]], [[Z]] -; CHECK-NEXT: [[R:%.*]] = icmp ugt i32 [[M]], 510 -; CHECK-NEXT: ret i1 [[R]] +; CHECK-NEXT: ret i1 false ; %b = and i32 %x, 2 %z = zext i8 %y to i32 @@ -787,13 +780,11 @@ ret i1 %r } +; high and low bits are known 0 + define i1 @mul_of_pow2_commute(i32 %x, i32 %y) { ; CHECK-LABEL: @mul_of_pow2_commute( -; CHECK-NEXT: [[X4:%.*]] = and i32 [[X:%.*]], 4 -; CHECK-NEXT: [[Y8:%.*]] = and i32 [[Y:%.*]], 255 -; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i32 [[Y8]], [[X4]] -; CHECK-NEXT: [[R:%.*]] = icmp ugt i32 [[M]], 1020 -; CHECK-NEXT: ret i1 [[R]] +; CHECK-NEXT: ret i1 false ; %x4 = and i32 %x, 4 %y8 = and i32 %y, 255 @@ -802,13 +793,11 @@ ret i1 %r } +; only bit 7 can be set by the multiply + define i32 @mul_of_pow2s(i32 %x, i32 %y) { ; CHECK-LABEL: @mul_of_pow2s( -; CHECK-NEXT: [[X8:%.*]] = and i32 [[X:%.*]], 8 -; CHECK-NEXT: [[Y16:%.*]] = and i32 [[Y:%.*]], 16 -; CHECK-NEXT: [[M:%.*]] = mul nuw nsw i32 [[X8]], [[Y16]] -; CHECK-NEXT: [[BIT7:%.*]] = or i32 [[M]], 128 -; CHECK-NEXT: ret i32 [[BIT7]] +; CHECK-NEXT: ret i32 128 ; %x8 = and i32 %x, 8 %y16 = and i32 %y, 16 @@ -817,6 +806,8 @@ ret i32 %bit7 } +; negative test - 6 * 255 = 1530 (but constant range analysis can get this) + define i1 @not_mul_of_pow2(i32 %x, i8 %y) { ; CHECK-LABEL: @not_mul_of_pow2( ; CHECK-NEXT: [[Q:%.*]] = and i32 [[X:%.*]], 6 @@ -832,6 +823,8 @@ ret i1 %r } +; negative test - 12 * 255 = 3060 (but constant range analysis can get this) + define i1 @not_mul_of_pow2_commute(i32 %x, i32 %y) { ; CHECK-LABEL: @not_mul_of_pow2_commute( ; CHECK-NEXT: [[X30:%.*]] = and i32 [[X:%.*]], 12 @@ -847,6 +840,9 @@ ret i1 %r } +; negative test - no leading zeros for 's' +; TODO: If analysis was generalized for sign bits, we could reduce this to false. + define i1 @mul_of_pow2_no_lz_other_op(i32 %x, i8 %y) { ; CHECK-LABEL: @mul_of_pow2_no_lz_other_op( ; CHECK-NEXT: [[B:%.*]] = and i32 [[X:%.*]], 2