diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -509,10 +509,10 @@ Intrinsic::ID IntrID = Intrinsic::not_intrinsic; - // TODO: Should this try to match mulhi24? if (ST->hasMulU24() && isU24(LHS, Size) && isU24(RHS, Size)) { - // The 24-bit mul intrinsics yields the low-order 32 bits. The result's bit - // width should not exceed 32 if `Size` > 32. + // The mul24 instruction yields the low-order 32 bits. If the original + // result and the destination is wider than 32 bits, the mul24 would + // truncate the result. if (Size > 32 && numBitsUnsigned(LHS, Size) + numBitsUnsigned(RHS, Size) > 32) { return false; @@ -520,7 +520,10 @@ IntrID = Intrinsic::amdgcn_mul_u24; } else if (ST->hasMulI24() && isI24(LHS, Size) && isI24(RHS, Size)) { - if (Size > 32 && numBitsSigned(LHS, Size) + numBitsSigned(RHS, Size) > 31) { + // The original result is positive if the width is wider than 32 and the + // highest set bit of the original result is at bit 31. Generating mul24 and + // sign-extending it would yield a negative value. + if (Size > 32 && numBitsSigned(LHS, Size) + numBitsSigned(RHS, Size) > 30) { return false; } diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll @@ -239,6 +239,39 @@ ret i64 %mul } +define i64 @smul24_i64_3(i64 %lhs, i64 %rhs) { +; SI-LABEL: @smul24_i64_3( +; SI-NEXT: [[LHS_TRUNC:%.*]] = trunc i64 [[LHS:%.*]] to i16 +; SI-NEXT: [[LHS24:%.*]] = sext i16 [[LHS_TRUNC]] to i64 +; SI-NEXT: [[RHS_TRUNC:%.*]] = trunc i64 [[RHS:%.*]] to i17 +; SI-NEXT: [[RHS24:%.*]] = sext i17 [[RHS_TRUNC]] to i64 +; SI-NEXT: [[MUL:%.*]] = mul i64 [[LHS24]], [[RHS24]] +; SI-NEXT: ret i64 [[MUL]] +; +; VI-LABEL: @smul24_i64_3( +; VI-NEXT: [[LHS_TRUNC:%.*]] = trunc i64 [[LHS:%.*]] to i16 +; VI-NEXT: [[LHS24:%.*]] = sext i16 [[LHS_TRUNC]] to i64 +; VI-NEXT: [[RHS_TRUNC:%.*]] = trunc i64 [[RHS:%.*]] to i17 +; VI-NEXT: [[RHS24:%.*]] = sext i17 [[RHS_TRUNC]] to i64 +; VI-NEXT: [[MUL:%.*]] = mul i64 [[LHS24]], [[RHS24]] +; VI-NEXT: ret i64 [[MUL]] +; +; DISABLED-LABEL: @smul24_i64_3( +; DISABLED-NEXT: [[LHS_TRUNC:%.*]] = trunc i64 [[LHS:%.*]] to i16 +; DISABLED-NEXT: [[LHS24:%.*]] = sext i16 [[LHS_TRUNC]] to i64 +; DISABLED-NEXT: [[RHS_TRUNC:%.*]] = trunc i64 [[RHS:%.*]] to i17 +; DISABLED-NEXT: [[RHS24:%.*]] = sext i17 [[RHS_TRUNC]] to i64 +; DISABLED-NEXT: [[MUL:%.*]] = mul i64 [[LHS24]], [[RHS24]] +; DISABLED-NEXT: ret i64 [[MUL]] +; + %lhs.trunc = trunc i64 %lhs to i16 + %lhs24 = sext i16 %lhs.trunc to i64 + %rhs.trunc = trunc i64 %rhs to i17 + %rhs24 = sext i17 %rhs.trunc to i64 + %mul = mul i64 %lhs24, %rhs24 + ret i64 %mul +} + define i64 @umul24_i64(i64 %lhs, i64 %rhs) { ; SI-LABEL: @umul24_i64( ; SI-NEXT: [[LHS24:%.*]] = and i64 [[LHS:%.*]], 16777215