diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp @@ -477,6 +477,36 @@ return NewVal; } +// Returns 24-bit or 48-bit (as per `NumBits`) mul of `LHS` and `RHS`. +static Value *getMul24(IRBuilder<> &Builder, Value *LHS, Value *RHS, + unsigned NumBits, bool IsSigned) { + Module *Mod = Builder.GetInsertBlock()->getModule(); + + if ((!IsSigned && NumBits <= 32) || (IsSigned && NumBits <= 30)) { + Intrinsic::ID ID = + IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24; + return Builder.CreateCall(Intrinsic::getDeclaration(Mod, ID), {LHS, RHS}); + } + + assert((!IsSigned && NumBits <= 48) || (IsSigned && NumBits <= 46)); + + Intrinsic::ID LoID = + IsSigned ? Intrinsic::amdgcn_mul_i24 : Intrinsic::amdgcn_mul_u24; + Intrinsic::ID HiID = + IsSigned ? Intrinsic::amdgcn_mulhi_i24 : Intrinsic::amdgcn_mulhi_u24; + + Value *Lo = + Builder.CreateCall(Intrinsic::getDeclaration(Mod, LoID), {LHS, RHS}); + Value *Hi = + Builder.CreateCall(Intrinsic::getDeclaration(Mod, HiID), {LHS, RHS}); + + IntegerType *I64Ty = Builder.getInt64Ty(); + Lo = Builder.CreateZExtOrTrunc(Lo, I64Ty); + Hi = Builder.CreateZExtOrTrunc(Hi, I64Ty); + + return Builder.CreateOr(Lo, Builder.CreateShl(Hi, 32)); +} + bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const { if (I.getOpcode() != Instruction::Mul) return false; @@ -495,29 +525,28 @@ IRBuilder<> Builder(&I); Builder.SetCurrentDebugLocation(I.getDebugLoc()); - Intrinsic::ID IntrID = Intrinsic::not_intrinsic; - unsigned LHSBits = 0, RHSBits = 0; + bool IsSigned = false; if (ST->hasMulU24() && (LHSBits = numBitsUnsigned(LHS, Size)) <= 24 && (RHSBits = numBitsUnsigned(RHS, Size)) <= 24) { - // The mul24 instruction yields the low-order 32 bits. If the original - // result and the destination is wider than 32 bits, the mul24 would - // truncate the result. - if (Size > 32 && LHSBits + RHSBits > 32) + // If the original result and the destination is wider than 48 bits, the + // mul48 (i.e. mul24, mul24hi pair) would truncate the result. + if (Size > 48 && LHSBits + RHSBits > 48) return false; - IntrID = Intrinsic::amdgcn_mul_u24; + IsSigned = false; + } else if (ST->hasMulI24() && (LHSBits = numBitsSigned(LHS, Size)) < 24 && (RHSBits = numBitsSigned(RHS, Size)) < 24) { - // The original result is positive if its destination is wider than 32 bits - // and its highest set bit is at bit 31. Generating mul24 and sign-extending + // The original result is positive if its destination is wider than 48 bits + // and its highest set bit is at bit 47. Generating mul48 and sign-extending // it would yield a negative value. - if (Size > 32 && LHSBits + RHSBits > 30) + if (Size > 48 && LHSBits + RHSBits > 46) return false; - IntrID = Intrinsic::amdgcn_mul_i24; + IsSigned = true; } else return false; @@ -527,12 +556,10 @@ extractValues(Builder, LHSVals, LHS); extractValues(Builder, RHSVals, RHS); - IntegerType *I32Ty = Builder.getInt32Ty(); - FunctionCallee Intrin = Intrinsic::getDeclaration(Mod, IntrID); for (int I = 0, E = LHSVals.size(); I != E; ++I) { Value *LHS, *RHS; - if (IntrID == Intrinsic::amdgcn_mul_u24) { + if (!IsSigned) { LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty); RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty); } else { @@ -540,9 +567,9 @@ RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty); } - Value *Result = Builder.CreateCall(Intrin, {LHS, RHS}); + Value *Result = getMul24(Builder, LHS, RHS, LHSBits + RHSBits, IsSigned); - if (IntrID == Intrinsic::amdgcn_mul_u24) { + if (!IsSigned) { ResultVals.push_back(Builder.CreateZExtOrTrunc(Result, LHSVals[I]->getType())); } else { diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll --- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll @@ -29,7 +29,13 @@ ; SI-NEXT: [[LHS24:%.*]] = ashr i32 [[SHL_LHS]], 8 ; SI-NEXT: [[LSHR_RHS:%.*]] = shl i32 [[RHS:%.*]], 8 ; SI-NEXT: [[RHS24:%.*]] = ashr i32 [[LHS]], 8 -; SI-NEXT: [[MUL:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[LHS24]], i32 [[RHS24]]) +; SI-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[LHS24]], i32 [[RHS24]]) +; SI-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[LHS24]], i32 [[RHS24]]) +; SI-NEXT: [[TMP3:%.*]] = zext i32 [[TMP1]] to i64 +; SI-NEXT: [[TMP4:%.*]] = zext i32 [[TMP2]] to i64 +; SI-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32 +; SI-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] +; SI-NEXT: [[MUL:%.*]] = trunc i64 [[TMP6]] to i32 ; SI-NEXT: ret i32 [[MUL]] ; ; VI-LABEL: @smul24_i32( @@ -37,7 +43,13 @@ ; VI-NEXT: [[LHS24:%.*]] = ashr i32 [[SHL_LHS]], 8 ; VI-NEXT: [[LSHR_RHS:%.*]] = shl i32 [[RHS:%.*]], 8 ; VI-NEXT: [[RHS24:%.*]] = ashr i32 [[LHS]], 8 -; VI-NEXT: [[MUL:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[LHS24]], i32 [[RHS24]]) +; VI-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[LHS24]], i32 [[RHS24]]) +; VI-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[LHS24]], i32 [[RHS24]]) +; VI-NEXT: [[TMP3:%.*]] = zext i32 [[TMP1]] to i64 +; VI-NEXT: [[TMP4:%.*]] = zext i32 [[TMP2]] to i64 +; VI-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32 +; VI-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] +; VI-NEXT: [[MUL:%.*]] = trunc i64 [[TMP6]] to i32 ; VI-NEXT: ret i32 [[MUL]] ; ; DISABLED-LABEL: @smul24_i32( @@ -67,9 +79,21 @@ ; SI-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[RHS24]], i64 0 ; SI-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[RHS24]], i64 1 ; SI-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP3]]) -; SI-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP2]], i32 [[TMP4]]) -; SI-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i64 0 -; SI-NEXT: [[MUL:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP6]], i64 1 +; SI-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP1]], i32 [[TMP3]]) +; SI-NEXT: [[TMP7:%.*]] = zext i32 [[TMP5]] to i64 +; SI-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 +; SI-NEXT: [[TMP9:%.*]] = shl i64 [[TMP8]], 32 +; SI-NEXT: [[TMP10:%.*]] = or i64 [[TMP7]], [[TMP9]] +; SI-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32 +; SI-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP2]], i32 [[TMP4]]) +; SI-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP2]], i32 [[TMP4]]) +; SI-NEXT: [[TMP14:%.*]] = zext i32 [[TMP12]] to i64 +; SI-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 +; SI-NEXT: [[TMP16:%.*]] = shl i64 [[TMP15]], 32 +; SI-NEXT: [[TMP17:%.*]] = or i64 [[TMP14]], [[TMP16]] +; SI-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 +; SI-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> undef, i32 [[TMP11]], i64 0 +; SI-NEXT: [[MUL:%.*]] = insertelement <2 x i32> [[TMP19]], i32 [[TMP18]], i64 1 ; SI-NEXT: ret <2 x i32> [[MUL]] ; ; VI-LABEL: @smul24_v2i32( @@ -82,9 +106,21 @@ ; VI-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[RHS24]], i64 0 ; VI-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[RHS24]], i64 1 ; VI-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP3]]) -; VI-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP2]], i32 [[TMP4]]) -; VI-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i64 0 -; VI-NEXT: [[MUL:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP6]], i64 1 +; VI-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP1]], i32 [[TMP3]]) +; VI-NEXT: [[TMP7:%.*]] = zext i32 [[TMP5]] to i64 +; VI-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 +; VI-NEXT: [[TMP9:%.*]] = shl i64 [[TMP8]], 32 +; VI-NEXT: [[TMP10:%.*]] = or i64 [[TMP7]], [[TMP9]] +; VI-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32 +; VI-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP2]], i32 [[TMP4]]) +; VI-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP2]], i32 [[TMP4]]) +; VI-NEXT: [[TMP14:%.*]] = zext i32 [[TMP12]] to i64 +; VI-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 +; VI-NEXT: [[TMP16:%.*]] = shl i64 [[TMP15]], 32 +; VI-NEXT: [[TMP17:%.*]] = or i64 [[TMP14]], [[TMP16]] +; VI-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 +; VI-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> undef, i32 [[TMP11]], i64 0 +; VI-NEXT: [[MUL:%.*]] = insertelement <2 x i32> [[TMP19]], i32 [[TMP18]], i64 1 ; VI-NEXT: ret <2 x i32> [[MUL]] ; ; DISABLED-LABEL: @smul24_v2i32( @@ -107,13 +143,25 @@ ; SI-LABEL: @umul24_i32( ; SI-NEXT: [[LHS24:%.*]] = and i32 [[LHS:%.*]], 16777215 ; SI-NEXT: [[RHS24:%.*]] = and i32 [[RHS:%.*]], 16777215 -; SI-NEXT: [[MUL:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[LHS24]], i32 [[RHS24]]) +; SI-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[LHS24]], i32 [[RHS24]]) +; SI-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.mulhi.u24(i32 [[LHS24]], i32 [[RHS24]]) +; SI-NEXT: [[TMP3:%.*]] = zext i32 [[TMP1]] to i64 +; SI-NEXT: [[TMP4:%.*]] = zext i32 [[TMP2]] to i64 +; SI-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32 +; SI-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] +; SI-NEXT: [[MUL:%.*]] = trunc i64 [[TMP6]] to i32 ; SI-NEXT: ret i32 [[MUL]] ; ; VI-LABEL: @umul24_i32( ; VI-NEXT: [[LHS24:%.*]] = and i32 [[LHS:%.*]], 16777215 ; VI-NEXT: [[RHS24:%.*]] = and i32 [[RHS:%.*]], 16777215 -; VI-NEXT: [[MUL:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[LHS24]], i32 [[RHS24]]) +; VI-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[LHS24]], i32 [[RHS24]]) +; VI-NEXT: [[TMP2:%.*]] = call i32 @llvm.amdgcn.mulhi.u24(i32 [[LHS24]], i32 [[RHS24]]) +; VI-NEXT: [[TMP3:%.*]] = zext i32 [[TMP1]] to i64 +; VI-NEXT: [[TMP4:%.*]] = zext i32 [[TMP2]] to i64 +; VI-NEXT: [[TMP5:%.*]] = shl i64 [[TMP4]], 32 +; VI-NEXT: [[TMP6:%.*]] = or i64 [[TMP3]], [[TMP5]] +; VI-NEXT: [[MUL:%.*]] = trunc i64 [[TMP6]] to i32 ; VI-NEXT: ret i32 [[MUL]] ; ; DISABLED-LABEL: @umul24_i32( @@ -137,9 +185,21 @@ ; SI-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[RHS24]], i64 0 ; SI-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[RHS24]], i64 1 ; SI-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP3]]) -; SI-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP2]], i32 [[TMP4]]) -; SI-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i64 0 -; SI-NEXT: [[MUL:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP6]], i64 1 +; SI-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mulhi.u24(i32 [[TMP1]], i32 [[TMP3]]) +; SI-NEXT: [[TMP7:%.*]] = zext i32 [[TMP5]] to i64 +; SI-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 +; SI-NEXT: [[TMP9:%.*]] = shl i64 [[TMP8]], 32 +; SI-NEXT: [[TMP10:%.*]] = or i64 [[TMP7]], [[TMP9]] +; SI-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32 +; SI-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP2]], i32 [[TMP4]]) +; SI-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.mulhi.u24(i32 [[TMP2]], i32 [[TMP4]]) +; SI-NEXT: [[TMP14:%.*]] = zext i32 [[TMP12]] to i64 +; SI-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 +; SI-NEXT: [[TMP16:%.*]] = shl i64 [[TMP15]], 32 +; SI-NEXT: [[TMP17:%.*]] = or i64 [[TMP14]], [[TMP16]] +; SI-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 +; SI-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> undef, i32 [[TMP11]], i64 0 +; SI-NEXT: [[MUL:%.*]] = insertelement <2 x i32> [[TMP19]], i32 [[TMP18]], i64 1 ; SI-NEXT: ret <2 x i32> [[MUL]] ; ; VI-LABEL: @umul24_v2i32( @@ -150,9 +210,21 @@ ; VI-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[RHS24]], i64 0 ; VI-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[RHS24]], i64 1 ; VI-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP3]]) -; VI-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP2]], i32 [[TMP4]]) -; VI-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i64 0 -; VI-NEXT: [[MUL:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP6]], i64 1 +; VI-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mulhi.u24(i32 [[TMP1]], i32 [[TMP3]]) +; VI-NEXT: [[TMP7:%.*]] = zext i32 [[TMP5]] to i64 +; VI-NEXT: [[TMP8:%.*]] = zext i32 [[TMP6]] to i64 +; VI-NEXT: [[TMP9:%.*]] = shl i64 [[TMP8]], 32 +; VI-NEXT: [[TMP10:%.*]] = or i64 [[TMP7]], [[TMP9]] +; VI-NEXT: [[TMP11:%.*]] = trunc i64 [[TMP10]] to i32 +; VI-NEXT: [[TMP12:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP2]], i32 [[TMP4]]) +; VI-NEXT: [[TMP13:%.*]] = call i32 @llvm.amdgcn.mulhi.u24(i32 [[TMP2]], i32 [[TMP4]]) +; VI-NEXT: [[TMP14:%.*]] = zext i32 [[TMP12]] to i64 +; VI-NEXT: [[TMP15:%.*]] = zext i32 [[TMP13]] to i64 +; VI-NEXT: [[TMP16:%.*]] = shl i64 [[TMP15]], 32 +; VI-NEXT: [[TMP17:%.*]] = or i64 [[TMP14]], [[TMP16]] +; VI-NEXT: [[TMP18:%.*]] = trunc i64 [[TMP17]] to i32 +; VI-NEXT: [[TMP19:%.*]] = insertelement <2 x i32> undef, i32 [[TMP11]], i64 0 +; VI-NEXT: [[MUL:%.*]] = insertelement <2 x i32> [[TMP19]], i32 [[TMP18]], i64 1 ; VI-NEXT: ret <2 x i32> [[MUL]] ; ; DISABLED-LABEL: @umul24_v2i32( @@ -173,7 +245,14 @@ ; SI-NEXT: [[LHS24:%.*]] = ashr i64 [[SHL_LHS]], 40 ; SI-NEXT: [[LSHR_RHS:%.*]] = shl i64 [[RHS:%.*]], 40 ; SI-NEXT: [[RHS24:%.*]] = ashr i64 [[LHS]], 40 -; SI-NEXT: [[MUL:%.*]] = mul i64 [[LHS24]], [[RHS24]] +; SI-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32 +; SI-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32 +; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]]) +; SI-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP1]], i32 [[TMP2]]) +; SI-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 +; SI-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; SI-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 32 +; SI-NEXT: [[MUL:%.*]] = or i64 [[TMP5]], [[TMP7]] ; SI-NEXT: ret i64 [[MUL]] ; ; VI-LABEL: @smul24_i64( @@ -181,7 +260,14 @@ ; VI-NEXT: [[LHS24:%.*]] = ashr i64 [[SHL_LHS]], 40 ; VI-NEXT: [[LSHR_RHS:%.*]] = shl i64 [[RHS:%.*]], 40 ; VI-NEXT: [[RHS24:%.*]] = ashr i64 [[LHS]], 40 -; VI-NEXT: [[MUL:%.*]] = mul i64 [[LHS24]], [[RHS24]] +; VI-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32 +; VI-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32 +; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]]) +; VI-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP1]], i32 [[TMP2]]) +; VI-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 +; VI-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; VI-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 32 +; VI-NEXT: [[MUL:%.*]] = or i64 [[TMP5]], [[TMP7]] ; VI-NEXT: ret i64 [[MUL]] ; ; DISABLED-LABEL: @smul24_i64( @@ -245,7 +331,14 @@ ; SI-NEXT: [[LHS24:%.*]] = sext i16 [[LHS_TRUNC]] to i64 ; SI-NEXT: [[RHS_TRUNC:%.*]] = trunc i64 [[RHS:%.*]] to i17 ; SI-NEXT: [[RHS24:%.*]] = sext i17 [[RHS_TRUNC]] to i64 -; SI-NEXT: [[MUL:%.*]] = mul i64 [[LHS24]], [[RHS24]] +; SI-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32 +; SI-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32 +; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]]) +; SI-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP1]], i32 [[TMP2]]) +; SI-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 +; SI-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; SI-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 32 +; SI-NEXT: [[MUL:%.*]] = or i64 [[TMP5]], [[TMP7]] ; SI-NEXT: ret i64 [[MUL]] ; ; VI-LABEL: @smul24_i64_3( @@ -253,7 +346,14 @@ ; VI-NEXT: [[LHS24:%.*]] = sext i16 [[LHS_TRUNC]] to i64 ; VI-NEXT: [[RHS_TRUNC:%.*]] = trunc i64 [[RHS:%.*]] to i17 ; VI-NEXT: [[RHS24:%.*]] = sext i17 [[RHS_TRUNC]] to i64 -; VI-NEXT: [[MUL:%.*]] = mul i64 [[LHS24]], [[RHS24]] +; VI-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32 +; VI-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32 +; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]]) +; VI-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP1]], i32 [[TMP2]]) +; VI-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 +; VI-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; VI-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 32 +; VI-NEXT: [[MUL:%.*]] = or i64 [[TMP5]], [[TMP7]] ; VI-NEXT: ret i64 [[MUL]] ; ; DISABLED-LABEL: @smul24_i64_3( @@ -272,17 +372,64 @@ ret i64 %mul } +define i64 @smul24_i64_4(i64 %lhs, i64 %rhs) { +; SI-LABEL: @smul24_i64_4( +; SI-NEXT: [[LHS_TRUNC:%.*]] = trunc i64 [[LHS:%.*]] to i24 +; SI-NEXT: [[LHS24:%.*]] = sext i24 [[LHS_TRUNC]] to i64 +; SI-NEXT: [[RHS_TRUNC:%.*]] = trunc i64 [[RHS:%.*]] to i25 +; SI-NEXT: [[RHS24:%.*]] = sext i25 [[RHS_TRUNC]] to i64 +; SI-NEXT: [[MUL:%.*]] = mul i64 [[LHS24]], [[RHS24]] +; SI-NEXT: ret i64 [[MUL]] +; +; VI-LABEL: @smul24_i64_4( +; VI-NEXT: [[LHS_TRUNC:%.*]] = trunc i64 [[LHS:%.*]] to i24 +; VI-NEXT: [[LHS24:%.*]] = sext i24 [[LHS_TRUNC]] to i64 +; VI-NEXT: [[RHS_TRUNC:%.*]] = trunc i64 [[RHS:%.*]] to i25 +; VI-NEXT: [[RHS24:%.*]] = sext i25 [[RHS_TRUNC]] to i64 +; VI-NEXT: [[MUL:%.*]] = mul i64 [[LHS24]], [[RHS24]] +; VI-NEXT: ret i64 [[MUL]] +; +; DISABLED-LABEL: @smul24_i64_4( +; DISABLED-NEXT: [[LHS_TRUNC:%.*]] = trunc i64 [[LHS:%.*]] to i24 +; DISABLED-NEXT: [[LHS24:%.*]] = sext i24 [[LHS_TRUNC]] to i64 +; DISABLED-NEXT: [[RHS_TRUNC:%.*]] = trunc i64 [[RHS:%.*]] to i25 +; DISABLED-NEXT: [[RHS24:%.*]] = sext i25 [[RHS_TRUNC]] to i64 +; DISABLED-NEXT: [[MUL:%.*]] = mul i64 [[LHS24]], [[RHS24]] +; DISABLED-NEXT: ret i64 [[MUL]] +; + %lhs.trunc = trunc i64 %lhs to i24 + %lhs24 = sext i24 %lhs.trunc to i64 + %rhs.trunc = trunc i64 %rhs to i25 + %rhs24 = sext i25 %rhs.trunc to i64 + %mul = mul i64 %lhs24, %rhs24 + ret i64 %mul +} + define i64 @umul24_i64(i64 %lhs, i64 %rhs) { ; SI-LABEL: @umul24_i64( ; SI-NEXT: [[LHS24:%.*]] = and i64 [[LHS:%.*]], 16777215 ; SI-NEXT: [[RHS24:%.*]] = and i64 [[RHS:%.*]], 16777215 -; SI-NEXT: [[MUL:%.*]] = mul i64 [[LHS24]], [[RHS24]] +; SI-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32 +; SI-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32 +; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]]) +; SI-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.u24(i32 [[TMP1]], i32 [[TMP2]]) +; SI-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 +; SI-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; SI-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 32 +; SI-NEXT: [[MUL:%.*]] = or i64 [[TMP5]], [[TMP7]] ; SI-NEXT: ret i64 [[MUL]] ; ; VI-LABEL: @umul24_i64( ; VI-NEXT: [[LHS24:%.*]] = and i64 [[LHS:%.*]], 16777215 ; VI-NEXT: [[RHS24:%.*]] = and i64 [[RHS:%.*]], 16777215 -; VI-NEXT: [[MUL:%.*]] = mul i64 [[LHS24]], [[RHS24]] +; VI-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32 +; VI-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32 +; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]]) +; VI-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.u24(i32 [[TMP1]], i32 [[TMP2]]) +; VI-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 +; VI-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; VI-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 32 +; VI-NEXT: [[MUL:%.*]] = or i64 [[TMP5]], [[TMP7]] ; VI-NEXT: ret i64 [[MUL]] ; ; DISABLED-LABEL: @umul24_i64( @@ -337,7 +484,12 @@ ; SI-NEXT: [[TMP1:%.*]] = sext i31 [[LHS24]] to i32 ; SI-NEXT: [[TMP2:%.*]] = sext i31 [[RHS24]] to i32 ; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]]) -; SI-NEXT: [[MUL:%.*]] = trunc i32 [[TMP3]] to i31 +; SI-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP1]], i32 [[TMP2]]) +; SI-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 +; SI-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; SI-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 32 +; SI-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] +; SI-NEXT: [[MUL:%.*]] = trunc i64 [[TMP8]] to i31 ; SI-NEXT: ret i31 [[MUL]] ; ; VI-LABEL: @smul24_i31( @@ -348,7 +500,12 @@ ; VI-NEXT: [[TMP1:%.*]] = sext i31 [[LHS24]] to i32 ; VI-NEXT: [[TMP2:%.*]] = sext i31 [[RHS24]] to i32 ; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]]) -; VI-NEXT: [[MUL:%.*]] = trunc i32 [[TMP3]] to i31 +; VI-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP1]], i32 [[TMP2]]) +; VI-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 +; VI-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; VI-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 32 +; VI-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] +; VI-NEXT: [[MUL:%.*]] = trunc i64 [[TMP8]] to i31 ; VI-NEXT: ret i31 [[MUL]] ; ; DISABLED-LABEL: @smul24_i31( @@ -374,7 +531,12 @@ ; SI-NEXT: [[TMP1:%.*]] = zext i31 [[LHS24]] to i32 ; SI-NEXT: [[TMP2:%.*]] = zext i31 [[RHS24]] to i32 ; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]]) -; SI-NEXT: [[MUL:%.*]] = trunc i32 [[TMP3]] to i31 +; SI-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.u24(i32 [[TMP1]], i32 [[TMP2]]) +; SI-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 +; SI-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; SI-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 32 +; SI-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] +; SI-NEXT: [[MUL:%.*]] = trunc i64 [[TMP8]] to i31 ; SI-NEXT: ret i31 [[MUL]] ; ; VI-LABEL: @umul24_i31( @@ -383,7 +545,12 @@ ; VI-NEXT: [[TMP1:%.*]] = zext i31 [[LHS24]] to i32 ; VI-NEXT: [[TMP2:%.*]] = zext i31 [[RHS24]] to i32 ; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]]) -; VI-NEXT: [[MUL:%.*]] = trunc i32 [[TMP3]] to i31 +; VI-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.u24(i32 [[TMP1]], i32 [[TMP2]]) +; VI-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 +; VI-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; VI-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 32 +; VI-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] +; VI-NEXT: [[MUL:%.*]] = trunc i64 [[TMP8]] to i31 ; VI-NEXT: ret i31 [[MUL]] ; ; DISABLED-LABEL: @umul24_i31( @@ -409,13 +576,23 @@ ; SI-NEXT: [[TMP5:%.*]] = zext i31 [[TMP1]] to i32 ; SI-NEXT: [[TMP6:%.*]] = zext i31 [[TMP3]] to i32 ; SI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP5]], i32 [[TMP6]]) -; SI-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i31 -; SI-NEXT: [[TMP9:%.*]] = zext i31 [[TMP2]] to i32 -; SI-NEXT: [[TMP10:%.*]] = zext i31 [[TMP4]] to i32 -; SI-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP9]], i32 [[TMP10]]) -; SI-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i31 -; SI-NEXT: [[TMP13:%.*]] = insertelement <2 x i31> undef, i31 [[TMP8]], i64 0 -; SI-NEXT: [[MUL:%.*]] = insertelement <2 x i31> [[TMP13]], i31 [[TMP12]], i64 1 +; SI-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mulhi.u24(i32 [[TMP5]], i32 [[TMP6]]) +; SI-NEXT: [[TMP9:%.*]] = zext i32 [[TMP7]] to i64 +; SI-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 +; SI-NEXT: [[TMP11:%.*]] = shl i64 [[TMP10]], 32 +; SI-NEXT: [[TMP12:%.*]] = or i64 [[TMP9]], [[TMP11]] +; SI-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i31 +; SI-NEXT: [[TMP14:%.*]] = zext i31 [[TMP2]] to i32 +; SI-NEXT: [[TMP15:%.*]] = zext i31 [[TMP4]] to i32 +; SI-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP14]], i32 [[TMP15]]) +; SI-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.mulhi.u24(i32 [[TMP14]], i32 [[TMP15]]) +; SI-NEXT: [[TMP18:%.*]] = zext i32 [[TMP16]] to i64 +; SI-NEXT: [[TMP19:%.*]] = zext i32 [[TMP17]] to i64 +; SI-NEXT: [[TMP20:%.*]] = shl i64 [[TMP19]], 32 +; SI-NEXT: [[TMP21:%.*]] = or i64 [[TMP18]], [[TMP20]] +; SI-NEXT: [[TMP22:%.*]] = trunc i64 [[TMP21]] to i31 +; SI-NEXT: [[TMP23:%.*]] = insertelement <2 x i31> undef, i31 [[TMP13]], i64 0 +; SI-NEXT: [[MUL:%.*]] = insertelement <2 x i31> [[TMP23]], i31 [[TMP22]], i64 1 ; SI-NEXT: ret <2 x i31> [[MUL]] ; ; VI-LABEL: @umul24_v2i31( @@ -428,13 +605,23 @@ ; VI-NEXT: [[TMP5:%.*]] = zext i31 [[TMP1]] to i32 ; VI-NEXT: [[TMP6:%.*]] = zext i31 [[TMP3]] to i32 ; VI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP5]], i32 [[TMP6]]) -; VI-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i31 -; VI-NEXT: [[TMP9:%.*]] = zext i31 [[TMP2]] to i32 -; VI-NEXT: [[TMP10:%.*]] = zext i31 [[TMP4]] to i32 -; VI-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP9]], i32 [[TMP10]]) -; VI-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i31 -; VI-NEXT: [[TMP13:%.*]] = insertelement <2 x i31> undef, i31 [[TMP8]], i64 0 -; VI-NEXT: [[MUL:%.*]] = insertelement <2 x i31> [[TMP13]], i31 [[TMP12]], i64 1 +; VI-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mulhi.u24(i32 [[TMP5]], i32 [[TMP6]]) +; VI-NEXT: [[TMP9:%.*]] = zext i32 [[TMP7]] to i64 +; VI-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 +; VI-NEXT: [[TMP11:%.*]] = shl i64 [[TMP10]], 32 +; VI-NEXT: [[TMP12:%.*]] = or i64 [[TMP9]], [[TMP11]] +; VI-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i31 +; VI-NEXT: [[TMP14:%.*]] = zext i31 [[TMP2]] to i32 +; VI-NEXT: [[TMP15:%.*]] = zext i31 [[TMP4]] to i32 +; VI-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP14]], i32 [[TMP15]]) +; VI-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.mulhi.u24(i32 [[TMP14]], i32 [[TMP15]]) +; VI-NEXT: [[TMP18:%.*]] = zext i32 [[TMP16]] to i64 +; VI-NEXT: [[TMP19:%.*]] = zext i32 [[TMP17]] to i64 +; VI-NEXT: [[TMP20:%.*]] = shl i64 [[TMP19]], 32 +; VI-NEXT: [[TMP21:%.*]] = or i64 [[TMP18]], [[TMP20]] +; VI-NEXT: [[TMP22:%.*]] = trunc i64 [[TMP21]] to i31 +; VI-NEXT: [[TMP23:%.*]] = insertelement <2 x i31> undef, i31 [[TMP13]], i64 0 +; VI-NEXT: [[MUL:%.*]] = insertelement <2 x i31> [[TMP23]], i31 [[TMP22]], i64 1 ; VI-NEXT: ret <2 x i31> [[MUL]] ; ; DISABLED-LABEL: @umul24_v2i31( @@ -462,13 +649,23 @@ ; SI-NEXT: [[TMP5:%.*]] = sext i31 [[TMP1]] to i32 ; SI-NEXT: [[TMP6:%.*]] = sext i31 [[TMP3]] to i32 ; SI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP5]], i32 [[TMP6]]) -; SI-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i31 -; SI-NEXT: [[TMP9:%.*]] = sext i31 [[TMP2]] to i32 -; SI-NEXT: [[TMP10:%.*]] = sext i31 [[TMP4]] to i32 -; SI-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP9]], i32 [[TMP10]]) -; SI-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i31 -; SI-NEXT: [[TMP13:%.*]] = insertelement <2 x i31> undef, i31 [[TMP8]], i64 0 -; SI-NEXT: [[MUL:%.*]] = insertelement <2 x i31> [[TMP13]], i31 [[TMP12]], i64 1 +; SI-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP5]], i32 [[TMP6]]) +; SI-NEXT: [[TMP9:%.*]] = zext i32 [[TMP7]] to i64 +; SI-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 +; SI-NEXT: [[TMP11:%.*]] = shl i64 [[TMP10]], 32 +; SI-NEXT: [[TMP12:%.*]] = or i64 [[TMP9]], [[TMP11]] +; SI-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i31 +; SI-NEXT: [[TMP14:%.*]] = sext i31 [[TMP2]] to i32 +; SI-NEXT: [[TMP15:%.*]] = sext i31 [[TMP4]] to i32 +; SI-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP14]], i32 [[TMP15]]) +; SI-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP14]], i32 [[TMP15]]) +; SI-NEXT: [[TMP18:%.*]] = zext i32 [[TMP16]] to i64 +; SI-NEXT: [[TMP19:%.*]] = zext i32 [[TMP17]] to i64 +; SI-NEXT: [[TMP20:%.*]] = shl i64 [[TMP19]], 32 +; SI-NEXT: [[TMP21:%.*]] = or i64 [[TMP18]], [[TMP20]] +; SI-NEXT: [[TMP22:%.*]] = trunc i64 [[TMP21]] to i31 +; SI-NEXT: [[TMP23:%.*]] = insertelement <2 x i31> undef, i31 [[TMP13]], i64 0 +; SI-NEXT: [[MUL:%.*]] = insertelement <2 x i31> [[TMP23]], i31 [[TMP22]], i64 1 ; SI-NEXT: ret <2 x i31> [[MUL]] ; ; VI-LABEL: @smul24_v2i31( @@ -483,13 +680,23 @@ ; VI-NEXT: [[TMP5:%.*]] = sext i31 [[TMP1]] to i32 ; VI-NEXT: [[TMP6:%.*]] = sext i31 [[TMP3]] to i32 ; VI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP5]], i32 [[TMP6]]) -; VI-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i31 -; VI-NEXT: [[TMP9:%.*]] = sext i31 [[TMP2]] to i32 -; VI-NEXT: [[TMP10:%.*]] = sext i31 [[TMP4]] to i32 -; VI-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP9]], i32 [[TMP10]]) -; VI-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i31 -; VI-NEXT: [[TMP13:%.*]] = insertelement <2 x i31> undef, i31 [[TMP8]], i64 0 -; VI-NEXT: [[MUL:%.*]] = insertelement <2 x i31> [[TMP13]], i31 [[TMP12]], i64 1 +; VI-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP5]], i32 [[TMP6]]) +; VI-NEXT: [[TMP9:%.*]] = zext i32 [[TMP7]] to i64 +; VI-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 +; VI-NEXT: [[TMP11:%.*]] = shl i64 [[TMP10]], 32 +; VI-NEXT: [[TMP12:%.*]] = or i64 [[TMP9]], [[TMP11]] +; VI-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i31 +; VI-NEXT: [[TMP14:%.*]] = sext i31 [[TMP2]] to i32 +; VI-NEXT: [[TMP15:%.*]] = sext i31 [[TMP4]] to i32 +; VI-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP14]], i32 [[TMP15]]) +; VI-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP14]], i32 [[TMP15]]) +; VI-NEXT: [[TMP18:%.*]] = zext i32 [[TMP16]] to i64 +; VI-NEXT: [[TMP19:%.*]] = zext i32 [[TMP17]] to i64 +; VI-NEXT: [[TMP20:%.*]] = shl i64 [[TMP19]], 32 +; VI-NEXT: [[TMP21:%.*]] = or i64 [[TMP18]], [[TMP20]] +; VI-NEXT: [[TMP22:%.*]] = trunc i64 [[TMP21]] to i31 +; VI-NEXT: [[TMP23:%.*]] = insertelement <2 x i31> undef, i31 [[TMP13]], i64 0 +; VI-NEXT: [[MUL:%.*]] = insertelement <2 x i31> [[TMP23]], i31 [[TMP22]], i64 1 ; VI-NEXT: ret <2 x i31> [[MUL]] ; ; DISABLED-LABEL: @smul24_v2i31( @@ -514,7 +721,15 @@ ; SI-NEXT: [[LHS24:%.*]] = ashr i33 [[SHL_LHS]], 9 ; SI-NEXT: [[LSHR_RHS:%.*]] = shl i33 [[RHS:%.*]], 9 ; SI-NEXT: [[RHS24:%.*]] = ashr i33 [[LHS]], 9 -; SI-NEXT: [[MUL:%.*]] = mul i33 [[LHS24]], [[RHS24]] +; SI-NEXT: [[TMP1:%.*]] = trunc i33 [[LHS24]] to i32 +; SI-NEXT: [[TMP2:%.*]] = trunc i33 [[RHS24]] to i32 +; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]]) +; SI-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP1]], i32 [[TMP2]]) +; SI-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 +; SI-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; SI-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 32 +; SI-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] +; SI-NEXT: [[MUL:%.*]] = trunc i64 [[TMP8]] to i33 ; SI-NEXT: ret i33 [[MUL]] ; ; VI-LABEL: @smul24_i33( @@ -522,7 +737,15 @@ ; VI-NEXT: [[LHS24:%.*]] = ashr i33 [[SHL_LHS]], 9 ; VI-NEXT: [[LSHR_RHS:%.*]] = shl i33 [[RHS:%.*]], 9 ; VI-NEXT: [[RHS24:%.*]] = ashr i33 [[LHS]], 9 -; VI-NEXT: [[MUL:%.*]] = mul i33 [[LHS24]], [[RHS24]] +; VI-NEXT: [[TMP1:%.*]] = trunc i33 [[LHS24]] to i32 +; VI-NEXT: [[TMP2:%.*]] = trunc i33 [[RHS24]] to i32 +; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]]) +; VI-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP1]], i32 [[TMP2]]) +; VI-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 +; VI-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; VI-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 32 +; VI-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] +; VI-NEXT: [[MUL:%.*]] = trunc i64 [[TMP8]] to i33 ; VI-NEXT: ret i33 [[MUL]] ; ; DISABLED-LABEL: @smul24_i33( @@ -545,13 +768,29 @@ ; SI-LABEL: @umul24_i33( ; SI-NEXT: [[LHS24:%.*]] = and i33 [[LHS:%.*]], 16777215 ; SI-NEXT: [[RHS24:%.*]] = and i33 [[RHS:%.*]], 16777215 -; SI-NEXT: [[MUL:%.*]] = mul i33 [[LHS24]], [[RHS24]] +; SI-NEXT: [[TMP1:%.*]] = trunc i33 [[LHS24]] to i32 +; SI-NEXT: [[TMP2:%.*]] = trunc i33 [[RHS24]] to i32 +; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]]) +; SI-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.u24(i32 [[TMP1]], i32 [[TMP2]]) +; SI-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 +; SI-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; SI-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 32 +; SI-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] +; SI-NEXT: [[MUL:%.*]] = trunc i64 [[TMP8]] to i33 ; SI-NEXT: ret i33 [[MUL]] ; ; VI-LABEL: @umul24_i33( ; VI-NEXT: [[LHS24:%.*]] = and i33 [[LHS:%.*]], 16777215 ; VI-NEXT: [[RHS24:%.*]] = and i33 [[RHS:%.*]], 16777215 -; VI-NEXT: [[MUL:%.*]] = mul i33 [[LHS24]], [[RHS24]] +; VI-NEXT: [[TMP1:%.*]] = trunc i33 [[LHS24]] to i32 +; VI-NEXT: [[TMP2:%.*]] = trunc i33 [[RHS24]] to i32 +; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]]) +; VI-NEXT: [[TMP4:%.*]] = call i32 @llvm.amdgcn.mulhi.u24(i32 [[TMP1]], i32 [[TMP2]]) +; VI-NEXT: [[TMP5:%.*]] = zext i32 [[TMP3]] to i64 +; VI-NEXT: [[TMP6:%.*]] = zext i32 [[TMP4]] to i64 +; VI-NEXT: [[TMP7:%.*]] = shl i64 [[TMP6]], 32 +; VI-NEXT: [[TMP8:%.*]] = or i64 [[TMP5]], [[TMP7]] +; VI-NEXT: [[MUL:%.*]] = trunc i64 [[TMP8]] to i33 ; VI-NEXT: ret i33 [[MUL]] ; ; DISABLED-LABEL: @umul24_i33( @@ -630,7 +869,30 @@ ; SI-NEXT: [[LHS24:%.*]] = ashr <2 x i33> [[SHL_LHS]], ; SI-NEXT: [[LSHR_RHS:%.*]] = shl <2 x i33> [[RHS:%.*]], ; SI-NEXT: [[RHS24:%.*]] = ashr <2 x i33> [[LHS]], -; SI-NEXT: [[MUL:%.*]] = mul <2 x i33> [[LHS24]], [[RHS24]] +; SI-NEXT: [[TMP1:%.*]] = extractelement <2 x i33> [[LHS24]], i64 0 +; SI-NEXT: [[TMP2:%.*]] = extractelement <2 x i33> [[LHS24]], i64 1 +; SI-NEXT: [[TMP3:%.*]] = extractelement <2 x i33> [[RHS24]], i64 0 +; SI-NEXT: [[TMP4:%.*]] = extractelement <2 x i33> [[RHS24]], i64 1 +; SI-NEXT: [[TMP5:%.*]] = trunc i33 [[TMP1]] to i32 +; SI-NEXT: [[TMP6:%.*]] = trunc i33 [[TMP3]] to i32 +; SI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP5]], i32 [[TMP6]]) +; SI-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP5]], i32 [[TMP6]]) +; SI-NEXT: [[TMP9:%.*]] = zext i32 [[TMP7]] to i64 +; SI-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 +; SI-NEXT: [[TMP11:%.*]] = shl i64 [[TMP10]], 32 +; SI-NEXT: [[TMP12:%.*]] = or i64 [[TMP9]], [[TMP11]] +; SI-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i33 +; SI-NEXT: [[TMP14:%.*]] = trunc i33 [[TMP2]] to i32 +; SI-NEXT: [[TMP15:%.*]] = trunc i33 [[TMP4]] to i32 +; SI-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP14]], i32 [[TMP15]]) +; SI-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP14]], i32 [[TMP15]]) +; SI-NEXT: [[TMP18:%.*]] = zext i32 [[TMP16]] to i64 +; SI-NEXT: [[TMP19:%.*]] = zext i32 [[TMP17]] to i64 +; SI-NEXT: [[TMP20:%.*]] = shl i64 [[TMP19]], 32 +; SI-NEXT: [[TMP21:%.*]] = or i64 [[TMP18]], [[TMP20]] +; SI-NEXT: [[TMP22:%.*]] = trunc i64 [[TMP21]] to i33 +; SI-NEXT: [[TMP23:%.*]] = insertelement <2 x i33> undef, i33 [[TMP13]], i64 0 +; SI-NEXT: [[MUL:%.*]] = insertelement <2 x i33> [[TMP23]], i33 [[TMP22]], i64 1 ; SI-NEXT: ret <2 x i33> [[MUL]] ; ; VI-LABEL: @smul24_v2i33( @@ -638,7 +900,30 @@ ; VI-NEXT: [[LHS24:%.*]] = ashr <2 x i33> [[SHL_LHS]], ; VI-NEXT: [[LSHR_RHS:%.*]] = shl <2 x i33> [[RHS:%.*]], ; VI-NEXT: [[RHS24:%.*]] = ashr <2 x i33> [[LHS]], -; VI-NEXT: [[MUL:%.*]] = mul <2 x i33> [[LHS24]], [[RHS24]] +; VI-NEXT: [[TMP1:%.*]] = extractelement <2 x i33> [[LHS24]], i64 0 +; VI-NEXT: [[TMP2:%.*]] = extractelement <2 x i33> [[LHS24]], i64 1 +; VI-NEXT: [[TMP3:%.*]] = extractelement <2 x i33> [[RHS24]], i64 0 +; VI-NEXT: [[TMP4:%.*]] = extractelement <2 x i33> [[RHS24]], i64 1 +; VI-NEXT: [[TMP5:%.*]] = trunc i33 [[TMP1]] to i32 +; VI-NEXT: [[TMP6:%.*]] = trunc i33 [[TMP3]] to i32 +; VI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP5]], i32 [[TMP6]]) +; VI-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP5]], i32 [[TMP6]]) +; VI-NEXT: [[TMP9:%.*]] = zext i32 [[TMP7]] to i64 +; VI-NEXT: [[TMP10:%.*]] = zext i32 [[TMP8]] to i64 +; VI-NEXT: [[TMP11:%.*]] = shl i64 [[TMP10]], 32 +; VI-NEXT: [[TMP12:%.*]] = or i64 [[TMP9]], [[TMP11]] +; VI-NEXT: [[TMP13:%.*]] = trunc i64 [[TMP12]] to i33 +; VI-NEXT: [[TMP14:%.*]] = trunc i33 [[TMP2]] to i32 +; VI-NEXT: [[TMP15:%.*]] = trunc i33 [[TMP4]] to i32 +; VI-NEXT: [[TMP16:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP14]], i32 [[TMP15]]) +; VI-NEXT: [[TMP17:%.*]] = call i32 @llvm.amdgcn.mulhi.i24(i32 [[TMP14]], i32 [[TMP15]]) +; VI-NEXT: [[TMP18:%.*]] = zext i32 [[TMP16]] to i64 +; VI-NEXT: [[TMP19:%.*]] = zext i32 [[TMP17]] to i64 +; VI-NEXT: [[TMP20:%.*]] = shl i64 [[TMP19]], 32 +; VI-NEXT: [[TMP21:%.*]] = or i64 [[TMP18]], [[TMP20]] +; VI-NEXT: [[TMP22:%.*]] = trunc i64 [[TMP21]] to i33 +; VI-NEXT: [[TMP23:%.*]] = insertelement <2 x i33> undef, i33 [[TMP13]], i64 0 +; VI-NEXT: [[MUL:%.*]] = insertelement <2 x i33> [[TMP23]], i33 [[TMP22]], i64 1 ; VI-NEXT: ret <2 x i33> [[MUL]] ; ; DISABLED-LABEL: @smul24_v2i33(