This is an archive of the discontinued LLVM Phabricator instance.

Differential D64460

AMDGPU: Add 24-bit mul intrinsics
ClosedPublic

Authored by arsenm on Jul 9 2019, 5:32 PM.

Download Raw Diff

Details

Reviewers

rampitec

Summary

Insert these during codegenprepare.

This works around a DAG issue where generic combines eliminate the and
asserting the high bits are zero, which then exposes an unknown read
source to the mul combine. It doesn't worth the hassle of trying to
insert an AssertZext or something to try to deal with it.

Diff Detail

Event Timeline

arsenm created this revision.Jul 9 2019, 5:32 PM

Herald added subscribers: t-tye, tpr, dstuttard and 5 others. · View Herald TranscriptJul 9 2019, 5:32 PM

Replacing it so early means we could miss other optimizations, especially on the DAG. Was any performance evaluation performed?

In D64460#1579467, @rampitec wrote:

Replacing it so early means we could miss other optimizations, especially on the DAG. Was any performance evaluation performed?

It's about as late as possible for the IR. This isn't really moving it that far from there this happens already, which is typically combine 1. The library function call is still using the standard IR operations, and I would expect the useful optimizations to have happened by this point. We already understand the known bits for these in the DAG, so I don't they shouldn't hurt too much (although they still may need ComputeNumSignBitsForTargetNode). It's possible combine opportunities will appear after lowering, and they will need to be implemented for the mul24 nodes.

I don't know what I would try this on, besides Jeff's benchmark that this is intended to solve. The only lit tests that broke were improvements.

LGTM

This revision is now accepted and ready to land.Jul 10 2019, 4:44 PM

r366094

Revision Contents

Path

Size

include/

llvm/

IR/

IntrinsicsAMDGPU.td

10 lines

lib/

Target/

AMDGPU/

AMDGPUCodeGenPrepare.cpp

127 lines

SIISelLowering.cpp

5 lines

test/

CodeGen/

AMDGPU/

amdgpu-codegenprepare-mul24.ll

494 lines

llvm.amdgcn.mul.i24.ll

14 lines

llvm.amdgcn.mul.u24.ll

14 lines

mad_uint24.ll

76 lines

mul.i16.ll

18 lines

mul_uint24-amdgcn.ll

4 lines

Diff 208853

include/llvm/IR/IntrinsicsAMDGPU.td

Show First 20 Lines • Show All 1,344 Lines • ▼ Show 20 Lines	def int_amdgcn_alignbit : Intrinsic<[llvm_i32_ty],
[IntrNoMem, IntrSpeculatable]		[IntrNoMem, IntrSpeculatable]
>;		>;

def int_amdgcn_alignbyte : Intrinsic<[llvm_i32_ty],		def int_amdgcn_alignbyte : Intrinsic<[llvm_i32_ty],
[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],		[llvm_i32_ty, llvm_i32_ty, llvm_i32_ty],
[IntrNoMem, IntrSpeculatable]		[IntrNoMem, IntrSpeculatable]
>;		>;

		def int_amdgcn_mul_i24 : Intrinsic<[llvm_i32_ty],
		[llvm_i32_ty, llvm_i32_ty],
		[IntrNoMem, IntrSpeculatable]
		>;

		def int_amdgcn_mul_u24 : Intrinsic<[llvm_i32_ty],
		[llvm_i32_ty, llvm_i32_ty],
		[IntrNoMem, IntrSpeculatable]
		>;

// llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id)		// llvm.amdgcn.ds.gws.init(i32 bar_val, i32 resource_id)
//		//
// bar_val is the total number of waves that will wait on this		// bar_val is the total number of waves that will wait on this
// barrier, minus 1.		// barrier, minus 1.
def int_amdgcn_ds_gws_init :		def int_amdgcn_ds_gws_init :
GCCBuiltin<"__builtin_amdgcn_ds_gws_init">,		GCCBuiltin<"__builtin_amdgcn_ds_gws_init">,
Intrinsic<[],		Intrinsic<[],
[llvm_i32_ty, llvm_i32_ty],		[llvm_i32_ty, llvm_i32_ty],
▲ Show 20 Lines • Show All 397 Lines • Show Last 20 Lines

lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp

Show First 20 Lines • Show All 55 Lines • ▼ Show 20 Lines	static cl::opt<bool> WidenLoads(
cl::init(true));		cl::init(true));

class AMDGPUCodeGenPrepare : public FunctionPass,		class AMDGPUCodeGenPrepare : public FunctionPass,
public InstVisitor<AMDGPUCodeGenPrepare, bool> {		public InstVisitor<AMDGPUCodeGenPrepare, bool> {
const GCNSubtarget *ST = nullptr;		const GCNSubtarget *ST = nullptr;
AssumptionCache *AC = nullptr;		AssumptionCache *AC = nullptr;
LegacyDivergenceAnalysis *DA = nullptr;		LegacyDivergenceAnalysis *DA = nullptr;
Module *Mod = nullptr;		Module *Mod = nullptr;
		const DataLayout *DL = nullptr;
bool HasUnsafeFPMath = false;		bool HasUnsafeFPMath = false;

/// Copies exact/nsw/nuw flags (if any) from binary operation \p I to		/// Copies exact/nsw/nuw flags (if any) from binary operation \p I to
/// binary operation \p V.		/// binary operation \p V.
///		///
/// \returns Binary operation \p V.		/// \returns Binary operation \p V.
/// \returns \p T's base element bit width.		/// \returns \p T's base element bit width.
unsigned getBaseElementBitWidth(const Type *T) const;		unsigned getBaseElementBitWidth(const Type *T) const;
▲ Show 20 Lines • Show All 56 Lines • ▼ Show 20 Lines	class AMDGPUCodeGenPrepare : public FunctionPass,
/// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the		/// bits, replacing \p I with 32 bit 'bitreverse' intrinsic, shifting the
/// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the		/// result of 32 bit 'bitreverse' intrinsic to the right with zero fill (the
/// shift amount is 32 minus \p I's base element bit width), and truncating		/// shift amount is 32 minus \p I's base element bit width), and truncating
/// the result of the shift operation back to \p I's original type.		/// the result of the shift operation back to \p I's original type.
///		///
/// \returns True.		/// \returns True.
bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;		bool promoteUniformBitreverseToI32(IntrinsicInst &I) const;


		unsigned numBitsUnsigned(Value *Op, unsigned ScalarSize) const;
		unsigned numBitsSigned(Value *Op, unsigned ScalarSize) const;
		bool isI24(Value *V, unsigned ScalarSize) const;
		bool isU24(Value *V, unsigned ScalarSize) const;

		/// Replace mul instructions with llvm.amdgcn.mul.u24 or llvm.amdgcn.mul.s24.
		/// SelectionDAG has an issue where an and asserting the bits are known
		bool replaceMulWithMul24(BinaryOperator &I) const;

/// Expands 24 bit div or rem.		/// Expands 24 bit div or rem.
Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,		Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
Value Num, Value Den,		Value Num, Value Den,
bool IsDiv, bool IsSigned) const;		bool IsDiv, bool IsSigned) const;

/// Expands 32 bit div or rem.		/// Expands 32 bit div or rem.
Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,		Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
Value Num, Value Den) const;		Value Num, Value Den) const;
▲ Show 20 Lines • Show All 243 Lines • ▼ Show 20 Lines	Value *TruncRes =
Builder.CreateTrunc(LShrOp, I.getType());		Builder.CreateTrunc(LShrOp, I.getType());

I.replaceAllUsesWith(TruncRes);		I.replaceAllUsesWith(TruncRes);
I.eraseFromParent();		I.eraseFromParent();

return true;		return true;
}		}

		unsigned AMDGPUCodeGenPrepare::numBitsUnsigned(Value *Op,
		unsigned ScalarSize) const {
		KnownBits Known = computeKnownBits(Op, *DL, 0, AC);
		return ScalarSize - Known.countMinLeadingZeros();
		}

		unsigned AMDGPUCodeGenPrepare::numBitsSigned(Value *Op,
		unsigned ScalarSize) const {
		// In order for this to be a signed 24-bit value, bit 23, must
		// be a sign bit.
		return ScalarSize - ComputeNumSignBits(Op, *DL, 0, AC);
		}

		bool AMDGPUCodeGenPrepare::isI24(Value *V, unsigned ScalarSize) const {
		return ScalarSize >= 24 && // Types less than 24-bit should be treated
		// as unsigned 24-bit values.
		numBitsSigned(V, ScalarSize) < 24;
		}

		bool AMDGPUCodeGenPrepare::isU24(Value *V, unsigned ScalarSize) const {
		return numBitsUnsigned(V, ScalarSize) <= 24;
		}

		static void extractValues(IRBuilder<> &Builder,
		SmallVectorImpl<Value > &Values, Value V) {
		VectorType *VT = dyn_cast<VectorType>(V->getType());
		if (!VT) {
		Values.push_back(V);
		return;
		}

		for (int I = 0, E = VT->getNumElements(); I != E; ++I)
		Values.push_back(Builder.CreateExtractElement(V, I));
		}

		static Value *insertValues(IRBuilder<> &Builder,
		Type *Ty,
		SmallVectorImpl<Value *> &Values) {
		if (Values.size() == 1)
		return Values[0];

		Value *NewVal = UndefValue::get(Ty);
		for (int I = 0, E = Values.size(); I != E; ++I)
		NewVal = Builder.CreateInsertElement(NewVal, Values[I], I);

		return NewVal;
		}

		bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
		if (I.getOpcode() != Instruction::Mul)
		return false;

		Type *Ty = I.getType();
		unsigned Size = Ty->getScalarSizeInBits();
		if (Size <= 16 && ST->has16BitInsts())
		return false;

		// Prefer scalar if this could be s_mul_i32
		if (DA->isUniform(&I))
		return false;

		Value *LHS = I.getOperand(0);
		Value *RHS = I.getOperand(1);
		IRBuilder<> Builder(&I);
		Builder.SetCurrentDebugLocation(I.getDebugLoc());

		Intrinsic::ID IntrID = Intrinsic::not_intrinsic;

		// TODO: Should this try to match mulhi24?
		if (ST->hasMulU24() && isU24(LHS, Size) && isU24(RHS, Size)) {
		IntrID = Intrinsic::amdgcn_mul_u24;
		} else if (ST->hasMulI24() && isI24(LHS, Size) && isI24(RHS, Size)) {
		IntrID = Intrinsic::amdgcn_mul_i24;
		} else
		return false;

		SmallVector<Value *, 4> LHSVals;
		SmallVector<Value *, 4> RHSVals;
		SmallVector<Value *, 4> ResultVals;
		extractValues(Builder, LHSVals, LHS);
		extractValues(Builder, RHSVals, RHS);


		IntegerType *I32Ty = Builder.getInt32Ty();
		FunctionCallee Intrin = Intrinsic::getDeclaration(Mod, IntrID);
		for (int I = 0, E = LHSVals.size(); I != E; ++I) {
		Value LHS, RHS;
		if (IntrID == Intrinsic::amdgcn_mul_u24) {
		LHS = Builder.CreateZExtOrTrunc(LHSVals[I], I32Ty);
		RHS = Builder.CreateZExtOrTrunc(RHSVals[I], I32Ty);
		} else {
		LHS = Builder.CreateSExtOrTrunc(LHSVals[I], I32Ty);
		RHS = Builder.CreateSExtOrTrunc(RHSVals[I], I32Ty);
		}

		Value *Result = Builder.CreateCall(Intrin, {LHS, RHS});

		if (IntrID == Intrinsic::amdgcn_mul_u24) {
		ResultVals.push_back(Builder.CreateZExtOrTrunc(Result,
		LHSVals[I]->getType()));
		} else {
		ResultVals.push_back(Builder.CreateSExtOrTrunc(Result,
		LHSVals[I]->getType()));
		}
		}

		I.replaceAllUsesWith(insertValues(Builder, Ty, ResultVals));
		I.eraseFromParent();

		return true;
		}

static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {		static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);		const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
if (!CNum)		if (!CNum)
return HasDenormals;		return HasDenormals;

if (UnsafeDiv)		if (UnsafeDiv)
return true;		return true;

▲ Show 20 Lines • Show All 348 Lines • ▼ Show 20 Lines	Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
return Res;		return Res;
}		}

bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {		bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&		if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
DA->isUniform(&I) && promoteUniformOpToI32(I))		DA->isUniform(&I) && promoteUniformOpToI32(I))
return true;		return true;

		if (replaceMulWithMul24(I))
		return true;

bool Changed = false;		bool Changed = false;
Instruction::BinaryOps Opc = I.getOpcode();		Instruction::BinaryOps Opc = I.getOpcode();
Type *Ty = I.getType();		Type *Ty = I.getType();
Value *NewDiv = nullptr;		Value *NewDiv = nullptr;
if ((Opc == Instruction::URem \|\| Opc == Instruction::UDiv \|\|		if ((Opc == Instruction::URem \|\| Opc == Instruction::UDiv \|\|
Opc == Instruction::SRem \|\| Opc == Instruction::SDiv) &&		Opc == Instruction::SRem \|\| Opc == Instruction::SDiv) &&
Ty->getScalarSizeInBits() <= 32) {		Ty->getScalarSizeInBits() <= 32) {
Value *Num = I.getOperand(0);		Value *Num = I.getOperand(0);
▲ Show 20 Lines • Show All 110 Lines • ▼ Show 20 Lines	if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
DA->isUniform(&I))		DA->isUniform(&I))
Changed \|= promoteUniformBitreverseToI32(I);		Changed \|= promoteUniformBitreverseToI32(I);

return Changed;		return Changed;
}		}

bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {		bool AMDGPUCodeGenPrepare::doInitialization(Module &M) {
Mod = &M;		Mod = &M;
		DL = &Mod->getDataLayout();
return false;		return false;
}		}

bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {		bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
if (skipFunction(F))		if (skipFunction(F))
return false;		return false;

auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();		auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
Show All 34 Lines

lib/Target/AMDGPU/SIISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 5,798 Lines • ▼ Show 20 Lines	case Intrinsic::amdgcn_interp_p2_f16: {
return DAG.getNode(AMDGPUISD::INTERP_P2_F16, DL, MVT::f16, Ops);		return DAG.getNode(AMDGPUISD::INTERP_P2_F16, DL, MVT::f16, Ops);
}		}
case Intrinsic::amdgcn_sin:		case Intrinsic::amdgcn_sin:
return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));		return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));

case Intrinsic::amdgcn_cos:		case Intrinsic::amdgcn_cos:
return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));		return DAG.getNode(AMDGPUISD::COS_HW, DL, VT, Op.getOperand(1));

		case Intrinsic::amdgcn_mul_u24:
		return DAG.getNode(AMDGPUISD::MUL_U24, DL, VT, Op.getOperand(1), Op.getOperand(2));
		case Intrinsic::amdgcn_mul_i24:
		return DAG.getNode(AMDGPUISD::MUL_I24, DL, VT, Op.getOperand(1), Op.getOperand(2));

case Intrinsic::amdgcn_log_clamp: {		case Intrinsic::amdgcn_log_clamp: {
if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)		if (Subtarget->getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS)
return SDValue();		return SDValue();

DiagnosticInfoUnsupported BadIntrin(		DiagnosticInfoUnsupported BadIntrin(
MF.getFunction(), "intrinsic not supported on subtarget",		MF.getFunction(), "intrinsic not supported on subtarget",
DL.getDebugLoc());		DL.getDebugLoc());
DAG.getContext()->diagnose(BadIntrin);		DAG.getContext()->diagnose(BadIntrin);
▲ Show 20 Lines • Show All 4,826 Lines • Show Last 20 Lines

test/CodeGen/AMDGPU/amdgpu-codegenprepare-mul24.ll

This file was added.

				; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
				; RUN: opt -S -mtriple=amdgcn-- -mcpu=tahiti -amdgpu-codegenprepare %s \| FileCheck -check-prefix=SI %s
				; RUN: opt -S -mtriple=amdgcn-- -mcpu=fiji -amdgpu-codegenprepare %s \| FileCheck -check-prefix=VI %s

				define i16 @mul_i16(i16 %lhs, i16 %rhs) {
				; SI-LABEL: @mul_i16(
				; SI-NEXT: [[TMP1:%.]] = zext i16 [[LHS:%.]] to i32
				; SI-NEXT: [[TMP2:%.]] = zext i16 [[RHS:%.]] to i32
				; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]])
				; SI-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i16
				; SI-NEXT: ret i16 [[TMP4]]
				;
				; VI-LABEL: @mul_i16(
				; VI-NEXT: [[MUL:%.]] = mul i16 [[LHS:%.]], [[RHS:%.*]]
				; VI-NEXT: ret i16 [[MUL]]
				;
				%mul = mul i16 %lhs, %rhs
				ret i16 %mul
				}

				define i32 @smul24_i32(i32 %lhs, i32 %rhs) {
				; SI-LABEL: @smul24_i32(
				; SI-NEXT: [[SHL_LHS:%.]] = shl i32 [[LHS:%.]], 8
				; SI-NEXT: [[LHS24:%.*]] = ashr i32 [[SHL_LHS]], 8
				; SI-NEXT: [[LSHR_RHS:%.]] = shl i32 [[RHS:%.]], 8
				; SI-NEXT: [[RHS24:%.*]] = ashr i32 [[LHS]], 8
				; SI-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[LHS24]], i32 [[RHS24]])
				; SI-NEXT: ret i32 [[TMP1]]
				;
				; VI-LABEL: @smul24_i32(
				; VI-NEXT: [[SHL_LHS:%.]] = shl i32 [[LHS:%.]], 8
				; VI-NEXT: [[LHS24:%.*]] = ashr i32 [[SHL_LHS]], 8
				; VI-NEXT: [[LSHR_RHS:%.]] = shl i32 [[RHS:%.]], 8
				; VI-NEXT: [[RHS24:%.*]] = ashr i32 [[LHS]], 8
				; VI-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[LHS24]], i32 [[RHS24]])
				; VI-NEXT: ret i32 [[TMP1]]
				;
				%shl.lhs = shl i32 %lhs, 8
				%lhs24 = ashr i32 %shl.lhs, 8
				%lshr.rhs = shl i32 %rhs, 8
				%rhs24 = ashr i32 %lhs, 8
				%mul = mul i32 %lhs24, %rhs24
				ret i32 %mul
				}

				define <2 x i32> @smul24_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
				; SI-LABEL: @smul24_v2i32(
				; SI-NEXT: [[SHL_LHS:%.]] = shl <2 x i32> [[LHS:%.]], <i32 8, i32 8>
				; SI-NEXT: [[LHS24:%.*]] = ashr <2 x i32> [[SHL_LHS]], <i32 8, i32 8>
				; SI-NEXT: [[LSHR_RHS:%.]] = shl <2 x i32> [[RHS:%.]], <i32 8, i32 8>
				; SI-NEXT: [[RHS24:%.*]] = ashr <2 x i32> [[LHS]], <i32 8, i32 8>
				; SI-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[LHS24]], i64 0
				; SI-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[LHS24]], i64 1
				; SI-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[RHS24]], i64 0
				; SI-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[RHS24]], i64 1
				; SI-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP3]])
				; SI-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP2]], i32 [[TMP4]])
				; SI-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i64 0
				; SI-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP6]], i64 1
				; SI-NEXT: ret <2 x i32> [[TMP8]]
				;
				; VI-LABEL: @smul24_v2i32(
				; VI-NEXT: [[SHL_LHS:%.]] = shl <2 x i32> [[LHS:%.]], <i32 8, i32 8>
				; VI-NEXT: [[LHS24:%.*]] = ashr <2 x i32> [[SHL_LHS]], <i32 8, i32 8>
				; VI-NEXT: [[LSHR_RHS:%.]] = shl <2 x i32> [[RHS:%.]], <i32 8, i32 8>
				; VI-NEXT: [[RHS24:%.*]] = ashr <2 x i32> [[LHS]], <i32 8, i32 8>
				; VI-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[LHS24]], i64 0
				; VI-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[LHS24]], i64 1
				; VI-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[RHS24]], i64 0
				; VI-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[RHS24]], i64 1
				; VI-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP3]])
				; VI-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP2]], i32 [[TMP4]])
				; VI-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i64 0
				; VI-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP6]], i64 1
				; VI-NEXT: ret <2 x i32> [[TMP8]]
				;
				%shl.lhs = shl <2 x i32> %lhs, <i32 8, i32 8>
				%lhs24 = ashr <2 x i32> %shl.lhs, <i32 8, i32 8>
				%lshr.rhs = shl <2 x i32> %rhs, <i32 8, i32 8>
				%rhs24 = ashr <2 x i32> %lhs, <i32 8, i32 8>
				%mul = mul <2 x i32> %lhs24, %rhs24
				ret <2 x i32> %mul
				}

				define i32 @umul24_i32(i32 %lhs, i32 %rhs) {
				; SI-LABEL: @umul24_i32(
				; SI-NEXT: [[LHS24:%.]] = and i32 [[LHS:%.]], 16777215
				; SI-NEXT: [[RHS24:%.]] = and i32 [[RHS:%.]], 16777215
				; SI-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[LHS24]], i32 [[RHS24]])
				; SI-NEXT: ret i32 [[TMP1]]
				;
				; VI-LABEL: @umul24_i32(
				; VI-NEXT: [[LHS24:%.]] = and i32 [[LHS:%.]], 16777215
				; VI-NEXT: [[RHS24:%.]] = and i32 [[RHS:%.]], 16777215
				; VI-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[LHS24]], i32 [[RHS24]])
				; VI-NEXT: ret i32 [[TMP1]]
				;
				%lhs24 = and i32 %lhs, 16777215
				%rhs24 = and i32 %rhs, 16777215
				%mul = mul i32 %lhs24, %rhs24
				ret i32 %mul
				}

				define <2 x i32> @umul24_v2i32(<2 x i32> %lhs, <2 x i32> %rhs) {
				; SI-LABEL: @umul24_v2i32(
				; SI-NEXT: [[LHS24:%.]] = and <2 x i32> [[LHS:%.]], <i32 16777215, i32 16777215>
				; SI-NEXT: [[RHS24:%.]] = and <2 x i32> [[RHS:%.]], <i32 16777215, i32 16777215>
				; SI-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[LHS24]], i64 0
				; SI-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[LHS24]], i64 1
				; SI-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[RHS24]], i64 0
				; SI-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[RHS24]], i64 1
				; SI-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP3]])
				; SI-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP2]], i32 [[TMP4]])
				; SI-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i64 0
				; SI-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP6]], i64 1
				; SI-NEXT: ret <2 x i32> [[TMP8]]
				;
				; VI-LABEL: @umul24_v2i32(
				; VI-NEXT: [[LHS24:%.]] = and <2 x i32> [[LHS:%.]], <i32 16777215, i32 16777215>
				; VI-NEXT: [[RHS24:%.]] = and <2 x i32> [[RHS:%.]], <i32 16777215, i32 16777215>
				; VI-NEXT: [[TMP1:%.*]] = extractelement <2 x i32> [[LHS24]], i64 0
				; VI-NEXT: [[TMP2:%.*]] = extractelement <2 x i32> [[LHS24]], i64 1
				; VI-NEXT: [[TMP3:%.*]] = extractelement <2 x i32> [[RHS24]], i64 0
				; VI-NEXT: [[TMP4:%.*]] = extractelement <2 x i32> [[RHS24]], i64 1
				; VI-NEXT: [[TMP5:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP3]])
				; VI-NEXT: [[TMP6:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP2]], i32 [[TMP4]])
				; VI-NEXT: [[TMP7:%.*]] = insertelement <2 x i32> undef, i32 [[TMP5]], i64 0
				; VI-NEXT: [[TMP8:%.*]] = insertelement <2 x i32> [[TMP7]], i32 [[TMP6]], i64 1
				; VI-NEXT: ret <2 x i32> [[TMP8]]
				;
				%lhs24 = and <2 x i32> %lhs, <i32 16777215, i32 16777215>
				%rhs24 = and <2 x i32> %rhs, <i32 16777215, i32 16777215>
				%mul = mul <2 x i32> %lhs24, %rhs24
				ret <2 x i32> %mul
				}

				define i64 @smul24_i64(i64 %lhs, i64 %rhs) {
				; SI-LABEL: @smul24_i64(
				; SI-NEXT: [[SHL_LHS:%.]] = shl i64 [[LHS:%.]], 40
				; SI-NEXT: [[LHS24:%.*]] = ashr i64 [[SHL_LHS]], 40
				; SI-NEXT: [[LSHR_RHS:%.]] = shl i64 [[RHS:%.]], 40
				; SI-NEXT: [[RHS24:%.*]] = ashr i64 [[LHS]], 40
				; SI-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32
				; SI-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32
				; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]])
				; SI-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
				; SI-NEXT: ret i64 [[TMP4]]
				;
				; VI-LABEL: @smul24_i64(
				; VI-NEXT: [[SHL_LHS:%.]] = shl i64 [[LHS:%.]], 40
				; VI-NEXT: [[LHS24:%.*]] = ashr i64 [[SHL_LHS]], 40
				; VI-NEXT: [[LSHR_RHS:%.]] = shl i64 [[RHS:%.]], 40
				; VI-NEXT: [[RHS24:%.*]] = ashr i64 [[LHS]], 40
				; VI-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32
				; VI-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32
				; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]])
				; VI-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i64
				; VI-NEXT: ret i64 [[TMP4]]
				;
				%shl.lhs = shl i64 %lhs, 40
				%lhs24 = ashr i64 %shl.lhs, 40
				%lshr.rhs = shl i64 %rhs, 40
				%rhs24 = ashr i64 %lhs, 40
				%mul = mul i64 %lhs24, %rhs24
				ret i64 %mul
				}

				define i64 @umul24_i64(i64 %lhs, i64 %rhs) {
				; SI-LABEL: @umul24_i64(
				; SI-NEXT: [[LHS24:%.]] = and i64 [[LHS:%.]], 16777215
				; SI-NEXT: [[RHS24:%.]] = and i64 [[RHS:%.]], 16777215
				; SI-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32
				; SI-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32
				; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]])
				; SI-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
				; SI-NEXT: ret i64 [[TMP4]]
				;
				; VI-LABEL: @umul24_i64(
				; VI-NEXT: [[LHS24:%.]] = and i64 [[LHS:%.]], 16777215
				; VI-NEXT: [[RHS24:%.]] = and i64 [[RHS:%.]], 16777215
				; VI-NEXT: [[TMP1:%.*]] = trunc i64 [[LHS24]] to i32
				; VI-NEXT: [[TMP2:%.*]] = trunc i64 [[RHS24]] to i32
				; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]])
				; VI-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i64
				; VI-NEXT: ret i64 [[TMP4]]
				;
				%lhs24 = and i64 %lhs, 16777215
				%rhs24 = and i64 %rhs, 16777215
				%mul = mul i64 %lhs24, %rhs24
				ret i64 %mul
				}

				define i31 @smul24_i31(i31 %lhs, i31 %rhs) {
				; SI-LABEL: @smul24_i31(
				; SI-NEXT: [[SHL_LHS:%.]] = shl i31 [[LHS:%.]], 7
				; SI-NEXT: [[LHS24:%.*]] = ashr i31 [[SHL_LHS]], 7
				; SI-NEXT: [[LSHR_RHS:%.]] = shl i31 [[RHS:%.]], 7
				; SI-NEXT: [[RHS24:%.*]] = ashr i31 [[LHS]], 7
				; SI-NEXT: [[TMP1:%.*]] = sext i31 [[LHS24]] to i32
				; SI-NEXT: [[TMP2:%.*]] = sext i31 [[RHS24]] to i32
				; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]])
				; SI-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i31
				; SI-NEXT: ret i31 [[TMP4]]
				;
				; VI-LABEL: @smul24_i31(
				; VI-NEXT: [[SHL_LHS:%.]] = shl i31 [[LHS:%.]], 7
				; VI-NEXT: [[LHS24:%.*]] = ashr i31 [[SHL_LHS]], 7
				; VI-NEXT: [[LSHR_RHS:%.]] = shl i31 [[RHS:%.]], 7
				; VI-NEXT: [[RHS24:%.*]] = ashr i31 [[LHS]], 7
				; VI-NEXT: [[TMP1:%.*]] = sext i31 [[LHS24]] to i32
				; VI-NEXT: [[TMP2:%.*]] = sext i31 [[RHS24]] to i32
				; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]])
				; VI-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i31
				; VI-NEXT: ret i31 [[TMP4]]
				;
				%shl.lhs = shl i31 %lhs, 7
				%lhs24 = ashr i31 %shl.lhs, 7
				%lshr.rhs = shl i31 %rhs, 7
				%rhs24 = ashr i31 %lhs, 7
				%mul = mul i31 %lhs24, %rhs24
				ret i31 %mul
				}

				define i31 @umul24_i31(i31 %lhs, i31 %rhs) {
				; SI-LABEL: @umul24_i31(
				; SI-NEXT: [[LHS24:%.]] = and i31 [[LHS:%.]], 16777215
				; SI-NEXT: [[RHS24:%.]] = and i31 [[RHS:%.]], 16777215
				; SI-NEXT: [[TMP1:%.*]] = zext i31 [[LHS24]] to i32
				; SI-NEXT: [[TMP2:%.*]] = zext i31 [[RHS24]] to i32
				; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]])
				; SI-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i31
				; SI-NEXT: ret i31 [[TMP4]]
				;
				; VI-LABEL: @umul24_i31(
				; VI-NEXT: [[LHS24:%.]] = and i31 [[LHS:%.]], 16777215
				; VI-NEXT: [[RHS24:%.]] = and i31 [[RHS:%.]], 16777215
				; VI-NEXT: [[TMP1:%.*]] = zext i31 [[LHS24]] to i32
				; VI-NEXT: [[TMP2:%.*]] = zext i31 [[RHS24]] to i32
				; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]])
				; VI-NEXT: [[TMP4:%.*]] = trunc i32 [[TMP3]] to i31
				; VI-NEXT: ret i31 [[TMP4]]
				;
				%lhs24 = and i31 %lhs, 16777215
				%rhs24 = and i31 %rhs, 16777215
				%mul = mul i31 %lhs24, %rhs24
				ret i31 %mul
				}

				define <2 x i31> @umul24_v2i31(<2 x i31> %lhs, <2 x i31> %rhs) {
				; SI-LABEL: @umul24_v2i31(
				; SI-NEXT: [[LHS24:%.]] = and <2 x i31> [[LHS:%.]], <i31 16777215, i31 16777215>
				; SI-NEXT: [[RHS24:%.]] = and <2 x i31> [[RHS:%.]], <i31 16777215, i31 16777215>
				; SI-NEXT: [[TMP1:%.*]] = extractelement <2 x i31> [[LHS24]], i64 0
				; SI-NEXT: [[TMP2:%.*]] = extractelement <2 x i31> [[LHS24]], i64 1
				; SI-NEXT: [[TMP3:%.*]] = extractelement <2 x i31> [[RHS24]], i64 0
				; SI-NEXT: [[TMP4:%.*]] = extractelement <2 x i31> [[RHS24]], i64 1
				; SI-NEXT: [[TMP5:%.*]] = zext i31 [[TMP1]] to i32
				; SI-NEXT: [[TMP6:%.*]] = zext i31 [[TMP3]] to i32
				; SI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP5]], i32 [[TMP6]])
				; SI-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i31
				; SI-NEXT: [[TMP9:%.*]] = zext i31 [[TMP2]] to i32
				; SI-NEXT: [[TMP10:%.*]] = zext i31 [[TMP4]] to i32
				; SI-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP9]], i32 [[TMP10]])
				; SI-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i31
				; SI-NEXT: [[TMP13:%.*]] = insertelement <2 x i31> undef, i31 [[TMP8]], i64 0
				; SI-NEXT: [[TMP14:%.*]] = insertelement <2 x i31> [[TMP13]], i31 [[TMP12]], i64 1
				; SI-NEXT: ret <2 x i31> [[TMP14]]
				;
				; VI-LABEL: @umul24_v2i31(
				; VI-NEXT: [[LHS24:%.]] = and <2 x i31> [[LHS:%.]], <i31 16777215, i31 16777215>
				; VI-NEXT: [[RHS24:%.]] = and <2 x i31> [[RHS:%.]], <i31 16777215, i31 16777215>
				; VI-NEXT: [[TMP1:%.*]] = extractelement <2 x i31> [[LHS24]], i64 0
				; VI-NEXT: [[TMP2:%.*]] = extractelement <2 x i31> [[LHS24]], i64 1
				; VI-NEXT: [[TMP3:%.*]] = extractelement <2 x i31> [[RHS24]], i64 0
				; VI-NEXT: [[TMP4:%.*]] = extractelement <2 x i31> [[RHS24]], i64 1
				; VI-NEXT: [[TMP5:%.*]] = zext i31 [[TMP1]] to i32
				; VI-NEXT: [[TMP6:%.*]] = zext i31 [[TMP3]] to i32
				; VI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP5]], i32 [[TMP6]])
				; VI-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i31
				; VI-NEXT: [[TMP9:%.*]] = zext i31 [[TMP2]] to i32
				; VI-NEXT: [[TMP10:%.*]] = zext i31 [[TMP4]] to i32
				; VI-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP9]], i32 [[TMP10]])
				; VI-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i31
				; VI-NEXT: [[TMP13:%.*]] = insertelement <2 x i31> undef, i31 [[TMP8]], i64 0
				; VI-NEXT: [[TMP14:%.*]] = insertelement <2 x i31> [[TMP13]], i31 [[TMP12]], i64 1
				; VI-NEXT: ret <2 x i31> [[TMP14]]
				;
				%lhs24 = and <2 x i31> %lhs, <i31 16777215, i31 16777215>
				%rhs24 = and <2 x i31> %rhs, <i31 16777215, i31 16777215>
				%mul = mul <2 x i31> %lhs24, %rhs24
				ret <2 x i31> %mul
				}

				define <2 x i31> @smul24_v2i31(<2 x i31> %lhs, <2 x i31> %rhs) {
				; SI-LABEL: @smul24_v2i31(
				; SI-NEXT: [[SHL_LHS:%.]] = shl <2 x i31> [[LHS:%.]], <i31 8, i31 8>
				; SI-NEXT: [[LHS24:%.*]] = ashr <2 x i31> [[SHL_LHS]], <i31 8, i31 8>
				; SI-NEXT: [[LSHR_RHS:%.]] = shl <2 x i31> [[RHS:%.]], <i31 8, i31 8>
				; SI-NEXT: [[RHS24:%.*]] = ashr <2 x i31> [[LHS]], <i31 8, i31 8>
				; SI-NEXT: [[TMP1:%.*]] = extractelement <2 x i31> [[LHS24]], i64 0
				; SI-NEXT: [[TMP2:%.*]] = extractelement <2 x i31> [[LHS24]], i64 1
				; SI-NEXT: [[TMP3:%.*]] = extractelement <2 x i31> [[RHS24]], i64 0
				; SI-NEXT: [[TMP4:%.*]] = extractelement <2 x i31> [[RHS24]], i64 1
				; SI-NEXT: [[TMP5:%.*]] = sext i31 [[TMP1]] to i32
				; SI-NEXT: [[TMP6:%.*]] = sext i31 [[TMP3]] to i32
				; SI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP5]], i32 [[TMP6]])
				; SI-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i31
				; SI-NEXT: [[TMP9:%.*]] = sext i31 [[TMP2]] to i32
				; SI-NEXT: [[TMP10:%.*]] = sext i31 [[TMP4]] to i32
				; SI-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP9]], i32 [[TMP10]])
				; SI-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i31
				; SI-NEXT: [[TMP13:%.*]] = insertelement <2 x i31> undef, i31 [[TMP8]], i64 0
				; SI-NEXT: [[TMP14:%.*]] = insertelement <2 x i31> [[TMP13]], i31 [[TMP12]], i64 1
				; SI-NEXT: ret <2 x i31> [[TMP14]]
				;
				; VI-LABEL: @smul24_v2i31(
				; VI-NEXT: [[SHL_LHS:%.]] = shl <2 x i31> [[LHS:%.]], <i31 8, i31 8>
				; VI-NEXT: [[LHS24:%.*]] = ashr <2 x i31> [[SHL_LHS]], <i31 8, i31 8>
				; VI-NEXT: [[LSHR_RHS:%.]] = shl <2 x i31> [[RHS:%.]], <i31 8, i31 8>
				; VI-NEXT: [[RHS24:%.*]] = ashr <2 x i31> [[LHS]], <i31 8, i31 8>
				; VI-NEXT: [[TMP1:%.*]] = extractelement <2 x i31> [[LHS24]], i64 0
				; VI-NEXT: [[TMP2:%.*]] = extractelement <2 x i31> [[LHS24]], i64 1
				; VI-NEXT: [[TMP3:%.*]] = extractelement <2 x i31> [[RHS24]], i64 0
				; VI-NEXT: [[TMP4:%.*]] = extractelement <2 x i31> [[RHS24]], i64 1
				; VI-NEXT: [[TMP5:%.*]] = sext i31 [[TMP1]] to i32
				; VI-NEXT: [[TMP6:%.*]] = sext i31 [[TMP3]] to i32
				; VI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP5]], i32 [[TMP6]])
				; VI-NEXT: [[TMP8:%.*]] = trunc i32 [[TMP7]] to i31
				; VI-NEXT: [[TMP9:%.*]] = sext i31 [[TMP2]] to i32
				; VI-NEXT: [[TMP10:%.*]] = sext i31 [[TMP4]] to i32
				; VI-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP9]], i32 [[TMP10]])
				; VI-NEXT: [[TMP12:%.*]] = trunc i32 [[TMP11]] to i31
				; VI-NEXT: [[TMP13:%.*]] = insertelement <2 x i31> undef, i31 [[TMP8]], i64 0
				; VI-NEXT: [[TMP14:%.*]] = insertelement <2 x i31> [[TMP13]], i31 [[TMP12]], i64 1
				; VI-NEXT: ret <2 x i31> [[TMP14]]
				;
				%shl.lhs = shl <2 x i31> %lhs, <i31 8, i31 8>
				%lhs24 = ashr <2 x i31> %shl.lhs, <i31 8, i31 8>
				%lshr.rhs = shl <2 x i31> %rhs, <i31 8, i31 8>
				%rhs24 = ashr <2 x i31> %lhs, <i31 8, i31 8>
				%mul = mul <2 x i31> %lhs24, %rhs24
				ret <2 x i31> %mul
				}

				define i33 @smul24_i33(i33 %lhs, i33 %rhs) {
				; SI-LABEL: @smul24_i33(
				; SI-NEXT: [[SHL_LHS:%.]] = shl i33 [[LHS:%.]], 9
				; SI-NEXT: [[LHS24:%.*]] = ashr i33 [[SHL_LHS]], 9
				; SI-NEXT: [[LSHR_RHS:%.]] = shl i33 [[RHS:%.]], 9
				; SI-NEXT: [[RHS24:%.*]] = ashr i33 [[LHS]], 9
				; SI-NEXT: [[TMP1:%.*]] = trunc i33 [[LHS24]] to i32
				; SI-NEXT: [[TMP2:%.*]] = trunc i33 [[RHS24]] to i32
				; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]])
				; SI-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i33
				; SI-NEXT: ret i33 [[TMP4]]
				;
				; VI-LABEL: @smul24_i33(
				; VI-NEXT: [[SHL_LHS:%.]] = shl i33 [[LHS:%.]], 9
				; VI-NEXT: [[LHS24:%.*]] = ashr i33 [[SHL_LHS]], 9
				; VI-NEXT: [[LSHR_RHS:%.]] = shl i33 [[RHS:%.]], 9
				; VI-NEXT: [[RHS24:%.*]] = ashr i33 [[LHS]], 9
				; VI-NEXT: [[TMP1:%.*]] = trunc i33 [[LHS24]] to i32
				; VI-NEXT: [[TMP2:%.*]] = trunc i33 [[RHS24]] to i32
				; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP1]], i32 [[TMP2]])
				; VI-NEXT: [[TMP4:%.*]] = sext i32 [[TMP3]] to i33
				; VI-NEXT: ret i33 [[TMP4]]
				;
				%shl.lhs = shl i33 %lhs, 9
				%lhs24 = ashr i33 %shl.lhs, 9
				%lshr.rhs = shl i33 %rhs, 9
				%rhs24 = ashr i33 %lhs, 9
				%mul = mul i33 %lhs24, %rhs24
				ret i33 %mul
				}

				define i33 @umul24_i33(i33 %lhs, i33 %rhs) {
				; SI-LABEL: @umul24_i33(
				; SI-NEXT: [[LHS24:%.]] = and i33 [[LHS:%.]], 16777215
				; SI-NEXT: [[RHS24:%.]] = and i33 [[RHS:%.]], 16777215
				; SI-NEXT: [[TMP1:%.*]] = trunc i33 [[LHS24]] to i32
				; SI-NEXT: [[TMP2:%.*]] = trunc i33 [[RHS24]] to i32
				; SI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]])
				; SI-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i33
				; SI-NEXT: ret i33 [[TMP4]]
				;
				; VI-LABEL: @umul24_i33(
				; VI-NEXT: [[LHS24:%.]] = and i33 [[LHS:%.]], 16777215
				; VI-NEXT: [[RHS24:%.]] = and i33 [[RHS:%.]], 16777215
				; VI-NEXT: [[TMP1:%.*]] = trunc i33 [[LHS24]] to i32
				; VI-NEXT: [[TMP2:%.*]] = trunc i33 [[RHS24]] to i32
				; VI-NEXT: [[TMP3:%.*]] = call i32 @llvm.amdgcn.mul.u24(i32 [[TMP1]], i32 [[TMP2]])
				; VI-NEXT: [[TMP4:%.*]] = zext i32 [[TMP3]] to i33
				; VI-NEXT: ret i33 [[TMP4]]
				;
				%lhs24 = and i33 %lhs, 16777215
				%rhs24 = and i33 %rhs, 16777215
				%mul = mul i33 %lhs24, %rhs24
				ret i33 %mul
				}

				define i32 @smul25_i32(i32 %lhs, i32 %rhs) {
				; SI-LABEL: @smul25_i32(
				; SI-NEXT: [[SHL_LHS:%.]] = shl i32 [[LHS:%.]], 7
				; SI-NEXT: [[LHS24:%.*]] = ashr i32 [[SHL_LHS]], 7
				; SI-NEXT: [[LSHR_RHS:%.]] = shl i32 [[RHS:%.]], 7
				; SI-NEXT: [[RHS24:%.*]] = ashr i32 [[LHS]], 7
				; SI-NEXT: [[MUL:%.*]] = mul i32 [[LHS24]], [[RHS24]]
				; SI-NEXT: ret i32 [[MUL]]
				;
				; VI-LABEL: @smul25_i32(
				; VI-NEXT: [[SHL_LHS:%.]] = shl i32 [[LHS:%.]], 7
				; VI-NEXT: [[LHS24:%.*]] = ashr i32 [[SHL_LHS]], 7
				; VI-NEXT: [[LSHR_RHS:%.]] = shl i32 [[RHS:%.]], 7
				; VI-NEXT: [[RHS24:%.*]] = ashr i32 [[LHS]], 7
				; VI-NEXT: [[MUL:%.*]] = mul i32 [[LHS24]], [[RHS24]]
				; VI-NEXT: ret i32 [[MUL]]
				;
				%shl.lhs = shl i32 %lhs, 7
				%lhs24 = ashr i32 %shl.lhs, 7
				%lshr.rhs = shl i32 %rhs, 7
				%rhs24 = ashr i32 %lhs, 7
				%mul = mul i32 %lhs24, %rhs24
				ret i32 %mul
				}

				define i32 @umul25_i32(i32 %lhs, i32 %rhs) {
				; SI-LABEL: @umul25_i32(
				; SI-NEXT: [[LHS24:%.]] = and i32 [[LHS:%.]], 33554431
				; SI-NEXT: [[RHS24:%.]] = and i32 [[RHS:%.]], 33554431
				; SI-NEXT: [[MUL:%.*]] = mul i32 [[LHS24]], [[RHS24]]
				; SI-NEXT: ret i32 [[MUL]]
				;
				; VI-LABEL: @umul25_i32(
				; VI-NEXT: [[LHS24:%.]] = and i32 [[LHS:%.]], 33554431
				; VI-NEXT: [[RHS24:%.]] = and i32 [[RHS:%.]], 33554431
				; VI-NEXT: [[MUL:%.*]] = mul i32 [[LHS24]], [[RHS24]]
				; VI-NEXT: ret i32 [[MUL]]
				;
				%lhs24 = and i32 %lhs, 33554431
				%rhs24 = and i32 %rhs, 33554431
				%mul = mul i32 %lhs24, %rhs24
				ret i32 %mul
				}

				define <2 x i33> @smul24_v2i33(<2 x i33> %lhs, <2 x i33> %rhs) {
				; SI-LABEL: @smul24_v2i33(
				; SI-NEXT: [[SHL_LHS:%.]] = shl <2 x i33> [[LHS:%.]], <i33 9, i33 9>
				; SI-NEXT: [[LHS24:%.*]] = ashr <2 x i33> [[SHL_LHS]], <i33 9, i33 9>
				; SI-NEXT: [[LSHR_RHS:%.]] = shl <2 x i33> [[RHS:%.]], <i33 9, i33 9>
				; SI-NEXT: [[RHS24:%.*]] = ashr <2 x i33> [[LHS]], <i33 9, i33 9>
				; SI-NEXT: [[TMP1:%.*]] = extractelement <2 x i33> [[LHS24]], i64 0
				; SI-NEXT: [[TMP2:%.*]] = extractelement <2 x i33> [[LHS24]], i64 1
				; SI-NEXT: [[TMP3:%.*]] = extractelement <2 x i33> [[RHS24]], i64 0
				; SI-NEXT: [[TMP4:%.*]] = extractelement <2 x i33> [[RHS24]], i64 1
				; SI-NEXT: [[TMP5:%.*]] = trunc i33 [[TMP1]] to i32
				; SI-NEXT: [[TMP6:%.*]] = trunc i33 [[TMP3]] to i32
				; SI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP5]], i32 [[TMP6]])
				; SI-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i33
				; SI-NEXT: [[TMP9:%.*]] = trunc i33 [[TMP2]] to i32
				; SI-NEXT: [[TMP10:%.*]] = trunc i33 [[TMP4]] to i32
				; SI-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP9]], i32 [[TMP10]])
				; SI-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i33
				; SI-NEXT: [[TMP13:%.*]] = insertelement <2 x i33> undef, i33 [[TMP8]], i64 0
				; SI-NEXT: [[TMP14:%.*]] = insertelement <2 x i33> [[TMP13]], i33 [[TMP12]], i64 1
				; SI-NEXT: ret <2 x i33> [[TMP14]]
				;
				; VI-LABEL: @smul24_v2i33(
				; VI-NEXT: [[SHL_LHS:%.]] = shl <2 x i33> [[LHS:%.]], <i33 9, i33 9>
				; VI-NEXT: [[LHS24:%.*]] = ashr <2 x i33> [[SHL_LHS]], <i33 9, i33 9>
				; VI-NEXT: [[LSHR_RHS:%.]] = shl <2 x i33> [[RHS:%.]], <i33 9, i33 9>
				; VI-NEXT: [[RHS24:%.*]] = ashr <2 x i33> [[LHS]], <i33 9, i33 9>
				; VI-NEXT: [[TMP1:%.*]] = extractelement <2 x i33> [[LHS24]], i64 0
				; VI-NEXT: [[TMP2:%.*]] = extractelement <2 x i33> [[LHS24]], i64 1
				; VI-NEXT: [[TMP3:%.*]] = extractelement <2 x i33> [[RHS24]], i64 0
				; VI-NEXT: [[TMP4:%.*]] = extractelement <2 x i33> [[RHS24]], i64 1
				; VI-NEXT: [[TMP5:%.*]] = trunc i33 [[TMP1]] to i32
				; VI-NEXT: [[TMP6:%.*]] = trunc i33 [[TMP3]] to i32
				; VI-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP5]], i32 [[TMP6]])
				; VI-NEXT: [[TMP8:%.*]] = sext i32 [[TMP7]] to i33
				; VI-NEXT: [[TMP9:%.*]] = trunc i33 [[TMP2]] to i32
				; VI-NEXT: [[TMP10:%.*]] = trunc i33 [[TMP4]] to i32
				; VI-NEXT: [[TMP11:%.*]] = call i32 @llvm.amdgcn.mul.i24(i32 [[TMP9]], i32 [[TMP10]])
				; VI-NEXT: [[TMP12:%.*]] = sext i32 [[TMP11]] to i33
				; VI-NEXT: [[TMP13:%.*]] = insertelement <2 x i33> undef, i33 [[TMP8]], i64 0
				; VI-NEXT: [[TMP14:%.*]] = insertelement <2 x i33> [[TMP13]], i33 [[TMP12]], i64 1
				; VI-NEXT: ret <2 x i33> [[TMP14]]
				;
				%shl.lhs = shl <2 x i33> %lhs, <i33 9, i33 9>
				%lhs24 = ashr <2 x i33> %shl.lhs, <i33 9, i33 9>
				%lshr.rhs = shl <2 x i33> %rhs, <i33 9, i33 9>
				%rhs24 = ashr <2 x i33> %lhs, <i33 9, i33 9>
				%mul = mul <2 x i33> %lhs24, %rhs24
				ret <2 x i33> %mul
				}

test/CodeGen/AMDGPU/llvm.amdgcn.mul.i24.ll

This file was added.

				; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s

				; GCN-LABEL: {{^}}test_mul_i24:
				; GCN: v_mul_i32_i24
				define amdgpu_kernel void @test_mul_i24(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #1 {
				%val = call i32 @llvm.amdgcn.mul.i24(i32 %src1, i32 %src2) #0
				store i32 %val, i32 addrspace(1)* %out
				ret void
				}

				declare i32 @llvm.amdgcn.mul.i24(i32, i32) #0

				attributes #0 = { nounwind readnone speculatable }
				attributes #1 = { nounwind }

test/CodeGen/AMDGPU/llvm.amdgcn.mul.u24.ll

This file was added.

				; RUN: llc -march=amdgcn -verify-machineinstrs < %s \| FileCheck -check-prefix=GCN %s

				; GCN-LABEL: {{^}}test_mul_u24:
				; GCN: v_mul_u32_u24
				define amdgpu_kernel void @test_mul_u24(i32 addrspace(1)* %out, i32 %src1, i32 %src2) #1 {
				%val = call i32 @llvm.amdgcn.mul.u24(i32 %src1, i32 %src2) #0
				store i32 %val, i32 addrspace(1)* %out
				ret void
				}

				declare i32 @llvm.amdgcn.mul.u24(i32, i32) #0

				attributes #0 = { nounwind readnone speculatable }
				attributes #1 = { nounwind }

test/CodeGen/AMDGPU/mad_uint24.ll

Show First 20 Lines • Show All 227 Lines • ▼ Show 20 Lines	entry:
%extb = sext i8 %lb to i16		%extb = sext i8 %lb to i16
%extc = sext i8 %lc to i16		%extc = sext i8 %lc to i16
%mul = mul i16 %exta, %extb		%mul = mul i16 %exta, %extb
%mad = add i16 %mul, %extc		%mad = add i16 %mul, %extc
%mad_ext = sext i16 %mad to i64		%mad_ext = sext i16 %mad to i64
store i64 %mad_ext, i64 addrspace(1)* %out		store i64 %mad_ext, i64 addrspace(1)* %out
ret void		ret void
}		}

		; The ands are asserting the high bits are 0. SimplifyDemandedBits on
		; the adds would remove the ands before the target combine on the mul
		; had a chance to form mul24. The mul combine would then see
		; extractelement with no known bits and fail. All of the mul/add
		; combos in this loop should form v_mad_u32_u24.

		; FUNC-LABEL: {{^}}mad24_known_bits_destroyed:
		; GCN: v_mad_u32_u24
		; GCN: v_mad_u32_u24
		; GCN: v_mad_u32_u24
		; GCN: v_mad_u32_u24
		; GCN: v_mad_u32_u24
		; GCN: v_mad_u32_u24
		; GCN: v_mad_u32_u24
		; GCN: v_mad_u32_u24
		define void @mad24_known_bits_destroyed(i32 %arg, <4 x i32> %arg1, <4 x i32> %arg2, <4 x i32> %arg3, i32 %arg4, i32 %arg5, i32 %arg6, i32 addrspace(1)* %arg7, <4 x i32> addrspace(1)* %arg8) #0 {
		bb:
		%tmp = and i32 %arg4, 16777215
		%tmp9 = extractelement <4 x i32> %arg1, i64 1
		%tmp10 = extractelement <4 x i32> %arg3, i64 1
		%tmp11 = and i32 %tmp9, 16777215
		%tmp12 = extractelement <4 x i32> %arg1, i64 2
		%tmp13 = extractelement <4 x i32> %arg3, i64 2
		%tmp14 = and i32 %tmp12, 16777215
		%tmp15 = extractelement <4 x i32> %arg1, i64 3
		%tmp16 = extractelement <4 x i32> %arg3, i64 3
		%tmp17 = and i32 %tmp15, 16777215
		br label %bb19

		bb18: ; preds = %bb19
		ret void

		bb19: ; preds = %bb19, %bb
		%tmp20 = phi i32 [ %arg, %bb ], [ %tmp40, %bb19 ]
		%tmp21 = phi i32 [ 0, %bb ], [ %tmp54, %bb19 ]
		%tmp22 = phi <4 x i32> [ %arg2, %bb ], [ %tmp53, %bb19 ]
		%tmp23 = and i32 %tmp20, 16777215
		%tmp24 = mul i32 %tmp23, %tmp
		%tmp25 = add i32 %tmp24, %arg5
		%tmp26 = extractelement <4 x i32> %tmp22, i64 1
		%tmp27 = and i32 %tmp26, 16777215
		%tmp28 = mul i32 %tmp27, %tmp11
		%tmp29 = add i32 %tmp28, %tmp10
		%tmp30 = extractelement <4 x i32> %tmp22, i64 2
		%tmp31 = and i32 %tmp30, 16777215
		%tmp32 = mul i32 %tmp31, %tmp14
		%tmp33 = add i32 %tmp32, %tmp13
		%tmp34 = extractelement <4 x i32> %tmp22, i64 3
		%tmp35 = and i32 %tmp34, 16777215
		%tmp36 = mul i32 %tmp35, %tmp17
		%tmp37 = add i32 %tmp36, %tmp16
		%tmp38 = and i32 %tmp25, 16777215
		%tmp39 = mul i32 %tmp38, %tmp
		%tmp40 = add i32 %tmp39, %arg5
		store i32 %tmp40, i32 addrspace(1)* %arg7
		%tmp41 = insertelement <4 x i32> undef, i32 %tmp40, i32 0
		%tmp42 = and i32 %tmp29, 16777215
		%tmp43 = mul i32 %tmp42, %tmp11
		%tmp44 = add i32 %tmp43, %tmp10
		%tmp45 = insertelement <4 x i32> %tmp41, i32 %tmp44, i32 1
		%tmp46 = and i32 %tmp33, 16777215
		%tmp47 = mul i32 %tmp46, %tmp14
		%tmp48 = add i32 %tmp47, %tmp13
		%tmp49 = insertelement <4 x i32> %tmp45, i32 %tmp48, i32 2
		%tmp50 = and i32 %tmp37, 16777215
		%tmp51 = mul i32 %tmp50, %tmp17
		%tmp52 = add i32 %tmp51, %tmp16
		%tmp53 = insertelement <4 x i32> %tmp49, i32 %tmp52, i32 3
		store <4 x i32> %tmp53, <4 x i32> addrspace(1)* %arg8
		%tmp54 = add nuw nsw i32 %tmp21, 1
		%tmp55 = icmp eq i32 %tmp54, %arg6
		br i1 %tmp55, label %bb18, label %bb19
		}

		attributes #0 = { norecurse nounwind }

test/CodeGen/AMDGPU/mul.i16.ll

Show All 35 Lines	entry:
%a.val = load i16, i16 addrspace(1)* %a		%a.val = load i16, i16 addrspace(1)* %a
%b.val = load i16, i16 addrspace(1)* %b		%b.val = load i16, i16 addrspace(1)* %b
%r.val = mul i16 %a.val, %b.val		%r.val = mul i16 %a.val, %b.val
store i16 %r.val, i16 addrspace(1)* %r		store i16 %r.val, i16 addrspace(1)* %r
ret void		ret void
}		}

; GCN-LABEL: {{^}}v_mul_v2i16:		; GCN-LABEL: {{^}}v_mul_v2i16:
; SI: v_mul_lo_u32		; SI: v_mul_u32_u24
; SI: v_mul_lo_u32		; SI: v_mul_u32_u24

; VI: v_mul_lo_u16_sdwa		; VI: v_mul_lo_u16_sdwa
; VI: v_mul_lo_u16_e32		; VI: v_mul_lo_u16_e32
; VI: v_or_b32_e32		; VI: v_or_b32_e32


; GFX9: s_waitcnt		; GFX9: s_waitcnt
; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v1		; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v1
; GFX9-NEXT: s_setpc_b64		; GFX9-NEXT: s_setpc_b64
define <2 x i16> @v_mul_v2i16(<2 x i16> %a, <2 x i16> %b) {		define <2 x i16> @v_mul_v2i16(<2 x i16> %a, <2 x i16> %b) {
%r.val = mul <2 x i16> %a, %b		%r.val = mul <2 x i16> %a, %b
ret <2 x i16> %r.val		ret <2 x i16> %r.val
}		}

; FIXME: Unpack garbage on gfx9		; FIXME: Unpack garbage on gfx9
; GCN-LABEL: {{^}}v_mul_v3i16:		; GCN-LABEL: {{^}}v_mul_v3i16:
; SI: v_mul_lo_u32		; SI: v_mul_u32_u24
; SI: v_mul_lo_u32		; SI: v_mul_u32_u24
; SI: v_mul_lo_u32		; SI: v_mul_u32_u24

; VI: v_mul_lo_u16		; VI: v_mul_lo_u16
; VI: v_mul_lo_u16		; VI: v_mul_lo_u16
; VI: v_mul_lo_u16		; VI: v_mul_lo_u16

; GFX9: s_waitcnt		; GFX9: s_waitcnt
; GFX9-NEXT: v_pk_mul_lo_u16		; GFX9-NEXT: v_pk_mul_lo_u16
; GFX9-NEXT: v_pk_mul_lo_u16		; GFX9-NEXT: v_pk_mul_lo_u16
; GFX9-NEXT: s_setpc_b64		; GFX9-NEXT: s_setpc_b64
define <3 x i16> @v_mul_v3i16(<3 x i16> %a, <3 x i16> %b) {		define <3 x i16> @v_mul_v3i16(<3 x i16> %a, <3 x i16> %b) {
%r.val = mul <3 x i16> %a, %b		%r.val = mul <3 x i16> %a, %b
ret <3 x i16> %r.val		ret <3 x i16> %r.val
}		}

; GCN-LABEL: {{^}}v_mul_v4i16:		; GCN-LABEL: {{^}}v_mul_v4i16:
; SI: v_mul_lo_u32		; SI: v_mul_u32_u24
; SI: v_mul_lo_u32		; SI: v_mul_u32_u24
; SI: v_mul_lo_u32		; SI: v_mul_u32_u24
; SI: v_mul_lo_u32		; SI: v_mul_u32_u24

; VI: v_mul_lo_u16_sdwa		; VI: v_mul_lo_u16_sdwa
; VI: v_mul_lo_u16_e32		; VI: v_mul_lo_u16_e32
; VI: v_mul_lo_u16_sdwa		; VI: v_mul_lo_u16_sdwa
; VI: v_mul_lo_u16_e32		; VI: v_mul_lo_u16_e32
; VI: v_or_b32_e32		; VI: v_or_b32_e32
; VI: v_or_b32_e32		; VI: v_or_b32_e32

; GFX9: s_waitcnt		; GFX9: s_waitcnt
; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2		; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2
; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v3		; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v3
; GFX9-NEXT: s_setpc_b64		; GFX9-NEXT: s_setpc_b64
define <4 x i16> @v_mul_v4i16(<4 x i16> %a, <4 x i16> %b) {		define <4 x i16> @v_mul_v4i16(<4 x i16> %a, <4 x i16> %b) {
%r.val = mul <4 x i16> %a, %b		%r.val = mul <4 x i16> %a, %b
ret <4 x i16> %r.val		ret <4 x i16> %r.val
}		}

test/CodeGen/AMDGPU/mul_uint24-amdgcn.ll

	Show First 20 Lines • Show All 243 Lines • ▼ Show 20 Lines
	}			}

	; GCN-LABEL: {{^}}test_umul24_anyextend_i23_src0_src1:			; GCN-LABEL: {{^}}test_umul24_anyextend_i23_src0_src1:
	; GCN: s_mov_b32 [[U23_MASK:s[0-9]+]], 0x7fffff			; GCN: s_mov_b32 [[U23_MASK:s[0-9]+]], 0x7fffff
	; GCN-DAG: v_and_b32_e32 v0, [[U23_MASK]], v0			; GCN-DAG: v_and_b32_e32 v0, [[U23_MASK]], v0
	; GCN-DAG: v_and_b32_e32 v1, [[U23_MASK]], v1			; GCN-DAG: v_and_b32_e32 v1, [[U23_MASK]], v1
	; GCN-DAG: v_mul_u32_u24_e32 v0, 0xea, v0			; GCN-DAG: v_mul_u32_u24_e32 v0, 0xea, v0
	; GCN-DAG: v_mul_u32_u24_e32 v1, 0x39b, v1			; GCN-DAG: v_mul_u32_u24_e32 v1, 0x39b, v1
	; GCN: v_and_b32_e32 v1, s4, v1			; GCN-DAG: v_and_b32_e32 v1, s4, v1
	; GCN: v_and_b32_e32 v0, 0x7ffffe, v0			; GCN-DAG: v_and_b32_e32 v0, 0x7ffffe, v0
	; GCN: v_mul_u32_u24_e32 v0, v0, v1			; GCN: v_mul_u32_u24_e32 v0, v0, v1
	; GCN: v_and_b32_e32 v0, 0x1fffe, v0			; GCN: v_and_b32_e32 v0, 0x1fffe, v0
	; GCN: v_mul_u32_u24_e32 v0, 0x63, v0			; GCN: v_mul_u32_u24_e32 v0, 0x63, v0
	; GCN: s_setpc_b64			; GCN: s_setpc_b64
	define i17 @test_umul24_anyextend_i23_src0_src1(i23 %a, i23 %b) {			define i17 @test_umul24_anyextend_i23_src0_src1(i23 %a, i23 %b) {
	entry:			entry:
	%aa = mul i23 %a, 234			%aa = mul i23 %a, 234
	%bb = mul i23 %b, 923			%bb = mul i23 %b, 923
	%a_32 = zext i23 %aa to i32			%a_32 = zext i23 %aa to i32
	%b_32 = zext i23 %bb to i32			%b_32 = zext i23 %bb to i32
	%mul = mul i32 %a_32, %b_32			%mul = mul i32 %a_32, %b_32
	%trunc = trunc i32 %mul to i17			%trunc = trunc i32 %mul to i17
	%arst = mul i17 %trunc, 99			%arst = mul i17 %trunc, 99
	ret i17 %arst			ret i17 %arst
	}			}