Diff 514935

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 9,933 Lines • ▼ Show 20 Lines	static SDValue combineShiftToMULH(SDNode *N, SelectionDAG &DAG,
unsigned ShiftAmt = ShiftAmtSrc->getZExtValue();		unsigned ShiftAmt = ShiftAmtSrc->getZExtValue();
if (ShiftAmt != NarrowVTSize)		if (ShiftAmt != NarrowVTSize)
return SDValue();		return SDValue();

// If the operation feeding into the MUL is a sign extend (sext),		// If the operation feeding into the MUL is a sign extend (sext),
// we use mulhs. Othewise, zero extends (zext) use mulhu.		// we use mulhs. Othewise, zero extends (zext) use mulhu.
unsigned MulhOpcode = IsSignExt ? ISD::MULHS : ISD::MULHU;		unsigned MulhOpcode = IsSignExt ? ISD::MULHS : ISD::MULHU;

// Combine to mulh if mulh is legal/custom for the narrow type on the target.		// Combine to mulh if mulh is legal/custom for the narrow type on the target
		// or if it is a vector type then we could transform to an acceptable type and
		// rely on legalization to split/combine the result.
		if (NarrowVT.isVector()) {
		sdesmalenUnsubmitted Not Done Reply Inline Actions this check is now irrelevant, you can remove the if/else entirely. sdesmalen: this check is now irrelevant, you can remove the if/else entirely.
		dtemirbulatovAuthorUnsubmitted Done Reply Inline Actions Hmm, I have got several errors on scalar handling of MULH on RISCV and LoongArch targets without isVector() check. dtemirbulatov: Hmm, I have got several errors on scalar handling of MULH on RISCV and LoongArch targets…
		EVT TransformVT = TLI.getTypeToTransformTo(*DAG.getContext(), NarrowVT);
		if (TransformVT.getVectorElementType() != NarrowVT.getVectorElementType() \|\|
		!TLI.isOperationLegalOrCustom(MulhOpcode, TransformVT))
		RKSimonUnsubmitted Done Reply Inline Actions if there any chance that odd element count vectors can get here? RKSimon: if there any chance that odd element count vectors can get here?
		return SDValue();
		} else {
if (!TLI.isOperationLegalOrCustom(MulhOpcode, NarrowVT))		if (!TLI.isOperationLegalOrCustom(MulhOpcode, NarrowVT))
return SDValue();		return SDValue();
		}

		sdesmalenUnsubmitted Done Reply Inline Actions I guess your point here is that if the vector is too wide (i.e. it has too many elements), then we can rely on legalisation of the UMULH/SMULH to do splitting of the operation/vectors, and this is preferred over legalising the extends themselves. So rather than only testing if the operation is Legal/Custom for NarrowVT or it's halved type, we actually want to check this for any legalised type of NarrowVT. That means you can use `getTypeToTransformTo`, e.g. EVT TransformVT = TLI.getTypeToTransformTo(DAG.getContext(), NarrowVT); if (!TLI.isOperationLegalOrCustom(Opc, TransformVT)) return SDValue(); sdesmalen:* I guess your point here is that if the vector is too wide (i.e. it has too many elements), then…
SDValue Result =		SDValue Result =
DAG.getNode(MulhOpcode, DL, NarrowVT, LeftOp.getOperand(0), MulhRightOp);		DAG.getNode(MulhOpcode, DL, NarrowVT, LeftOp.getOperand(0), MulhRightOp);
return (N->getOpcode() == ISD::SRA ? DAG.getSExtOrTrunc(Result, DL, WideVT)		return (N->getOpcode() == ISD::SRA ? DAG.getSExtOrTrunc(Result, DL, WideVT)
: DAG.getZExtOrTrunc(Result, DL, WideVT));		: DAG.getZExtOrTrunc(Result, DL, WideVT));
}		}

SDValue DAGCombiner::visitSRA(SDNode *N) {		SDValue DAGCombiner::visitSRA(SDNode *N) {
SDValue N0 = N->getOperand(0);		SDValue N0 = N->getOperand(0);
▲ Show 20 Lines • Show All 17,198 Lines • Show Last 20 Lines

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,550 Lines • ▼ Show 20 Lines	if (Subtarget->useSVEForFixedLengthVectors()) {
// Int operations with no NEON support.		// Int operations with no NEON support.
for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,		for (auto VT : {MVT::v8i8, MVT::v16i8, MVT::v4i16, MVT::v8i16,
MVT::v2i32, MVT::v4i32, MVT::v2i64}) {		MVT::v2i32, MVT::v4i32, MVT::v2i64}) {
setOperationAction(ISD::BITREVERSE, VT, Custom);		setOperationAction(ISD::BITREVERSE, VT, Custom);
setOperationAction(ISD::CTTZ, VT, Custom);		setOperationAction(ISD::CTTZ, VT, Custom);
setOperationAction(ISD::VECREDUCE_AND, VT, Custom);		setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
setOperationAction(ISD::VECREDUCE_OR, VT, Custom);		setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);		setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
		setOperationAction(ISD::MULHS, VT, Custom);
		setOperationAction(ISD::MULHU, VT, Custom);
}		}


// Use SVE for vectors with more than 2 elements.		// Use SVE for vectors with more than 2 elements.
for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})		for (auto VT : {MVT::v4f16, MVT::v8f16, MVT::v4f32})
setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);		setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
}		}

▲ Show 20 Lines • Show All 21,096 Lines • ▼ Show 20 Lines	case AArch64ISD::UMINV:
ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);		ReplaceReductionResults(N, Results, DAG, ISD::UMIN, AArch64ISD::UMINV);
return;		return;
case AArch64ISD::SMAXV:		case AArch64ISD::SMAXV:
ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);		ReplaceReductionResults(N, Results, DAG, ISD::SMAX, AArch64ISD::SMAXV);
return;		return;
case AArch64ISD::UMAXV:		case AArch64ISD::UMAXV:
ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);		ReplaceReductionResults(N, Results, DAG, ISD::UMAX, AArch64ISD::UMAXV);
return;		return;
		case ISD::MULHS:
		if (useSVEForFixedLengthVectorVT(SDValue(N, 0).getValueType()))
		Results.push_back(
		LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHS_PRED));
		return;
		case ISD::MULHU:
		if (useSVEForFixedLengthVectorVT(SDValue(N, 0).getValueType()))
		Results.push_back(
		LowerToPredicatedOp(SDValue(N, 0), DAG, AArch64ISD::MULHU_PRED));
		return;
case ISD::FP_TO_UINT:		case ISD::FP_TO_UINT:
case ISD::FP_TO_SINT:		case ISD::FP_TO_SINT:
case ISD::STRICT_FP_TO_SINT:		case ISD::STRICT_FP_TO_SINT:
case ISD::STRICT_FP_TO_UINT:		case ISD::STRICT_FP_TO_UINT:
assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");		assert(N->getValueType(0) == MVT::i128 && "unexpected illegal conversion");
// Let normal code take care of it by not adding anything to Results.		// Let normal code take care of it by not adding anything to Results.
return;		return;
case ISD::ATOMIC_CMP_SWAP:		case ISD::ATOMIC_CMP_SWAP:
▲ Show 20 Lines • Show All 2,039 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll

; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py		; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s \| FileCheck %s -check-prefixes=CHECK,VBITS_GE_256		; RUN: llc -aarch64-sve-vector-bits-min=256 < %s \| FileCheck %s -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s \| FileCheck %s -check-prefixes=CHECK,VBITS_GE_512		; RUN: llc -aarch64-sve-vector-bits-min=512 < %s \| FileCheck %s -check-prefixes=CHECK,VBITS_GE_512
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s \| FileCheck %s -check-prefixes=CHECK,VBITS_GE_512		; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s \| FileCheck %s -check-prefixes=CHECK,VBITS_GE_512

; This test only tests the legal types for a given vector width, as mulh nodes		; This test only tests the legal types for a given vector width, as mulh nodes
; do not get generated for non-legal types.		; do not get generated for non-legal types.

target triple = "aarch64-unknown-linux-gnu"		target triple = "aarch64-unknown-linux-gnu"

;		;
; SMULH		; SMULH
;		;

; Don't use SVE for 64-bit vectors.		; Don't use SVE for 64-bit vectors.
; FIXME: The codegen for the >=256 bits case can be improved.
define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {		define <8 x i8> @smulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
		RKSimonUnsubmitted Done Reply Inline Actions remove the FIXME? RKSimon: remove the FIXME?
; CHECK-LABEL: smulh_v8i8:		; CHECK-LABEL: smulh_v8i8:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b		; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: ushr v1.8h, v0.8h, #8		; CHECK-NEXT: ptrue p0.b, vl8
; CHECK-NEXT: umov w8, v1.h[0]		; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: umov w9, v1.h[1]		; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b
; CHECK-NEXT: fmov s0, w8		; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: umov w8, v1.h[2]
; CHECK-NEXT: mov v0.b[1], w9
; CHECK-NEXT: mov v0.b[2], w8
; CHECK-NEXT: umov w8, v1.h[3]
; CHECK-NEXT: mov v0.b[3], w8
; CHECK-NEXT: umov w8, v1.h[4]
; CHECK-NEXT: mov v0.b[4], w8
; CHECK-NEXT: umov w8, v1.h[5]
; CHECK-NEXT: mov v0.b[5], w8
; CHECK-NEXT: umov w8, v1.h[6]
; CHECK-NEXT: mov v0.b[6], w8
; CHECK-NEXT: umov w8, v1.h[7]
; CHECK-NEXT: mov v0.b[7], w8
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%insert = insertelement <8 x i16> undef, i16 8, i64 0		%insert = insertelement <8 x i16> undef, i16 8, i64 0
%splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer		%splat = shufflevector <8 x i16> %insert, <8 x i16> undef, <8 x i32> zeroinitializer
%1 = sext <8 x i8> %op1 to <8 x i16>		%1 = sext <8 x i8> %op1 to <8 x i16>
%2 = sext <8 x i8> %op2 to <8 x i16>		%2 = sext <8 x i8> %op2 to <8 x i16>
%mul = mul <8 x i16> %1, %2		%mul = mul <8 x i16> %1, %2
%shr = lshr <8 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>		%shr = lshr <8 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%res = trunc <8 x i16> %shr to <8 x i8>		%res = trunc <8 x i16> %shr to <8 x i8>
ret <8 x i8> %res		ret <8 x i8> %res
}		}

; Don't use SVE for 128-bit vectors.		; Don't use SVE for 128-bit vectors.
define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {		define <16 x i8> @smulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: smulh_v16i8:		; CHECK-LABEL: smulh_v16i8:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: smull2 v2.8h, v0.16b, v1.16b		; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: smull v0.8h, v0.8b, v1.8b		; CHECK-NEXT: ptrue p0.b, vl16
; CHECK-NEXT: uzp2 v0.16b, v0.16b, v2.16b		; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
		; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z1.b
		; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%1 = sext <16 x i8> %op1 to <16 x i16>		%1 = sext <16 x i8> %op1 to <16 x i16>
%2 = sext <16 x i8> %op2 to <16 x i16>		%2 = sext <16 x i8> %op2 to <16 x i16>
%mul = mul <16 x i16> %1, %2		%mul = mul <16 x i16> %1, %2
%shr = lshr <16 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>		%shr = lshr <16 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%res = trunc <16 x i16> %shr to <16 x i8>		%res = trunc <16 x i16> %shr to <16 x i8>
ret <16 x i8> %res		ret <16 x i8> %res
}		}
Show All 16 Lines	; CHECK-NEXT: ret
%res = trunc <32 x i16> %shr to <32 x i8>		%res = trunc <32 x i16> %shr to <32 x i8>
store <32 x i8> %res, ptr %a		store <32 x i8> %res, ptr %a
ret void		ret void
}		}

define void @smulh_v64i8(ptr %a, ptr %b) #0 {		define void @smulh_v64i8(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: smulh_v64i8:		; VBITS_GE_256-LABEL: smulh_v64i8:
; VBITS_GE_256: // %bb.0:		; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov w8, #32		; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
; VBITS_GE_256-NEXT: ptrue p0.b, vl32		; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: ptrue p1.h, vl16
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]		; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]		; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]		; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]		; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
; VBITS_GE_256-NEXT: sunpklo z4.h, z0.b		; VBITS_GE_256-NEXT: smulh z0.b, p0/m, z0.b, z2.b
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16		; VBITS_GE_256-NEXT: smulh z1.b, p0/m, z1.b, z3.b
; VBITS_GE_256-NEXT: sunpklo z5.h, z1.b		; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16		; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
; VBITS_GE_256-NEXT: sunpklo z6.h, z2.b
; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
; VBITS_GE_256-NEXT: sunpklo z7.h, z3.b
; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
; VBITS_GE_256-NEXT: sunpklo z0.h, z0.b
; VBITS_GE_256-NEXT: sunpklo z1.h, z1.b
; VBITS_GE_256-NEXT: sunpklo z2.h, z2.b
; VBITS_GE_256-NEXT: sunpklo z3.h, z3.b
; VBITS_GE_256-NEXT: mul z4.h, p1/m, z4.h, z6.h
; VBITS_GE_256-NEXT: mul z0.h, p1/m, z0.h, z2.h
; VBITS_GE_256-NEXT: movprfx z2, z5
; VBITS_GE_256-NEXT: mul z2.h, p1/m, z2.h, z7.h
; VBITS_GE_256-NEXT: mul z1.h, p1/m, z1.h, z3.h
; VBITS_GE_256-NEXT: lsr z0.h, z0.h, #8
; VBITS_GE_256-NEXT: lsr z3.h, z4.h, #8
; VBITS_GE_256-NEXT: lsr z1.h, z1.h, #8
; VBITS_GE_256-NEXT: lsr z2.h, z2.h, #8
; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b
; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
; VBITS_GE_256-NEXT: ptrue p1.b, vl16
; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b
; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
; VBITS_GE_256-NEXT: splice z3.b, p1, z3.b, z0.b
; VBITS_GE_256-NEXT: splice z2.b, p1, z2.b, z1.b
; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0, x8]
; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0]
; VBITS_GE_256-NEXT: ret		; VBITS_GE_256-NEXT: ret
;		;
; VBITS_GE_512-LABEL: smulh_v64i8:		; VBITS_GE_512-LABEL: smulh_v64i8:
; VBITS_GE_512: // %bb.0:		; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.b, vl64		; VBITS_GE_512-NEXT: ptrue p0.b, vl64
; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]		; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]		; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
; VBITS_GE_512-NEXT: smulh z0.b, p0/m, z0.b, z1.b		; VBITS_GE_512-NEXT: smulh z0.b, p0/m, z0.b, z1.b
▲ Show 20 Lines • Show All 47 Lines • ▼ Show 20 Lines	; CHECK-NEXT: ret
%2 = sext <256 x i8> %op2 to <256 x i16>		%2 = sext <256 x i8> %op2 to <256 x i16>
%mul = mul <256 x i16> %1, %2		%mul = mul <256 x i16> %1, %2
%shr = lshr <256 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>		%shr = lshr <256 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%res = trunc <256 x i16> %shr to <256 x i8>		%res = trunc <256 x i16> %shr to <256 x i8>
store <256 x i8> %res, ptr %a		store <256 x i8> %res, ptr %a
ret void		ret void
}		}

; Don't use SVE for 64-bit vectors.		; Don't use SVE for 64-bit vectors.
; FIXME: The codegen for the >=256 bits case can be improved.
define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {		define <4 x i16> @smulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
		RKSimonUnsubmitted Done Reply Inline Actions FIXME? RKSimon: FIXME?
; CHECK-LABEL: smulh_v4i16:		; CHECK-LABEL: smulh_v4i16:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h		; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: ushr v1.4s, v0.4s, #16		; CHECK-NEXT: ptrue p0.h, vl4
; CHECK-NEXT: mov w8, v1.s[1]		; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: mov w9, v1.s[2]		; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h
; CHECK-NEXT: mov v0.16b, v1.16b		; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: mov v0.h[1], w8
; CHECK-NEXT: mov w8, v1.s[3]
; CHECK-NEXT: mov v0.h[2], w9
; CHECK-NEXT: mov v0.h[3], w8
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%1 = sext <4 x i16> %op1 to <4 x i32>		%1 = sext <4 x i16> %op1 to <4 x i32>
%2 = sext <4 x i16> %op2 to <4 x i32>		%2 = sext <4 x i16> %op2 to <4 x i32>
%mul = mul <4 x i32> %1, %2		%mul = mul <4 x i32> %1, %2
%shr = lshr <4 x i32> %mul, <i32 16, i32 16, i32 16, i32 16>		%shr = lshr <4 x i32> %mul, <i32 16, i32 16, i32 16, i32 16>
%res = trunc <4 x i32> %shr to <4 x i16>		%res = trunc <4 x i32> %shr to <4 x i16>
ret <4 x i16> %res		ret <4 x i16> %res
}		}

; Don't use SVE for 128-bit vectors.		; Don't use SVE for 128-bit vectors.
define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {		define <8 x i16> @smulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: smulh_v8i16:		; CHECK-LABEL: smulh_v8i16:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: smull2 v2.4s, v0.8h, v1.8h		; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: smull v0.4s, v0.4h, v1.4h		; CHECK-NEXT: ptrue p0.h, vl8
; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h		; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
		; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z1.h
		; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%1 = sext <8 x i16> %op1 to <8 x i32>		%1 = sext <8 x i16> %op1 to <8 x i32>
%2 = sext <8 x i16> %op2 to <8 x i32>		%2 = sext <8 x i16> %op2 to <8 x i32>
%mul = mul <8 x i32> %1, %2		%mul = mul <8 x i32> %1, %2
%shr = lshr <8 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>		%shr = lshr <8 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%res = trunc <8 x i32> %shr to <8 x i16>		%res = trunc <8 x i32> %shr to <8 x i16>
ret <8 x i16> %res		ret <8 x i16> %res
}		}
Show All 16 Lines	; CHECK-NEXT: ret
%res = trunc <16 x i32> %shr to <16 x i16>		%res = trunc <16 x i32> %shr to <16 x i16>
store <16 x i16> %res, ptr %a		store <16 x i16> %res, ptr %a
ret void		ret void
}		}

define void @smulh_v32i16(ptr %a, ptr %b) #0 {		define void @smulh_v32i16(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: smulh_v32i16:		; VBITS_GE_256-LABEL: smulh_v32i16:
; VBITS_GE_256: // %bb.0:		; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16		; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
; VBITS_GE_256-NEXT: ptrue p0.h, vl16		; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ptrue p1.h, vl8
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]		; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]		; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]		; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]		; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: mov z7.d, z1.d		; VBITS_GE_256-NEXT: smulh z0.h, p0/m, z0.h, z2.h
; VBITS_GE_256-NEXT: mov z16.d, z3.d		; VBITS_GE_256-NEXT: smulh z1.h, p0/m, z1.h, z3.h
; VBITS_GE_256-NEXT: ext z7.b, z7.b, z7.b, #16		; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: smull2 v4.4s, v0.8h, v2.8h
; VBITS_GE_256-NEXT: ext z16.b, z16.b, z3.b, #16
; VBITS_GE_256-NEXT: smull v5.4s, v0.4h, v2.4h
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
; VBITS_GE_256-NEXT: smull2 v6.4s, v1.8h, v3.8h
; VBITS_GE_256-NEXT: smull v1.4s, v1.4h, v3.4h
; VBITS_GE_256-NEXT: smull2 v3.4s, v0.8h, v2.8h
; VBITS_GE_256-NEXT: smull v0.4s, v0.4h, v2.4h
; VBITS_GE_256-NEXT: smull2 v2.4s, v7.8h, v16.8h
; VBITS_GE_256-NEXT: smull v7.4s, v7.4h, v16.4h
; VBITS_GE_256-NEXT: uzp2 v4.8h, v5.8h, v4.8h
; VBITS_GE_256-NEXT: uzp2 v1.8h, v1.8h, v6.8h
; VBITS_GE_256-NEXT: uzp2 v0.8h, v0.8h, v3.8h
; VBITS_GE_256-NEXT: uzp2 v2.8h, v7.8h, v2.8h
; VBITS_GE_256-NEXT: splice z4.h, p1, z4.h, z0.h
; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z2.h
; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]		; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret		; VBITS_GE_256-NEXT: ret
;		;
; VBITS_GE_512-LABEL: smulh_v32i16:		; VBITS_GE_512-LABEL: smulh_v32i16:
; VBITS_GE_512: // %bb.0:		; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl32		; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]		; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]		; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
▲ Show 20 Lines • Show All 50 Lines • ▼ Show 20 Lines	; CHECK-NEXT: ret
store <128 x i16> %res, ptr %a		store <128 x i16> %res, ptr %a
ret void		ret void
}		}

; Vector i64 multiplications are not legal for NEON so use SVE when available.		; Vector i64 multiplications are not legal for NEON so use SVE when available.
define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {		define <2 x i32> @smulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: smulh_v2i32:		; CHECK-LABEL: smulh_v2i32:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: sshll v0.2d, v0.2s, #0		; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: ptrue p0.d, vl2		; CHECK-NEXT: ptrue p0.s, vl2
; CHECK-NEXT: sshll v1.2d, v1.2s, #0		; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d		; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: shrn v0.2s, v0.2d, #32		; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%1 = sext <2 x i32> %op1 to <2 x i64>		%1 = sext <2 x i32> %op1 to <2 x i64>
%2 = sext <2 x i32> %op2 to <2 x i64>		%2 = sext <2 x i32> %op2 to <2 x i64>
%mul = mul <2 x i64> %1, %2		%mul = mul <2 x i64> %1, %2
%shr = lshr <2 x i64> %mul, <i64 32, i64 32>		%shr = lshr <2 x i64> %mul, <i64 32, i64 32>
%res = trunc <2 x i64> %shr to <2 x i32>		%res = trunc <2 x i64> %shr to <2 x i32>
ret <2 x i32> %res		ret <2 x i32> %res
}		}

; Don't use SVE for 128-bit vectors.		; Don't use SVE for 128-bit vectors.
define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {		define <4 x i32> @smulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: smulh_v4i32:		; CHECK-LABEL: smulh_v4i32:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: smull2 v2.2d, v0.4s, v1.4s		; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: smull v0.2d, v0.2s, v1.2s		; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: uzp2 v0.4s, v0.4s, v2.4s		; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
		; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z1.s
		; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%1 = sext <4 x i32> %op1 to <4 x i64>		%1 = sext <4 x i32> %op1 to <4 x i64>
%2 = sext <4 x i32> %op2 to <4 x i64>		%2 = sext <4 x i32> %op2 to <4 x i64>
%mul = mul <4 x i64> %1, %2		%mul = mul <4 x i64> %1, %2
%shr = lshr <4 x i64> %mul, <i64 32, i64 32, i64 32, i64 32>		%shr = lshr <4 x i64> %mul, <i64 32, i64 32, i64 32, i64 32>
%res = trunc <4 x i64> %shr to <4 x i32>		%res = trunc <4 x i64> %shr to <4 x i32>
ret <4 x i32> %res		ret <4 x i32> %res
}		}
Show All 16 Lines	; CHECK-NEXT: ret
%res = trunc <8 x i64> %shr to <8 x i32>		%res = trunc <8 x i64> %shr to <8 x i32>
store <8 x i32> %res, ptr %a		store <8 x i32> %res, ptr %a
ret void		ret void
}		}

define void @smulh_v16i32(ptr %a, ptr %b) #0 {		define void @smulh_v16i32(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: smulh_v16i32:		; VBITS_GE_256-LABEL: smulh_v16i32:
; VBITS_GE_256: // %bb.0:		; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8		; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8		; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ptrue p1.s, vl4
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]		; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]		; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]		; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]		; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: mov z7.d, z1.d		; VBITS_GE_256-NEXT: smulh z0.s, p0/m, z0.s, z2.s
; VBITS_GE_256-NEXT: mov z16.d, z3.d		; VBITS_GE_256-NEXT: smulh z1.s, p0/m, z1.s, z3.s
; VBITS_GE_256-NEXT: ext z7.b, z7.b, z7.b, #16		; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: smull2 v4.2d, v0.4s, v2.4s
; VBITS_GE_256-NEXT: ext z16.b, z16.b, z3.b, #16
; VBITS_GE_256-NEXT: smull v5.2d, v0.2s, v2.2s
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
; VBITS_GE_256-NEXT: smull2 v6.2d, v1.4s, v3.4s
; VBITS_GE_256-NEXT: smull v1.2d, v1.2s, v3.2s
; VBITS_GE_256-NEXT: smull2 v3.2d, v0.4s, v2.4s
; VBITS_GE_256-NEXT: smull v0.2d, v0.2s, v2.2s
; VBITS_GE_256-NEXT: smull2 v2.2d, v7.4s, v16.4s
; VBITS_GE_256-NEXT: smull v7.2d, v7.2s, v16.2s
; VBITS_GE_256-NEXT: uzp2 v4.4s, v5.4s, v4.4s
; VBITS_GE_256-NEXT: uzp2 v1.4s, v1.4s, v6.4s
; VBITS_GE_256-NEXT: uzp2 v0.4s, v0.4s, v3.4s
; VBITS_GE_256-NEXT: uzp2 v2.4s, v7.4s, v2.4s
; VBITS_GE_256-NEXT: splice z4.s, p1, z4.s, z0.s
; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z2.s
; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]		; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret		; VBITS_GE_256-NEXT: ret
;		;
; VBITS_GE_512-LABEL: smulh_v16i32:		; VBITS_GE_512-LABEL: smulh_v16i32:
; VBITS_GE_512: // %bb.0:		; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16		; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]		; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]		; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
▲ Show 20 Lines • Show All 107 Lines • ▼ Show 20 Lines	; CHECK-NEXT: ret
%res = trunc <4 x i128> %shr to <4 x i64>		%res = trunc <4 x i128> %shr to <4 x i64>
store <4 x i64> %res, ptr %a		store <4 x i64> %res, ptr %a
ret void		ret void
}		}

define void @smulh_v8i64(ptr %a, ptr %b) #0 {		define void @smulh_v8i64(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: smulh_v8i64:		; VBITS_GE_256-LABEL: smulh_v8i64:
; VBITS_GE_256: // %bb.0:		; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4		; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4		; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ptrue p1.d, vl2
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]		; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]		; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]		; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]		; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: mov x9, v0.d[1]		; VBITS_GE_256-NEXT: smulh z0.d, p0/m, z0.d, z2.d
; VBITS_GE_256-NEXT: fmov x10, d0		; VBITS_GE_256-NEXT: smulh z1.d, p0/m, z1.d, z3.d
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16		; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: fmov x17, d2		; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_GE_256-NEXT: mov x13, v2.d[1]
; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
; VBITS_GE_256-NEXT: mov x14, v0.d[1]
; VBITS_GE_256-NEXT: mov x18, v2.d[1]
; VBITS_GE_256-NEXT: smulh x10, x10, x17
; VBITS_GE_256-NEXT: mov x11, v1.d[1]
; VBITS_GE_256-NEXT: fmov x12, d1
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
; VBITS_GE_256-NEXT: mov x2, v3.d[1]
; VBITS_GE_256-NEXT: fmov x3, d3
; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
; VBITS_GE_256-NEXT: smulh x9, x9, x13
; VBITS_GE_256-NEXT: mov x13, v1.d[1]
; VBITS_GE_256-NEXT: smulh x14, x14, x18
; VBITS_GE_256-NEXT: mov x18, v3.d[1]
; VBITS_GE_256-NEXT: smulh x12, x12, x3
; VBITS_GE_256-NEXT: fmov x15, d0
; VBITS_GE_256-NEXT: fmov x16, d1
; VBITS_GE_256-NEXT: fmov x1, d2
; VBITS_GE_256-NEXT: fmov x17, d3
; VBITS_GE_256-NEXT: fmov d0, x9
; VBITS_GE_256-NEXT: fmov d1, x10
; VBITS_GE_256-NEXT: smulh x9, x11, x2
; VBITS_GE_256-NEXT: smulh x15, x15, x1
; VBITS_GE_256-NEXT: fmov d4, x12
; VBITS_GE_256-NEXT: smulh x16, x16, x17
; VBITS_GE_256-NEXT: smulh x10, x13, x18
; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
; VBITS_GE_256-NEXT: fmov d0, x14
; VBITS_GE_256-NEXT: fmov d2, x15
; VBITS_GE_256-NEXT: fmov d3, x9
; VBITS_GE_256-NEXT: fmov d6, x16
; VBITS_GE_256-NEXT: fmov d5, x10
; VBITS_GE_256-NEXT: mov v2.d[1], v0.d[0]
; VBITS_GE_256-NEXT: mov v4.d[1], v3.d[0]
; VBITS_GE_256-NEXT: mov v6.d[1], v5.d[0]
; VBITS_GE_256-NEXT: splice z1.d, p1, z1.d, z2.d
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: splice z4.d, p1, z4.d, z6.d
; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret		; VBITS_GE_256-NEXT: ret
;		;
; VBITS_GE_512-LABEL: smulh_v8i64:		; VBITS_GE_512-LABEL: smulh_v8i64:
; VBITS_GE_512: // %bb.0:		; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8		; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]		; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]		; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: smulh z0.d, p0/m, z0.d, z1.d		; VBITS_GE_512-NEXT: smulh z0.d, p0/m, z0.d, z1.d
▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines
; UMULH		; UMULH
;		;

; Don't use SVE for 64-bit vectors.		; Don't use SVE for 64-bit vectors.
; FIXME: The codegen for the >=256 bits case can be improved.		; FIXME: The codegen for the >=256 bits case can be improved.
define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {		define <8 x i8> @umulh_v8i8(<8 x i8> %op1, <8 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: umulh_v8i8:		; CHECK-LABEL: umulh_v8i8:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b		; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: ushr v1.8h, v0.8h, #8		; CHECK-NEXT: ptrue p0.b, vl8
; CHECK-NEXT: umov w8, v1.h[0]		; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: umov w9, v1.h[1]		; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b
; CHECK-NEXT: fmov s0, w8		; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: umov w8, v1.h[2]
; CHECK-NEXT: mov v0.b[1], w9
; CHECK-NEXT: mov v0.b[2], w8
; CHECK-NEXT: umov w8, v1.h[3]
; CHECK-NEXT: mov v0.b[3], w8
; CHECK-NEXT: umov w8, v1.h[4]
; CHECK-NEXT: mov v0.b[4], w8
; CHECK-NEXT: umov w8, v1.h[5]
; CHECK-NEXT: mov v0.b[5], w8
; CHECK-NEXT: umov w8, v1.h[6]
; CHECK-NEXT: mov v0.b[6], w8
; CHECK-NEXT: umov w8, v1.h[7]
; CHECK-NEXT: mov v0.b[7], w8
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%1 = zext <8 x i8> %op1 to <8 x i16>		%1 = zext <8 x i8> %op1 to <8 x i16>
%2 = zext <8 x i8> %op2 to <8 x i16>		%2 = zext <8 x i8> %op2 to <8 x i16>
%mul = mul <8 x i16> %1, %2		%mul = mul <8 x i16> %1, %2
%shr = lshr <8 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>		%shr = lshr <8 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%res = trunc <8 x i16> %shr to <8 x i8>		%res = trunc <8 x i16> %shr to <8 x i8>
ret <8 x i8> %res		ret <8 x i8> %res
}		}

; Don't use SVE for 128-bit vectors.		; Don't use SVE for 128-bit vectors.
define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {		define <16 x i8> @umulh_v16i8(<16 x i8> %op1, <16 x i8> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: umulh_v16i8:		; CHECK-LABEL: umulh_v16i8:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: umull2 v2.8h, v0.16b, v1.16b		; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: umull v0.8h, v0.8b, v1.8b		; CHECK-NEXT: ptrue p0.b, vl16
; CHECK-NEXT: uzp2 v0.16b, v0.16b, v2.16b		; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
		; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z1.b
		; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%1 = zext <16 x i8> %op1 to <16 x i16>		%1 = zext <16 x i8> %op1 to <16 x i16>
%2 = zext <16 x i8> %op2 to <16 x i16>		%2 = zext <16 x i8> %op2 to <16 x i16>
%mul = mul <16 x i16> %1, %2		%mul = mul <16 x i16> %1, %2
%shr = lshr <16 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>		%shr = lshr <16 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%res = trunc <16 x i16> %shr to <16 x i8>		%res = trunc <16 x i16> %shr to <16 x i8>
ret <16 x i8> %res		ret <16 x i8> %res
}		}
Show All 16 Lines	; CHECK-NEXT: ret
%res = trunc <32 x i16> %shr to <32 x i8>		%res = trunc <32 x i16> %shr to <32 x i8>
store <32 x i8> %res, ptr %a		store <32 x i8> %res, ptr %a
ret void		ret void
}		}

define void @umulh_v64i8(ptr %a, ptr %b) #0 {		define void @umulh_v64i8(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: umulh_v64i8:		; VBITS_GE_256-LABEL: umulh_v64i8:
; VBITS_GE_256: // %bb.0:		; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov w8, #32		; VBITS_GE_256-NEXT: mov w8, #32 // =0x20
; VBITS_GE_256-NEXT: ptrue p0.b, vl32		; VBITS_GE_256-NEXT: ptrue p0.b, vl32
; VBITS_GE_256-NEXT: ptrue p1.h, vl16
; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]		; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0, x8]
; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]		; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]		; VBITS_GE_256-NEXT: ld1b { z2.b }, p0/z, [x1, x8]
; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]		; VBITS_GE_256-NEXT: ld1b { z3.b }, p0/z, [x1]
; VBITS_GE_256-NEXT: uunpklo z4.h, z0.b		; VBITS_GE_256-NEXT: umulh z0.b, p0/m, z0.b, z2.b
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16		; VBITS_GE_256-NEXT: umulh z1.b, p0/m, z1.b, z3.b
; VBITS_GE_256-NEXT: uunpklo z5.h, z1.b		; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0, x8]
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16		; VBITS_GE_256-NEXT: st1b { z1.b }, p0, [x0]
; VBITS_GE_256-NEXT: uunpklo z6.h, z2.b
; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
; VBITS_GE_256-NEXT: uunpklo z7.h, z3.b
; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
; VBITS_GE_256-NEXT: uunpklo z0.h, z0.b
; VBITS_GE_256-NEXT: uunpklo z1.h, z1.b
; VBITS_GE_256-NEXT: uunpklo z2.h, z2.b
; VBITS_GE_256-NEXT: uunpklo z3.h, z3.b
; VBITS_GE_256-NEXT: mul z4.h, p1/m, z4.h, z6.h
; VBITS_GE_256-NEXT: mul z0.h, p1/m, z0.h, z2.h
; VBITS_GE_256-NEXT: movprfx z2, z5
; VBITS_GE_256-NEXT: mul z2.h, p1/m, z2.h, z7.h
; VBITS_GE_256-NEXT: mul z1.h, p1/m, z1.h, z3.h
; VBITS_GE_256-NEXT: lsr z0.h, z0.h, #8
; VBITS_GE_256-NEXT: lsr z3.h, z4.h, #8
; VBITS_GE_256-NEXT: lsr z1.h, z1.h, #8
; VBITS_GE_256-NEXT: lsr z2.h, z2.h, #8
; VBITS_GE_256-NEXT: uzp1 z3.b, z3.b, z3.b
; VBITS_GE_256-NEXT: uzp1 z0.b, z0.b, z0.b
; VBITS_GE_256-NEXT: ptrue p1.b, vl16
; VBITS_GE_256-NEXT: uzp1 z2.b, z2.b, z2.b
; VBITS_GE_256-NEXT: uzp1 z1.b, z1.b, z1.b
; VBITS_GE_256-NEXT: splice z3.b, p1, z3.b, z0.b
; VBITS_GE_256-NEXT: splice z2.b, p1, z2.b, z1.b
; VBITS_GE_256-NEXT: st1b { z3.b }, p0, [x0, x8]
; VBITS_GE_256-NEXT: st1b { z2.b }, p0, [x0]
; VBITS_GE_256-NEXT: ret		; VBITS_GE_256-NEXT: ret
;		;
; VBITS_GE_512-LABEL: umulh_v64i8:		; VBITS_GE_512-LABEL: umulh_v64i8:
; VBITS_GE_512: // %bb.0:		; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.b, vl64		; VBITS_GE_512-NEXT: ptrue p0.b, vl64
; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]		; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]		; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1]
; VBITS_GE_512-NEXT: umulh z0.b, p0/m, z0.b, z1.b		; VBITS_GE_512-NEXT: umulh z0.b, p0/m, z0.b, z1.b
▲ Show 20 Lines • Show All 52 Lines • ▼ Show 20 Lines	; CHECK-NEXT: ret
ret void		ret void
}		}

; Don't use SVE for 64-bit vectors.		; Don't use SVE for 64-bit vectors.
; FIXME: The codegen for the >=256 bits case can be improved.		; FIXME: The codegen for the >=256 bits case can be improved.
define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {		define <4 x i16> @umulh_v4i16(<4 x i16> %op1, <4 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: umulh_v4i16:		; CHECK-LABEL: umulh_v4i16:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h		; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: ushr v1.4s, v0.4s, #16		; CHECK-NEXT: ptrue p0.h, vl4
; CHECK-NEXT: mov w8, v1.s[1]		; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: mov w9, v1.s[2]		; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h
; CHECK-NEXT: mov v0.16b, v1.16b		; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: mov v0.h[1], w8
; CHECK-NEXT: mov w8, v1.s[3]
; CHECK-NEXT: mov v0.h[2], w9
; CHECK-NEXT: mov v0.h[3], w8
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%1 = zext <4 x i16> %op1 to <4 x i32>		%1 = zext <4 x i16> %op1 to <4 x i32>
%2 = zext <4 x i16> %op2 to <4 x i32>		%2 = zext <4 x i16> %op2 to <4 x i32>
%mul = mul <4 x i32> %1, %2		%mul = mul <4 x i32> %1, %2
%shr = lshr <4 x i32> %mul, <i32 16, i32 16, i32 16, i32 16>		%shr = lshr <4 x i32> %mul, <i32 16, i32 16, i32 16, i32 16>
%res = trunc <4 x i32> %shr to <4 x i16>		%res = trunc <4 x i32> %shr to <4 x i16>
ret <4 x i16> %res		ret <4 x i16> %res
}		}

; Don't use SVE for 128-bit vectors.		; Don't use SVE for 128-bit vectors.
define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {		define <8 x i16> @umulh_v8i16(<8 x i16> %op1, <8 x i16> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: umulh_v8i16:		; CHECK-LABEL: umulh_v8i16:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: umull2 v2.4s, v0.8h, v1.8h		; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: umull v0.4s, v0.4h, v1.4h		; CHECK-NEXT: ptrue p0.h, vl8
; CHECK-NEXT: uzp2 v0.8h, v0.8h, v2.8h		; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
		; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z1.h
		; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%1 = zext <8 x i16> %op1 to <8 x i32>		%1 = zext <8 x i16> %op1 to <8 x i32>
%2 = zext <8 x i16> %op2 to <8 x i32>		%2 = zext <8 x i16> %op2 to <8 x i32>
%mul = mul <8 x i32> %1, %2		%mul = mul <8 x i32> %1, %2
%shr = lshr <8 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>		%shr = lshr <8 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
%res = trunc <8 x i32> %shr to <8 x i16>		%res = trunc <8 x i32> %shr to <8 x i16>
ret <8 x i16> %res		ret <8 x i16> %res
}		}
Show All 16 Lines	; CHECK-NEXT: ret
%res = trunc <16 x i32> %shr to <16 x i16>		%res = trunc <16 x i32> %shr to <16 x i16>
store <16 x i16> %res, ptr %a		store <16 x i16> %res, ptr %a
ret void		ret void
}		}

define void @umulh_v32i16(ptr %a, ptr %b) #0 {		define void @umulh_v32i16(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: umulh_v32i16:		; VBITS_GE_256-LABEL: umulh_v32i16:
; VBITS_GE_256: // %bb.0:		; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #16		; VBITS_GE_256-NEXT: mov x8, #16 // =0x10
; VBITS_GE_256-NEXT: ptrue p0.h, vl16		; VBITS_GE_256-NEXT: ptrue p0.h, vl16
; VBITS_GE_256-NEXT: ptrue p1.h, vl8
; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]		; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]		; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]		; VBITS_GE_256-NEXT: ld1h { z2.h }, p0/z, [x1, x8, lsl #1]
; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]		; VBITS_GE_256-NEXT: ld1h { z3.h }, p0/z, [x1]
; VBITS_GE_256-NEXT: mov z7.d, z1.d		; VBITS_GE_256-NEXT: umulh z0.h, p0/m, z0.h, z2.h
; VBITS_GE_256-NEXT: mov z16.d, z3.d		; VBITS_GE_256-NEXT: umulh z1.h, p0/m, z1.h, z3.h
; VBITS_GE_256-NEXT: ext z7.b, z7.b, z7.b, #16		; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: umull2 v4.4s, v0.8h, v2.8h
; VBITS_GE_256-NEXT: ext z16.b, z16.b, z3.b, #16
; VBITS_GE_256-NEXT: umull v5.4s, v0.4h, v2.4h
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
; VBITS_GE_256-NEXT: umull2 v6.4s, v1.8h, v3.8h
; VBITS_GE_256-NEXT: umull v1.4s, v1.4h, v3.4h
; VBITS_GE_256-NEXT: umull2 v3.4s, v0.8h, v2.8h
; VBITS_GE_256-NEXT: umull v0.4s, v0.4h, v2.4h
; VBITS_GE_256-NEXT: umull2 v2.4s, v7.8h, v16.8h
; VBITS_GE_256-NEXT: umull v7.4s, v7.4h, v16.4h
; VBITS_GE_256-NEXT: uzp2 v4.8h, v5.8h, v4.8h
; VBITS_GE_256-NEXT: uzp2 v1.8h, v1.8h, v6.8h
; VBITS_GE_256-NEXT: uzp2 v0.8h, v0.8h, v3.8h
; VBITS_GE_256-NEXT: uzp2 v2.8h, v7.8h, v2.8h
; VBITS_GE_256-NEXT: splice z4.h, p1, z4.h, z0.h
; VBITS_GE_256-NEXT: splice z1.h, p1, z1.h, z2.h
; VBITS_GE_256-NEXT: st1h { z4.h }, p0, [x0, x8, lsl #1]
; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]		; VBITS_GE_256-NEXT: st1h { z1.h }, p0, [x0]
; VBITS_GE_256-NEXT: ret		; VBITS_GE_256-NEXT: ret
;		;
; VBITS_GE_512-LABEL: umulh_v32i16:		; VBITS_GE_512-LABEL: umulh_v32i16:
; VBITS_GE_512: // %bb.0:		; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.h, vl32		; VBITS_GE_512-NEXT: ptrue p0.h, vl32
; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]		; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]		; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1]
▲ Show 20 Lines • Show All 50 Lines • ▼ Show 20 Lines	; CHECK-NEXT: ret
store <128 x i16> %res, ptr %a		store <128 x i16> %res, ptr %a
ret void		ret void
}		}

; Vector i64 multiplications are not legal for NEON so use SVE when available.		; Vector i64 multiplications are not legal for NEON so use SVE when available.
define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {		define <2 x i32> @umulh_v2i32(<2 x i32> %op1, <2 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: umulh_v2i32:		; CHECK-LABEL: umulh_v2i32:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: ushll v0.2d, v0.2s, #0		; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: ptrue p0.d, vl2		; CHECK-NEXT: ptrue p0.s, vl2
; CHECK-NEXT: ushll v1.2d, v1.2s, #0		; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: mul z0.d, p0/m, z0.d, z1.d		; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: shrn v0.2s, v0.2d, #32		; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%1 = zext <2 x i32> %op1 to <2 x i64>		%1 = zext <2 x i32> %op1 to <2 x i64>
%2 = zext <2 x i32> %op2 to <2 x i64>		%2 = zext <2 x i32> %op2 to <2 x i64>
%mul = mul <2 x i64> %1, %2		%mul = mul <2 x i64> %1, %2
%shr = lshr <2 x i64> %mul, <i64 32, i64 32>		%shr = lshr <2 x i64> %mul, <i64 32, i64 32>
%res = trunc <2 x i64> %shr to <2 x i32>		%res = trunc <2 x i64> %shr to <2 x i32>
ret <2 x i32> %res		ret <2 x i32> %res
}		}

; Don't use SVE for 128-bit vectors.		; Don't use SVE for 128-bit vectors.
define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {		define <4 x i32> @umulh_v4i32(<4 x i32> %op1, <4 x i32> %op2) vscale_range(2,0) #0 {
; CHECK-LABEL: umulh_v4i32:		; CHECK-LABEL: umulh_v4i32:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: umull2 v2.2d, v0.4s, v1.4s		; CHECK-NEXT: // kill: def $q0 killed $q0 def $z0
; CHECK-NEXT: umull v0.2d, v0.2s, v1.2s		; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: uzp2 v0.4s, v0.4s, v2.4s		; CHECK-NEXT: // kill: def $q1 killed $q1 def $z1
		; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z1.s
		; CHECK-NEXT: // kill: def $q0 killed $q0 killed $z0
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%1 = zext <4 x i32> %op1 to <4 x i64>		%1 = zext <4 x i32> %op1 to <4 x i64>
%2 = zext <4 x i32> %op2 to <4 x i64>		%2 = zext <4 x i32> %op2 to <4 x i64>
%mul = mul <4 x i64> %1, %2		%mul = mul <4 x i64> %1, %2
%shr = lshr <4 x i64> %mul, <i64 32, i64 32, i64 32, i64 32>		%shr = lshr <4 x i64> %mul, <i64 32, i64 32, i64 32, i64 32>
%res = trunc <4 x i64> %shr to <4 x i32>		%res = trunc <4 x i64> %shr to <4 x i32>
ret <4 x i32> %res		ret <4 x i32> %res
}		}
Show All 18 Lines	; CHECK-NEXT: ret
%res = trunc <8 x i64> %shr to <8 x i32>		%res = trunc <8 x i64> %shr to <8 x i32>
store <8 x i32> %res, ptr %a		store <8 x i32> %res, ptr %a
ret void		ret void
}		}

define void @umulh_v16i32(ptr %a, ptr %b) #0 {		define void @umulh_v16i32(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: umulh_v16i32:		; VBITS_GE_256-LABEL: umulh_v16i32:
; VBITS_GE_256: // %bb.0:		; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #8		; VBITS_GE_256-NEXT: mov x8, #8 // =0x8
; VBITS_GE_256-NEXT: ptrue p0.s, vl8		; VBITS_GE_256-NEXT: ptrue p0.s, vl8
; VBITS_GE_256-NEXT: ptrue p1.s, vl4
; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]		; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]		; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]		; VBITS_GE_256-NEXT: ld1w { z2.s }, p0/z, [x1, x8, lsl #2]
; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]		; VBITS_GE_256-NEXT: ld1w { z3.s }, p0/z, [x1]
; VBITS_GE_256-NEXT: mov z7.d, z1.d		; VBITS_GE_256-NEXT: umulh z0.s, p0/m, z0.s, z2.s
; VBITS_GE_256-NEXT: mov z16.d, z3.d		; VBITS_GE_256-NEXT: umulh z1.s, p0/m, z1.s, z3.s
; VBITS_GE_256-NEXT: ext z7.b, z7.b, z7.b, #16		; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: umull2 v4.2d, v0.4s, v2.4s
; VBITS_GE_256-NEXT: ext z16.b, z16.b, z3.b, #16
; VBITS_GE_256-NEXT: umull v5.2d, v0.2s, v2.2s
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16
; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
; VBITS_GE_256-NEXT: umull2 v6.2d, v1.4s, v3.4s
; VBITS_GE_256-NEXT: umull v1.2d, v1.2s, v3.2s
; VBITS_GE_256-NEXT: umull2 v3.2d, v0.4s, v2.4s
; VBITS_GE_256-NEXT: umull v0.2d, v0.2s, v2.2s
; VBITS_GE_256-NEXT: umull2 v2.2d, v7.4s, v16.4s
; VBITS_GE_256-NEXT: umull v7.2d, v7.2s, v16.2s
; VBITS_GE_256-NEXT: uzp2 v4.4s, v5.4s, v4.4s
; VBITS_GE_256-NEXT: uzp2 v1.4s, v1.4s, v6.4s
; VBITS_GE_256-NEXT: uzp2 v0.4s, v0.4s, v3.4s
; VBITS_GE_256-NEXT: uzp2 v2.4s, v7.4s, v2.4s
; VBITS_GE_256-NEXT: splice z4.s, p1, z4.s, z0.s
; VBITS_GE_256-NEXT: splice z1.s, p1, z1.s, z2.s
; VBITS_GE_256-NEXT: st1w { z4.s }, p0, [x0, x8, lsl #2]
; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]		; VBITS_GE_256-NEXT: st1w { z1.s }, p0, [x0]
; VBITS_GE_256-NEXT: ret		; VBITS_GE_256-NEXT: ret
;		;
; VBITS_GE_512-LABEL: umulh_v16i32:		; VBITS_GE_512-LABEL: umulh_v16i32:
; VBITS_GE_512: // %bb.0:		; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.s, vl16		; VBITS_GE_512-NEXT: ptrue p0.s, vl16
; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]		; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]		; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1]
▲ Show 20 Lines • Show All 105 Lines • ▼ Show 20 Lines	; CHECK-NEXT: ret
%res = trunc <4 x i128> %shr to <4 x i64>		%res = trunc <4 x i128> %shr to <4 x i64>
store <4 x i64> %res, ptr %a		store <4 x i64> %res, ptr %a
ret void		ret void
}		}

define void @umulh_v8i64(ptr %a, ptr %b) #0 {		define void @umulh_v8i64(ptr %a, ptr %b) #0 {
; VBITS_GE_256-LABEL: umulh_v8i64:		; VBITS_GE_256-LABEL: umulh_v8i64:
; VBITS_GE_256: // %bb.0:		; VBITS_GE_256: // %bb.0:
; VBITS_GE_256-NEXT: mov x8, #4		; VBITS_GE_256-NEXT: mov x8, #4 // =0x4
; VBITS_GE_256-NEXT: ptrue p0.d, vl4		; VBITS_GE_256-NEXT: ptrue p0.d, vl4
; VBITS_GE_256-NEXT: ptrue p1.d, vl2
; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]		; VBITS_GE_256-NEXT: ld1d { z0.d }, p0/z, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]		; VBITS_GE_256-NEXT: ld1d { z1.d }, p0/z, [x0]
; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]		; VBITS_GE_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3]
; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]		; VBITS_GE_256-NEXT: ld1d { z3.d }, p0/z, [x1]
; VBITS_GE_256-NEXT: mov x9, v0.d[1]		; VBITS_GE_256-NEXT: umulh z0.d, p0/m, z0.d, z2.d
; VBITS_GE_256-NEXT: fmov x10, d0		; VBITS_GE_256-NEXT: umulh z1.d, p0/m, z1.d, z3.d
; VBITS_GE_256-NEXT: ext z0.b, z0.b, z0.b, #16		; VBITS_GE_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: fmov x17, d2		; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0]
; VBITS_GE_256-NEXT: mov x13, v2.d[1]
; VBITS_GE_256-NEXT: ext z2.b, z2.b, z2.b, #16
; VBITS_GE_256-NEXT: mov x14, v0.d[1]
; VBITS_GE_256-NEXT: mov x18, v2.d[1]
; VBITS_GE_256-NEXT: umulh x10, x10, x17
; VBITS_GE_256-NEXT: mov x11, v1.d[1]
; VBITS_GE_256-NEXT: fmov x12, d1
; VBITS_GE_256-NEXT: ext z1.b, z1.b, z1.b, #16
; VBITS_GE_256-NEXT: mov x2, v3.d[1]
; VBITS_GE_256-NEXT: fmov x3, d3
; VBITS_GE_256-NEXT: ext z3.b, z3.b, z3.b, #16
; VBITS_GE_256-NEXT: umulh x9, x9, x13
; VBITS_GE_256-NEXT: mov x13, v1.d[1]
; VBITS_GE_256-NEXT: umulh x14, x14, x18
; VBITS_GE_256-NEXT: mov x18, v3.d[1]
; VBITS_GE_256-NEXT: umulh x12, x12, x3
; VBITS_GE_256-NEXT: fmov x15, d0
; VBITS_GE_256-NEXT: fmov x16, d1
; VBITS_GE_256-NEXT: fmov x1, d2
; VBITS_GE_256-NEXT: fmov x17, d3
; VBITS_GE_256-NEXT: fmov d0, x9
; VBITS_GE_256-NEXT: fmov d1, x10
; VBITS_GE_256-NEXT: umulh x9, x11, x2
; VBITS_GE_256-NEXT: umulh x15, x15, x1
; VBITS_GE_256-NEXT: fmov d4, x12
; VBITS_GE_256-NEXT: umulh x16, x16, x17
; VBITS_GE_256-NEXT: umulh x10, x13, x18
; VBITS_GE_256-NEXT: mov v1.d[1], v0.d[0]
; VBITS_GE_256-NEXT: fmov d0, x14
; VBITS_GE_256-NEXT: fmov d2, x15
; VBITS_GE_256-NEXT: fmov d3, x9
; VBITS_GE_256-NEXT: fmov d6, x16
; VBITS_GE_256-NEXT: fmov d5, x10
; VBITS_GE_256-NEXT: mov v2.d[1], v0.d[0]
; VBITS_GE_256-NEXT: mov v4.d[1], v3.d[0]
; VBITS_GE_256-NEXT: mov v6.d[1], v5.d[0]
; VBITS_GE_256-NEXT: splice z1.d, p1, z1.d, z2.d
; VBITS_GE_256-NEXT: st1d { z1.d }, p0, [x0, x8, lsl #3]
; VBITS_GE_256-NEXT: splice z4.d, p1, z4.d, z6.d
; VBITS_GE_256-NEXT: st1d { z4.d }, p0, [x0]
; VBITS_GE_256-NEXT: ret		; VBITS_GE_256-NEXT: ret
;		;
; VBITS_GE_512-LABEL: umulh_v8i64:		; VBITS_GE_512-LABEL: umulh_v8i64:
; VBITS_GE_512: // %bb.0:		; VBITS_GE_512: // %bb.0:
; VBITS_GE_512-NEXT: ptrue p0.d, vl8		; VBITS_GE_512-NEXT: ptrue p0.d, vl8
; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]		; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x0]
; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]		; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1]
; VBITS_GE_512-NEXT: umulh z0.d, p0/m, z0.d, z1.d		; VBITS_GE_512-NEXT: umulh z0.d, p0/m, z0.d, z1.d
▲ Show 20 Lines • Show All 53 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll

Show First 20 Lines • Show All 65 Lines • ▼ Show 20 Lines	; CHECK-NEXT: ret
%shr = lshr <16 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>		%shr = lshr <16 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%res = trunc <16 x i16> %shr to <16 x i8>		%res = trunc <16 x i16> %shr to <16 x i8>
ret <16 x i8> %res		ret <16 x i8> %res
}		}

define void @smulh_v32i8(ptr %a, ptr %b) #0 {		define void @smulh_v32i8(ptr %a, ptr %b) #0 {
; CHECK-LABEL: smulh_v32i8:		; CHECK-LABEL: smulh_v32i8:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: ldp q1, q0, [x0]		; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: ptrue p0.h, vl8		; CHECK-NEXT: ptrue p0.b, vl16
; CHECK-NEXT: sunpklo z4.h, z1.b		; CHECK-NEXT: ldp q2, q3, [x1]
; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8		; CHECK-NEXT: smulh z0.b, p0/m, z0.b, z2.b
; CHECK-NEXT: sunpklo z1.h, z1.b		; CHECK-NEXT: smulh z1.b, p0/m, z1.b, z3.b
; CHECK-NEXT: ldp q3, q2, [x1]		; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: sunpklo z5.h, z0.b
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
; CHECK-NEXT: sunpklo z0.h, z0.b
; CHECK-NEXT: sunpklo z6.h, z3.b
; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
; CHECK-NEXT: sunpklo z3.h, z3.b
; CHECK-NEXT: sunpklo z7.h, z2.b
; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8
; CHECK-NEXT: sunpklo z2.h, z2.b
; CHECK-NEXT: mul z1.h, p0/m, z1.h, z3.h
; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
; CHECK-NEXT: movprfx z2, z5
; CHECK-NEXT: mul z2.h, p0/m, z2.h, z7.h
; CHECK-NEXT: movprfx z3, z4
; CHECK-NEXT: mul z3.h, p0/m, z3.h, z6.h
; CHECK-NEXT: lsr z1.h, z1.h, #8
; CHECK-NEXT: lsr z3.h, z3.h, #8
; CHECK-NEXT: lsr z2.h, z2.h, #8
; CHECK-NEXT: lsr z0.h, z0.h, #8
; CHECK-NEXT: ptrue p0.b, vl8
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b
; CHECK-NEXT: uzp1 z3.b, z3.b, z3.b
; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
; CHECK-NEXT: splice z3.b, p0, z3.b, z1.b
; CHECK-NEXT: splice z2.b, p0, z2.b, z0.b
; CHECK-NEXT: stp q3, q2, [x0]
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%op1 = load <32 x i8>, ptr %a		%op1 = load <32 x i8>, ptr %a
%op2 = load <32 x i8>, ptr %b		%op2 = load <32 x i8>, ptr %b
%1 = sext <32 x i8> %op1 to <32 x i16>		%1 = sext <32 x i8> %op1 to <32 x i16>
%2 = sext <32 x i8> %op2 to <32 x i16>		%2 = sext <32 x i8> %op2 to <32 x i16>
%mul = mul <32 x i16> %1, %2		%mul = mul <32 x i16> %1, %2
%shr = lshr <32 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>		%shr = lshr <32 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%res = trunc <32 x i16> %shr to <32 x i8>		%res = trunc <32 x i16> %shr to <32 x i8>
store <32 x i8> %res, ptr %a		store <32 x i8> %res, ptr %a
ret void		ret void
}		}

define <2 x i16> @smulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) #0 {		define <2 x i16> @smulh_v2i16(<2 x i16> %op1, <2 x i16> %op2) #0 {
; CHECK-LABEL: smulh_v2i16:		; CHECK-LABEL: smulh_v2i16:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1		; CHECK-NEXT: // kill: def $d1 killed $d1 def $z1
; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0		; CHECK-NEXT: // kill: def $d0 killed $d0 def $z0
; CHECK-NEXT: ptrue p0.s, vl2		; CHECK-NEXT: ptrue p0.s, vl2
; CHECK-NEXT: sxth z0.s, p0/m, z0.s		; CHECK-NEXT: sxth z0.s, p0/m, z0.s
; CHECK-NEXT: sxth z1.s, p0/m, z1.s		; CHECK-NEXT: sxth z1.s, p0/m, z1.s
; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s		; CHECK-NEXT: mul z0.s, p0/m, z0.s, z1.s
; CHECK-NEXT: lsr z0.s, z0.s, #16		; CHECK-NEXT: lsr z0.s, z0.s, #16
; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0		; CHECK-NEXT: // kill: def $d0 killed $d0 killed $z0
		sdesmalenUnsubmitted Not Done Reply Inline Actions This doesn't seem right because this instruction is sign-extending i32 elements, not the i16 elements that are passed in as the arguments. I would have expected `smulh z0.h, p0/m, z0.h, z1.h` instead. sdesmalen: This doesn't seem right because this instruction is sign-extending i32 elements, not the i16…
		sdesmalenUnsubmitted Done Reply Inline Actions I still don't think this is correct, because it's not keeping the correct part of the result. I think the promotion could work by shifting the operands to the left by 16bits, doing the smulh on 32-bit elements, and then shifting the result right again by 16bits. But this is probably less efficient than the original code (sign-extend + mul + shift-right), so it seems better to avoid combining this case entirely and in the DAGCombine just add a check that the transformed type's element type is the same as the element type of NarrowVT (otherwise that would imply the need for type promotion) sdesmalen: I still don't think this is correct, because it's not keeping the correct part of the result. I…
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%1 = sext <2 x i16> %op1 to <2 x i32>		%1 = sext <2 x i16> %op1 to <2 x i32>
%2 = sext <2 x i16> %op2 to <2 x i32>		%2 = sext <2 x i16> %op2 to <2 x i32>
%mul = mul <2 x i32> %1, %2		%mul = mul <2 x i32> %1, %2
%shr = lshr <2 x i32> %mul, <i32 16, i32 16>		%shr = lshr <2 x i32> %mul, <i32 16, i32 16>
%res = trunc <2 x i32> %shr to <2 x i16>		%res = trunc <2 x i32> %shr to <2 x i16>
ret <2 x i16> %res		ret <2 x i16> %res
}		}
Show All 31 Lines	; CHECK-NEXT: ret
%res = trunc <8 x i32> %shr to <8 x i16>		%res = trunc <8 x i32> %shr to <8 x i16>
ret <8 x i16> %res		ret <8 x i16> %res
}		}

define void @smulh_v16i16(ptr %a, ptr %b) #0 {		define void @smulh_v16i16(ptr %a, ptr %b) #0 {
; CHECK-LABEL: smulh_v16i16:		; CHECK-LABEL: smulh_v16i16:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: ldp q0, q1, [x0]		; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: ptrue p0.h, vl4		; CHECK-NEXT: ptrue p0.h, vl8
; CHECK-NEXT: mov z5.d, z0.d
; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8
; CHECK-NEXT: ldp q2, q3, [x1]		; CHECK-NEXT: ldp q2, q3, [x1]
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8
; CHECK-NEXT: mov z6.d, z2.d
; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z2.h		; CHECK-NEXT: smulh z0.h, p0/m, z0.h, z2.h
; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8
; CHECK-NEXT: mov z2.d, z3.d
; CHECK-NEXT: smulh z1.h, p0/m, z1.h, z3.h		; CHECK-NEXT: smulh z1.h, p0/m, z1.h, z3.h
; CHECK-NEXT: ext z2.b, z2.b, z3.b, #8
; CHECK-NEXT: movprfx z3, z5
; CHECK-NEXT: smulh z3.h, p0/m, z3.h, z6.h
; CHECK-NEXT: smulh z2.h, p0/m, z2.h, z4.h
; CHECK-NEXT: splice z0.h, p0, z0.h, z3.h
; CHECK-NEXT: splice z1.h, p0, z1.h, z2.h
; CHECK-NEXT: stp q0, q1, [x0]		; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%op1 = load <16 x i16>, ptr %a		%op1 = load <16 x i16>, ptr %a
%op2 = load <16 x i16>, ptr %b		%op2 = load <16 x i16>, ptr %b
%1 = sext <16 x i16> %op1 to <16 x i32>		%1 = sext <16 x i16> %op1 to <16 x i32>
%2 = sext <16 x i16> %op2 to <16 x i32>		%2 = sext <16 x i16> %op2 to <16 x i32>
%mul = mul <16 x i32> %1, %2		%mul = mul <16 x i32> %1, %2
%shr = lshr <16 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>		%shr = lshr <16 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
Show All 35 Lines	; CHECK-NEXT: ret
%res = trunc <4 x i64> %shr to <4 x i32>		%res = trunc <4 x i64> %shr to <4 x i32>
ret <4 x i32> %res		ret <4 x i32> %res
}		}

define void @smulh_v8i32(ptr %a, ptr %b) #0 {		define void @smulh_v8i32(ptr %a, ptr %b) #0 {
; CHECK-LABEL: smulh_v8i32:		; CHECK-LABEL: smulh_v8i32:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: ldp q0, q1, [x0]		; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: ptrue p0.s, vl2		; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: mov z5.d, z0.d
; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8
; CHECK-NEXT: ldp q2, q3, [x1]		; CHECK-NEXT: ldp q2, q3, [x1]
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8
; CHECK-NEXT: mov z6.d, z2.d
; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z2.s		; CHECK-NEXT: smulh z0.s, p0/m, z0.s, z2.s
; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8
; CHECK-NEXT: mov z2.d, z3.d
; CHECK-NEXT: smulh z1.s, p0/m, z1.s, z3.s		; CHECK-NEXT: smulh z1.s, p0/m, z1.s, z3.s
; CHECK-NEXT: ext z2.b, z2.b, z3.b, #8
; CHECK-NEXT: movprfx z3, z5
; CHECK-NEXT: smulh z3.s, p0/m, z3.s, z6.s
; CHECK-NEXT: smulh z2.s, p0/m, z2.s, z4.s
; CHECK-NEXT: splice z0.s, p0, z0.s, z3.s
; CHECK-NEXT: splice z1.s, p0, z1.s, z2.s
; CHECK-NEXT: stp q0, q1, [x0]		; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%op1 = load <8 x i32>, ptr %a		%op1 = load <8 x i32>, ptr %a
%op2 = load <8 x i32>, ptr %b		%op2 = load <8 x i32>, ptr %b
%1 = sext <8 x i32> %op1 to <8 x i64>		%1 = sext <8 x i32> %op1 to <8 x i64>
%2 = sext <8 x i32> %op2 to <8 x i64>		%2 = sext <8 x i32> %op2 to <8 x i64>
%mul = mul <8 x i64> %1, %2		%mul = mul <8 x i64> %1, %2
%shr = lshr <8 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>		%shr = lshr <8 x i64> %mul, <i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32, i64 32>
Show All 37 Lines	; CHECK-NEXT: ret
%res = trunc <2 x i128> %shr to <2 x i64>		%res = trunc <2 x i128> %shr to <2 x i64>
ret <2 x i64> %res		ret <2 x i64> %res
}		}

define void @smulh_v4i64(ptr %a, ptr %b) #0 {		define void @smulh_v4i64(ptr %a, ptr %b) #0 {
; CHECK-LABEL: smulh_v4i64:		; CHECK-LABEL: smulh_v4i64:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: ldp q0, q1, [x0]		; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: ptrue p0.d, vl1		; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: fmov x9, d0
; CHECK-NEXT: ldp q2, q3, [x1]		; CHECK-NEXT: ldp q2, q3, [x1]
; CHECK-NEXT: mov z4.d, z1.d[1]		; CHECK-NEXT: smulh z0.d, p0/m, z0.d, z2.d
; CHECK-NEXT: fmov x8, d1		; CHECK-NEXT: smulh z1.d, p0/m, z1.d, z3.d
; CHECK-NEXT: mov z1.d, z0.d[1]		; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: fmov x13, d4
; CHECK-NEXT: fmov x10, d1
; CHECK-NEXT: mov z0.d, z2.d[1]
; CHECK-NEXT: fmov x12, d2
; CHECK-NEXT: fmov x11, d0
; CHECK-NEXT: mov z0.d, z3.d[1]
; CHECK-NEXT: fmov x14, d0
; CHECK-NEXT: smulh x9, x9, x12
; CHECK-NEXT: smulh x10, x10, x11
; CHECK-NEXT: fmov x11, d3
; CHECK-NEXT: smulh x12, x13, x14
; CHECK-NEXT: smulh x8, x8, x11
; CHECK-NEXT: fmov d0, x9
; CHECK-NEXT: fmov d1, x10
; CHECK-NEXT: fmov d3, x12
; CHECK-NEXT: fmov d2, x8
; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
; CHECK-NEXT: splice z2.d, p0, z2.d, z3.d
; CHECK-NEXT: stp q0, q2, [x0]
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%op1 = load <4 x i64>, ptr %a		%op1 = load <4 x i64>, ptr %a
%op2 = load <4 x i64>, ptr %b		%op2 = load <4 x i64>, ptr %b
%1 = sext <4 x i64> %op1 to <4 x i128>		%1 = sext <4 x i64> %op1 to <4 x i128>
%2 = sext <4 x i64> %op2 to <4 x i128>		%2 = sext <4 x i64> %op2 to <4 x i128>
%mul = mul <4 x i128> %1, %2		%mul = mul <4 x i128> %1, %2
%shr = lshr <4 x i128> %mul, <i128 64, i128 64, i128 64, i128 64>		%shr = lshr <4 x i128> %mul, <i128 64, i128 64, i128 64, i128 64>
%res = trunc <4 x i128> %shr to <4 x i64>		%res = trunc <4 x i128> %shr to <4 x i64>
▲ Show 20 Lines • Show All 57 Lines • ▼ Show 20 Lines	; CHECK-NEXT: ret
%shr = lshr <16 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>		%shr = lshr <16 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%res = trunc <16 x i16> %shr to <16 x i8>		%res = trunc <16 x i16> %shr to <16 x i8>
ret <16 x i8> %res		ret <16 x i8> %res
}		}

define void @umulh_v32i8(ptr %a, ptr %b) #0 {		define void @umulh_v32i8(ptr %a, ptr %b) #0 {
; CHECK-LABEL: umulh_v32i8:		; CHECK-LABEL: umulh_v32i8:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: ldp q1, q0, [x0]		; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: ptrue p0.h, vl8		; CHECK-NEXT: ptrue p0.b, vl16
; CHECK-NEXT: uunpklo z4.h, z1.b		; CHECK-NEXT: ldp q2, q3, [x1]
; CHECK-NEXT: ext z1.b, z1.b, z1.b, #8		; CHECK-NEXT: umulh z0.b, p0/m, z0.b, z2.b
; CHECK-NEXT: uunpklo z1.h, z1.b		; CHECK-NEXT: umulh z1.b, p0/m, z1.b, z3.b
; CHECK-NEXT: ldp q3, q2, [x1]		; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: uunpklo z5.h, z0.b
; CHECK-NEXT: ext z0.b, z0.b, z0.b, #8
; CHECK-NEXT: uunpklo z0.h, z0.b
; CHECK-NEXT: uunpklo z6.h, z3.b
; CHECK-NEXT: ext z3.b, z3.b, z3.b, #8
; CHECK-NEXT: uunpklo z3.h, z3.b
; CHECK-NEXT: uunpklo z7.h, z2.b
; CHECK-NEXT: ext z2.b, z2.b, z2.b, #8
; CHECK-NEXT: uunpklo z2.h, z2.b
; CHECK-NEXT: mul z1.h, p0/m, z1.h, z3.h
; CHECK-NEXT: mul z0.h, p0/m, z0.h, z2.h
; CHECK-NEXT: movprfx z2, z5
; CHECK-NEXT: mul z2.h, p0/m, z2.h, z7.h
; CHECK-NEXT: movprfx z3, z4
; CHECK-NEXT: mul z3.h, p0/m, z3.h, z6.h
; CHECK-NEXT: lsr z1.h, z1.h, #8
; CHECK-NEXT: lsr z3.h, z3.h, #8
; CHECK-NEXT: lsr z2.h, z2.h, #8
; CHECK-NEXT: lsr z0.h, z0.h, #8
; CHECK-NEXT: ptrue p0.b, vl8
; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
; CHECK-NEXT: uzp1 z1.b, z1.b, z1.b
; CHECK-NEXT: uzp1 z3.b, z3.b, z3.b
; CHECK-NEXT: uzp1 z2.b, z2.b, z2.b
; CHECK-NEXT: splice z3.b, p0, z3.b, z1.b
; CHECK-NEXT: splice z2.b, p0, z2.b, z0.b
; CHECK-NEXT: stp q3, q2, [x0]
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%op1 = load <32 x i8>, ptr %a		%op1 = load <32 x i8>, ptr %a
%op2 = load <32 x i8>, ptr %b		%op2 = load <32 x i8>, ptr %b
%1 = zext <32 x i8> %op1 to <32 x i16>		%1 = zext <32 x i8> %op1 to <32 x i16>
%2 = zext <32 x i8> %op2 to <32 x i16>		%2 = zext <32 x i8> %op2 to <32 x i16>
%mul = mul <32 x i16> %1, %2		%mul = mul <32 x i16> %1, %2
%shr = lshr <32 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>		%shr = lshr <32 x i16> %mul, <i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8, i16 8>
%res = trunc <32 x i16> %shr to <32 x i8>		%res = trunc <32 x i16> %shr to <32 x i8>
▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines	; CHECK-NEXT: ret
%res = trunc <8 x i32> %shr to <8 x i16>		%res = trunc <8 x i32> %shr to <8 x i16>
ret <8 x i16> %res		ret <8 x i16> %res
}		}

define void @umulh_v16i16(ptr %a, ptr %b) #0 {		define void @umulh_v16i16(ptr %a, ptr %b) #0 {
; CHECK-LABEL: umulh_v16i16:		; CHECK-LABEL: umulh_v16i16:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: ldp q0, q1, [x0]		; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: ptrue p0.h, vl4		; CHECK-NEXT: ptrue p0.h, vl8
; CHECK-NEXT: mov z5.d, z0.d
; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8
; CHECK-NEXT: ldp q2, q3, [x1]		; CHECK-NEXT: ldp q2, q3, [x1]
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8
; CHECK-NEXT: mov z6.d, z2.d
; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z2.h		; CHECK-NEXT: umulh z0.h, p0/m, z0.h, z2.h
; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8
; CHECK-NEXT: mov z2.d, z3.d
; CHECK-NEXT: umulh z1.h, p0/m, z1.h, z3.h		; CHECK-NEXT: umulh z1.h, p0/m, z1.h, z3.h
; CHECK-NEXT: ext z2.b, z2.b, z3.b, #8
; CHECK-NEXT: movprfx z3, z5
; CHECK-NEXT: umulh z3.h, p0/m, z3.h, z6.h
; CHECK-NEXT: umulh z2.h, p0/m, z2.h, z4.h
; CHECK-NEXT: splice z0.h, p0, z0.h, z3.h
; CHECK-NEXT: splice z1.h, p0, z1.h, z2.h
; CHECK-NEXT: stp q0, q1, [x0]		; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%op1 = load <16 x i16>, ptr %a		%op1 = load <16 x i16>, ptr %a
%op2 = load <16 x i16>, ptr %b		%op2 = load <16 x i16>, ptr %b
%1 = zext <16 x i16> %op1 to <16 x i32>		%1 = zext <16 x i16> %op1 to <16 x i32>
%2 = zext <16 x i16> %op2 to <16 x i32>		%2 = zext <16 x i16> %op2 to <16 x i32>
%mul = mul <16 x i32> %1, %2		%mul = mul <16 x i32> %1, %2
%shr = lshr <16 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>		%shr = lshr <16 x i32> %mul, <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
Show All 35 Lines	; CHECK-NEXT: ret
%res = trunc <4 x i64> %shr to <4 x i32>		%res = trunc <4 x i64> %shr to <4 x i32>
ret <4 x i32> %res		ret <4 x i32> %res
}		}

define void @umulh_v8i32(ptr %a, ptr %b) #0 {		define void @umulh_v8i32(ptr %a, ptr %b) #0 {
; CHECK-LABEL: umulh_v8i32:		; CHECK-LABEL: umulh_v8i32:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: ldp q0, q1, [x0]		; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: ptrue p0.s, vl2		; CHECK-NEXT: ptrue p0.s, vl4
; CHECK-NEXT: mov z5.d, z0.d
; CHECK-NEXT: ext z5.b, z5.b, z5.b, #8
; CHECK-NEXT: ldp q2, q3, [x1]		; CHECK-NEXT: ldp q2, q3, [x1]
; CHECK-NEXT: mov z4.d, z1.d
; CHECK-NEXT: ext z4.b, z4.b, z4.b, #8
; CHECK-NEXT: mov z6.d, z2.d
; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z2.s		; CHECK-NEXT: umulh z0.s, p0/m, z0.s, z2.s
; CHECK-NEXT: ext z6.b, z6.b, z6.b, #8
; CHECK-NEXT: mov z2.d, z3.d
; CHECK-NEXT: umulh z1.s, p0/m, z1.s, z3.s		; CHECK-NEXT: umulh z1.s, p0/m, z1.s, z3.s
; CHECK-NEXT: ext z2.b, z2.b, z3.b, #8
; CHECK-NEXT: movprfx z3, z5
; CHECK-NEXT: umulh z3.s, p0/m, z3.s, z6.s
; CHECK-NEXT: umulh z2.s, p0/m, z2.s, z4.s
; CHECK-NEXT: splice z0.s, p0, z0.s, z3.s
; CHECK-NEXT: splice z1.s, p0, z1.s, z2.s
; CHECK-NEXT: stp q0, q1, [x0]		; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%op1 = load <8 x i32>, ptr %a		%op1 = load <8 x i32>, ptr %a
%op2 = load <8 x i32>, ptr %b		%op2 = load <8 x i32>, ptr %b
%insert = insertelement <8 x i64> undef, i64 32, i64 0		%insert = insertelement <8 x i64> undef, i64 32, i64 0
%splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer		%splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
%1 = zext <8 x i32> %op1 to <8 x i64>		%1 = zext <8 x i32> %op1 to <8 x i64>
%2 = zext <8 x i32> %op2 to <8 x i64>		%2 = zext <8 x i32> %op2 to <8 x i64>
Show All 37 Lines	; CHECK-NEXT: ret
%res = trunc <2 x i128> %shr to <2 x i64>		%res = trunc <2 x i128> %shr to <2 x i64>
ret <2 x i64> %res		ret <2 x i64> %res
}		}

define void @umulh_v4i64(ptr %a, ptr %b) #0 {		define void @umulh_v4i64(ptr %a, ptr %b) #0 {
; CHECK-LABEL: umulh_v4i64:		; CHECK-LABEL: umulh_v4i64:
; CHECK: // %bb.0:		; CHECK: // %bb.0:
; CHECK-NEXT: ldp q0, q1, [x0]		; CHECK-NEXT: ldp q0, q1, [x0]
; CHECK-NEXT: ptrue p0.d, vl1		; CHECK-NEXT: ptrue p0.d, vl2
; CHECK-NEXT: fmov x9, d0
; CHECK-NEXT: ldp q2, q3, [x1]		; CHECK-NEXT: ldp q2, q3, [x1]
; CHECK-NEXT: mov z4.d, z1.d[1]		; CHECK-NEXT: umulh z0.d, p0/m, z0.d, z2.d
; CHECK-NEXT: fmov x8, d1		; CHECK-NEXT: umulh z1.d, p0/m, z1.d, z3.d
; CHECK-NEXT: mov z1.d, z0.d[1]		; CHECK-NEXT: stp q0, q1, [x0]
; CHECK-NEXT: fmov x13, d4
; CHECK-NEXT: fmov x10, d1
; CHECK-NEXT: mov z0.d, z2.d[1]
; CHECK-NEXT: fmov x12, d2
; CHECK-NEXT: fmov x11, d0
; CHECK-NEXT: mov z0.d, z3.d[1]
; CHECK-NEXT: fmov x14, d0
; CHECK-NEXT: umulh x9, x9, x12
; CHECK-NEXT: umulh x10, x10, x11
; CHECK-NEXT: fmov x11, d3
; CHECK-NEXT: umulh x12, x13, x14
; CHECK-NEXT: umulh x8, x8, x11
; CHECK-NEXT: fmov d0, x9
; CHECK-NEXT: fmov d1, x10
; CHECK-NEXT: fmov d3, x12
; CHECK-NEXT: fmov d2, x8
; CHECK-NEXT: splice z0.d, p0, z0.d, z1.d
; CHECK-NEXT: splice z2.d, p0, z2.d, z3.d
; CHECK-NEXT: stp q0, q2, [x0]
; CHECK-NEXT: ret		; CHECK-NEXT: ret
%op1 = load <4 x i64>, ptr %a		%op1 = load <4 x i64>, ptr %a
%op2 = load <4 x i64>, ptr %b		%op2 = load <4 x i64>, ptr %b
%1 = zext <4 x i64> %op1 to <4 x i128>		%1 = zext <4 x i64> %op1 to <4 x i128>
%2 = zext <4 x i64> %op2 to <4 x i128>		%2 = zext <4 x i64> %op2 to <4 x i128>
%mul = mul <4 x i128> %1, %2		%mul = mul <4 x i128> %1, %2
%shr = lshr <4 x i128> %mul, <i128 64, i128 64, i128 64, i128 64>		%shr = lshr <4 x i128> %mul, <i128 64, i128 64, i128 64, i128 64>
%res = trunc <4 x i128> %shr to <4 x i64>		%res = trunc <4 x i128> %shr to <4 x i64>
store <4 x i64> %res, ptr %a		store <4 x i64> %res, ptr %a
ret void		ret void
}		}

attributes #0 = { "target-features"="+sve" }		attributes #0 = { "target-features"="+sve" }

This is an archive of the discontinued LLVM Phabricator instance.

[DAGCombine][AArch64][CodeGen] Allow tranformable vectors to a legal for MULH lowering and use SVE's MULH for fixed vector types.
ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 514935

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll

llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll

This is an archive of the discontinued LLVM Phabricator instance.

[DAGCombine][AArch64][CodeGen] Allow tranformable vectors to a legal for MULH lowering and use SVE's MULH for fixed vector types.ClosedPublic

Details

Diff Detail

Event Timeline

Revision Contents

Diff 514935

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll

llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-mulh.ll

[DAGCombine][AArch64][CodeGen] Allow tranformable vectors to a legal for MULH lowering and use SVE's MULH for fixed vector types.
ClosedPublic