This is an archive of the discontinued LLVM Phabricator instance.

Paths

Table of Contentst

-
llvm/
-
lib/Target/AArch64/
-
Target/
-
AArch64/
-
AArch64ISelLowering.cpp
-
test/CodeGen/AArch64/
-
CodeGen/
-
AArch64/
2
sve-fixed-length-ext-loads.ll
-
sve-fixed-length-int-mulh.ll

Differential D107057

[llvm][sve] Lowering for VLS extending loads
ClosedPublic

Authored by DavidTruby on Jul 29 2021, 4:57 AM.

Download Raw Diff

Details

Reviewers

efriedma
peterwaller-arm
bsmith

Commits

rG9c47d6b48d6b: [llvm][sve] Lowering for VLS extending loads

Summary

This patch enables extending loads for fixed length SVE code generation.

There is a slight regression here in the mulh tests; since these tests
load the parameter and then extend it these are treated as extending
loads which are merged, preventing the mulh instruction from being
generated. As this affects scalable SVE codegen as well this should be
addressed in a separate patch.

Diff Detail

Repository: rG LLVM Github Monorepo

Event Timeline

DavidTruby created this revision.Jul 29 2021, 4:57 AM

Herald added a reviewer: efriedma. · View Herald TranscriptJul 29 2021, 4:57 AM

Herald added subscribers: ctetreau, psnobl, hiraditya, tschuett. · View Herald Transcript

DavidTruby requested review of this revision.Jul 29 2021, 4:57 AM

Herald added a project: Restricted Project. · View Herald TranscriptJul 29 2021, 4:57 AM

Herald added a subscriber: llvm-commits. · View Herald Transcript

DavidTruby added reviewers: peterwaller-arm, bsmith.Jul 29 2021, 4:57 AM

Matt added a subscriber: Matt.Jul 29 2021, 5:14 AM

Harbormaster completed remote builds in B116935: Diff 362723.Jul 29 2021, 6:17 AM

junparser added a subscriber: junparser.Aug 3 2021, 4:50 AM

bsmith added inline comments.Aug 3 2021, 8:41 AM

llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
51–64	The codegen in the type legalisation cases seems a bit odd, why is this not using SVE to do the extending load?

efriedma added inline comments.Aug 3 2021, 11:58 AM

llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll
51–64	The fact that legalization goes through the stack is obviously just a missed optimization. The way type legalization works, it will see that `<16 x i16>` is legal, so we do a `<16 x i16>` load. Then we have an extend of that load to an illegal type. This gets split into two parts: extract/extend the low half, then extract/extend the high half. If we optimized that correctly, it would come out to three instructions: ld1h, followed by uunpcklo/uunpckhi. Whether that's the best approach probably depends on the target and the types involved. If extending vector loads are reasonably fast, maybe we just want to generate more of them.

Other than the clang-format nit, LGTM

This revision is now accepted and ready to land.Aug 6 2021, 7:39 AM

This revision was landed with ongoing or failed builds.Aug 12 2021, 2:43 AM

Closed by commit rG9c47d6b48d6b: [llvm][sve] Lowering for VLS extending loads (authored by DavidTruby). · Explain Why

This revision was automatically updated to reflect the committed changes.

DavidTruby added a commit: rG9c47d6b48d6b: [llvm][sve] Lowering for VLS extending loads.

Revision Contents

Path

Size

llvm/

lib/

Target/

AArch64/

AArch64ISelLowering.cpp

6 lines

test/

CodeGen/

AArch64/

sve-fixed-length-ext-loads.ll

225 lines

sve-fixed-length-int-mulh.ll

358 lines

Diff 365948

llvm/lib/Target/AArch64/AArch64ISelLowering.cpp

This file is larger than 256 KB, so syntax highlighting is disabled by default.

Show First 20 Lines • Show All 1,510 Lines • ▼ Show 20 Lines	if (VT.isFloatingPoint()) {
setCondCodeAction(ISD::SETUNE, VT, Expand);		setCondCodeAction(ISD::SETUNE, VT, Expand);
}		}

// Mark integer truncating stores as having custom lowering		// Mark integer truncating stores as having custom lowering
if (VT.isInteger()) {		if (VT.isInteger()) {
MVT InnerVT = VT.changeVectorElementType(MVT::i8);		MVT InnerVT = VT.changeVectorElementType(MVT::i8);
while (InnerVT != VT) {		while (InnerVT != VT) {
setTruncStoreAction(VT, InnerVT, Custom);		setTruncStoreAction(VT, InnerVT, Custom);
		setLoadExtAction(ISD::ZEXTLOAD, VT, InnerVT, Custom);
		setLoadExtAction(ISD::SEXTLOAD, VT, InnerVT, Custom);
InnerVT = InnerVT.changeVectorElementType(		InnerVT = InnerVT.changeVectorElementType(
MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));		MVT::getIntegerVT(2 * InnerVT.getScalarSizeInBits()));
}		}
}		}

// Lower fixed length vector operations to scalable equivalents.		// Lower fixed length vector operations to scalable equivalents.
setOperationAction(ISD::ABS, VT, Custom);		setOperationAction(ISD::ABS, VT, Custom);
setOperationAction(ISD::ADD, VT, Custom);		setOperationAction(ISD::ADD, VT, Custom);
▲ Show 20 Lines • Show All 2,644 Lines • ▼ Show 20 Lines	bool AArch64TargetLowering::shouldRemoveExtendFromGSIndex(EVT VT) const {
if (VT.getVectorElementType() == MVT::i32 &&		if (VT.getVectorElementType() == MVT::i32 &&
VT.getVectorElementCount().getKnownMinValue() >= 4)		VT.getVectorElementCount().getKnownMinValue() >= 4)
return true;		return true;

return false;		return false;
}		}

bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {		bool AArch64TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
return ExtVal.getValueType().isScalableVector();		return ExtVal.getValueType().isScalableVector() \|\|
		useSVEForFixedLengthVectorVT(ExtVal.getValueType(),
		/OverrideNEON=/true);
}		}

unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {		unsigned getGatherVecOpcode(bool IsScaled, bool IsSigned, bool NeedsExtend) {
std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {		std::map<std::tuple<bool, bool, bool>, unsigned> AddrModes = {
{std::make_tuple(/Scaled/ false, /Signed/ false, /Extend/ false),		{std::make_tuple(/Scaled/ false, /Signed/ false, /Extend/ false),
AArch64ISD::GLD1_MERGE_ZERO},		AArch64ISD::GLD1_MERGE_ZERO},
{std::make_tuple(/Scaled/ false, /Signed/ false, /Extend/ true),		{std::make_tuple(/Scaled/ false, /Signed/ false, /Extend/ true),
AArch64ISD::GLD1_UXTW_MERGE_ZERO},		AArch64ISD::GLD1_UXTW_MERGE_ZERO},
▲ Show 20 Lines • Show All 14,685 Lines • Show Last 20 Lines

llvm/test/CodeGen/AArch64/sve-fixed-length-ext-loads.ll

This file was added.

				; RUN: llc -aarch64-sve-vector-bits-min=128 < %s \| FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
				; RUN: llc -aarch64-sve-vector-bits-min=256 < %s \| FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256
				; RUN: llc -aarch64-sve-vector-bits-min=384 < %s \| FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK
				; RUN: llc -aarch64-sve-vector-bits-min=512 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=640 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=768 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=896 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512
				; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s \| FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512

				target triple = "aarch64-unknown-linux-gnu"

				; Don't use SVE when its registers are no bigger than NEON.
				; NO_SVE-NOT: ptrue

				define <4 x i32> @load_zext_v4i16i32(<4 x i16>* %ap) #0 {
				; CHECK-LABEL: load_zext_v4i16i32
				; CHECK: ldr d[[D0:[0-9]+]], [x0]
				; CHECK-NEXT: ushll v[[D0]].4s, v[[D0]].4h, #0
				; CHECK-NEXT: ret
				%a = load <4 x i16>, <4 x i16>* %ap
				%val = zext <4 x i16> %a to <4 x i32>
				ret <4 x i32> %val
				}

				define <8 x i32> @load_zext_v8i16i32(<8 x i16>* %ap) #0 {
				; CHECK-LABEL: load_zext_v8i16i32
				; CHECK: ptrue [[P0:p[0-9]+]].s, vl8
				; CHECK-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
				; CHECK-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
				; CHECK-NEXT: ret
				%a = load <8 x i16>, <8 x i16>* %ap
				%val = zext <8 x i16> %a to <8 x i32>
				ret <8 x i32> %val
				}

				define <16 x i32> @load_zext_v16i16i32(<16 x i16>* %ap) #0 {
				; CHECK-LABEL: load_zext_v16i16i32
				; VBITS_GE_512: ptrue [[P0:p[0-9]+]].s, vl16
				; VBITS_GE_512-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
				; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
				; VBITS_GE_512-NEXT: ret

				; Ensure sensible type legalistaion
				; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
				; VBITS_EQ_256-DAG: ld1h { [[Z0:z[0-9]+]].h }, [[PG]]/z, [x0]
				; VBITS_EQ_256-DAG: mov x9, sp
				; VBITS_EQ_256-DAG: st1h { [[Z0]].h }, [[PG]], [x9]
				; VBITS_EQ_256-DAG: ldp q[[R0:[0-9]+]], q[[R1:[0-9]+]], [sp]
				; VBITS_EQ_256-DAG: add x9, x8, #32
				; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8
				; VBITS_EQ_256-DAG: uunpklo z[[R0]].s, z[[R0]].h
				; VBITS_EQ_256-DAG: uunpklo z[[R1]].s, z[[R1]].h
				; VBITS_EQ_256-DAG: st1w { z[[R1]].s }, [[PG1]], [x9]
				; VBITS_EQ_256-DAG: st1w { z[[R0]].s }, [[PG1]], [x8]
				; VBITS_EQ_256-DAG: ret
				%a = load <16 x i16>, <16 x i16>* %ap
				bsmithUnsubmitted Not Done Reply Inline Actions The codegen in the type legalisation cases seems a bit odd, why is this not using SVE to do the extending load? bsmith: The codegen in the type legalisation cases seems a bit odd, why is this not using SVE to do the…
				efriedmaUnsubmitted Not Done Reply Inline Actions The fact that legalization goes through the stack is obviously just a missed optimization. The way type legalization works, it will see that `<16 x i16>` is legal, so we do a `<16 x i16>` load. Then we have an extend of that load to an illegal type. This gets split into two parts: extract/extend the low half, then extract/extend the high half. If we optimized that correctly, it would come out to three instructions: ld1h, followed by uunpcklo/uunpckhi. Whether that's the best approach probably depends on the target and the types involved. If extending vector loads are reasonably fast, maybe we just want to generate more of them. efriedma: The fact that legalization goes through the stack is obviously just a missed optimization. The…
				%val = zext <16 x i16> %a to <16 x i32>
				ret <16 x i32> %val
				}

				define <32 x i32> @load_zext_v32i16i32(<32 x i16>* %ap) #0 {
				; CHECK-LABEL: load_zext_v32i16i32
				; VBITS_GE_1024: ptrue [[P0:p[0-9]+]].s, vl32
				; VBITS_GE_1024-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
				; VBITS_GE_1024-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
				; VBITS_GE_1024-NEXT: ret
				%a = load <32 x i16>, <32 x i16>* %ap
				%val = zext <32 x i16> %a to <32 x i32>
				ret <32 x i32> %val
				}

				define <64 x i32> @load_zext_v64i16i32(<64 x i16>* %ap) #0 {
				; CHECK-LABEL: load_zext_v64i16i32
				; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].s, vl64
				; VBITS_GE_2048-NEXT: ld1h { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
				; VBITS_GE_2048-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
				; VBITS_GE_2048-NEXT: ret
				%a = load <64 x i16>, <64 x i16>* %ap
				%val = zext <64 x i16> %a to <64 x i32>
				ret <64 x i32> %val
				}

				define <4 x i32> @load_sext_v4i16i32(<4 x i16>* %ap) #0 {
				; CHECK-LABEL: load_sext_v4i16i32
				; CHECK: ldr d[[D0:[0-9]+]], [x0]
				; CHECK-NEXT: sshll v[[D0]].4s, v[[D0]].4h, #0
				; CHECK-NEXT: ret
				%a = load <4 x i16>, <4 x i16>* %ap
				%val = sext <4 x i16> %a to <4 x i32>
				ret <4 x i32> %val
				}

				define <8 x i32> @load_sext_v8i16i32(<8 x i16>* %ap) #0 {
				; CHECK-LABEL: load_sext_v8i16i32
				; CHECK: ptrue [[P0:p[0-9]+]].s, vl8
				; CHECK-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
				; CHECK-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
				; CHECK-NEXT: ret
				%a = load <8 x i16>, <8 x i16>* %ap
				%val = sext <8 x i16> %a to <8 x i32>
				ret <8 x i32> %val
				}

				define <16 x i32> @load_sext_v16i16i32(<16 x i16>* %ap) #0 {
				; CHECK-LABEL: load_sext_v16i16i32
				; VBITS_GE_512: ptrue [[P0:p[0-9]+]].s, vl16
				; VBITS_GE_512-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
				; VBITS_GE_512-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
				; VBITS_GE_512-NEXT: ret

				; Ensure sensible type legalistaion
				; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].h, vl16
				; VBITS_EQ_256-DAG: ld1h { [[Z0:z[0-9]+]].h }, [[PG]]/z, [x0]
				; VBITS_EQ_256-DAG: mov x9, sp
				; VBITS_EQ_256-DAG: st1h { [[Z0]].h }, [[PG]], [x9]
				; VBITS_EQ_256-DAG: ldp q[[R0:[0-9]+]], q[[R1:[0-9]+]], [sp]
				; VBITS_EQ_256-DAG: add x9, x8, #32
				; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].s, vl8
				; VBITS_EQ_256-DAG: sunpklo z[[R0]].s, z[[R0]].h
				; VBITS_EQ_256-DAG: sunpklo z[[R1]].s, z[[R1]].h
				; VBITS_EQ_256-DAG: st1w { z[[R1]].s }, [[PG1]], [x9]
				; VBITS_EQ_256-DAG: st1w { z[[R0]].s }, [[PG1]], [x8]
				; VBITS_EQ_256-DAG: ret
				%a = load <16 x i16>, <16 x i16>* %ap
				%val = sext <16 x i16> %a to <16 x i32>
				ret <16 x i32> %val
				}

				define <32 x i32> @load_sext_v32i16i32(<32 x i16>* %ap) #0 {
				; CHECK-LABEL: load_sext_v32i16i32
				; VBITS_GE_1024: ptrue [[P0:p[0-9]+]].s, vl32
				; VBITS_GE_1024-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
				; VBITS_GE_1024-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
				; VBITS_GE_1024-NEXT: ret
				%a = load <32 x i16>, <32 x i16>* %ap
				%val = sext <32 x i16> %a to <32 x i32>
				ret <32 x i32> %val
				}

				define <64 x i32> @load_sext_v64i16i32(<64 x i16>* %ap) #0 {
				; CHECK-LABEL: load_sext_v64i16i32
				; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].s, vl64
				; VBITS_GE_2048-NEXT: ld1sh { [[Z0:z[0-9]+]].s }, [[P0]]/z, [x0]
				; VBITS_GE_2048-NEXT: st1w { [[Z0]].s }, [[P0]], [x8]
				; VBITS_GE_2048-NEXT: ret
				%a = load <64 x i16>, <64 x i16>* %ap
				%val = sext <64 x i16> %a to <64 x i32>
				ret <64 x i32> %val
				}

				define <32 x i64> @load_zext_v32i8i64(<32 x i8>* %ap) #0 {
				; CHECK-LABEL: load_zext_v32i8i64
				; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32
				; VBITS_GE_2048-NEXT: ld1b { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0]
				; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8]
				; VBITS_GE_2048-NEXT: ret
				%a = load <32 x i8>, <32 x i8>* %ap
				%val = zext <32 x i8> %a to <32 x i64>
				ret <32 x i64> %val
				}

				define <32 x i64> @load_sext_v32i8i64(<32 x i8>* %ap) #0 {
				; CHECK-LABEL: load_sext_v32i8i64
				; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32
				; VBITS_GE_2048-NEXT: ld1sb { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0]
				; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8]
				; VBITS_GE_2048-NEXT: ret
				%a = load <32 x i8>, <32 x i8>* %ap
				%val = sext <32 x i8> %a to <32 x i64>
				ret <32 x i64> %val
				}

				define <32 x i64> @load_zext_v32i16i64(<32 x i16>* %ap) #0 {
				; CHECK-LABEL: load_zext_v32i16i64
				; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32
				; VBITS_GE_2048-NEXT: ld1h { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0]
				; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8]
				; VBITS_GE_2048-NEXT: ret
				%a = load <32 x i16>, <32 x i16>* %ap
				%val = zext <32 x i16> %a to <32 x i64>
				ret <32 x i64> %val
				}

				define <32 x i64> @load_sext_v32i16i64(<32 x i16>* %ap) #0 {
				; CHECK-LABEL: load_sext_v32i16i64
				; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32
				; VBITS_GE_2048-NEXT: ld1sh { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0]
				; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8]
				; VBITS_GE_2048-NEXT: ret
				%a = load <32 x i16>, <32 x i16>* %ap
				%val = sext <32 x i16> %a to <32 x i64>
				ret <32 x i64> %val
				}

				define <32 x i64> @load_zext_v32i32i64(<32 x i32>* %ap) #0 {
				; CHECK-LABEL: load_zext_v32i32i64
				; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32
				; VBITS_GE_2048-NEXT: ld1w { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0]
				; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8]
				; VBITS_GE_2048-NEXT: ret
				%a = load <32 x i32>, <32 x i32>* %ap
				%val = zext <32 x i32> %a to <32 x i64>
				ret <32 x i64> %val
				}

				define <32 x i64> @load_sext_v32i32i64(<32 x i32>* %ap) #0 {
				; CHECK-LABEL: load_sext_v32i32i64
				; VBITS_GE_2048: ptrue [[P0:p[0-9]+]].d, vl32
				; VBITS_GE_2048-NEXT: ld1sw { [[Z0:z[0-9]+]].d }, [[P0]]/z, [x0]
				; VBITS_GE_2048-NEXT: st1d { [[Z0]].d }, [[P0]], [x8]
				; VBITS_GE_2048-NEXT: ret
				%a = load <32 x i32>, <32 x i32>* %ap
				%val = sext <32 x i32> %a to <32 x i64>
				ret <32 x i64> %val
				}

				attributes #0 = { "target-features"="+sve" }

llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll

; RUN: llc -aarch64-sve-vector-bits-min=128 < %s \| FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE		; RUN: llc -aarch64-sve-vector-bits-min=128 < %s \| FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE
; RUN: llc -aarch64-sve-vector-bits-min=256 < %s \| FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256		; RUN: llc -aarch64-sve-vector-bits-min=256 < %s \| FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256,VBITS_EQ_256
; RUN: llc -aarch64-sve-vector-bits-min=384 < %s \| FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256		; RUN: llc -aarch64-sve-vector-bits-min=384 < %s \| FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=512 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256		; RUN: llc -aarch64-sve-vector-bits-min=512 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256,VBITS_EQ_512
; RUN: llc -aarch64-sve-vector-bits-min=640 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256		; RUN: llc -aarch64-sve-vector-bits-min=640 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=768 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256		; RUN: llc -aarch64-sve-vector-bits-min=768 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=896 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256		; RUN: llc -aarch64-sve-vector-bits-min=896 < %s \| FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256		; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256,VBITS_EQ_1024
; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256		; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256		; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256		; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256		; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256		; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256		; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256		; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s \| FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s \| FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256		; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s \| FileCheck %s -D#VBYTES=256 -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512,VBITS_GE_256
▲ Show 20 Lines • Show All 53 Lines • ▼ Show 20 Lines	; CHECK: ret
%mul = mul <16 x i16> %1, %2		%mul = mul <16 x i16> %1, %2
%shr = lshr <16 x i16> %mul, %splat		%shr = lshr <16 x i16> %mul, %splat
%res = trunc <16 x i16> %shr to <16 x i8>		%res = trunc <16 x i16> %shr to <16 x i8>
ret <16 x i8> %res		ret <16 x i8> %res
}		}

define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {		define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
; CHECK-LABEL: smulh_v32i8:		; CHECK-LABEL: smulh_v32i8:
; VBITS_GE_256: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]		; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
; VBITS_GE_256-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]		; VBITS_EQ_256-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; VBITS_GE_256-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]		; VBITS_EQ_256-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; VBITS_GE_256: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b		; VBITS_EQ_256: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
; VBITS_GE_256: st1b { [[RES]].b }, [[PG]], [x0]		; VBITS_EQ_256: ret
; VBITS_GE_256: ret
		; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]]
		; VBITS_GE_512-DAG: ld1sb { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
		; VBITS_GE_512-DAG: ld1sb { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
		; VBITS_GE_512: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
		; VBITS_GE_512: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
		; VBITS_GE_512: st1b { [[RES]].h }, [[PG]], [x0]
		; VBITS_GE_512: ret
%op1 = load <32 x i8>, <32 x i8>* %a		%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b		%op2 = load <32 x i8>, <32 x i8>* %b
%insert = insertelement <32 x i16> undef, i16 8, i64 0		%insert = insertelement <32 x i16> undef, i16 8, i64 0
%splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer		%splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
%1 = sext <32 x i8> %op1 to <32 x i16>		%1 = sext <32 x i8> %op1 to <32 x i16>
%2 = sext <32 x i8> %op2 to <32 x i16>		%2 = sext <32 x i8> %op2 to <32 x i16>
%mul = mul <32 x i16> %1, %2		%mul = mul <32 x i16> %1, %2
%shr = lshr <32 x i16> %mul, %splat		%shr = lshr <32 x i16> %mul, %splat
%res = trunc <32 x i16> %shr to <32 x i8>		%res = trunc <32 x i16> %shr to <32 x i8>
store <32 x i8> %res, <32 x i8>* %a		store <32 x i8> %res, <32 x i8>* %a
ret void		ret void
}		}

define void @smulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {		define void @smulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
; CHECK-LABEL: smulh_v64i8:		; CHECK-LABEL: smulh_v64i8:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]		; VBITS_EQ_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]		; VBITS_EQ_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]		; VBITS_EQ_512: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
; VBITS_GE_512: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b		; VBITS_EQ_512: st1b { [[RES]].b }, [[PG]], [x0]
; VBITS_GE_512: st1b { [[RES]].b }, [[PG]], [x0]		; VBITS_EQ_512: ret
; VBITS_GE_512: ret
		; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]]
		; VBITS_GE_1024-DAG: ld1sb { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
		; VBITS_GE_1024-DAG: ld1sb { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
		; VBITS_GE_1024: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
		; VBITS_GE_1024: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
		; VBITS_GE_1024: st1b { [[RES]].h }, [[PG]], [x0]
		; VBITS_GE_1024: ret
%op1 = load <64 x i8>, <64 x i8>* %a		%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b		%op2 = load <64 x i8>, <64 x i8>* %b
%insert = insertelement <64 x i16> undef, i16 8, i64 0		%insert = insertelement <64 x i16> undef, i16 8, i64 0
%splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer		%splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer
%1 = sext <64 x i8> %op1 to <64 x i16>		%1 = sext <64 x i8> %op1 to <64 x i16>
%2 = sext <64 x i8> %op2 to <64 x i16>		%2 = sext <64 x i8> %op2 to <64 x i16>
%mul = mul <64 x i16> %1, %2		%mul = mul <64 x i16> %1, %2
%shr = lshr <64 x i16> %mul, %splat		%shr = lshr <64 x i16> %mul, %splat
%res = trunc <64 x i16> %shr to <64 x i8>		%res = trunc <64 x i16> %shr to <64 x i8>
store <64 x i8> %res, <64 x i8>* %a		store <64 x i8> %res, <64 x i8>* %a
ret void		ret void
}		}

define void @smulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {		define void @smulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
; CHECK-LABEL: smulh_v128i8:		; CHECK-LABEL: smulh_v128i8:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]		; VBITS_EQ_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]		; VBITS_EQ_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]		; VBITS_EQ_1024: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
; VBITS_GE_1024: smulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b		; VBITS_EQ_1024: st1b { [[RES]].b }, [[PG]], [x0]
; VBITS_GE_1024: st1b { [[RES]].b }, [[PG]], [x0]		; VBITS_EQ_1024: ret
; VBITS_GE_1024: ret
		; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,128)]]
		; VBITS_GE_2048-DAG: ld1sb { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
		; VBITS_GE_2048-DAG: ld1sb { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
		; VBITS_GE_2048: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
		; VBITS_GE_2048: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
		; VBITS_GE_2048: st1b { [[RES]].h }, [[PG]], [x0]
		; VBITS_GE_2048: ret
%op1 = load <128 x i8>, <128 x i8>* %a		%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b		%op2 = load <128 x i8>, <128 x i8>* %b
%insert = insertelement <128 x i16> undef, i16 8, i64 0		%insert = insertelement <128 x i16> undef, i16 8, i64 0
%splat = shufflevector <128 x i16> %insert, <128 x i16> undef, <128 x i32> zeroinitializer		%splat = shufflevector <128 x i16> %insert, <128 x i16> undef, <128 x i32> zeroinitializer
%1 = sext <128 x i8> %op1 to <128 x i16>		%1 = sext <128 x i8> %op1 to <128 x i16>
%2 = sext <128 x i8> %op2 to <128 x i16>		%2 = sext <128 x i8> %op2 to <128 x i16>
%mul = mul <128 x i16> %1, %2		%mul = mul <128 x i16> %1, %2
%shr = lshr <128 x i16> %mul, %splat		%shr = lshr <128 x i16> %mul, %splat
▲ Show 20 Lines • Show All 59 Lines • ▼ Show 20 Lines	; CHECK: ret
%mul = mul <8 x i32> %1, %2		%mul = mul <8 x i32> %1, %2
%shr = lshr <8 x i32> %mul, %splat		%shr = lshr <8 x i32> %mul, %splat
%res = trunc <8 x i32> %shr to <8 x i16>		%res = trunc <8 x i32> %shr to <8 x i16>
ret <8 x i16> %res		ret <8 x i16> %res
}		}

define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {		define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
; CHECK-LABEL: smulh_v16i16:		; CHECK-LABEL: smulh_v16i16:
; VBITS_GE_256: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,16)]]		; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,16)]]
; VBITS_GE_256-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]		; VBITS_EQ_256-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_256-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]		; VBITS_EQ_256-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_GE_256: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h		; VBITS_EQ_256: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; VBITS_GE_256: st1h { [[RES]].h }, [[PG]], [x0]		; VBITS_EQ_256: st1h { [[RES]].h }, [[PG]], [x0]
; VBITS_GE_256: ret		; VBITS_EQ_256: ret

		; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]]
		; VBITS_GE_512-DAG: ld1sh { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
		; VBITS_GE_512-DAG: ld1sh { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
		; VBITS_GE_512: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
		; VBITS_GE_512: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
		; VBITS_GE_512: st1h { [[RES]].s }, [[PG]], [x0]
		; VBITS_GE_512: ret
%op1 = load <16 x i16>, <16 x i16>* %a		%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b		%op2 = load <16 x i16>, <16 x i16>* %b
%insert = insertelement <16 x i32> undef, i32 16, i64 0		%insert = insertelement <16 x i32> undef, i32 16, i64 0
%splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer		%splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
%1 = sext <16 x i16> %op1 to <16 x i32>		%1 = sext <16 x i16> %op1 to <16 x i32>
%2 = sext <16 x i16> %op2 to <16 x i32>		%2 = sext <16 x i16> %op2 to <16 x i32>
%mul = mul <16 x i32> %1, %2		%mul = mul <16 x i32> %1, %2
%shr = lshr <16 x i32> %mul, %splat		%shr = lshr <16 x i32> %mul, %splat
%res = trunc <16 x i32> %shr to <16 x i16>		%res = trunc <16 x i32> %shr to <16 x i16>
store <16 x i16> %res, <16 x i16>* %a		store <16 x i16> %res, <16 x i16>* %a
ret void		ret void
}		}

define void @smulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {		define void @smulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
; CHECK-LABEL: smulh_v32i16:		; CHECK-LABEL: smulh_v32i16:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]]		; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]]
; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]		; VBITS_EQ_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]		; VBITS_EQ_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_GE_512: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h		; VBITS_EQ_512: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; VBITS_GE_512: st1h { [[RES]].h }, [[PG]], [x0]		; VBITS_EQ_512: st1h { [[RES]].h }, [[PG]], [x0]
; VBITS_GE_512: ret		; VBITS_EQ_512: ret

		; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]]
		; VBITS_GE_1024-DAG: ld1sh { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
		; VBITS_GE_1024-DAG: ld1sh { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
		; VBITS_GE_1024: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
		; VBITS_GE_1024: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
		; VBITS_GE_1024: st1h { [[RES]].s }, [[PG]], [x0]
		; VBITS_GE_1024: ret
%op1 = load <32 x i16>, <32 x i16>* %a		%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b		%op2 = load <32 x i16>, <32 x i16>* %b
%insert = insertelement <32 x i32> undef, i32 16, i64 0		%insert = insertelement <32 x i32> undef, i32 16, i64 0
%splat = shufflevector <32 x i32> %insert, <32 x i32> undef, <32 x i32> zeroinitializer		%splat = shufflevector <32 x i32> %insert, <32 x i32> undef, <32 x i32> zeroinitializer
%1 = sext <32 x i16> %op1 to <32 x i32>		%1 = sext <32 x i16> %op1 to <32 x i32>
%2 = sext <32 x i16> %op2 to <32 x i32>		%2 = sext <32 x i16> %op2 to <32 x i32>
%mul = mul <32 x i32> %1, %2		%mul = mul <32 x i32> %1, %2
%shr = lshr <32 x i32> %mul, %splat		%shr = lshr <32 x i32> %mul, %splat
%res = trunc <32 x i32> %shr to <32 x i16>		%res = trunc <32 x i32> %shr to <32 x i16>
store <32 x i16> %res, <32 x i16>* %a		store <32 x i16> %res, <32 x i16>* %a
ret void		ret void
}		}

define void @smulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {		define void @smulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
; CHECK-LABEL: smulh_v64i16:		; CHECK-LABEL: smulh_v64i16:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]]		; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]]
; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]		; VBITS_EQ_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]		; VBITS_EQ_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_GE_1024: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h		; VBITS_EQ_1024: smulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; VBITS_GE_1024: st1h { [[RES]].h }, [[PG]], [x0]		; VBITS_EQ_1024: st1h { [[RES]].h }, [[PG]], [x0]
; VBITS_GE_1024: ret		; VBITS_EQ_1024: ret

		; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,64)]]
		; VBITS_GE_2048-DAG: ld1sh { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
		; VBITS_GE_2048-DAG: ld1sh { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
		; VBITS_GE_2048: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
		; VBITS_GE_2048: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
		; VBITS_GE_2048: st1h { [[RES]].s }, [[PG]], [x0]
		; VBITS_GE_2048: ret
%op1 = load <64 x i16>, <64 x i16>* %a		%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b		%op2 = load <64 x i16>, <64 x i16>* %b
%insert = insertelement <64 x i32> undef, i32 16, i64 0		%insert = insertelement <64 x i32> undef, i32 16, i64 0
%splat = shufflevector <64 x i32> %insert, <64 x i32> undef, <64 x i32> zeroinitializer		%splat = shufflevector <64 x i32> %insert, <64 x i32> undef, <64 x i32> zeroinitializer
%1 = sext <64 x i16> %op1 to <64 x i32>		%1 = sext <64 x i16> %op1 to <64 x i32>
%2 = sext <64 x i16> %op2 to <64 x i32>		%2 = sext <64 x i16> %op2 to <64 x i32>
%mul = mul <64 x i32> %1, %2		%mul = mul <64 x i32> %1, %2
%shr = lshr <64 x i32> %mul, %splat		%shr = lshr <64 x i32> %mul, %splat
▲ Show 20 Lines • Show All 56 Lines • ▼ Show 20 Lines	; CHECK: ret
%mul = mul <4 x i64> %1, %2		%mul = mul <4 x i64> %1, %2
%shr = lshr <4 x i64> %mul, %splat		%shr = lshr <4 x i64> %mul, %splat
%res = trunc <4 x i64> %shr to <4 x i32>		%res = trunc <4 x i64> %shr to <4 x i32>
ret <4 x i32> %res		ret <4 x i32> %res
}		}

define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {		define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
; CHECK-LABEL: smulh_v8i32:		; CHECK-LABEL: smulh_v8i32:
; VBITS_GE_256: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,8)]]		; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,8)]]
; VBITS_GE_256-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]		; VBITS_EQ_256-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_256-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]		; VBITS_EQ_256-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_GE_256: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s		; VBITS_EQ_256: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; VBITS_GE_256: st1w { [[RES]].s }, [[PG]], [x0]		; VBITS_EQ_256: st1w { [[RES]].s }, [[PG]], [x0]
; VBITS_GE_256: ret		; VBITS_EQ_256: ret

		; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,8)]]
		; VBITS_GE_512-DAG: ld1sw { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
		; VBITS_GE_512-DAG: ld1sw { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
		; VBITS_GE_512: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
		; VBITS_GE_512: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32
		; VBITS_GE_512: st1w { [[RES]].d }, [[PG]], [x0]
		; VBITS_GE_512: ret
%op1 = load <8 x i32>, <8 x i32>* %a		%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b		%op2 = load <8 x i32>, <8 x i32>* %b
%insert = insertelement <8 x i64> undef, i64 32, i64 0		%insert = insertelement <8 x i64> undef, i64 32, i64 0
%splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer		%splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
%1 = sext <8 x i32> %op1 to <8 x i64>		%1 = sext <8 x i32> %op1 to <8 x i64>
%2 = sext <8 x i32> %op2 to <8 x i64>		%2 = sext <8 x i32> %op2 to <8 x i64>
%mul = mul <8 x i64> %1, %2		%mul = mul <8 x i64> %1, %2
%shr = lshr <8 x i64> %mul, %splat		%shr = lshr <8 x i64> %mul, %splat
%res = trunc <8 x i64> %shr to <8 x i32>		%res = trunc <8 x i64> %shr to <8 x i32>
store <8 x i32> %res, <8 x i32>* %a		store <8 x i32> %res, <8 x i32>* %a
ret void		ret void
}		}

define void @smulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {		define void @smulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
; CHECK-LABEL: smulh_v16i32:		; CHECK-LABEL: smulh_v16i32:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]]		; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]]
; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]		; VBITS_EQ_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]		; VBITS_EQ_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_GE_512: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s		; VBITS_EQ_512: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; VBITS_GE_512: st1w { [[RES]].s }, [[PG]], [x0]		; VBITS_EQ_512: st1w { [[RES]].s }, [[PG]], [x0]
; VBITS_GE_512: ret		; VBITS_EQ_512: ret

		; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,16)]]
		; VBITS_GE_1024-DAG: ld1sw { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
		; VBITS_GE_1024-DAG: ld1sw { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
		; VBITS_GE_1024: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
		; VBITS_GE_1024: st1w { [[RES]].d }, [[PG]], [x0]
		; VBITS_GE_1024: ret
%op1 = load <16 x i32>, <16 x i32>* %a		%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b		%op2 = load <16 x i32>, <16 x i32>* %b
%insert = insertelement <16 x i64> undef, i64 32, i64 0		%insert = insertelement <16 x i64> undef, i64 32, i64 0
%splat = shufflevector <16 x i64> %insert, <16 x i64> undef, <16 x i32> zeroinitializer		%splat = shufflevector <16 x i64> %insert, <16 x i64> undef, <16 x i32> zeroinitializer
%1 = sext <16 x i32> %op1 to <16 x i64>		%1 = sext <16 x i32> %op1 to <16 x i64>
%2 = sext <16 x i32> %op2 to <16 x i64>		%2 = sext <16 x i32> %op2 to <16 x i64>
%mul = mul <16 x i64> %1, %2		%mul = mul <16 x i64> %1, %2
%shr = lshr <16 x i64> %mul, %splat		%shr = lshr <16 x i64> %mul, %splat
%res = trunc <16 x i64> %shr to <16 x i32>		%res = trunc <16 x i64> %shr to <16 x i32>
store <16 x i32> %res, <16 x i32>* %a		store <16 x i32> %res, <16 x i32>* %a
ret void		ret void
}		}

define void @smulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {		define void @smulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
; CHECK-LABEL: smulh_v32i32:		; CHECK-LABEL: smulh_v32i32:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]]		; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]]
; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]		; VBITS_EQ_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]		; VBITS_EQ_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_GE_1024: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s		; VBITS_EQ_1024: smulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; VBITS_GE_1024: st1w { [[RES]].s }, [[PG]], [x0]		; VBITS_EQ_1024: st1w { [[RES]].s }, [[PG]], [x0]
; VBITS_GE_1024: ret		; VBITS_EQ_1024: ret

		; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,32)]]
		; VBITS_GE_2048-DAG: ld1sw { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
		; VBITS_GE_2048-DAG: ld1sw { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
		; VBITS_GE_2048: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
		; VBITS_GE_2048: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32
		; VBITS_GE_2048: st1w { [[RES]].d }, [[PG]], [x0]
		; VBITS_GE_2048: ret
%op1 = load <32 x i32>, <32 x i32>* %a		%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b		%op2 = load <32 x i32>, <32 x i32>* %b
%insert = insertelement <32 x i64> undef, i64 32, i64 0		%insert = insertelement <32 x i64> undef, i64 32, i64 0
%splat = shufflevector <32 x i64> %insert, <32 x i64> undef, <32 x i32> zeroinitializer		%splat = shufflevector <32 x i64> %insert, <32 x i64> undef, <32 x i32> zeroinitializer
%1 = sext <32 x i32> %op1 to <32 x i64>		%1 = sext <32 x i32> %op1 to <32 x i64>
%2 = sext <32 x i32> %op2 to <32 x i64>		%2 = sext <32 x i32> %op2 to <32 x i64>
%mul = mul <32 x i64> %1, %2		%mul = mul <32 x i64> %1, %2
%shr = lshr <32 x i64> %mul, %splat		%shr = lshr <32 x i64> %mul, %splat
▲ Show 20 Lines • Show All 181 Lines • ▼ Show 20 Lines	; CHECK: ret
%mul = mul <16 x i16> %1, %2		%mul = mul <16 x i16> %1, %2
%shr = lshr <16 x i16> %mul, %splat		%shr = lshr <16 x i16> %mul, %splat
%res = trunc <16 x i16> %shr to <16 x i8>		%res = trunc <16 x i16> %shr to <16 x i8>
ret <16 x i8> %res		ret <16 x i8> %res
}		}

define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {		define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 {
; CHECK-LABEL: umulh_v32i8:		; CHECK-LABEL: umulh_v32i8:
; VBITS_GE_256: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]		; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,32)]]
; VBITS_GE_256-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]		; VBITS_EQ_256-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; VBITS_GE_256-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]		; VBITS_EQ_256-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; VBITS_GE_256: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b		; VBITS_EQ_256: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
; VBITS_GE_256: st1b { [[RES]].b }, [[PG]], [x0]		; VBITS_EQ_256: st1b { [[RES]].b }, [[PG]], [x0]
; VBITS_GE_256: ret		; VBITS_EQ_256: ret

		; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]]
		; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
		; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
		; VBITS_GE_512: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
		; VBIGS_GE_512: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
		; VBITS_GE_512: st1b { [[RES]].h }, [[PG]], [x0]
		; VBITS_GE_512: ret
%op1 = load <32 x i8>, <32 x i8>* %a		%op1 = load <32 x i8>, <32 x i8>* %a
%op2 = load <32 x i8>, <32 x i8>* %b		%op2 = load <32 x i8>, <32 x i8>* %b
%insert = insertelement <32 x i16> undef, i16 8, i64 0		%insert = insertelement <32 x i16> undef, i16 8, i64 0
%splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer		%splat = shufflevector <32 x i16> %insert, <32 x i16> undef, <32 x i32> zeroinitializer
%1 = zext <32 x i8> %op1 to <32 x i16>		%1 = zext <32 x i8> %op1 to <32 x i16>
%2 = zext <32 x i8> %op2 to <32 x i16>		%2 = zext <32 x i8> %op2 to <32 x i16>
%mul = mul <32 x i16> %1, %2		%mul = mul <32 x i16> %1, %2
%shr = lshr <32 x i16> %mul, %splat		%shr = lshr <32 x i16> %mul, %splat
%res = trunc <32 x i16> %shr to <32 x i8>		%res = trunc <32 x i16> %shr to <32 x i8>
store <32 x i8> %res, <32 x i8>* %a		store <32 x i8> %res, <32 x i8>* %a
ret void		ret void
}		}

define void @umulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {		define void @umulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 {
; CHECK-LABEL: umulh_v64i8:		; CHECK-LABEL: umulh_v64i8:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]		; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,64)]]
; VBITS_GE_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]		; VBITS_EQ_512-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; VBITS_GE_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]		; VBITS_EQ_512-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; VBITS_GE_512: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b		; VBITS_EQ_512: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
; VBITS_GE_512: st1b { [[RES]].b }, [[PG]], [x0]		; VBITS_EQ_512: ret
; VBITS_GE_512: ret
		; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
		; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
		; VBITS_GE_1024: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
		; VBIGS_GE_1024: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
		; VBITS_GE_1024: st1b { [[RES]].h }, [[PG]], [x0]
		; VBITS_GE_1024: ret
%op1 = load <64 x i8>, <64 x i8>* %a		%op1 = load <64 x i8>, <64 x i8>* %a
%op2 = load <64 x i8>, <64 x i8>* %b		%op2 = load <64 x i8>, <64 x i8>* %b
%insert = insertelement <64 x i16> undef, i16 8, i64 0		%insert = insertelement <64 x i16> undef, i16 8, i64 0
%splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer		%splat = shufflevector <64 x i16> %insert, <64 x i16> undef, <64 x i32> zeroinitializer
%1 = zext <64 x i8> %op1 to <64 x i16>		%1 = zext <64 x i8> %op1 to <64 x i16>
%2 = zext <64 x i8> %op2 to <64 x i16>		%2 = zext <64 x i8> %op2 to <64 x i16>
%mul = mul <64 x i16> %1, %2		%mul = mul <64 x i16> %1, %2
%shr = lshr <64 x i16> %mul, %splat		%shr = lshr <64 x i16> %mul, %splat
%res = trunc <64 x i16> %shr to <64 x i8>		%res = trunc <64 x i16> %shr to <64 x i8>
store <64 x i8> %res, <64 x i8>* %a		store <64 x i8> %res, <64 x i8>* %a
ret void		ret void
}		}

define void @umulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {		define void @umulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 {
; CHECK-LABEL: umulh_v128i8:		; CHECK-LABEL: umulh_v128i8:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]		; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].b, vl[[#min(VBYTES,128)]]
; VBITS_GE_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]		; VBITS_EQ_1024-DAG: ld1b { [[OP1:z[0-9]+]].b }, [[PG]]/z, [x0]
; VBITS_GE_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]		; VBITS_EQ_1024-DAG: ld1b { [[OP2:z[0-9]+]].b }, [[PG]]/z, [x1]
; VBITS_GE_1024: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b		; VBITS_EQ_1024: umulh [[RES:z[0-9]+]].b, [[PG]]/m, [[OP1]].b, [[OP2]].b
; VBITS_GE_1024: st1b { [[RES]].b }, [[PG]], [x0]		; VBITS_EQ_1024: st1b { [[RES]].b }, [[PG]], [x0]
; VBITS_GE_1024: ret		; VBITS_EQ_1024: ret

		; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,128)]]
		; VBITS_GE_2048-DAG: ld1b { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
		; VBITS_GE_2048-DAG: ld1b { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
		; VBITS_GE_2048: mul [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
		; VBIGS_GE_2048: lsr [[RES]].h, [[PG]]/m, [[RES]].h, #8
		; VBITS_GE_2048: st1b { [[RES]].h }, [[PG]], [x0]
		; VBITS_GE_2048: ret
%op1 = load <128 x i8>, <128 x i8>* %a		%op1 = load <128 x i8>, <128 x i8>* %a
%op2 = load <128 x i8>, <128 x i8>* %b		%op2 = load <128 x i8>, <128 x i8>* %b
%insert = insertelement <128 x i16> undef, i16 8, i64 0		%insert = insertelement <128 x i16> undef, i16 8, i64 0
%splat = shufflevector <128 x i16> %insert, <128 x i16> undef, <128 x i32> zeroinitializer		%splat = shufflevector <128 x i16> %insert, <128 x i16> undef, <128 x i32> zeroinitializer
%1 = zext <128 x i8> %op1 to <128 x i16>		%1 = zext <128 x i8> %op1 to <128 x i16>
%2 = zext <128 x i8> %op2 to <128 x i16>		%2 = zext <128 x i8> %op2 to <128 x i16>
%mul = mul <128 x i16> %1, %2		%mul = mul <128 x i16> %1, %2
%shr = lshr <128 x i16> %mul, %splat		%shr = lshr <128 x i16> %mul, %splat
▲ Show 20 Lines • Show All 59 Lines • ▼ Show 20 Lines	; CHECK: ret
%mul = mul <8 x i32> %1, %2		%mul = mul <8 x i32> %1, %2
%shr = lshr <8 x i32> %mul, %splat		%shr = lshr <8 x i32> %mul, %splat
%res = trunc <8 x i32> %shr to <8 x i16>		%res = trunc <8 x i32> %shr to <8 x i16>
ret <8 x i16> %res		ret <8 x i16> %res
}		}

define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {		define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 {
; CHECK-LABEL: umulh_v16i16:		; CHECK-LABEL: umulh_v16i16:
; VBITS_GE_256: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,16)]]		; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,16)]]
; VBITS_GE_256-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]		; VBITS_EQ_256-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_256-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]		; VBITS_EQ_256-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_GE_256: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h		; VBITS_EQ_256: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; VBITS_GE_256: st1h { [[RES]].h }, [[PG]], [x0]		; VBITS_EQ_256: st1h { [[RES]].h }, [[PG]], [x0]
; VBITS_GE_256: ret		; VBITS_EQ_256: ret

		; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]]
		; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
		; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
		; VBITS_GE_512: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
		; VBITS_GE_512: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
		; VBITS_GE_512: st1h { [[RES]].s }, [[PG]], [x0]
		; VBITS_GE_512: ret
%op1 = load <16 x i16>, <16 x i16>* %a		%op1 = load <16 x i16>, <16 x i16>* %a
%op2 = load <16 x i16>, <16 x i16>* %b		%op2 = load <16 x i16>, <16 x i16>* %b
%insert = insertelement <16 x i32> undef, i32 16, i64 0		%insert = insertelement <16 x i32> undef, i32 16, i64 0
%splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer		%splat = shufflevector <16 x i32> %insert, <16 x i32> undef, <16 x i32> zeroinitializer
%1 = zext <16 x i16> %op1 to <16 x i32>		%1 = zext <16 x i16> %op1 to <16 x i32>
%2 = zext <16 x i16> %op2 to <16 x i32>		%2 = zext <16 x i16> %op2 to <16 x i32>
%mul = mul <16 x i32> %1, %2		%mul = mul <16 x i32> %1, %2
%shr = lshr <16 x i32> %mul, %splat		%shr = lshr <16 x i32> %mul, %splat
%res = trunc <16 x i32> %shr to <16 x i16>		%res = trunc <16 x i32> %shr to <16 x i16>
store <16 x i16> %res, <16 x i16>* %a		store <16 x i16> %res, <16 x i16>* %a
ret void		ret void
}		}

define void @umulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {		define void @umulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 {
; CHECK-LABEL: umulh_v32i16:		; CHECK-LABEL: umulh_v32i16:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]]		; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,32)]]
; VBITS_GE_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]		; VBITS_EQ_512-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]		; VBITS_EQ_512-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_GE_512: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h		; VBITS_EQ_512: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; VBITS_GE_512: st1h { [[RES]].h }, [[PG]], [x0]		; VBITS_EQ_512: st1h { [[RES]].h }, [[PG]], [x0]
; VBITS_GE_512: ret		; VBITS_EQ_512: ret

		; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]]
		; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
		; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
		; VBITS_GE_1024: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
		; VBITS_GE_1024: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
		; VBITS_GE_1024: st1h { [[RES]].s }, [[PG]], [x0]
		; VBITS_GE_1024: ret
%op1 = load <32 x i16>, <32 x i16>* %a		%op1 = load <32 x i16>, <32 x i16>* %a
%op2 = load <32 x i16>, <32 x i16>* %b		%op2 = load <32 x i16>, <32 x i16>* %b
%insert = insertelement <32 x i32> undef, i32 16, i64 0		%insert = insertelement <32 x i32> undef, i32 16, i64 0
%splat = shufflevector <32 x i32> %insert, <32 x i32> undef, <32 x i32> zeroinitializer		%splat = shufflevector <32 x i32> %insert, <32 x i32> undef, <32 x i32> zeroinitializer
%1 = zext <32 x i16> %op1 to <32 x i32>		%1 = zext <32 x i16> %op1 to <32 x i32>
%2 = zext <32 x i16> %op2 to <32 x i32>		%2 = zext <32 x i16> %op2 to <32 x i32>
%mul = mul <32 x i32> %1, %2		%mul = mul <32 x i32> %1, %2
%shr = lshr <32 x i32> %mul, %splat		%shr = lshr <32 x i32> %mul, %splat
%res = trunc <32 x i32> %shr to <32 x i16>		%res = trunc <32 x i32> %shr to <32 x i16>
store <32 x i16> %res, <32 x i16>* %a		store <32 x i16> %res, <32 x i16>* %a
ret void		ret void
}		}

define void @umulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {		define void @umulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 {
; CHECK-LABEL: umulh_v64i16:		; CHECK-LABEL: umulh_v64i16:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]]		; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].h, vl[[#min(VBYTES,64)]]
; VBITS_GE_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]		; VBITS_EQ_1024-DAG: ld1h { [[OP1:z[0-9]+]].h }, [[PG]]/z, [x0]
; VBITS_GE_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]		; VBITS_EQ_1024-DAG: ld1h { [[OP2:z[0-9]+]].h }, [[PG]]/z, [x1]
; VBITS_GE_1024: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h		; VBITS_EQ_1024: umulh [[RES:z[0-9]+]].h, [[PG]]/m, [[OP1]].h, [[OP2]].h
; VBITS_GE_1024: st1h { [[RES]].h }, [[PG]], [x0]		; VBITS_EQ_1024: ret
; VBITS_GE_1024: ret
		; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,64)]]
		; VBITS_GE_2048-DAG: ld1h { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
		; VBITS_GE_2048-DAG: ld1h { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
		; VBITS_GE_2048: mul [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
		; VBITS_GE_2048: lsr [[RES]].s, [[PG]]/m, [[RES]].s, #16
		; VBITS_GE_2048: st1h { [[RES]].s }, [[PG]], [x0]
		; VBITS_GE_2048: ret
%op1 = load <64 x i16>, <64 x i16>* %a		%op1 = load <64 x i16>, <64 x i16>* %a
%op2 = load <64 x i16>, <64 x i16>* %b		%op2 = load <64 x i16>, <64 x i16>* %b
%insert = insertelement <64 x i32> undef, i32 16, i64 0		%insert = insertelement <64 x i32> undef, i32 16, i64 0
%splat = shufflevector <64 x i32> %insert, <64 x i32> undef, <64 x i32> zeroinitializer		%splat = shufflevector <64 x i32> %insert, <64 x i32> undef, <64 x i32> zeroinitializer
%1 = zext <64 x i16> %op1 to <64 x i32>		%1 = zext <64 x i16> %op1 to <64 x i32>
%2 = zext <64 x i16> %op2 to <64 x i32>		%2 = zext <64 x i16> %op2 to <64 x i32>
%mul = mul <64 x i32> %1, %2		%mul = mul <64 x i32> %1, %2
%shr = lshr <64 x i32> %mul, %splat		%shr = lshr <64 x i32> %mul, %splat
▲ Show 20 Lines • Show All 56 Lines • ▼ Show 20 Lines	; CHECK: ret
%mul = mul <4 x i64> %1, %2		%mul = mul <4 x i64> %1, %2
%shr = lshr <4 x i64> %mul, %splat		%shr = lshr <4 x i64> %mul, %splat
%res = trunc <4 x i64> %shr to <4 x i32>		%res = trunc <4 x i64> %shr to <4 x i32>
ret <4 x i32> %res		ret <4 x i32> %res
}		}

define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {		define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 {
; CHECK-LABEL: umulh_v8i32:		; CHECK-LABEL: umulh_v8i32:
; VBITS_GE_256: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,8)]]		; VBITS_EQ_256: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,8)]]
; VBITS_GE_256-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]		; VBITS_EQ_256-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_256-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]		; VBITS_EQ_256-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_GE_256: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s		; VBITS_EQ_256: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; VBITS_GE_256: st1w { [[RES]].s }, [[PG]], [x0]		; VBITS_EQ_256: st1w { [[RES]].s }, [[PG]], [x0]
; VBITS_GE_256: ret		; VBITS_EQ_256: ret

		; VBITS_GE_512: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,8)]]
		; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
		; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
		; VBITS_GE_512: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
		; VBITS_GE_512: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32
		; VBITS_GE_512: st1w { [[RES]].d }, [[PG]], [x0]
		; VBITS_GE_512: ret
%op1 = load <8 x i32>, <8 x i32>* %a		%op1 = load <8 x i32>, <8 x i32>* %a
%op2 = load <8 x i32>, <8 x i32>* %b		%op2 = load <8 x i32>, <8 x i32>* %b
%insert = insertelement <8 x i64> undef, i64 32, i64 0		%insert = insertelement <8 x i64> undef, i64 32, i64 0
%splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer		%splat = shufflevector <8 x i64> %insert, <8 x i64> undef, <8 x i32> zeroinitializer
%1 = zext <8 x i32> %op1 to <8 x i64>		%1 = zext <8 x i32> %op1 to <8 x i64>
%2 = zext <8 x i32> %op2 to <8 x i64>		%2 = zext <8 x i32> %op2 to <8 x i64>
%mul = mul <8 x i64> %1, %2		%mul = mul <8 x i64> %1, %2
%shr = lshr <8 x i64> %mul, %splat		%shr = lshr <8 x i64> %mul, %splat
%res = trunc <8 x i64> %shr to <8 x i32>		%res = trunc <8 x i64> %shr to <8 x i32>
store <8 x i32> %res, <8 x i32>* %a		store <8 x i32> %res, <8 x i32>* %a
ret void		ret void
}		}

define void @umulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {		define void @umulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 {
; CHECK-LABEL: umulh_v16i32:		; CHECK-LABEL: umulh_v16i32:
; VBITS_GE_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]]		; VBITS_EQ_512: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,16)]]
; VBITS_GE_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]		; VBITS_EQ_512-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]		; VBITS_EQ_512-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_GE_512: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s		; VBITS_EQ_512: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; VBITS_GE_512: st1w { [[RES]].s }, [[PG]], [x0]		; VBITS_EQ_512: st1w { [[RES]].s }, [[PG]], [x0]
; VBITS_GE_512: ret
		; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,16)]]
		; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
		; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
		; VBITS_GE_1024: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
		; VBITS_GE_1024: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32
		; VBITS_GE_1024: st1w { [[RES]].d }, [[PG]], [x0]
		; VBITS_GE_1024: ret
%op1 = load <16 x i32>, <16 x i32>* %a		%op1 = load <16 x i32>, <16 x i32>* %a
%op2 = load <16 x i32>, <16 x i32>* %b		%op2 = load <16 x i32>, <16 x i32>* %b
%insert = insertelement <16 x i64> undef, i64 32, i64 0		%insert = insertelement <16 x i64> undef, i64 32, i64 0
%splat = shufflevector <16 x i64> %insert, <16 x i64> undef, <16 x i32> zeroinitializer		%splat = shufflevector <16 x i64> %insert, <16 x i64> undef, <16 x i32> zeroinitializer
%1 = zext <16 x i32> %op1 to <16 x i64>		%1 = zext <16 x i32> %op1 to <16 x i64>
%2 = zext <16 x i32> %op2 to <16 x i64>		%2 = zext <16 x i32> %op2 to <16 x i64>
%mul = mul <16 x i64> %1, %2		%mul = mul <16 x i64> %1, %2
%shr = lshr <16 x i64> %mul, %splat		%shr = lshr <16 x i64> %mul, %splat
%res = trunc <16 x i64> %shr to <16 x i32>		%res = trunc <16 x i64> %shr to <16 x i32>
store <16 x i32> %res, <16 x i32>* %a		store <16 x i32> %res, <16 x i32>* %a
ret void		ret void
}		}

define void @umulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {		define void @umulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 {
; CHECK-LABEL: umulh_v32i32:		; CHECK-LABEL: umulh_v32i32:
; VBITS_GE_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]]		; VBITS_EQ_1024: ptrue [[PG:p[0-9]+]].s, vl[[#min(VBYTES,32)]]
; VBITS_GE_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]		; VBITS_EQ_1024-DAG: ld1w { [[OP1:z[0-9]+]].s }, [[PG]]/z, [x0]
; VBITS_GE_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]		; VBITS_EQ_1024-DAG: ld1w { [[OP2:z[0-9]+]].s }, [[PG]]/z, [x1]
; VBITS_GE_1024: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s		; VBITS_EQ_1024: umulh [[RES:z[0-9]+]].s, [[PG]]/m, [[OP1]].s, [[OP2]].s
; VBITS_GE_1024: st1w { [[RES]].s }, [[PG]], [x0]		; VBITS_EQ_1024: st1w { [[RES]].s }, [[PG]], [x0]
; VBITS_GE_1024: ret		; VBITS_EQ_1024: ret

		; VBITS_GE_2048: ptrue [[PG:p[0-9]+]].d, vl[[#min(VBYTES,32)]]
		; VBITS_GE_2048-DAG: ld1w { [[OP1:z[0-9]+]].d }, [[PG]]/z, [x0]
		; VBITS_GE_2048-DAG: ld1w { [[OP2:z[0-9]+]].d }, [[PG]]/z, [x1]
		; VBITS_GE_2048: mul [[RES:z[0-9]+]].d, [[PG]]/m, [[OP1]].d, [[OP2]].d
		; VBITS_GE_2048: lsr [[RES]].d, [[PG]]/m, [[RES]].d, #32
		; VBITS_GE_2048: st1w { [[RES]].d }, [[PG]], [x0]
		; VBITS_GE_2048: ret
%op1 = load <32 x i32>, <32 x i32>* %a		%op1 = load <32 x i32>, <32 x i32>* %a
%op2 = load <32 x i32>, <32 x i32>* %b		%op2 = load <32 x i32>, <32 x i32>* %b
%insert = insertelement <32 x i64> undef, i64 32, i64 0		%insert = insertelement <32 x i64> undef, i64 32, i64 0
%splat = shufflevector <32 x i64> %insert, <32 x i64> undef, <32 x i32> zeroinitializer		%splat = shufflevector <32 x i64> %insert, <32 x i64> undef, <32 x i32> zeroinitializer
%1 = zext <32 x i32> %op1 to <32 x i64>		%1 = zext <32 x i32> %op1 to <32 x i64>
%2 = zext <32 x i32> %op2 to <32 x i64>		%2 = zext <32 x i32> %op2 to <32 x i64>
%mul = mul <32 x i64> %1, %2		%mul = mul <32 x i64> %1, %2
%shr = lshr <32 x i64> %mul, %splat		%shr = lshr <32 x i64> %mul, %splat
▲ Show 20 Lines • Show All 142 Lines • Show Last 20 Lines