diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp --- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp @@ -10005,6 +10005,21 @@ } } + if (ShuffleVectorSDNode *SV = dyn_cast(N)) { + if (!SV->isSplat()) + return nullptr; + + if (SV->getOperand(0).getOpcode() != ISD::INSERT_VECTOR_ELT) + return nullptr; + + SDValue IVE = SV->getOperand(0); + if (IVE->getConstantOperandVal(2) != SV->getSplatIndex()) + return nullptr; + + ConstantSDNode *CN = dyn_cast(IVE->getOperand(1)); + return CN; + } + return nullptr; } diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-int-mulh.ll @@ -1,11 +1,11 @@ ; RUN: llc -aarch64-sve-vector-bits-min=128 < %s | FileCheck %s -D#VBYTES=16 -check-prefix=NO_SVE -; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_EQ_256 +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK,VBITS_GE_256 ; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -D#VBYTES=32 -check-prefixes=CHECK -; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512,VBITS_EQ_512 +; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 ; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -D#VBYTES=64 -check-prefixes=CHECK,VBITS_GE_512 -; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024,VBITS_EQ_1024 +; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 ; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 ; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 ; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -D#VBYTES=128 -check-prefixes=CHECK,VBITS_GE_512,VBITS_GE_1024 @@ -85,25 +85,14 @@ } define void @smulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { -; VBITS_EQ_256-LABEL: smulh_v32i8: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 -; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ld1b { z1.b }, p0/z, [x1] -; VBITS_EQ_256-NEXT: smulh z0.b, p0/m, z0.b, z1.b -; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x0] -; VBITS_EQ_256-NEXT: ret -; -; VBITS_GE_512-LABEL: smulh_v32i8: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl32 -; VBITS_GE_512-NEXT: ld1sb { z0.h }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1sb { z1.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: mul z0.h, p0/m, z0.h, z1.h -; VBITS_GE_512-NEXT: lsr z0.h, p0/m, z0.h, #8 -; VBITS_GE_512-NEXT: st1b { z0.h }, p0, [x0] -; VBITS_GE_512-NEXT: ret - +; VBITS_GE_256-LABEL: smulh_v32i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: smulh z0.b, p0/m, z0.b, z1.b +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret %op1 = load <32 x i8>, <32 x i8>* %a %op2 = load <32 x i8>, <32 x i8>* %b %insert = insertelement <32 x i16> undef, i16 8, i64 0 @@ -118,25 +107,14 @@ } define void @smulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { -; VBITS_EQ_512-LABEL: smulh_v64i8: -; VBITS_EQ_512: // %bb.0: -; VBITS_EQ_512-NEXT: ptrue p0.b, vl64 -; VBITS_EQ_512-NEXT: ld1b { z0.b }, p0/z, [x0] -; VBITS_EQ_512-NEXT: ld1b { z1.b }, p0/z, [x1] -; VBITS_EQ_512-NEXT: smulh z0.b, p0/m, z0.b, z1.b -; VBITS_EQ_512-NEXT: st1b { z0.b }, p0, [x0] -; VBITS_EQ_512-NEXT: ret -; -; VBITS_GE_1024-LABEL: smulh_v64i8: -; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 -; VBITS_GE_1024-NEXT: ld1sb { z0.h }, p0/z, [x0] -; VBITS_GE_1024-NEXT: ld1sb { z1.h }, p0/z, [x1] -; VBITS_GE_1024-NEXT: mul z0.h, p0/m, z0.h, z1.h -; VBITS_GE_1024-NEXT: lsr z0.h, p0/m, z0.h, #8 -; VBITS_GE_1024-NEXT: st1b { z0.h }, p0, [x0] -; VBITS_GE_1024-NEXT: ret - +; VBITS_GE_512-LABEL: smulh_v64i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl64 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: smulh z0.b, p0/m, z0.b, z1.b +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret %op1 = load <64 x i8>, <64 x i8>* %a %op2 = load <64 x i8>, <64 x i8>* %b %insert = insertelement <64 x i16> undef, i16 8, i64 0 @@ -151,25 +129,14 @@ } define void @smulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { -; VBITS_EQ_1024-LABEL: smulh_v128i8: -; VBITS_EQ_1024: // %bb.0: -; VBITS_EQ_1024-NEXT: ptrue p0.b, vl128 -; VBITS_EQ_1024-NEXT: ld1b { z0.b }, p0/z, [x0] -; VBITS_EQ_1024-NEXT: ld1b { z1.b }, p0/z, [x1] -; VBITS_EQ_1024-NEXT: smulh z0.b, p0/m, z0.b, z1.b -; VBITS_EQ_1024-NEXT: st1b { z0.b }, p0, [x0] -; VBITS_EQ_1024-NEXT: ret -; -; VBITS_GE_2048-LABEL: smulh_v128i8: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 -; VBITS_GE_2048-NEXT: ld1sb { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1sb { z1.h }, p0/z, [x1] -; VBITS_GE_2048-NEXT: mul z0.h, p0/m, z0.h, z1.h -; VBITS_GE_2048-NEXT: lsr z0.h, p0/m, z0.h, #8 -; VBITS_GE_2048-NEXT: st1b { z0.h }, p0, [x0] -; VBITS_GE_2048-NEXT: ret - +; VBITS_GE_1024-LABEL: smulh_v128i8: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.b, vl128 +; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_1024-NEXT: smulh z0.b, p0/m, z0.b, z1.b +; VBITS_GE_1024-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %op1 = load <128 x i8>, <128 x i8>* %a %op2 = load <128 x i8>, <128 x i8>* %b %insert = insertelement <128 x i16> undef, i16 8, i64 0 @@ -249,25 +216,14 @@ } define void @smulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { -; VBITS_EQ_256-LABEL: smulh_v16i16: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 -; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_EQ_256-NEXT: smulh z0.h, p0/m, z0.h, z1.h -; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_EQ_256-NEXT: ret -; -; VBITS_GE_512-LABEL: smulh_v16i16: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: ld1sh { z0.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1sh { z1.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: mul z0.s, p0/m, z0.s, z1.s -; VBITS_GE_512-NEXT: lsr z0.s, p0/m, z0.s, #16 -; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x0] -; VBITS_GE_512-NEXT: ret - +; VBITS_GE_256-LABEL: smulh_v16i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: smulh z0.h, p0/m, z0.h, z1.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret %op1 = load <16 x i16>, <16 x i16>* %a %op2 = load <16 x i16>, <16 x i16>* %b %insert = insertelement <16 x i32> undef, i32 16, i64 0 @@ -282,25 +238,14 @@ } define void @smulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { -; VBITS_EQ_512-LABEL: smulh_v32i16: -; VBITS_EQ_512: // %bb.0: -; VBITS_EQ_512-NEXT: ptrue p0.h, vl32 -; VBITS_EQ_512-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_EQ_512-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_EQ_512-NEXT: smulh z0.h, p0/m, z0.h, z1.h -; VBITS_EQ_512-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_EQ_512-NEXT: ret -; -; VBITS_GE_1024-LABEL: smulh_v32i16: -; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 -; VBITS_GE_1024-NEXT: ld1sh { z0.s }, p0/z, [x0] -; VBITS_GE_1024-NEXT: ld1sh { z1.s }, p0/z, [x1] -; VBITS_GE_1024-NEXT: mul z0.s, p0/m, z0.s, z1.s -; VBITS_GE_1024-NEXT: lsr z0.s, p0/m, z0.s, #16 -; VBITS_GE_1024-NEXT: st1h { z0.s }, p0, [x0] -; VBITS_GE_1024-NEXT: ret - +; VBITS_GE_512-LABEL: smulh_v32i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: smulh z0.h, p0/m, z0.h, z1.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret %op1 = load <32 x i16>, <32 x i16>* %a %op2 = load <32 x i16>, <32 x i16>* %b %insert = insertelement <32 x i32> undef, i32 16, i64 0 @@ -315,25 +260,14 @@ } define void @smulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { -; VBITS_EQ_1024-LABEL: smulh_v64i16: -; VBITS_EQ_1024: // %bb.0: -; VBITS_EQ_1024-NEXT: ptrue p0.h, vl64 -; VBITS_EQ_1024-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_EQ_1024-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_EQ_1024-NEXT: smulh z0.h, p0/m, z0.h, z1.h -; VBITS_EQ_1024-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_EQ_1024-NEXT: ret -; -; VBITS_GE_2048-LABEL: smulh_v64i16: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 -; VBITS_GE_2048-NEXT: ld1sh { z0.s }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1sh { z1.s }, p0/z, [x1] -; VBITS_GE_2048-NEXT: mul z0.s, p0/m, z0.s, z1.s -; VBITS_GE_2048-NEXT: lsr z0.s, p0/m, z0.s, #16 -; VBITS_GE_2048-NEXT: st1h { z0.s }, p0, [x0] -; VBITS_GE_2048-NEXT: ret - +; VBITS_GE_1024-LABEL: smulh_v64i16: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_1024-NEXT: smulh z0.h, p0/m, z0.h, z1.h +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %op1 = load <64 x i16>, <64 x i16>* %a %op2 = load <64 x i16>, <64 x i16>* %b %insert = insertelement <64 x i32> undef, i32 16, i64 0 @@ -408,25 +342,14 @@ } define void @smulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { -; VBITS_EQ_256-LABEL: smulh_v8i32: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_EQ_256-NEXT: smulh z0.s, p0/m, z0.s, z1.s -; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_EQ_256-NEXT: ret -; -; VBITS_GE_512-LABEL: smulh_v8i32: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: ld1sw { z0.d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1sw { z1.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: mul z0.d, p0/m, z0.d, z1.d -; VBITS_GE_512-NEXT: lsr z0.d, p0/m, z0.d, #32 -; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [x0] -; VBITS_GE_512-NEXT: ret - +; VBITS_GE_256-LABEL: smulh_v8i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: smulh z0.s, p0/m, z0.s, z1.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret %op1 = load <8 x i32>, <8 x i32>* %a %op2 = load <8 x i32>, <8 x i32>* %b %insert = insertelement <8 x i64> undef, i64 32, i64 0 @@ -441,25 +364,14 @@ } define void @smulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { -; VBITS_EQ_512-LABEL: smulh_v16i32: -; VBITS_EQ_512: // %bb.0: -; VBITS_EQ_512-NEXT: ptrue p0.s, vl16 -; VBITS_EQ_512-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_EQ_512-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_EQ_512-NEXT: smulh z0.s, p0/m, z0.s, z1.s -; VBITS_EQ_512-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_EQ_512-NEXT: ret -; -; VBITS_GE_1024-LABEL: smulh_v16i32: -; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 -; VBITS_GE_1024-NEXT: ld1sw { z0.d }, p0/z, [x0] -; VBITS_GE_1024-NEXT: ld1sw { z1.d }, p0/z, [x1] -; VBITS_GE_1024-NEXT: mul z0.d, p0/m, z0.d, z1.d -; VBITS_GE_1024-NEXT: lsr z0.d, p0/m, z0.d, #32 -; VBITS_GE_1024-NEXT: st1w { z0.d }, p0, [x0] -; VBITS_GE_1024-NEXT: ret - +; VBITS_GE_512-LABEL: smulh_v16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: smulh z0.s, p0/m, z0.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret %op1 = load <16 x i32>, <16 x i32>* %a %op2 = load <16 x i32>, <16 x i32>* %b %insert = insertelement <16 x i64> undef, i64 32, i64 0 @@ -474,25 +386,14 @@ } define void @smulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { -; VBITS_EQ_1024-LABEL: smulh_v32i32: -; VBITS_EQ_1024: // %bb.0: -; VBITS_EQ_1024-NEXT: ptrue p0.s, vl32 -; VBITS_EQ_1024-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_EQ_1024-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_EQ_1024-NEXT: smulh z0.s, p0/m, z0.s, z1.s -; VBITS_EQ_1024-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_EQ_1024-NEXT: ret -; -; VBITS_GE_2048-LABEL: smulh_v32i32: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 -; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1sw { z1.d }, p0/z, [x1] -; VBITS_GE_2048-NEXT: mul z0.d, p0/m, z0.d, z1.d -; VBITS_GE_2048-NEXT: lsr z0.d, p0/m, z0.d, #32 -; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [x0] -; VBITS_GE_2048-NEXT: ret - +; VBITS_GE_1024-LABEL: smulh_v32i32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_1024-NEXT: smulh z0.s, p0/m, z0.s, z1.s +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %op1 = load <32 x i32>, <32 x i32>* %a %op2 = load <32 x i32>, <32 x i32>* %b %insert = insertelement <32 x i64> undef, i64 32, i64 0 @@ -713,25 +614,14 @@ } define void @umulh_v32i8(<32 x i8>* %a, <32 x i8>* %b) #0 { -; VBITS_EQ_256-LABEL: umulh_v32i8: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: ptrue p0.b, vl32 -; VBITS_EQ_256-NEXT: ld1b { z0.b }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ld1b { z1.b }, p0/z, [x1] -; VBITS_EQ_256-NEXT: umulh z0.b, p0/m, z0.b, z1.b -; VBITS_EQ_256-NEXT: st1b { z0.b }, p0, [x0] -; VBITS_EQ_256-NEXT: ret -; -; VBITS_GE_512-LABEL: umulh_v32i8: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.h, vl32 -; VBITS_GE_512-NEXT: ld1b { z0.h }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1b { z1.h }, p0/z, [x1] -; VBITS_GE_512-NEXT: mul z0.h, p0/m, z0.h, z1.h -; VBITS_GE_512-NEXT: lsr z0.h, p0/m, z0.h, #8 -; VBITS_GE_512-NEXT: st1b { z0.h }, p0, [x0] -; VBITS_GE_512-NEXT: ret - +; VBITS_GE_256-LABEL: umulh_v32i8: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.b, vl32 +; VBITS_GE_256-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_256-NEXT: umulh z0.b, p0/m, z0.b, z1.b +; VBITS_GE_256-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_256-NEXT: ret %op1 = load <32 x i8>, <32 x i8>* %a %op2 = load <32 x i8>, <32 x i8>* %b %insert = insertelement <32 x i16> undef, i16 8, i64 0 @@ -746,25 +636,14 @@ } define void @umulh_v64i8(<64 x i8>* %a, <64 x i8>* %b) #0 { -; VBITS_EQ_512-LABEL: umulh_v64i8: -; VBITS_EQ_512: // %bb.0: -; VBITS_EQ_512-NEXT: ptrue p0.b, vl64 -; VBITS_EQ_512-NEXT: ld1b { z0.b }, p0/z, [x0] -; VBITS_EQ_512-NEXT: ld1b { z1.b }, p0/z, [x1] -; VBITS_EQ_512-NEXT: umulh z0.b, p0/m, z0.b, z1.b -; VBITS_EQ_512-NEXT: st1b { z0.b }, p0, [x0] -; VBITS_EQ_512-NEXT: ret -; -; VBITS_GE_1024-LABEL: umulh_v64i8: -; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 -; VBITS_GE_1024-NEXT: ld1b { z0.h }, p0/z, [x0] -; VBITS_GE_1024-NEXT: ld1b { z1.h }, p0/z, [x1] -; VBITS_GE_1024-NEXT: mul z0.h, p0/m, z0.h, z1.h -; VBITS_GE_1024-NEXT: lsr z0.h, p0/m, z0.h, #8 -; VBITS_GE_1024-NEXT: st1b { z0.h }, p0, [x0] -; VBITS_GE_1024-NEXT: ret - +; VBITS_GE_512-LABEL: umulh_v64i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.b, vl64 +; VBITS_GE_512-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_512-NEXT: umulh z0.b, p0/m, z0.b, z1.b +; VBITS_GE_512-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_512-NEXT: ret %op1 = load <64 x i8>, <64 x i8>* %a %op2 = load <64 x i8>, <64 x i8>* %b %insert = insertelement <64 x i16> undef, i16 8, i64 0 @@ -779,25 +658,14 @@ } define void @umulh_v128i8(<128 x i8>* %a, <128 x i8>* %b) #0 { -; VBITS_EQ_1024-LABEL: umulh_v128i8: -; VBITS_EQ_1024: // %bb.0: -; VBITS_EQ_1024-NEXT: ptrue p0.b, vl128 -; VBITS_EQ_1024-NEXT: ld1b { z0.b }, p0/z, [x0] -; VBITS_EQ_1024-NEXT: ld1b { z1.b }, p0/z, [x1] -; VBITS_EQ_1024-NEXT: umulh z0.b, p0/m, z0.b, z1.b -; VBITS_EQ_1024-NEXT: st1b { z0.b }, p0, [x0] -; VBITS_EQ_1024-NEXT: ret -; -; VBITS_GE_2048-LABEL: umulh_v128i8: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.h, vl128 -; VBITS_GE_2048-NEXT: ld1b { z0.h }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1b { z1.h }, p0/z, [x1] -; VBITS_GE_2048-NEXT: mul z0.h, p0/m, z0.h, z1.h -; VBITS_GE_2048-NEXT: lsr z0.h, p0/m, z0.h, #8 -; VBITS_GE_2048-NEXT: st1b { z0.h }, p0, [x0] -; VBITS_GE_2048-NEXT: ret - +; VBITS_GE_1024-LABEL: umulh_v128i8: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.b, vl128 +; VBITS_GE_1024-NEXT: ld1b { z0.b }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1b { z1.b }, p0/z, [x1] +; VBITS_GE_1024-NEXT: umulh z0.b, p0/m, z0.b, z1.b +; VBITS_GE_1024-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %op1 = load <128 x i8>, <128 x i8>* %a %op2 = load <128 x i8>, <128 x i8>* %b %insert = insertelement <128 x i16> undef, i16 8, i64 0 @@ -877,25 +745,14 @@ } define void @umulh_v16i16(<16 x i16>* %a, <16 x i16>* %b) #0 { -; VBITS_EQ_256-LABEL: umulh_v16i16: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: ptrue p0.h, vl16 -; VBITS_EQ_256-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_EQ_256-NEXT: umulh z0.h, p0/m, z0.h, z1.h -; VBITS_EQ_256-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_EQ_256-NEXT: ret -; -; VBITS_GE_512-LABEL: umulh_v16i16: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.s, vl16 -; VBITS_GE_512-NEXT: ld1h { z0.s }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1h { z1.s }, p0/z, [x1] -; VBITS_GE_512-NEXT: mul z0.s, p0/m, z0.s, z1.s -; VBITS_GE_512-NEXT: lsr z0.s, p0/m, z0.s, #16 -; VBITS_GE_512-NEXT: st1h { z0.s }, p0, [x0] -; VBITS_GE_512-NEXT: ret - +; VBITS_GE_256-LABEL: umulh_v16i16: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.h, vl16 +; VBITS_GE_256-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_256-NEXT: umulh z0.h, p0/m, z0.h, z1.h +; VBITS_GE_256-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_256-NEXT: ret %op1 = load <16 x i16>, <16 x i16>* %a %op2 = load <16 x i16>, <16 x i16>* %b %insert = insertelement <16 x i32> undef, i32 16, i64 0 @@ -910,25 +767,14 @@ } define void @umulh_v32i16(<32 x i16>* %a, <32 x i16>* %b) #0 { -; VBITS_EQ_512-LABEL: umulh_v32i16: -; VBITS_EQ_512: // %bb.0: -; VBITS_EQ_512-NEXT: ptrue p0.h, vl32 -; VBITS_EQ_512-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_EQ_512-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_EQ_512-NEXT: umulh z0.h, p0/m, z0.h, z1.h -; VBITS_EQ_512-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_EQ_512-NEXT: ret -; -; VBITS_GE_1024-LABEL: umulh_v32i16: -; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 -; VBITS_GE_1024-NEXT: ld1h { z0.s }, p0/z, [x0] -; VBITS_GE_1024-NEXT: ld1h { z1.s }, p0/z, [x1] -; VBITS_GE_1024-NEXT: mul z0.s, p0/m, z0.s, z1.s -; VBITS_GE_1024-NEXT: lsr z0.s, p0/m, z0.s, #16 -; VBITS_GE_1024-NEXT: st1h { z0.s }, p0, [x0] -; VBITS_GE_1024-NEXT: ret - +; VBITS_GE_512-LABEL: umulh_v32i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.h, vl32 +; VBITS_GE_512-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_512-NEXT: umulh z0.h, p0/m, z0.h, z1.h +; VBITS_GE_512-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_512-NEXT: ret %op1 = load <32 x i16>, <32 x i16>* %a %op2 = load <32 x i16>, <32 x i16>* %b %insert = insertelement <32 x i32> undef, i32 16, i64 0 @@ -943,25 +789,14 @@ } define void @umulh_v64i16(<64 x i16>* %a, <64 x i16>* %b) #0 { -; VBITS_EQ_1024-LABEL: umulh_v64i16: -; VBITS_EQ_1024: // %bb.0: -; VBITS_EQ_1024-NEXT: ptrue p0.h, vl64 -; VBITS_EQ_1024-NEXT: ld1h { z0.h }, p0/z, [x0] -; VBITS_EQ_1024-NEXT: ld1h { z1.h }, p0/z, [x1] -; VBITS_EQ_1024-NEXT: umulh z0.h, p0/m, z0.h, z1.h -; VBITS_EQ_1024-NEXT: st1h { z0.h }, p0, [x0] -; VBITS_EQ_1024-NEXT: ret -; -; VBITS_GE_2048-LABEL: umulh_v64i16: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.s, vl64 -; VBITS_GE_2048-NEXT: ld1h { z0.s }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1h { z1.s }, p0/z, [x1] -; VBITS_GE_2048-NEXT: mul z0.s, p0/m, z0.s, z1.s -; VBITS_GE_2048-NEXT: lsr z0.s, p0/m, z0.s, #16 -; VBITS_GE_2048-NEXT: st1h { z0.s }, p0, [x0] -; VBITS_GE_2048-NEXT: ret - +; VBITS_GE_1024-LABEL: umulh_v64i16: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.h, vl64 +; VBITS_GE_1024-NEXT: ld1h { z0.h }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1h { z1.h }, p0/z, [x1] +; VBITS_GE_1024-NEXT: umulh z0.h, p0/m, z0.h, z1.h +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %op1 = load <64 x i16>, <64 x i16>* %a %op2 = load <64 x i16>, <64 x i16>* %b %insert = insertelement <64 x i32> undef, i32 16, i64 0 @@ -1036,25 +871,14 @@ } define void @umulh_v8i32(<8 x i32>* %a, <8 x i32>* %b) #0 { -; VBITS_EQ_256-LABEL: umulh_v8i32: -; VBITS_EQ_256: // %bb.0: -; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 -; VBITS_EQ_256-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_EQ_256-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_EQ_256-NEXT: umulh z0.s, p0/m, z0.s, z1.s -; VBITS_EQ_256-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_EQ_256-NEXT: ret -; -; VBITS_GE_512-LABEL: umulh_v8i32: -; VBITS_GE_512: // %bb.0: -; VBITS_GE_512-NEXT: ptrue p0.d, vl8 -; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [x0] -; VBITS_GE_512-NEXT: ld1w { z1.d }, p0/z, [x1] -; VBITS_GE_512-NEXT: mul z0.d, p0/m, z0.d, z1.d -; VBITS_GE_512-NEXT: lsr z0.d, p0/m, z0.d, #32 -; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [x0] -; VBITS_GE_512-NEXT: ret - +; VBITS_GE_256-LABEL: umulh_v8i32: +; VBITS_GE_256: // %bb.0: +; VBITS_GE_256-NEXT: ptrue p0.s, vl8 +; VBITS_GE_256-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_256-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_256-NEXT: umulh z0.s, p0/m, z0.s, z1.s +; VBITS_GE_256-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_256-NEXT: ret %op1 = load <8 x i32>, <8 x i32>* %a %op2 = load <8 x i32>, <8 x i32>* %b %insert = insertelement <8 x i64> undef, i64 32, i64 0 @@ -1069,25 +893,14 @@ } define void @umulh_v16i32(<16 x i32>* %a, <16 x i32>* %b) #0 { -; VBITS_EQ_512-LABEL: umulh_v16i32: -; VBITS_EQ_512: // %bb.0: -; VBITS_EQ_512-NEXT: ptrue p0.s, vl16 -; VBITS_EQ_512-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_EQ_512-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_EQ_512-NEXT: umulh z0.s, p0/m, z0.s, z1.s -; VBITS_EQ_512-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_EQ_512-NEXT: ret -; -; VBITS_GE_1024-LABEL: umulh_v16i32: -; VBITS_GE_1024: // %bb.0: -; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 -; VBITS_GE_1024-NEXT: ld1w { z0.d }, p0/z, [x0] -; VBITS_GE_1024-NEXT: ld1w { z1.d }, p0/z, [x1] -; VBITS_GE_1024-NEXT: mul z0.d, p0/m, z0.d, z1.d -; VBITS_GE_1024-NEXT: lsr z0.d, p0/m, z0.d, #32 -; VBITS_GE_1024-NEXT: st1w { z0.d }, p0, [x0] -; VBITS_GE_1024-NEXT: ret - +; VBITS_GE_512-LABEL: umulh_v16i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.s, vl16 +; VBITS_GE_512-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_512-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_512-NEXT: umulh z0.s, p0/m, z0.s, z1.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret %op1 = load <16 x i32>, <16 x i32>* %a %op2 = load <16 x i32>, <16 x i32>* %b %insert = insertelement <16 x i64> undef, i64 32, i64 0 @@ -1102,25 +915,14 @@ } define void @umulh_v32i32(<32 x i32>* %a, <32 x i32>* %b) #0 { -; VBITS_EQ_1024-LABEL: umulh_v32i32: -; VBITS_EQ_1024: // %bb.0: -; VBITS_EQ_1024-NEXT: ptrue p0.s, vl32 -; VBITS_EQ_1024-NEXT: ld1w { z0.s }, p0/z, [x0] -; VBITS_EQ_1024-NEXT: ld1w { z1.s }, p0/z, [x1] -; VBITS_EQ_1024-NEXT: umulh z0.s, p0/m, z0.s, z1.s -; VBITS_EQ_1024-NEXT: st1w { z0.s }, p0, [x0] -; VBITS_EQ_1024-NEXT: ret -; -; VBITS_GE_2048-LABEL: umulh_v32i32: -; VBITS_GE_2048: // %bb.0: -; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 -; VBITS_GE_2048-NEXT: ld1w { z0.d }, p0/z, [x0] -; VBITS_GE_2048-NEXT: ld1w { z1.d }, p0/z, [x1] -; VBITS_GE_2048-NEXT: mul z0.d, p0/m, z0.d, z1.d -; VBITS_GE_2048-NEXT: lsr z0.d, p0/m, z0.d, #32 -; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [x0] -; VBITS_GE_2048-NEXT: ret - +; VBITS_GE_1024-LABEL: umulh_v32i32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.s, vl32 +; VBITS_GE_1024-NEXT: ld1w { z0.s }, p0/z, [x0] +; VBITS_GE_1024-NEXT: ld1w { z1.s }, p0/z, [x1] +; VBITS_GE_1024-NEXT: umulh z0.s, p0/m, z0.s, z1.s +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_1024-NEXT: ret %op1 = load <32 x i32>, <32 x i32>* %a %op2 = load <32 x i32>, <32 x i32>* %b %insert = insertelement <32 x i64> undef, i64 32, i64 0