diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4506,7 +4506,7 @@ } InputVT = DAG.getValueType(MemVT.changeTypeToInteger()); Mask = DAG.getNode( - ISD::ZERO_EXTEND, DL, + ISD::SIGN_EXTEND, DL, VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask); } @@ -4618,7 +4618,7 @@ VT.changeVectorElementType(IndexVT.getVectorElementType()), StoreVal); StoreVal = convertToScalableVector(DAG, IndexVT, StoreVal); Mask = DAG.getNode( - ISD::ZERO_EXTEND, DL, + ISD::SIGN_EXTEND, DL, VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask); } else if (VT.isFloatingPoint()) { // Handle FP data by casting the data so an integer scatter can be used. @@ -18627,12 +18627,15 @@ EVT InVT = Mask.getValueType(); EVT ContainerVT = getContainerForFixedLengthVector(DAG, InVT); + auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT); + + if (ISD::isBuildVectorAllOnes(Mask.getNode())) + return Pg; + auto Op1 = convertToScalableVector(DAG, ContainerVT, Mask); auto Op2 = DAG.getConstant(0, DL, ContainerVT); - auto Pg = getPredicateForFixedLengthVector(DAG, DL, InVT); - EVT CmpVT = Pg.getValueType(); - return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, CmpVT, + return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, Pg.getValueType(), {Pg, Op1, Op2, DAG.getCondCode(ISD::SETNE)}); } diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-mask-opt.ll @@ -0,0 +1,450 @@ +; RUN: llc -aarch64-sve-vector-bits-min=256 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_EQ_256 +; RUN: llc -aarch64-sve-vector-bits-min=384 < %s | FileCheck %s -check-prefixes=CHECK +; RUN: llc -aarch64-sve-vector-bits-min=512 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=640 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=768 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=896 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1024 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1152 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1280 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1408 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1536 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1664 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1792 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=1920 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_1024,VBITS_GE_512 +; RUN: llc -aarch64-sve-vector-bits-min=2048 < %s | FileCheck %s -check-prefixes=CHECK,VBITS_GE_2048,VBITS_GE_1024,VBITS_GE_512 + +target triple = "aarch64-unknown-linux-gnu" + +; +; LD1B +; + +define void @masked_gather_v2i8(<2 x i8>* %a, <2 x i8*>* %b) #0 { +; CHECK-LABEL: masked_gather_v2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1sb { z0.d }, p0/z, [z0.d] +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: st1b { z0.s }, p0, [x0] +; CHECK-NEXT: ret + %ptrs = load <2 x i8*>, <2 x i8*>* %b + %vals = call <2 x i8> @llvm.masked.gather.v2i8(<2 x i8*> %ptrs, i32 8, <2 x i1> , <2 x i8> undef) + store <2 x i8> %vals, <2 x i8>* %a + ret void +} + +define void @masked_gather_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 { +; CHECK-LABEL: masked_gather_v4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: ld1sb { z0.d }, p0/z, [z0.d] +; CHECK-NEXT: ptrue p0.h, vl4 +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: st1b { z0.h }, p0, [x0] +; CHECK-NEXT: ret + %ptrs = load <4 x i8*>, <4 x i8*>* %b + %vals = call <4 x i8> @llvm.masked.gather.v4i8(<4 x i8*> %ptrs, i32 8, <4 x i1> , <4 x i8> undef) + store <4 x i8> %vals, <4 x i8>* %a + ret void +} + +define void @masked_gather_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 { +; VBITS_EQ_256-LABEL: masked_gather_v8i8: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: ld1sb { z0.d }, p0/z, [z0.d] +; VBITS_EQ_256-NEXT: ld1sb { z1.d }, p0/z, [z1.d] +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: uzp1 v0.8b, v1.8b, v0.8b +; VBITS_EQ_256-NEXT: str d0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_512-LABEL: masked_gather_v8i8: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: ld1b { z0.d }, p0/z, [z0.d] +; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_512-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_512-NEXT: str d0, [x0] +; VBITS_GE_512-NEXT: ret + %ptrs = load <8 x i8*>, <8 x i8*>* %b + %vals = call <8 x i8> @llvm.masked.gather.v8i8(<8 x i8*> %ptrs, i32 8, <8 x i1> , <8 x i8> undef) + store <8 x i8> %vals, <8 x i8>* %a + ret void +} + +define void @masked_gather_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 { +; VBITS_GE_1024-LABEL: masked_gather_v16i8: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 +; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x1] +; VBITS_GE_1024-NEXT: ld1b { z0.d }, p0/z, [z0.d] +; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_1024-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_1024-NEXT: str q0, [x0] +; VBITS_GE_1024-NEXT: ret + %ptrs = load <16 x i8*>, <16 x i8*>* %b + %vals = call <16 x i8> @llvm.masked.gather.v16i8(<16 x i8*> %ptrs, i32 8, <16 x i1> , <16 x i8> undef) + store <16 x i8> %vals, <16 x i8>* %a + ret void +} + +define void @masked_gather_v32i8(<32 x i8>* %a, <32 x i8*>* %b) #0 { +; VBITS_GE_2048-LABEL: masked_gather_v32i8: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x1] +; VBITS_GE_2048-NEXT: ld1b { z0.d }, p0/z, [z0.d] +; VBITS_GE_2048-NEXT: ptrue p0.b, vl32 +; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_2048-NEXT: uzp1 z0.b, z0.b, z0.b +; VBITS_GE_2048-NEXT: st1b { z0.b }, p0, [x0] +; VBITS_GE_2048-NEXT: ret + %ptrs = load <32 x i8*>, <32 x i8*>* %b + %vals = call <32 x i8> @llvm.masked.gather.v32i8(<32 x i8*> %ptrs, i32 8, <32 x i1> , <32 x i8> undef) + store <32 x i8> %vals, <32 x i8>* %a + ret void +} + +; +; LD1H +; + +define void @masked_gather_v2i16(<2 x i16>* %a, <2 x i16*>* %b) #0 { +; CHECK-LABEL: masked_gather_v2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1sh { z0.d }, p0/z, [z0.d] +; CHECK-NEXT: ptrue p0.s, vl2 +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: st1h { z0.s }, p0, [x0] +; CHECK-NEXT: ret + %ptrs = load <2 x i16*>, <2 x i16*>* %b + %vals = call <2 x i16> @llvm.masked.gather.v2i16(<2 x i16*> %ptrs, i32 8, <2 x i1> , <2 x i16> undef) + store <2 x i16> %vals, <2 x i16>* %a + ret void +} + +define void @masked_gather_v4i16(<4 x i16>* %a, <4 x i16*>* %b) #0 { +; CHECK-LABEL: masked_gather_v4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret + %ptrs = load <4 x i16*>, <4 x i16*>* %b + %vals = call <4 x i16> @llvm.masked.gather.v4i16(<4 x i16*> %ptrs, i32 8, <4 x i1> , <4 x i16> undef) + store <4 x i16> %vals, <4 x i16>* %a + ret void +} + +define void @masked_gather_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 { +; VBITS_EQ_256-LABEL: masked_gather_v8i16: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: ld1h { z0.d }, p0/z, [z0.d] +; VBITS_EQ_256-NEXT: ld1h { z1.d }, p0/z, [z1.d] +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_EQ_256-NEXT: uzp1 z1.h, z1.h, z1.h +; VBITS_EQ_256-NEXT: mov v1.d[1], v0.d[0] +; VBITS_EQ_256-NEXT: str q1, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_512-LABEL: masked_gather_v8i16: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: ld1h { z0.d }, p0/z, [z0.d] +; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_512-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_512-NEXT: str q0, [x0] +; VBITS_GE_512-NEXT: ret + %ptrs = load <8 x i16*>, <8 x i16*>* %b + %vals = call <8 x i16> @llvm.masked.gather.v8i16(<8 x i16*> %ptrs, i32 8, <8 x i1> , <8 x i16> undef) + store <8 x i16> %vals, <8 x i16>* %a + ret void +} + +define void @masked_gather_v16i16(<16 x i16>* %a, <16 x i16*>* %b) #0 { +; VBITS_GE_1024-LABEL: masked_gather_v16i16: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 +; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x1] +; VBITS_GE_1024-NEXT: ld1h { z0.d }, p0/z, [z0.d] +; VBITS_GE_1024-NEXT: ptrue p0.h, vl16 +; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_1024-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_1024-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_1024-NEXT: ret + %ptrs = load <16 x i16*>, <16 x i16*>* %b + %vals = call <16 x i16> @llvm.masked.gather.v16i16(<16 x i16*> %ptrs, i32 8, <16 x i1> , <16 x i16> undef) + store <16 x i16> %vals, <16 x i16>* %a + ret void +} + +define void @masked_gather_v32i16(<32 x i16>* %a, <32 x i16*>* %b) #0 { +; VBITS_GE_2048-LABEL: masked_gather_v32i16: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x1] +; VBITS_GE_2048-NEXT: ld1h { z0.d }, p0/z, [z0.d] +; VBITS_GE_2048-NEXT: ptrue p0.h, vl32 +; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_2048-NEXT: uzp1 z0.h, z0.h, z0.h +; VBITS_GE_2048-NEXT: st1h { z0.h }, p0, [x0] +; VBITS_GE_2048-NEXT: ret + %ptrs = load <32 x i16*>, <32 x i16*>* %b + %vals = call <32 x i16> @llvm.masked.gather.v32i16(<32 x i16*> %ptrs, i32 8, <32 x i1> , <32 x i16> undef) + store <32 x i16> %vals, <32 x i16>* %a + ret void +} + +; +; LD1W +; + +define void @masked_gather_v2i32(<2 x i32>* %a, <2 x i32*>* %b) #0 { +; CHECK-LABEL: masked_gather_v2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d] +; CHECK-NEXT: xtn v0.2s, v0.2d +; CHECK-NEXT: str d0, [x0] +; CHECK-NEXT: ret + %ptrs = load <2 x i32*>, <2 x i32*>* %b + %vals = call <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*> %ptrs, i32 8, <2 x i1> , <2 x i32> undef) + store <2 x i32> %vals, <2 x i32>* %a + ret void +} + +define void @masked_gather_v4i32(<4 x i32>* %a, <4 x i32*>* %b) #0 { +; CHECK-LABEL: masked_gather_v4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: ld1w { z0.d }, p0/z, [z0.d] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %ptrs = load <4 x i32*>, <4 x i32*>* %b + %vals = call <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*> %ptrs, i32 8, <4 x i1> , <4 x i32> undef) + store <4 x i32> %vals, <4 x i32>* %a + ret void +} + +define void @masked_gather_v8i32(<8 x i32>* %a, <8 x i32*>* %b) #0 { +; VBITS_EQ_256-LABEL: masked_gather_v8i32: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: ld1w { z0.d }, p0/z, [z0.d] +; VBITS_EQ_256-NEXT: ld1w { z1.d }, p0/z, [z1.d] +; VBITS_EQ_256-NEXT: ptrue p0.s, vl4 +; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_EQ_256-NEXT: uzp1 z1.s, z1.s, z1.s +; VBITS_EQ_256-NEXT: splice z1.s, p0, z1.s, z0.s +; VBITS_EQ_256-NEXT: ptrue p0.s, vl8 +; VBITS_EQ_256-NEXT: st1w { z1.s }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_512-LABEL: masked_gather_v8i32: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: ld1w { z0.d }, p0/z, [z0.d] +; VBITS_GE_512-NEXT: ptrue p0.s, vl8 +; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_512-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %ptrs = load <8 x i32*>, <8 x i32*>* %b + %vals = call <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*> %ptrs, i32 8, <8 x i1> , <8 x i32> undef) + store <8 x i32> %vals, <8 x i32>* %a + ret void +} + +define void @masked_gather_v16i32(<16 x i32>* %a, <16 x i32*>* %b) #0 { +; VBITS_GE_1024-LABEL: masked_gather_v16i32: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 +; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x1] +; VBITS_GE_1024-NEXT: ld1w { z0.d }, p0/z, [z0.d] +; VBITS_GE_1024-NEXT: ptrue p0.s, vl16 +; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_1024-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_1024-NEXT: ret + %ptrs = load <16 x i32*>, <16 x i32*>* %b + %vals = call <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*> %ptrs, i32 8, <16 x i1> , <16 x i32> undef) + store <16 x i32> %vals, <16 x i32>* %a + ret void +} + +define void @masked_gather_v32i32(<32 x i32>* %a, <32 x i32*>* %b) #0 { +; VBITS_GE_2048-LABEL: masked_gather_v32i32: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x1] +; VBITS_GE_2048-NEXT: ld1w { z0.d }, p0/z, [z0.d] +; VBITS_GE_2048-NEXT: ptrue p0.s, vl32 +; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s +; VBITS_GE_2048-NEXT: st1w { z0.s }, p0, [x0] +; VBITS_GE_2048-NEXT: ret + %ptrs = load <32 x i32*>, <32 x i32*>* %b + %vals = call <32 x i32> @llvm.masked.gather.v32i32(<32 x i32*> %ptrs, i32 8, <32 x i1> , <32 x i32> undef) + store <32 x i32> %vals, <32 x i32>* %a + ret void +} + +; +; LD1D +; + +define void @masked_gather_v2i64(<2 x i64>* %a, <2 x i64*>* %b) #0 { +; CHECK-LABEL: masked_gather_v2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr q0, [x1] +; CHECK-NEXT: ptrue p0.d, vl2 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [z0.d] +; CHECK-NEXT: str q0, [x0] +; CHECK-NEXT: ret + %ptrs = load <2 x i64*>, <2 x i64*>* %b + %vals = call <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*> %ptrs, i32 8, <2 x i1> , <2 x i64> undef) + store <2 x i64> %vals, <2 x i64>* %a + ret void +} + +define void @masked_gather_v4i64(<4 x i64>* %a, <4 x i64*>* %b) #0 { +; CHECK-LABEL: masked_gather_v4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d, vl4 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x1] +; CHECK-NEXT: ld1d { z0.d }, p0/z, [z0.d] +; CHECK-NEXT: st1d { z0.d }, p0, [x0] +; CHECK-NEXT: ret + %ptrs = load <4 x i64*>, <4 x i64*>* %b + %vals = call <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*> %ptrs, i32 8, <4 x i1> , <4 x i64> undef) + store <4 x i64> %vals, <4 x i64>* %a + ret void +} + +define void @masked_gather_v8i64(<8 x i64>* %a, <8 x i64*>* %b) #0 { +; VBITS_EQ_256-LABEL: masked_gather_v8i64: +; VBITS_EQ_256: // %bb.0: +; VBITS_EQ_256-NEXT: mov x8, #4 +; VBITS_EQ_256-NEXT: ptrue p0.d, vl4 +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [x1, x8, lsl #3] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [x1] +; VBITS_EQ_256-NEXT: ld1d { z0.d }, p0/z, [z0.d] +; VBITS_EQ_256-NEXT: ld1d { z1.d }, p0/z, [z1.d] +; VBITS_EQ_256-NEXT: st1d { z0.d }, p0, [x0, x8, lsl #3] +; VBITS_EQ_256-NEXT: st1d { z1.d }, p0, [x0] +; VBITS_EQ_256-NEXT: ret +; +; VBITS_GE_512-LABEL: masked_gather_v8i64: +; VBITS_GE_512: // %bb.0: +; VBITS_GE_512-NEXT: ptrue p0.d, vl8 +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [x1] +; VBITS_GE_512-NEXT: ld1d { z0.d }, p0/z, [z0.d] +; VBITS_GE_512-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_512-NEXT: ret + %ptrs = load <8 x i64*>, <8 x i64*>* %b + %vals = call <8 x i64> @llvm.masked.gather.v8i64(<8 x i64*> %ptrs, i32 8, <8 x i1> , <8 x i64> undef) + store <8 x i64> %vals, <8 x i64>* %a + ret void +} + +define void @masked_gather_v16i64(<16 x i64>* %a, <16 x i64*>* %b) #0 { +; VBITS_GE_1024-LABEL: masked_gather_v16i64: +; VBITS_GE_1024: // %bb.0: +; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 +; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [x1] +; VBITS_GE_1024-NEXT: ld1d { z0.d }, p0/z, [z0.d] +; VBITS_GE_1024-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_1024-NEXT: ret + %ptrs = load <16 x i64*>, <16 x i64*>* %b + %vals = call <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*> %ptrs, i32 8, <16 x i1> , <16 x i64> undef) + store <16 x i64> %vals, <16 x i64>* %a + ret void +} + +define void @masked_gather_v32i64(<32 x i64>* %a, <32 x i64*>* %b) #0 { +; VBITS_GE_2048-LABEL: masked_gather_v32i64: +; VBITS_GE_2048: // %bb.0: +; VBITS_GE_2048-NEXT: ptrue p0.d, vl32 +; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [x1] +; VBITS_GE_2048-NEXT: ld1d { z0.d }, p0/z, [z0.d] +; VBITS_GE_2048-NEXT: st1d { z0.d }, p0, [x0] +; VBITS_GE_2048-NEXT: ret + %ptrs = load <32 x i64*>, <32 x i64*>* %b + %vals = call <32 x i64> @llvm.masked.gather.v32i64(<32 x i64*> %ptrs, i32 8, <32 x i1> , <32 x i64> undef) + store <32 x i64> %vals, <32 x i64>* %a + ret void +} + +declare <2 x i8> @llvm.masked.gather.v2i8(<2 x i8*>, i32, <2 x i1>, <2 x i8>) +declare <4 x i8> @llvm.masked.gather.v4i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>) +declare <8 x i8> @llvm.masked.gather.v8i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>) +declare <16 x i8> @llvm.masked.gather.v16i8(<16 x i8*>, i32, <16 x i1>, <16 x i8>) +declare <32 x i8> @llvm.masked.gather.v32i8(<32 x i8*>, i32, <32 x i1>, <32 x i8>) + +declare <2 x i16> @llvm.masked.gather.v2i16(<2 x i16*>, i32, <2 x i1>, <2 x i16>) +declare <4 x i16> @llvm.masked.gather.v4i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>) +declare <8 x i16> @llvm.masked.gather.v8i16(<8 x i16*>, i32, <8 x i1>, <8 x i16>) +declare <16 x i16> @llvm.masked.gather.v16i16(<16 x i16*>, i32, <16 x i1>, <16 x i16>) +declare <32 x i16> @llvm.masked.gather.v32i16(<32 x i16*>, i32, <32 x i1>, <32 x i16>) + +declare <2 x i32> @llvm.masked.gather.v2i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>) +declare <4 x i32> @llvm.masked.gather.v4i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) +declare <8 x i32> @llvm.masked.gather.v8i32(<8 x i32*>, i32, <8 x i1>, <8 x i32>) +declare <16 x i32> @llvm.masked.gather.v16i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>) +declare <32 x i32> @llvm.masked.gather.v32i32(<32 x i32*>, i32, <32 x i1>, <32 x i32>) + +declare <2 x i64> @llvm.masked.gather.v2i64(<2 x i64*>, i32, <2 x i1>, <2 x i64>) +declare <4 x i64> @llvm.masked.gather.v4i64(<4 x i64*>, i32, <4 x i1>, <4 x i64>) +declare <8 x i64> @llvm.masked.gather.v8i64(<8 x i64*>, i32, <8 x i1>, <8 x i64>) +declare <16 x i64> @llvm.masked.gather.v16i64(<16 x i64*>, i32, <16 x i1>, <16 x i64>) +declare <32 x i64> @llvm.masked.gather.v32i64(<32 x i64*>, i32, <32 x i1>, <32 x i64>) + +attributes #0 = { "target-features"="+sve" } diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -34,7 +34,7 @@ ; CHECK-NEXT: ldrb w8, [x0, #1] ; CHECK-NEXT: mov v0.s[1], w8 ; CHECK-NEXT: cmeq v0.2s, v0.2s, #0 -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ld1sb { z0.d }, p0/z, [z1.d] ; CHECK-NEXT: ptrue p0.s, vl2 @@ -57,8 +57,8 @@ ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: ushll v0.8h, v0.8b, #0 ; CHECK-NEXT: cmeq v0.4h, v0.4h, #0 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ld1sb { z0.d }, p0/z, [z1.d] ; CHECK-NEXT: ptrue p0.h, vl4 @@ -90,10 +90,10 @@ ; VBITS_EQ_256-NEXT: shl v0.4h, v0.4h, #8 ; VBITS_EQ_256-NEXT: sshr v1.4h, v1.4h, #8 ; VBITS_EQ_256-NEXT: sshr v0.4h, v0.4h, #8 -; VBITS_EQ_256-NEXT: uunpklo z1.s, z1.h -; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h -; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s -; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s ; VBITS_EQ_256-NEXT: cmpne p1.d, p0/z, z1.d, #0 ; VBITS_EQ_256-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; VBITS_EQ_256-NEXT: ld1sb { z0.d }, p1/z, [z2.d] @@ -112,9 +112,9 @@ ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmeq v0.8b, v0.8b, #0 -; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b -; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_512-NEXT: sunpklo z0.h, z0.b +; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1b { z0.d }, p0/z, [z1.d] ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s @@ -137,9 +137,9 @@ ; VBITS_GE_1024-NEXT: ptrue p0.d, vl16 ; VBITS_GE_1024-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_1024-NEXT: cmeq v0.16b, v0.16b, #0 -; VBITS_GE_1024-NEXT: uunpklo z0.h, z0.b -; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_1024-NEXT: sunpklo z0.h, z0.b +; VBITS_GE_1024-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_1024-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_1024-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; VBITS_GE_1024-NEXT: ld1b { z0.d }, p0/z, [z1.d] ; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s @@ -164,9 +164,9 @@ ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p2/z, [x1] ; VBITS_GE_2048-NEXT: cmpeq p1.b, p0/z, z0.b, #0 ; VBITS_GE_2048-NEXT: mov z0.b, p1/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z0.h, z0.b -; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_2048-NEXT: sunpklo z0.h, z0.b +; VBITS_GE_2048-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_2048-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: cmpne p1.d, p2/z, z0.d, #0 ; VBITS_GE_2048-NEXT: ld1b { z0.d }, p1/z, [z1.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s @@ -196,7 +196,7 @@ ; CHECK-NEXT: ldrh w8, [x0, #2] ; CHECK-NEXT: mov v0.s[1], w8 ; CHECK-NEXT: cmeq v0.2s, v0.2s, #0 -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ld1sh { z0.d }, p0/z, [z1.d] ; CHECK-NEXT: ptrue p0.s, vl2 @@ -218,8 +218,8 @@ ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: cmeq v0.4h, v0.4h, #0 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [z1.d] ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s @@ -245,12 +245,12 @@ ; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1, x8, lsl #3] ; VBITS_EQ_256-NEXT: ld1d { z3.d }, p0/z, [x1] ; VBITS_EQ_256-NEXT: ext v1.16b, v0.16b, v0.16b, #8 -; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h -; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: sunpklo z0.s, z0.h +; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s ; VBITS_EQ_256-NEXT: cmpne p1.d, p0/z, z0.d, #0 ; VBITS_EQ_256-NEXT: ld1h { z0.d }, p1/z, [z3.d] -; VBITS_EQ_256-NEXT: uunpklo z1.s, z1.h -; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s ; VBITS_EQ_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 ; VBITS_EQ_256-NEXT: ld1h { z1.d }, p0/z, [z2.d] ; VBITS_EQ_256-NEXT: uzp1 z0.s, z0.s, z0.s @@ -267,8 +267,8 @@ ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: cmeq v0.8h, v0.8h, #0 -; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1h { z0.d }, p0/z, [z1.d] ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s @@ -292,8 +292,8 @@ ; VBITS_GE_1024-NEXT: cmpeq p2.h, p0/z, z0.h, #0 ; VBITS_GE_1024-NEXT: ld1d { z0.d }, p1/z, [x1] ; VBITS_GE_1024-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_1024-NEXT: uunpklo z1.s, z1.h -; VBITS_GE_1024-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_1024-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_1024-NEXT: sunpklo z1.d, z1.s ; VBITS_GE_1024-NEXT: cmpne p1.d, p1/z, z1.d, #0 ; VBITS_GE_1024-NEXT: ld1h { z0.d }, p1/z, [z0.d] ; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s @@ -317,8 +317,8 @@ ; VBITS_GE_2048-NEXT: cmpeq p2.h, p0/z, z0.h, #0 ; VBITS_GE_2048-NEXT: ld1d { z0.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z1.s, z1.h -; VBITS_GE_2048-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_2048-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_2048-NEXT: sunpklo z1.d, z1.s ; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z1.d, #0 ; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [z0.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s @@ -344,7 +344,7 @@ ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: cmeq v0.2s, v0.2s, #0 -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [z1.d] ; CHECK-NEXT: xtn v0.2s, v0.2d @@ -365,7 +365,7 @@ ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: cmeq v0.4s, v0.4s, #0 -; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [z1.d] ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s @@ -391,9 +391,9 @@ ; VBITS_EQ_256-NEXT: ld1d { z2.d }, p1/z, [x1] ; VBITS_EQ_256-NEXT: cmpeq p2.s, p0/z, z0.s, #0 ; VBITS_EQ_256-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_EQ_256-NEXT: uunpklo z3.d, z0.s +; VBITS_EQ_256-NEXT: sunpklo z3.d, z0.s ; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 -; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s +; VBITS_EQ_256-NEXT: sunpklo z0.d, z0.s ; VBITS_EQ_256-NEXT: cmpne p2.d, p1/z, z3.d, #0 ; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z0.d, #0 ; VBITS_EQ_256-NEXT: ld1w { z2.d }, p2/z, [z2.d] @@ -413,7 +413,7 @@ ; VBITS_GE_512-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_512-NEXT: cmpeq p2.s, p0/z, z0.s, #0 ; VBITS_GE_512-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [z1.d] ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s @@ -436,7 +436,7 @@ ; VBITS_GE_1024-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_1024-NEXT: cmpeq p2.s, p0/z, z0.s, #0 ; VBITS_GE_1024-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_1024-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_1024-NEXT: cmpne p1.d, p1/z, z0.d, #0 ; VBITS_GE_1024-NEXT: ld1w { z0.d }, p1/z, [z1.d] ; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s @@ -459,7 +459,7 @@ ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: cmpeq p2.s, p0/z, z0.s, #0 ; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_2048-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 ; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [z1.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s @@ -633,8 +633,8 @@ ; CHECK-NEXT: mov v0.h[1], w9 ; CHECK-NEXT: shl v0.4h, v0.4h, #15 ; CHECK-NEXT: sshr v0.4h, v0.4h, #15 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [z1.d] ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s @@ -656,8 +656,8 @@ ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: fcmeq v0.4h, v0.4h, #0.0 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ld1h { z0.d }, p0/z, [z1.d] ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s @@ -679,8 +679,8 @@ ; VBITS_GE_512-NEXT: ptrue p0.d, vl8 ; VBITS_GE_512-NEXT: ld1d { z1.d }, p0/z, [x1] ; VBITS_GE_512-NEXT: fcmeq v0.8h, v0.8h, #0.0 -; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_512-NEXT: sunpklo z0.s, z0.h +; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1h { z0.d }, p0/z, [z1.d] ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s @@ -704,8 +704,8 @@ ; VBITS_GE_1024-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 ; VBITS_GE_1024-NEXT: ld1d { z0.d }, p1/z, [x1] ; VBITS_GE_1024-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_1024-NEXT: uunpklo z1.s, z1.h -; VBITS_GE_1024-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_1024-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_1024-NEXT: sunpklo z1.d, z1.s ; VBITS_GE_1024-NEXT: cmpne p1.d, p1/z, z1.d, #0 ; VBITS_GE_1024-NEXT: ld1h { z0.d }, p1/z, [z0.d] ; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s @@ -729,8 +729,8 @@ ; VBITS_GE_2048-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 ; VBITS_GE_2048-NEXT: ld1d { z0.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z1.s, z1.h -; VBITS_GE_2048-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_2048-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_2048-NEXT: sunpklo z1.d, z1.s ; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z1.d, #0 ; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [z0.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s @@ -756,7 +756,7 @@ ; CHECK-NEXT: ptrue p0.d, vl2 ; CHECK-NEXT: ldr q1, [x1] ; CHECK-NEXT: fcmeq v0.2s, v0.2s, #0.0 -; CHECK-NEXT: ushll v0.2d, v0.2s, #0 +; CHECK-NEXT: sshll v0.2d, v0.2s, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [z1.d] ; CHECK-NEXT: xtn v0.2s, v0.2d @@ -777,7 +777,7 @@ ; CHECK-NEXT: ptrue p0.d, vl4 ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: fcmeq v0.4s, v0.4s, #0.0 -; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: ld1w { z0.d }, p0/z, [z1.d] ; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s @@ -800,7 +800,7 @@ ; VBITS_GE_512-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_512-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 ; VBITS_GE_512-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_512-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_512-NEXT: cmpne p1.d, p1/z, z0.d, #0 ; VBITS_GE_512-NEXT: ld1w { z0.d }, p1/z, [z1.d] ; VBITS_GE_512-NEXT: uzp1 z0.s, z0.s, z0.s @@ -823,7 +823,7 @@ ; VBITS_GE_1024-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_1024-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 ; VBITS_GE_1024-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_1024-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_1024-NEXT: cmpne p1.d, p1/z, z0.d, #0 ; VBITS_GE_1024-NEXT: ld1w { z0.d }, p1/z, [z1.d] ; VBITS_GE_1024-NEXT: uzp1 z0.s, z0.s, z0.s @@ -846,7 +846,7 @@ ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_2048-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 ; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [z1.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s @@ -991,8 +991,8 @@ ; VBITS_GE_2048-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 ; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z1.s, z1.h -; VBITS_GE_2048-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_2048-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_2048-NEXT: sunpklo z1.d, z1.s ; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z1.d, #0 ; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [x2, z0.d, lsl #1] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s @@ -1019,7 +1019,7 @@ ; VBITS_GE_2048-NEXT: ld1sw { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_2048-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 ; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [x2, z1.d, lsl #2] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s @@ -1066,8 +1066,8 @@ ; VBITS_GE_2048-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 ; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z1.s, z1.h -; VBITS_GE_2048-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_2048-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_2048-NEXT: sunpklo z1.d, z1.s ; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z1.d, #0 ; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [x2, z0.d, lsl #1] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s @@ -1094,8 +1094,8 @@ ; VBITS_GE_2048-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 ; VBITS_GE_2048-NEXT: ld1sw { z0.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z1.s, z1.h -; VBITS_GE_2048-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_2048-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_2048-NEXT: sunpklo z1.d, z1.s ; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z1.d, #0 ; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [x2, z0.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s @@ -1123,8 +1123,8 @@ ; VBITS_GE_2048-NEXT: fcmeq p2.h, p0/z, z0.h, #0.0 ; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: mov z1.h, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z1.s, z1.h -; VBITS_GE_2048-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_2048-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_2048-NEXT: sunpklo z1.d, z1.s ; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z1.d, #0 ; VBITS_GE_2048-NEXT: ld1h { z0.d }, p1/z, [x2, z0.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s @@ -1151,7 +1151,7 @@ ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_2048-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 ; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [x2, z1.d, lsl #2] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s @@ -1175,7 +1175,7 @@ ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_2048-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 ; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [x2, z1.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s @@ -1203,7 +1203,7 @@ ; VBITS_GE_2048-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: add z1.d, p1/m, z1.d, z2.d ; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_2048-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 ; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [z1.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s @@ -1231,7 +1231,7 @@ ; VBITS_GE_2048-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: add z1.d, p1/m, z1.d, z2.d ; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_2048-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 ; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [z1.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s @@ -1256,7 +1256,7 @@ ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_2048-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 ; VBITS_GE_2048-NEXT: ld1w { z0.s }, p0/z, [x2] ; VBITS_GE_2048-NEXT: ld1w { z1.d }, p1/z, [z1.d] @@ -1282,7 +1282,7 @@ ; VBITS_GE_2048-NEXT: ld1d { z1.d }, p1/z, [x1] ; VBITS_GE_2048-NEXT: fcmeq p2.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: mov z0.s, p2/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s +; VBITS_GE_2048-NEXT: sunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: cmpne p1.d, p1/z, z0.d, #0 ; VBITS_GE_2048-NEXT: ld1w { z0.d }, p1/z, [z1.d] ; VBITS_GE_2048-NEXT: uzp1 z0.s, z0.s, z0.s diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -35,7 +35,7 @@ ; CHECK-NEXT: mov v0.s[1], w8 ; CHECK-NEXT: cmeq v1.2s, v0.2s, #0 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: sshll v1.2d, v1.2s, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 ; CHECK-NEXT: st1b { z0.d }, p0, [z2.d] ; CHECK-NEXT: ret @@ -56,8 +56,8 @@ ; CHECK-NEXT: cmeq v2.4h, v0.4h, #0 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z2.d, z2.s ; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 ; CHECK-NEXT: st1b { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret @@ -88,10 +88,10 @@ ; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s ; VBITS_EQ_256-NEXT: sshr v2.4h, v2.4h, #8 ; VBITS_EQ_256-NEXT: sshr v1.4h, v1.4h, #8 -; VBITS_EQ_256-NEXT: uunpklo z2.s, z2.h -; VBITS_EQ_256-NEXT: uunpklo z1.s, z1.h -; VBITS_EQ_256-NEXT: uunpklo z2.d, z2.s -; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: sunpklo z2.s, z2.h +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: sunpklo z2.d, z2.s +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s ; VBITS_EQ_256-NEXT: cmpne p1.d, p0/z, z2.d, #0 ; VBITS_EQ_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 ; VBITS_EQ_256-NEXT: uunpklo z1.s, z5.h @@ -109,9 +109,9 @@ ; VBITS_GE_512-NEXT: uunpklo z0.h, z0.b ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: uunpklo z2.h, z2.b -; VBITS_GE_512-NEXT: uunpklo z2.s, z2.h -; VBITS_GE_512-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_512-NEXT: sunpklo z2.h, z2.b +; VBITS_GE_512-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_512-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z2.d, #0 ; VBITS_GE_512-NEXT: st1b { z0.d }, p0, [z1.d] ; VBITS_GE_512-NEXT: ret @@ -132,9 +132,9 @@ ; VBITS_GE_1024-NEXT: uunpklo z0.h, z0.b ; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_1024-NEXT: uunpklo z2.h, z2.b -; VBITS_GE_1024-NEXT: uunpklo z2.s, z2.h -; VBITS_GE_1024-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_1024-NEXT: sunpklo z2.h, z2.b +; VBITS_GE_1024-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_1024-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_1024-NEXT: cmpne p0.d, p0/z, z2.d, #0 ; VBITS_GE_1024-NEXT: st1b { z0.d }, p0, [z1.d] ; VBITS_GE_1024-NEXT: ret @@ -156,10 +156,10 @@ ; VBITS_GE_2048-NEXT: uunpklo z0.h, z0.b ; VBITS_GE_2048-NEXT: mov z1.b, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h -; VBITS_GE_2048-NEXT: uunpklo z1.h, z1.b +; VBITS_GE_2048-NEXT: sunpklo z1.h, z1.b ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: uunpklo z1.s, z1.h -; VBITS_GE_2048-NEXT: uunpklo z1.d, z1.s +; VBITS_GE_2048-NEXT: sunpklo z1.s, z1.h +; VBITS_GE_2048-NEXT: sunpklo z1.d, z1.s ; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z1.d, #0 ; VBITS_GE_2048-NEXT: st1b { z0.d }, p0, [z2.d] ; VBITS_GE_2048-NEXT: ret @@ -185,7 +185,7 @@ ; CHECK-NEXT: mov v0.s[1], w8 ; CHECK-NEXT: cmeq v1.2s, v0.2s, #0 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: sshll v1.2d, v1.2s, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 ; CHECK-NEXT: st1h { z0.d }, p0, [z2.d] ; CHECK-NEXT: ret @@ -205,8 +205,8 @@ ; CHECK-NEXT: cmeq v2.4h, v0.4h, #0 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z2.d, z2.s ; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 ; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret @@ -229,14 +229,14 @@ ; VBITS_EQ_256-NEXT: ext v3.16b, v0.16b, v0.16b, #8 ; VBITS_EQ_256-NEXT: uunpklo z0.s, z0.h ; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s -; VBITS_EQ_256-NEXT: uunpklo z2.s, z1.h +; VBITS_EQ_256-NEXT: sunpklo z2.s, z1.h ; VBITS_EQ_256-NEXT: ext v1.16b, v1.16b, v1.16b, #8 -; VBITS_EQ_256-NEXT: uunpklo z2.d, z2.s +; VBITS_EQ_256-NEXT: sunpklo z2.d, z2.s ; VBITS_EQ_256-NEXT: cmpne p1.d, p0/z, z2.d, #0 ; VBITS_EQ_256-NEXT: ld1d { z2.d }, p0/z, [x1] ; VBITS_EQ_256-NEXT: uunpklo z3.s, z3.h -; VBITS_EQ_256-NEXT: uunpklo z1.s, z1.h -; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: sunpklo z1.s, z1.h +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s ; VBITS_EQ_256-NEXT: st1h { z0.d }, p1, [z2.d] ; VBITS_EQ_256-NEXT: cmpne p0.d, p0/z, z1.d, #0 ; VBITS_EQ_256-NEXT: uunpklo z1.d, z3.s @@ -251,8 +251,8 @@ ; VBITS_GE_512-NEXT: cmeq v2.8h, v0.8h, #0 ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: uunpklo z2.s, z2.h -; VBITS_GE_512-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_512-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_512-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z2.d, #0 ; VBITS_GE_512-NEXT: st1h { z0.d }, p0, [z1.d] ; VBITS_GE_512-NEXT: ret @@ -274,8 +274,8 @@ ; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_1024-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_1024-NEXT: uunpklo z2.s, z2.h -; VBITS_GE_1024-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_1024-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_1024-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_1024-NEXT: cmpne p0.d, p1/z, z2.d, #0 ; VBITS_GE_1024-NEXT: st1h { z0.d }, p0, [z1.d] ; VBITS_GE_1024-NEXT: ret @@ -297,8 +297,8 @@ ; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_2048-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: uunpklo z2.s, z2.h -; VBITS_GE_2048-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_2048-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_2048-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 ; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [z1.d] ; VBITS_GE_2048-NEXT: ret @@ -321,7 +321,7 @@ ; CHECK-NEXT: ldr q2, [x1] ; CHECK-NEXT: cmeq v1.2s, v0.2s, #0 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: sshll v1.2d, v1.2s, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 ; CHECK-NEXT: st1w { z0.d }, p0, [z2.d] ; CHECK-NEXT: ret @@ -340,7 +340,7 @@ ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: cmeq v2.4s, v0.4s, #0 ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: sunpklo z2.d, z2.s ; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 ; CHECK-NEXT: st1w { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret @@ -363,11 +363,11 @@ ; VBITS_EQ_256-NEXT: ld1d { z4.d }, p1/z, [x1] ; VBITS_EQ_256-NEXT: cmpeq p0.s, p0/z, z0.s, #0 ; VBITS_EQ_256-NEXT: mov z1.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_EQ_256-NEXT: uunpklo z3.d, z1.s +; VBITS_EQ_256-NEXT: sunpklo z3.d, z1.s ; VBITS_EQ_256-NEXT: ext z1.b, z1.b, z1.b, #16 ; VBITS_EQ_256-NEXT: cmpne p0.d, p1/z, z3.d, #0 ; VBITS_EQ_256-NEXT: uunpklo z3.d, z0.s -; VBITS_EQ_256-NEXT: uunpklo z1.d, z1.s +; VBITS_EQ_256-NEXT: sunpklo z1.d, z1.s ; VBITS_EQ_256-NEXT: ext z0.b, z0.b, z0.b, #16 ; VBITS_EQ_256-NEXT: cmpne p1.d, p1/z, z1.d, #0 ; VBITS_EQ_256-NEXT: uunpklo z0.d, z0.s @@ -384,7 +384,7 @@ ; VBITS_GE_512-NEXT: cmpeq p0.s, p0/z, z0.s, #0 ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_512-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_512-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_512-NEXT: cmpne p0.d, p1/z, z2.d, #0 ; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [z1.d] ; VBITS_GE_512-NEXT: ret @@ -405,7 +405,7 @@ ; VBITS_GE_1024-NEXT: cmpeq p0.s, p0/z, z0.s, #0 ; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_1024-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_1024-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_1024-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_1024-NEXT: cmpne p0.d, p1/z, z2.d, #0 ; VBITS_GE_1024-NEXT: st1w { z0.d }, p0, [z1.d] ; VBITS_GE_1024-NEXT: ret @@ -426,7 +426,7 @@ ; VBITS_GE_2048-NEXT: cmpeq p0.s, p0/z, z0.s, #0 ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_2048-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 ; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [z1.d] ; VBITS_GE_2048-NEXT: ret @@ -582,8 +582,8 @@ ; CHECK-NEXT: mov v0.h[1], w9 ; CHECK-NEXT: shl v0.4h, v0.4h, #15 ; CHECK-NEXT: sshr v0.4h, v0.4h, #15 -; CHECK-NEXT: uunpklo z0.s, z0.h -; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: sunpklo z0.s, z0.h +; CHECK-NEXT: sunpklo z0.d, z0.s ; CHECK-NEXT: cmpne p0.d, p0/z, z0.d, #0 ; CHECK-NEXT: uunpklo z0.d, z1.s ; CHECK-NEXT: st1h { z0.d }, p0, [z2.d] @@ -604,8 +604,8 @@ ; CHECK-NEXT: fcmeq v2.4h, v0.4h, #0.0 ; CHECK-NEXT: uunpklo z0.s, z0.h ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: uunpklo z2.s, z2.h -; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: sunpklo z2.s, z2.h +; CHECK-NEXT: sunpklo z2.d, z2.s ; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 ; CHECK-NEXT: st1h { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret @@ -625,8 +625,8 @@ ; VBITS_GE_512-NEXT: fcmeq v2.8h, v0.8h, #0.0 ; VBITS_GE_512-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_512-NEXT: uunpklo z2.s, z2.h -; VBITS_GE_512-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_512-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_512-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_512-NEXT: cmpne p0.d, p0/z, z2.d, #0 ; VBITS_GE_512-NEXT: st1h { z0.d }, p0, [z1.d] ; VBITS_GE_512-NEXT: ret @@ -648,8 +648,8 @@ ; VBITS_GE_1024-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_1024-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_1024-NEXT: uunpklo z2.s, z2.h -; VBITS_GE_1024-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_1024-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_1024-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_1024-NEXT: cmpne p0.d, p1/z, z2.d, #0 ; VBITS_GE_1024-NEXT: st1h { z0.d }, p0, [z1.d] ; VBITS_GE_1024-NEXT: ret @@ -671,8 +671,8 @@ ; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_2048-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: uunpklo z2.s, z2.h -; VBITS_GE_2048-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_2048-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_2048-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 ; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [z1.d] ; VBITS_GE_2048-NEXT: ret @@ -695,7 +695,7 @@ ; CHECK-NEXT: ldr q2, [x1] ; CHECK-NEXT: fcmeq v1.2s, v0.2s, #0.0 ; CHECK-NEXT: ushll v0.2d, v0.2s, #0 -; CHECK-NEXT: ushll v1.2d, v1.2s, #0 +; CHECK-NEXT: sshll v1.2d, v1.2s, #0 ; CHECK-NEXT: cmpne p0.d, p0/z, z1.d, #0 ; CHECK-NEXT: st1w { z0.d }, p0, [z2.d] ; CHECK-NEXT: ret @@ -714,7 +714,7 @@ ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x1] ; CHECK-NEXT: fcmeq v2.4s, v0.4s, #0.0 ; CHECK-NEXT: uunpklo z0.d, z0.s -; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: sunpklo z2.d, z2.s ; CHECK-NEXT: cmpne p0.d, p0/z, z2.d, #0 ; CHECK-NEXT: st1w { z0.d }, p0, [z1.d] ; CHECK-NEXT: ret @@ -735,7 +735,7 @@ ; VBITS_GE_512-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; VBITS_GE_512-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_512-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_512-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_512-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_512-NEXT: cmpne p0.d, p1/z, z2.d, #0 ; VBITS_GE_512-NEXT: st1w { z0.d }, p0, [z1.d] ; VBITS_GE_512-NEXT: ret @@ -756,7 +756,7 @@ ; VBITS_GE_1024-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; VBITS_GE_1024-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_1024-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_1024-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_1024-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_1024-NEXT: cmpne p0.d, p1/z, z2.d, #0 ; VBITS_GE_1024-NEXT: st1w { z0.d }, p0, [z1.d] ; VBITS_GE_1024-NEXT: ret @@ -777,7 +777,7 @@ ; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_2048-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 ; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [z1.d] ; VBITS_GE_2048-NEXT: ret @@ -908,8 +908,8 @@ ; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_2048-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: uunpklo z2.s, z2.h -; VBITS_GE_2048-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_2048-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_2048-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 ; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [x2, z1.d, lsl #1] ; VBITS_GE_2048-NEXT: ret @@ -933,7 +933,7 @@ ; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_2048-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 ; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [x2, z1.d, lsl #2] ; VBITS_GE_2048-NEXT: ret @@ -977,8 +977,8 @@ ; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_2048-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: uunpklo z2.s, z2.h -; VBITS_GE_2048-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_2048-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_2048-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 ; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [x2, z1.d, lsl #1] ; VBITS_GE_2048-NEXT: ret @@ -1003,8 +1003,8 @@ ; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_2048-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: uunpklo z2.s, z2.h -; VBITS_GE_2048-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_2048-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_2048-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 ; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [x2, z1.d] ; VBITS_GE_2048-NEXT: ret @@ -1030,8 +1030,8 @@ ; VBITS_GE_2048-NEXT: uunpklo z0.s, z0.h ; VBITS_GE_2048-NEXT: mov z2.h, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: uunpklo z2.s, z2.h -; VBITS_GE_2048-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_2048-NEXT: sunpklo z2.s, z2.h +; VBITS_GE_2048-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 ; VBITS_GE_2048-NEXT: st1h { z0.d }, p0, [x2, z1.d] ; VBITS_GE_2048-NEXT: ret @@ -1055,7 +1055,7 @@ ; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_2048-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 ; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [x2, z1.d, lsl #2] ; VBITS_GE_2048-NEXT: ret @@ -1077,7 +1077,7 @@ ; VBITS_GE_2048-NEXT: fcmeq p0.s, p0/z, z0.s, #0.0 ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s ; VBITS_GE_2048-NEXT: mov z2.s, p0/z, #-1 // =0xffffffffffffffff -; VBITS_GE_2048-NEXT: uunpklo z2.d, z2.s +; VBITS_GE_2048-NEXT: sunpklo z2.d, z2.s ; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 ; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [x2, z1.d] ; VBITS_GE_2048-NEXT: ret @@ -1103,7 +1103,7 @@ ; VBITS_GE_2048-NEXT: add z1.d, p1/m, z1.d, z2.d ; VBITS_GE_2048-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: uunpklo z2.d, z3.s +; VBITS_GE_2048-NEXT: sunpklo z2.d, z3.s ; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 ; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [z1.d] ; VBITS_GE_2048-NEXT: ret @@ -1129,7 +1129,7 @@ ; VBITS_GE_2048-NEXT: add z1.d, p1/m, z1.d, z2.d ; VBITS_GE_2048-NEXT: mov z3.s, p0/z, #-1 // =0xffffffffffffffff ; VBITS_GE_2048-NEXT: uunpklo z0.d, z0.s -; VBITS_GE_2048-NEXT: uunpklo z2.d, z3.s +; VBITS_GE_2048-NEXT: sunpklo z2.d, z3.s ; VBITS_GE_2048-NEXT: cmpne p0.d, p1/z, z2.d, #0 ; VBITS_GE_2048-NEXT: st1w { z0.d }, p0, [z1.d] ; VBITS_GE_2048-NEXT: ret