diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4348,6 +4348,9 @@ IndexVT = getContainerForFixedLengthVector(DAG, IndexVT); MemVT = IndexVT.changeVectorElementType(MemVT.getVectorElementType()); InputVT = DAG.getValueType(MemVT.changeTypeToInteger()); + Mask = DAG.getNode( + ISD::ZERO_EXTEND, DL, + VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask); } if (PassThru->isUndef() || isZerosVector(PassThru.getNode())) @@ -4452,6 +4455,9 @@ ISD::ANY_EXTEND, DL, VT.changeVectorElementType(IndexVT.getVectorElementType()), StoreVal); StoreVal = convertToScalableVector(DAG, IndexVT, StoreVal); + Mask = DAG.getNode( + ISD::ZERO_EXTEND, DL, + VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask); } else if (VT.isFloatingPoint()) { // Handle FP data by casting the data so an integer scatter can be used. EVT StoreValVT = getPackedSVEVectorVT(VT.getVectorElementCount()); diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-gather.ll @@ -29,14 +29,16 @@ ; CHECK: ldrb [[VALS_LO:w[0-9]+]], [x0] ; CHECK-NEXT: ldrb [[VALS_HI:w[0-9]+]], [x0, #1] ; CHECK-NEXT: ldr q[[PTRS:[0-9]+]], [x1] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].s, vl2 +; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl2 ; CHECK-NEXT: fmov s[[VALS:[0-9]+]], [[VALS_LO]] ; CHECK-NEXT: mov v[[VALS]].s[1], [[VALS_HI]] ; CHECK-NEXT: cmeq v[[CMP:[0-9]+]].2s, v[[VALS]].2s, #0 -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG0]]/z, z[[CMP]].s, #0 +; CHECK-NEXT: ushll v[[SHL:[0-9]+]].2d, v[[CMP]].2s, #0 +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG0]]/z, z[[SHL]].d, #0 ; CHECK-NEXT: ld1sb { z[[RES:[0-9]+]].d }, [[MASK]]/z, [z[[PTRS]].d] +; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl2 ; CHECK-NEXT: xtn v[[XTN:[0-9]+]].2s, v[[RES]].2d -; CHECK-NEXT: st1b { z[[XTN]].s }, [[PG0]], [x0] +; CHECK-NEXT: st1b { z[[XTN]].s }, [[PG1]], [x0] ; CHECK-NEXT: ret %cval = load <2 x i8>, <2 x i8>* %a %ptrs = load <2 x i8*>, <2 x i8*>* %b @@ -51,11 +53,13 @@ ; CHECK: ldr s[[VALS:[0-9]+]], [x0] ; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4 ; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].h, vl4 ; CHECK-NEXT: ushll [[SHL:v[0-9]+]].8h, v[[VALS]].8b, #0 ; CHECK-NEXT: cmeq v[[CMP:[0-9]+]].4h, [[SHL]].4h, #0 -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].h, [[PG1]]/z, z[[CMP]].h, #0 +; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[CMP]].h +; CHECK-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG0]]/z, [[UPK2]].d, #0 ; CHECK-NEXT: ld1sb { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] +; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].h, vl4 ; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s ; CHECK-NEXT: uzp1 z[[UZP2:[0-9]+]].h, [[UZP1]].h, [[UZP1]].h ; CHECK-NEXT: st1b { z[[UZP2]].h }, [[PG0]], [x0] @@ -71,11 +75,13 @@ define void @masked_gather_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 { ; CHECK-LABEL: masked_gather_v8i8: ; VBITS_GE_512: ldr d[[VALS:[0-9]+]], [x0] -; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].b, vl8 +; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8 +; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1] ; VBITS_GE_512-NEXT: cmeq v[[CMP:[0-9]+]].8b, v[[VALS]].8b, #0 -; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].b, [[PG1]]/z, z[[CMP]].b, #0 +; VBITS_GE_512-NEXT: uunpklo [[UPK1:z[0-9]+]].h, z[[CMP]].b +; VBITS_GE_512-NEXT: uunpklo [[UPK2:z[0-9]+]].s, [[UPK1]].h +; VBITS_GE_512-NEXT: uunpklo [[UPK3:z[0-9]+]].d, [[UPK2]].s +; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG]]/z, [[UPK3]].d, #0 ; VBITS_GE_512-NEXT: ld1b { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_512-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_512-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h @@ -85,20 +91,23 @@ ; Ensure sensible type legalisation. ; VBITS_EQ_256-DAG: ldr d[[VALS:[0-9]+]], [x0] -; VBITS_EQ_256-DAG: ptrue [[PG0:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 -; VBITS_EQ_256-DAG: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_EQ_256-DAG: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG0]]/z, [x1, x[[NUMELTS]], lsl #3] +; VBITS_EQ_256-DAG: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_EQ_256-DAG: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3] ; VBITS_EQ_256-DAG: cmeq [[ZMSK:v[0-9]+]].8b, v[[VALS]].8b, #0 ; VBITS_EQ_256-DAG: zip1 [[VAL_LO:v[0-9]+]].8b, [[ZMSK]].8b, v[[VALS]].8b ; VBITS_EQ_256-DAG: zip2 [[VAL_HI:v[0-9]+]].8b, [[ZMSK]].8b, v[[VALS]].8b ; VBITS_EQ_256-DAG: shl [[SHL_LO:v[0-9]+]].4h, [[VAL_LO]].4h, #8 ; VBITS_EQ_256-DAG: shl [[SHL_HI:v[0-9]+]].4h, [[VAL_HI]].4h, #8 -; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].h, vl4 ; VBITS_EQ_256-DAG: sshr v[[SSHR_LO:[0-9]+]].4h, [[SHL_LO]].4h, #8 ; VBITS_EQ_256-DAG: sshr v[[SSHR_HI:[0-9]+]].4h, [[SHL_HI]].4h, #8 -; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].h, [[PG1]]/z, z[[SSHR_LO]].h, #0 -; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].h, [[PG1]]/z, z[[SSHR_HI]].h, #0 +; VBITS_EQ_256-DAG: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[SSHR_LO]].h +; VBITS_EQ_256-DAG: uunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s +; VBITS_EQ_256-DAG: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[SSHR_HI]].h +; VBITS_EQ_256-DAG: uunpklo [[UPK2_HI:z[0-9]+]].d, [[UPK1_HI]].s +; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].d, [[PG]]/z, [[UPK2_LO]].d, #0 +; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].d, [[PG]]/z, [[UPK2_HI]].d, #0 ; VBITS_EQ_256-DAG: ld1sb { [[RES_LO:z[0-9]+]].d }, [[MASK_LO]]/z, {{\[}}[[PTRS_LO]].d] ; VBITS_EQ_256-DAG: ld1sb { [[RES_HI:z[0-9]+]].d }, [[MASK_HI]]/z, {{\[}}[[PTRS_HI]].d] ; VBITS_EQ_256-DAG: uzp1 [[UZP1_LO:z[0-9]+]].s, [[RES_LO]].s, [[RES_LO]].s @@ -119,11 +128,13 @@ define void @masked_gather_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 { ; CHECK-LABEL: masked_gather_v16i8: ; VBITS_GE_1024: ldr q[[VALS:[0-9]+]], [x0] -; VBITS_GE_1024-NEXT: ptrue [[PG0:p[0-9]+]].d, vl16 -; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].b, vl16 +; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].d, vl16 +; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1] ; VBITS_GE_1024-NEXT: cmeq v[[CMP:[0-9]+]].16b, v[[VALS]].16b, #0 -; VBITS_GE_1024-NEXT: cmpne [[MASK:p[0-9]+]].b, [[PG1]]/z, z[[CMP]].b, #0 +; VBITS_GE_1024-NEXT: uunpklo [[UPK1:z[0-9]+]].h, z[[CMP]].b +; VBITS_GE_1024-NEXT: uunpklo [[UPK2:z[0-9]+]].s, [[UPK1]].h +; VBITS_GE_1024-NEXT: uunpklo [[UPK3:z[0-9]+]].d, [[UPK2]].s +; VBITS_GE_1024-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG]]/z, [[UPK3]].d, #0 ; VBITS_GE_1024-NEXT: ld1b { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h @@ -144,7 +155,12 @@ ; VBITS_GE_2048-NEXT: ld1b { [[VALS:z[0-9]+]].b }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq [[MASK:p[0-9]+]].b, [[PG0]]/z, [[VALS]].b, #0 +; VBITS_GE_2048-NEXT: cmpeq [[CMP:p[0-9]+]].b, [[PG0]]/z, [[VALS]].b, #0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].b, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].h, [[MONE]].b +; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].s, [[UPK1]].h +; VBITS_GE_2048-NEXT: uunpklo [[UPK3:z[0-9]+]].d, [[UPK2]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK3]].d, #0 ; VBITS_GE_2048-NEXT: ld1b { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h @@ -168,12 +184,14 @@ ; CHECK: ldrh [[VALS_LO:w[0-9]+]], [x0] ; CHECK-NEXT: ldrh [[VALS_HI:w[0-9]+]], [x0, #2] ; CHECK-NEXT: ldr q[[PTRS:[0-9]+]], [x1] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].s, vl2 +; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl2 ; CHECK-NEXT: fmov s[[VALS:[0-9]+]], [[VALS_LO]] ; CHECK-NEXT: mov v[[VALS]].s[1], [[VALS_HI]] ; CHECK-NEXT: cmeq v[[CMP:[0-9]+]].2s, v[[VALS]].2s, #0 -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG0]]/z, z[[CMP]].s, #0 +; CHECK-NEXT: ushll v[[SHL:[0-9]+]].2d, v[[CMP]].2s, #0 +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG0]]/z, z[[SHL]].d, #0 ; CHECK-NEXT: ld1sh { z[[RES:[0-9]+]].d }, [[MASK]]/z, [z[[PTRS]].d] +; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].s, vl2 ; CHECK-NEXT: xtn v[[XTN:[0-9]+]].2s, v[[RES]].2d ; CHECK-NEXT: st1h { z[[RES]].s }, [[PG0]], [x0] ; CHECK-NEXT: ret @@ -188,11 +206,12 @@ define void @masked_gather_v4i16(<4 x i16>* %a, <4 x i16*>* %b) #0 { ; CHECK-LABEL: masked_gather_v4i16: ; CHECK: ldr d[[VALS:[0-9]+]], [x0] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4 -; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].h, vl4 +; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4 +; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-NEXT: cmeq v[[CMP:[0-9]+]].4h, v[[VALS]].4h, #0 -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].h, [[PG1]]/z, z[[CMP]].h, #0 +; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[CMP]].h +; CHECK-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG]]/z, [[UPK2]].d, #0 ; CHECK-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s ; CHECK-NEXT: uzp1 z[[UZP2:[0-9]+]].h, [[UZP1]].h, [[UZP1]].h @@ -209,11 +228,12 @@ define void @masked_gather_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 { ; CHECK-LABEL: masked_gather_v8i16: ; VBITS_GE_512: ldr q[[VALS:[0-9]+]], [x0] -; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].h, vl8 +; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8 +; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1] ; VBITS_GE_512-NEXT: cmeq v[[CMP:[0-9]+]].8h, v[[VALS]].8h, #0 -; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].h, [[PG1]]/z, z[[CMP]].h, #0 +; VBITS_GE_512-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[CMP]].h +; VBITS_GE_512-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s +; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG]]/z, [[UPK2]].d, #0 ; VBITS_GE_512-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_512-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_512-NEXT: uzp1 z[[UZP2:[0-9]+]].h, [[UZP1]].h, [[UZP1]].h @@ -222,15 +242,18 @@ ; Ensure sensible type legalisation. ; VBITS_EQ_256-DAG: ldr q[[VALS:[0-9]+]], [x0] -; VBITS_EQ_256-DAG: ptrue [[PG0:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 -; VBITS_EQ_256-DAG: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_EQ_256-DAG: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG0]]/z, [x1, x[[NUMELTS]], lsl #3] -; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].h, vl4 +; VBITS_EQ_256-DAG: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_EQ_256-DAG: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3] ; VBITS_EQ_256-DAG: cmeq v[[ZMSK:[0-9]+]].8h, v[[VALS]].8h, #0 -; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].h, [[PG1]]/z, z[[ZMSK]].h, #0 ; VBITS_EQ_256-DAG: ext v[[ZEXT:[0-9]+]].16b, v[[ZMSK]].16b, v[[ZMSK]].16b, #8 -; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].h, [[PG1]]/z, z[[ZEXT]].h, #0 +; VBITS_EQ_256-DAG: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[ZMSK]].h +; VBITS_EQ_256-DAG: uunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s +; VBITS_EQ_256-DAG: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[ZEXT]].h +; VBITS_EQ_256-DAG: uunpklo [[UPK2_HI:z[0-9]+]].d, [[UPK1_HI]].s +; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].d, [[PG]]/z, [[UPK2_LO]].d, #0 +; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].d, [[PG]]/z, [[UPK2_HI]].d, #0 ; VBITS_EQ_256-DAG: ld1h { [[RES_LO:z[0-9]+]].d }, [[MASK_LO]]/z, {{\[}}[[PTRS_LO]].d] ; VBITS_EQ_256-DAG: ld1h { [[RES_HI:z[0-9]+]].d }, [[MASK_HI]]/z, {{\[}}[[PTRS_HI]].d] ; VBITS_EQ_256-DAG: uzp1 [[UZP1_LO:z[0-9]+]].s, [[RES_LO]].s, [[RES_LO]].s @@ -254,7 +277,11 @@ ; VBITS_GE_1024-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl16 ; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_1024-NEXT: cmpeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0 +; VBITS_GE_1024-NEXT: cmpeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0 +; VBITS_GE_1024-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1 +; VBITS_GE_1024-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_1024-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s +; VBITS_GE_1024-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK2]].d, #0 ; VBITS_GE_1024-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h @@ -274,7 +301,11 @@ ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0 +; VBITS_GE_2048-NEXT: cmpeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK2]].d, #0 ; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h @@ -296,9 +327,10 @@ ; CHECK-LABEL: masked_gather_v2i32: ; CHECK: ldr d[[VALS:[0-9]+]], [x0] ; CHECK-NEXT: ldr q[[PTRS:[0-9]+]], [x1] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].s, vl2 +; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl2 ; CHECK-NEXT: cmeq v[[CMP:[0-9]+]].2s, v[[VALS]].2s, #0 -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG0]]/z, z[[CMP]].s, #0 +; CHECK-NEXT: ushll v[[SHL:[0-9]+]].2d, v[[CMP]].2s, #0 +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG0]]/z, z[[SHL]].d, #0 ; CHECK-NEXT: ld1w { z[[RES:[0-9]+]].d }, [[MASK]]/z, [z[[PTRS]].d] ; CHECK-NEXT: xtn v[[XTN:[0-9]+]].2s, v[[RES]].2d ; CHECK-NEXT: str d[[XTN]], [x0] @@ -314,11 +346,11 @@ define void @masked_gather_v4i32(<4 x i32>* %a, <4 x i32*>* %b) #0 { ; CHECK-LABEL: masked_gather_v4i32: ; CHECK: ldr q[[VALS:[0-9]+]], [x0] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4 -; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4 +; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4 +; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-NEXT: cmeq v[[CMP:[0-9]+]].4s, v[[VALS]].4s, #0 -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, z[[CMP]].s, #0 +; CHECK-NEXT: uunpklo [[UPK:z[0-9]+]].d, z[[CMP]].s +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG]]/z, [[UPK]].d, #0 ; CHECK-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; CHECK-NEXT: uzp1 z[[UZP:[0-9]+]].s, [[RES]].s, [[RES]].s ; CHECK-NEXT: str q[[UZP]], [x0] @@ -337,7 +369,10 @@ ; VBITS_GE_512-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].d, vl8 ; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_512-NEXT: cmpeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0 +; VBITS_GE_512-NEXT: cmpeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0 +; VBITS_GE_512-NEXT: mov [[MONE:z[0-9]+]].s, [[CMP]]/z, #-1 +; VBITS_GE_512-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 ; VBITS_GE_512-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_512-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_512-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0] @@ -352,16 +387,18 @@ ; VBITS_EQ_256-DAG: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG1]]/z, [x1, x[[NUMELTS]], lsl #3] ; VBITS_EQ_256-DAG: cmpeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0 ; VBITS_EQ_256-DAG: mov x8, sp -; VBITS_EQ_256-DAG: mov [[MONE:z[0-9]+]].s, p1/z, #-1 +; VBITS_EQ_256-DAG: mov [[MONE:z[0-9]+]].s, [[MASK]]/z, #-1 ; VBITS_EQ_256-DAG: st1w { [[MONE]].s }, [[PG0]], [x8] ; VBITS_EQ_256-DAG: ldr q[[CMP_HI:[0-9]+]], [sp, #16] -; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s, vl4 -; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].s, [[PG2]]/z, z[[CMP_HI]].s, #0 +; VBITS_EQ_256-DAG: uunpklo [[UPK_HI:z[0-9]+]].d, z[[CMP_HI]].s +; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].d, [[PG1]]/z, [[UPK_HI]].d, #0 ; VBITS_EQ_256-DAG: ld1w { [[RES_HI:z[0-9]+]].d }, [[MASK_HI]]/z, {{\[}}[[PTRS_HI]].d] ; VBITS_EQ_256-DAG: ldr q[[CMP_LO:[0-9]+]], [sp] ; VBITS_EQ_256-DAG: uzp1 [[UZP_HI:z[0-9]+]].s, [[RES_HI]].s, [[RES_HI]].s -; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].s, [[PG2]]/z, z[[CMP_LO]].s, #0 +; VBITS_EQ_256-DAG: uunpklo [[UPK_LO:z[0-9]+]].d, z[[CMP_LO]].s +; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].d, [[PG1]]/z, [[UPK_LO]].d, #0 ; VBITS_EQ_256-DAG: ld1w { [[RES_LO:z[0-9]+]].d }, [[MASK_LO]]/z, {{\[}}[[PTRS_LO]].d] +; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s, vl4 ; VBITS_EQ_256-DAG: uzp1 [[UZP_LO:z[0-9]+]].s, [[RES_LO]].s, [[RES_LO]].s ; VBITS_EQ_256-DAG: splice [[RES:z[0-9]+]].s, [[PG1]], [[RES_LO]].s, [[RES_HI]].s ; VBITS_EQ_256-DAG: st1w { [[RES]].s }, [[PG0]], [x0] @@ -379,7 +416,10 @@ ; VBITS_GE_1024-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl16 ; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_1024-NEXT: cmpeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0 +; VBITS_GE_1024-NEXT: cmpeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0 +; VBITS_GE_1024-NEXT: mov [[MONE:z[0-9]+]].s, [[CMP]]/z, #-1 +; VBITS_GE_1024-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_1024-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 ; VBITS_GE_1024-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_1024-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_1024-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0] @@ -398,7 +438,10 @@ ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0 +; VBITS_GE_2048-NEXT: cmpeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].s, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 ; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_2048-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0] @@ -537,7 +580,7 @@ ; CHECK: ldr s[[VALS:[0-9]+]], [x0] ; CHECK-NEXT: movi d[[ZERO:[0-9]+]], #0000000000000000 ; CHECK-NEXT: ldr q[[PTRS:[0-9]+]], [x1] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].h, vl4 +; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4 ; CHECK-NEXT: fcmeq v[[CMP:[0-9]+]].4h, v[[VALS]].4h, #0.0 ; CHECK-NEXT: umov w8, v[[CMP]].h[0] ; CHECK-NEXT: umov w9, v[[CMP]].h[1] @@ -551,7 +594,9 @@ ; CHECK-NEXT: mov v[[NCMP]].h[1], w8 ; CHECK-NEXT: shl v[[SHL:[0-9]+]].4h, v[[NCMP]].4h, #15 ; CHECK-NEXT: sshr v[[SHL]].4h, v[[SHL]].4h, #15 -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].h, [[PG0]]/z, z[[SHL]].h, #0 +; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[SHL]].h +; CHECK-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG0]]/z, [[UPK2]].d, #0 ; CHECK-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, [z[[PTRS]].d] ; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s ; CHECK-NEXT: uzp1 z[[UZP2:[0-9]+]].h, [[UZP1]].h, [[UZP1]].h @@ -568,11 +613,12 @@ define void @masked_gather_v4f16(<4 x half>* %a, <4 x half*>* %b) #0 { ; CHECK-LABEL: masked_gather_v4f16: ; CHECK: ldr d[[VALS:[0-9]+]], [x0] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4 -; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].h, vl4 +; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4 +; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-NEXT: fcmeq v[[CMP:[0-9]+]].4h, v[[VALS]].4h, #0 -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].h, [[PG1]]/z, z[[CMP]].h, #0 +; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[CMP]].h +; CHECK-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG]]/z, [[UPK2]].d, #0 ; CHECK-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; CHECK-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s ; CHECK-NEXT: uzp1 z[[UZP2:[0-9]+]].h, [[UZP1]].h, [[UZP1]].h @@ -589,11 +635,12 @@ define void @masked_gather_v8f16(<8 x half>* %a, <8 x half*>* %b) #0 { ; CHECK-LABEL: masked_gather_v8f16: ; VBITS_GE_512: ldr q[[VALS:[0-9]+]], [x0] -; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].h, vl8 +; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8 +; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1] ; VBITS_GE_512-NEXT: fcmeq v[[CMP:[0-9]+]].8h, v[[VALS]].8h, #0 -; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].h, [[PG1]]/z, z[[CMP]].h, #0 +; VBITS_GE_512-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[CMP]].h +; VBITS_GE_512-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s +; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG]]/z, [[UPK2]].d, #0 ; VBITS_GE_512-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_512-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_512-NEXT: uzp1 z[[UZP2:[0-9]+]].h, [[UZP1]].h, [[UZP1]].h @@ -613,7 +660,11 @@ ; VBITS_GE_1024-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl16 ; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 +; VBITS_GE_1024-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 +; VBITS_GE_1024-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1 +; VBITS_GE_1024-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_1024-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s +; VBITS_GE_1024-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK2]].d, #0 ; VBITS_GE_1024-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_1024-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_1024-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h @@ -633,7 +684,11 @@ ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK2]].d, #0 ; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_2048-NEXT: uzp1 [[UZP1:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_2048-NEXT: uzp1 [[UZP2:z[0-9]+]].h, [[UZP1]].h, [[UZP1]].h @@ -655,9 +710,10 @@ ; CHECK-LABEL: masked_gather_v2f32: ; CHECK: ldr d[[VALS:[0-9]+]], [x0] ; CHECK-NEXT: ldr q[[PTRS:[0-9]+]], [x1] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].s, vl2 +; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl2 ; CHECK-NEXT: fcmeq v[[CMP:[0-9]+]].2s, v[[VALS]].2s, #0 -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG0]]/z, z[[CMP]].s, #0 +; CHECK-NEXT: ushll v[[SHL:[0-9]+]].2d, v[[CMP]].2s, #0 +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG0]]/z, z[[SHL]].d, #0 ; CHECK-NEXT: ld1w { z[[RES:[0-9]+]].d }, [[MASK]]/z, [z[[PTRS]].d] ; CHECK-NEXT: xtn v[[XTN:[0-9]+]].2s, v[[RES]].2d ; CHECK-NEXT: str d[[XTN]], [x0] @@ -673,11 +729,11 @@ define void @masked_gather_v4f32(<4 x float>* %a, <4 x float*>* %b) #0 { ; CHECK-LABEL: masked_gather_v4f32: ; CHECK: ldr q[[VALS:[0-9]+]], [x0] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4 -; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4 +; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4 +; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-NEXT: fcmeq v[[CMP:[0-9]+]].4s, v[[VALS]].4s, #0 -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, z[[CMP]].s, #0 +; CHECK-NEXT: uunpklo [[UPK:z[0-9]+]].d, z[[CMP]].s +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG]]/z, [[UPK]].d, #0 ; CHECK-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; CHECK-NEXT: uzp1 z[[UZP:[0-9]+]].s, [[RES]].s, [[RES]].s ; CHECK-NEXT: str q[[UZP]], [x0] @@ -696,7 +752,10 @@ ; VBITS_GE_512-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].d, vl8 ; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_512-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_512-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_512-NEXT: mov [[MONE:z[0-9]+]].s, [[CMP]]/z, #-1 +; VBITS_GE_512-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 ; VBITS_GE_512-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_512-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_512-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0] @@ -715,7 +774,10 @@ ; VBITS_GE_1024-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl16 ; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_1024-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_1024-NEXT: mov [[MONE:z[0-9]+]].s, [[CMP]]/z, #-1 +; VBITS_GE_1024-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_1024-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 ; VBITS_GE_1024-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_1024-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_1024-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0] @@ -734,7 +796,10 @@ ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].s, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 ; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_2048-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0] @@ -858,7 +923,10 @@ ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0 ; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, sxtw #1] ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h ; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0] @@ -879,7 +947,10 @@ ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0 ; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, uxtw #1] ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h ; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0] @@ -900,7 +971,10 @@ ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0 ; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, sxtw] ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h ; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0] @@ -922,7 +996,10 @@ ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0 ; VBITS_GE_2048-NEXT: ld1h { [[RES:z[0-9]+]].s }, [[MASK]]/z, [x2, [[PTRS]].s, uxtw] ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].h, [[RES]].h, [[RES]].h ; VBITS_GE_2048-NEXT: st1h { [[UZP]].h }, [[PG0]], [x0] @@ -944,7 +1021,10 @@ ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].s, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 ; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[PTRS]].d, lsl #2] ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_2048-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0] @@ -964,7 +1044,10 @@ ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].s, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 ; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, [x2, [[PTRS]].d] ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_2048-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0] @@ -983,12 +1066,15 @@ define void @masked_gather_vec_plus_reg(<32 x float>* %a, <32 x i8*>* %b, i64 %off) #0 { ; CHECK-LABEL: masked_gather_vec_plus_reg: ; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].s, vl32 -; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] +; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] ; VBITS_GE_2048-NEXT: mov [[OFF:z[0-9]+]].d, x2 +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].s, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s ; VBITS_GE_2048-NEXT: add [[PTRS_ADD:z[0-9]+]].d, [[PG1]]/m, [[PTRS]].d, [[OFF]].d -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 ; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS_ADD]].d] ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_2048-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0] @@ -1007,12 +1093,15 @@ define void @masked_gather_vec_plus_imm(<32 x float>* %a, <32 x i8*>* %b) #0 { ; CHECK-LABEL: masked_gather_vec_plus_imm: ; VBITS_GE_2048: ptrue [[PG0:p[0-9]+]].s, vl32 -; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] +; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] ; VBITS_GE_2048-NEXT: mov [[OFF:z[0-9]+]].d, #4 +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].s, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s ; VBITS_GE_2048-NEXT: add [[PTRS_ADD:z[0-9]+]].d, [[PG1]]/m, [[PTRS]].d, [[OFF]].d -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 ; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS_ADD]].d] ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_2048-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0] @@ -1034,10 +1123,13 @@ ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] ; VBITS_GE_2048-NEXT: ld1w { [[PT:z[0-9]+]].s }, [[PG0]]/z, [x2] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].s, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 ; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s -; VBITS_GE_2048-NEXT: sel [[SEL:z[0-9]+]].s, [[PG1]], [[UZP]].s, [[PT]].s +; VBITS_GE_2048-NEXT: sel [[SEL:z[0-9]+]].s, [[CMP]], [[UZP]].s, [[PT]].s ; VBITS_GE_2048-NEXT: st1w { [[SEL]].s }, [[PG0]], [x0] ; VBITS_GE_2048-NEXT: ret %cvals = load <32 x float>, <32 x float>* %a @@ -1055,7 +1147,10 @@ ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].s, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 ; VBITS_GE_2048-NEXT: ld1w { [[RES:z[0-9]+]].d }, [[MASK]]/z, {{\[}}[[PTRS]].d] ; VBITS_GE_2048-NEXT: uzp1 [[UZP:z[0-9]+]].s, [[RES]].s, [[RES]].s ; VBITS_GE_2048-NEXT: st1w { [[UZP]].s }, [[PG0]], [x0] diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -29,13 +29,14 @@ ; CHECK: ldrb [[VALS_LO:w[0-9]+]], [x0] ; CHECK-NEXT: ldrb [[VALS_HI:w[0-9]+]], [x0, #1] ; CHECK-NEXT: ldr q[[PTRS:[0-9]+]], [x1] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].s, vl2 +; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl2 ; CHECK-NEXT: fmov s[[VALS:[0-9]+]], [[VALS_LO]] ; CHECK-NEXT: mov v[[VALS]].s[1], [[VALS_HI]] ; CHECK-NEXT: cmeq v[[CMP:[0-9]+]].2s, v[[VALS]].2s, #0 -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG0]]/z, z[[CMP]].s, #0 -; CHECK-NEXT: ushll v[[SHL:[0-9]+]].2d, v[[VALS]].2s, #0 -; CHECK-NEXT: st1b { z[[SHL]].d }, [[MASK]], [z[[PTRS]].d] +; CHECK-NEXT: ushll v[[SHL:[0-9]+]].2d, v[[CMP]].2s, #0 +; CHECK-NEXT: ushll v[[SHL2:[0-9]+]].2d, v[[VALS]].2s, #0 +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG0]]/z, z[[SHL]].d, #0 +; CHECK-NEXT: st1b { z[[SHL2]].d }, [[MASK]], [z[[PTRS]].d] ; CHECK-NEXT: ret %vals = load <2 x i8>, <2 x i8>* %a %ptrs = load <2 x i8*>, <2 x i8*>* %b @@ -47,15 +48,16 @@ define void @masked_scatter_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 { ; CHECK-LABEL: masked_scatter_v4i8: ; CHECK: ldr s[[VALS:[0-9]+]], [x0] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4 -; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].h, vl4 +; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4 +; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-NEXT: ushll [[SHL:v[0-9]+]].8h, v[[VALS]].8b, #0 ; CHECK-NEXT: cmeq v[[CMP:[0-9]+]].4h, [[SHL]].4h, #0 -; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[VALS]].h -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].h, [[PG1]]/z, z[[CMP]].h, #0 -; CHECK-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s -; CHECK-NEXT: st1b { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[CMP]].h +; CHECK-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, z[[VALS]].h +; CHECK-NEXT: uunpklo z[[UPK2:[0-9]+]].d, [[UPK1]].s +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG]]/z, z[[UPK2]].d, #0 +; CHECK-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s +; CHECK-NEXT: st1b { [[UPKV2]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; CHECK-NEXT: ret %vals = load <4 x i8>, <4 x i8>* %a %ptrs = load <4 x i8*>, <4 x i8*>* %b @@ -67,33 +69,38 @@ define void @masked_scatter_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 { ; CHECK-LABEL: masked_scatter_v8i8: ; VBITS_GE_512: ldr d[[VALS:[0-9]+]], [x0] -; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].b, vl8 +; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8 +; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1] ; VBITS_GE_512-NEXT: cmeq v[[CMP:[0-9]+]].8b, v[[VALS]].8b, #0 -; VBITS_GE_512-NEXT: uunpklo [[UPK1:z[0-9]+]].h, z[[VALS]].b +; VBITS_GE_512-NEXT: uunpklo [[UPK1:z[0-9]+]].h, z[[CMP]].b +; VBITS_GE_512-NEXT: uunpklo [[UPKV1:z[0-9]+]].h, z[[VALS]].b ; VBITS_GE_512-NEXT: uunpklo [[UPK2:z[0-9]+]].s, [[UPK1]].h -; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].b, [[PG1]]/z, z[[CMP]].b, #0 +; VBITS_GE_512-NEXT: uunpklo [[UPKV2:z[0-9]+]].s, [[UPKV1]].h ; VBITS_GE_512-NEXT: uunpklo [[UPK3:z[0-9]+]].d, [[UPK2]].s -; VBITS_GE_512-NEXT: st1b { [[UPK3]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG]]/z, [[UPK3]].d, #0 +; VBITS_GE_512-NEXT: uunpklo [[UPKV3:z[0-9]+]].d, [[UPKV2]].s +; VBITS_GE_512-NEXT: st1b { [[UPKV3]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. ; VBITS_EQ_256-DAG: ldr d[[VALS:[0-9]+]], [x0] -; VBITS_EQ_256-DAG: ptrue [[PG0:p[0-9]+]].d, vl4 -; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].h, vl4 +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 ; VBITS_EQ_256-DAG: cmeq [[ZMSK:v[0-9]+]].8b, v[[VALS]].8b, #0 -; VBITS_EQ_256-DAG: zip1 [[VAL_LO:v[0-9]+]].8b, [[ZMSK]].8b, v[[VALS]].8b -; VBITS_EQ_256-DAG: zip2 [[VAL_HI:v[0-9]+]].8b, [[ZMSK]].8b, v[[VALS]].8b -; VBITS_EQ_256-DAG: shl [[SHL_LO:v[0-9]+]].4h, [[VAL_LO]].4h, #8 -; VBITS_EQ_256-DAG: shl [[SHL_HI:v[0-9]+]].4h, [[VAL_HI]].4h, #8 -; VBITS_EQ_256-DAG: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_EQ_256-DAG: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG0]]/z, [x1, x[[NUMELTS]], lsl #3] +; VBITS_EQ_256-DAG: zip1 v[[VAL_LO:[0-9]+]].8b, [[ZMSK]].8b, v[[VALS]].8b +; VBITS_EQ_256-DAG: zip2 v[[VAL_HI:[0-9]+]].8b, [[ZMSK]].8b, v[[VALS]].8b +; VBITS_EQ_256-DAG: shl [[SHL_LO:v[0-9]+]].4h, v[[VAL_LO]].4h, #8 +; VBITS_EQ_256-DAG: shl [[SHL_HI:v[0-9]+]].4h, v[[VAL_HI]].4h, #8 +; VBITS_EQ_256-DAG: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_EQ_256-DAG: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3] ; VBITS_EQ_256-DAG: sshr v[[SSHR_LO:[0-9]+]].4h, [[SHL_LO]].4h, #8 ; VBITS_EQ_256-DAG: sshr v[[SSHR_HI:[0-9]+]].4h, [[SHL_HI]].4h, #8 -; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].h, [[PG1]]/z, z[[SSHR_LO]].h, #0 -; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].h, [[PG1]]/z, z[[SSHR_HI]].h, #0 +; VBITS_EQ_256-DAG: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[VAL_LO]].h +; VBITS_EQ_256-DAG: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[VAL_HI]].h +; VBITS_EQ_256-DAG: uunpklo z[[UPK2_LO:[0-9]+]].d, [[UPK1_LO]].s +; VBITS_EQ_256-DAG: uunpklo z[[UPK2_HI:[0-9]+]].d, [[UPK1_HI]].s +; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].d, [[PG]]/z, z[[UPK2_LO]].d, #0 +; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].d, [[PG]]/z, z[[UPK2_HI]].d, #0 ; VBITS_EQ_256-DAG: zip1 v[[VALS2_LO:[0-9]+]].8b, v[[VALS]].8b, v[[VALS]].8b ; VBITS_EQ_256-DAG: zip2 v[[VALS2_HI:[0-9]+]].8b, v[[VALS]].8b, v[[VALS]].8b ; VBITS_EQ_256-DAG: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[VALS2_LO]].h @@ -113,15 +120,17 @@ define void @masked_scatter_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 { ; CHECK-LABEL: masked_scatter_v16i8: ; VBITS_GE_1024: ldr q[[VALS:[0-9]+]], [x0] -; VBITS_GE_1024-NEXT: ptrue [[PG0:p[0-9]+]].d, vl16 -; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].b, vl16 +; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].d, vl16 +; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1] ; VBITS_GE_1024-NEXT: cmeq v[[CMP:[0-9]+]].16b, v[[VALS]].16b, #0 -; VBITS_GE_1024-NEXT: uunpklo [[UPK1:z[0-9]+]].h, z[[VALS]].b +; VBITS_GE_1024-NEXT: uunpklo [[UPK1:z[0-9]+]].h, z[[CMP]].b +; VBITS_GE_1024-NEXT: uunpklo [[UPKV1:z[0-9]+]].h, z[[VALS]].b ; VBITS_GE_1024-NEXT: uunpklo [[UPK2:z[0-9]+]].s, [[UPK1]].h -; VBITS_GE_1024-NEXT: cmpne [[MASK:p[0-9]+]].b, [[PG1]]/z, z[[CMP]].b, #0 +; VBITS_GE_1024-NEXT: uunpklo [[UPKV2:z[0-9]+]].s, [[UPKV1]].h ; VBITS_GE_1024-NEXT: uunpklo [[UPK3:z[0-9]+]].d, [[UPK2]].s -; VBITS_GE_1024-NEXT: st1b { [[UPK3]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_1024-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG]]/z, [[UPK3]].d, #0 +; VBITS_GE_1024-NEXT: uunpklo [[UPKV3:z[0-9]+]].d, [[UPKV2]].s +; VBITS_GE_1024-NEXT: st1b { [[UPKV3]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_1024-NEXT: ret %vals = load <16 x i8>, <16 x i8>* %a %ptrs = load <16 x i8*>, <16 x i8*>* %b @@ -136,11 +145,16 @@ ; VBITS_GE_2048-NEXT: ld1b { [[VALS:z[0-9]+]].b }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq [[MASK:p[0-9]+]].b, [[PG0]]/z, [[VALS]].b, #0 -; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].h, [[VALS]].b +; VBITS_GE_2048-NEXT: cmpeq [[CMP:p[0-9]+]].b, [[PG0]]/z, [[VALS]].b, #0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].b, [[PG0]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].h, [[MONE]].b +; VBITS_GE_2048-NEXT: uunpklo [[UPKV1:z[0-9]+]].h, [[VALS]].b ; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].s, [[UPK1]].h +; VBITS_GE_2048-NEXT: uunpklo [[UPKV2:z[0-9]+]].s, [[UPKV1]].h ; VBITS_GE_2048-NEXT: uunpklo [[UPK3:z[0-9]+]].d, [[UPK2]].s -; VBITS_GE_2048-NEXT: st1b { [[UPK3]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK3]].d, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV3:z[0-9]+]].d, [[UPKV2]].s +; VBITS_GE_2048-NEXT: st1b { [[UPKV3]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x i8>, <32 x i8>* %a %ptrs = load <32 x i8*>, <32 x i8*>* %b @@ -158,13 +172,14 @@ ; CHECK: ldrh [[VALS_LO:w[0-9]+]], [x0] ; CHECK-NEXT: ldrh [[VALS_HI:w[0-9]+]], [x0, #2] ; CHECK-NEXT: ldr q[[PTRS:[0-9]+]], [x1] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].s, vl2 +; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl2 ; CHECK-NEXT: fmov s[[VALS:[0-9]+]], [[VALS_LO]] ; CHECK-NEXT: mov v[[VALS]].s[1], [[VALS_HI]] ; CHECK-NEXT: cmeq v[[CMP:[0-9]+]].2s, v[[VALS]].2s, #0 -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG0]]/z, z[[CMP]].s, #0 -; CHECK-NEXT: ushll v[[SHL:[0-9]+]].2d, v[[VALS]].2s, #0 -; CHECK-NEXT: st1h { z[[SHL]].d }, [[MASK]], [z[[PTRS]].d] +; CHECK-NEXT: ushll v[[SHL:[0-9]+]].2d, v[[CMP]].2s, #0 +; CHECK-NEXT: ushll v[[SHL2:[0-9]+]].2d, v[[VALS]].2s, #0 +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG0]]/z, z[[SHL]].d, #0 +; CHECK-NEXT: st1h { z[[SHL2]].d }, [[MASK]], [z[[PTRS]].d] ; CHECK-NEXT: ret %vals = load <2 x i16>, <2 x i16>* %a %ptrs = load <2 x i16*>, <2 x i16*>* %b @@ -176,14 +191,15 @@ define void @masked_scatter_v4i16(<4 x i16>* %a, <4 x i16*>* %b) #0 { ; CHECK-LABEL: masked_scatter_v4i16: ; CHECK: ldr d[[VALS:[0-9]+]], [x0] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4 -; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].h, vl4 +; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4 +; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-NEXT: cmeq v[[CMP:[0-9]+]].4h, v[[VALS]].4h, #0 -; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[VALS]].h -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].h, [[PG1]]/z, z[[CMP]].h, #0 -; CHECK-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s -; CHECK-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[CMP]].h +; CHECK-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, z[[VALS]].h +; CHECK-NEXT: uunpklo z[[UPK2:[0-9]+]].d, [[UPK1]].s +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG]]/z, z[[UPK2]].d, #0 +; CHECK-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s +; CHECK-NEXT: st1h { [[UPKV2]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; CHECK-NEXT: ret %vals = load <4 x i16>, <4 x i16>* %a %ptrs = load <4 x i16*>, <4 x i16*>* %b @@ -195,28 +211,32 @@ define void @masked_scatter_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 { ; CHECK-LABEL: masked_scatter_v8i16: ; VBITS_GE_512: ldr q[[VALS:[0-9]+]], [x0] -; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].h, vl8 +; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8 +; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1] ; VBITS_GE_512-NEXT: cmeq v[[CMP:[0-9]+]].8h, v[[VALS]].8h, #0 -; VBITS_GE_512-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[VALS]].h -; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].h, [[PG1]]/z, z[[CMP]].h, #0 +; VBITS_GE_512-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[CMP]].h +; VBITS_GE_512-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, z[[VALS]].h ; VBITS_GE_512-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s -; VBITS_GE_512-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG]]/z, [[UPK2]].d, #0 +; VBITS_GE_512-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s +; VBITS_GE_512-NEXT: st1h { [[UPKV2]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. ; VBITS_EQ_256-DAG: ldr q[[VALS:[0-9]+]], [x0] -; VBITS_EQ_256-DAG: ptrue [[PG0:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 -; VBITS_EQ_256-DAG: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_EQ_256-DAG: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG0]]/z, [x1, x[[NUMELTS]], lsl #3] -; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].h, vl4 +; VBITS_EQ_256-DAG: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_EQ_256-DAG: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3] ; VBITS_EQ_256-DAG: cmeq v[[ZMSK:[0-9]+]].8h, v[[VALS]].8h, #0 ; VBITS_EQ_256-DAG: ext v[[EXT:[0-9]+]].16b, v[[VALS]].16b, v[[VALS]].16b, #8 ; VBITS_EQ_256-DAG: ext v[[ZEXT:[0-9]+]].16b, v[[ZMSK]].16b, v[[ZMSK]].16b, #8 -; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].h, [[PG1]]/z, z[[ZMSK]].h, #0 -; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].h, [[PG1]]/z, z[[ZEXT]].h, #0 +; VBITS_EQ_256-DAG: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[ZMSK]].h +; VBITS_EQ_256-DAG: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[ZEXT]].h +; VBITS_EQ_256-DAG: uunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s +; VBITS_EQ_256-DAG: uunpklo [[UPK2_HI:z[0-9]+]].d, [[UPK1_HI]].s +; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].d, [[PG]]/z, [[UPK2_LO]].d, #0 +; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].d, [[PG]]/z, [[UPK2_HI]].d, #0 ; VBITS_EQ_256-DAG: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[VALS]].h ; VBITS_EQ_256-DAG: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[EXT]].h ; VBITS_EQ_256-DAG: uunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s @@ -237,10 +257,14 @@ ; VBITS_GE_1024-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl16 ; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_1024-NEXT: cmpeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0 -; VBITS_GE_1024-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[VALS]].h +; VBITS_GE_1024-NEXT: cmpeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0 +; VBITS_GE_1024-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1 +; VBITS_GE_1024-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_1024-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, [[VALS]].h ; VBITS_GE_1024-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s -; VBITS_GE_1024-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_1024-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK2]].d, #0 +; VBITS_GE_1024-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s +; VBITS_GE_1024-NEXT: st1h { [[UPKV2]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_1024-NEXT: ret %vals = load <16 x i16>, <16 x i16>* %a %ptrs = load <16 x i16*>, <16 x i16*>* %b @@ -255,10 +279,14 @@ ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0 -; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[VALS]].h +; VBITS_GE_2048-NEXT: cmpeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_2048-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, [[VALS]].h ; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s -; VBITS_GE_2048-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK2]].d, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s +; VBITS_GE_2048-NEXT: st1h { [[UPKV2]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x i16>, <32 x i16>* %a %ptrs = load <32 x i16*>, <32 x i16*>* %b @@ -275,11 +303,12 @@ ; CHECK-LABEL: masked_scatter_v2i32: ; CHECK: ldr d[[VALS:[0-9]+]], [x0] ; CHECK-NEXT: ldr q[[PTRS:[0-9]+]], [x1] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].s, vl2 +; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl2 ; CHECK-NEXT: cmeq v[[CMP:[0-9]+]].2s, v[[VALS]].2s, #0 -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG0]]/z, z[[CMP]].s, #0 -; CHECK-NEXT: ushll v[[SHL:[0-9]+]].2d, v[[VALS]].2s, #0 -; CHECK-NEXT: st1w { z[[SHL]].d }, [[MASK]], [z[[PTRS]].d] +; CHECK-NEXT: ushll v[[SHL:[0-9]+]].2d, v[[CMP]].2s, #0 +; CHECK-NEXT: ushll v[[SHL2:[0-9]+]].2d, v[[VALS]].2s, #0 +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG0]]/z, z[[SHL]].d, #0 +; CHECK-NEXT: st1w { z[[SHL2]].d }, [[MASK]], [z[[PTRS]].d] ; CHECK-NEXT: ret %vals = load <2 x i32>, <2 x i32>* %a %ptrs = load <2 x i32*>, <2 x i32*>* %b @@ -291,13 +320,13 @@ define void @masked_scatter_v4i32(<4 x i32>* %a, <4 x i32*>* %b) #0 { ; CHECK-LABEL: masked_scatter_v4i32: ; CHECK: ldr q[[VALS:[0-9]+]], [x0] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4 -; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4 +; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4 +; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-NEXT: cmeq v[[CMP:[0-9]+]].4s, v[[VALS]].4s, #0 -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, z[[CMP]].s, #0 -; CHECK-NEXT: uunpklo [[UPK:z[0-9]+]].d, z[[VALS]].s -; CHECK-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; CHECK-NEXT: uunpklo [[UPK:z[0-9]+]].d, z[[CMP]].s +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG]]/z, [[UPK]].d, #0 +; CHECK-NEXT: uunpklo [[UPKV:z[0-9]+]].d, z[[VALS]].s +; CHECK-NEXT: st1w { [[UPKV]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; CHECK-NEXT: ret %vals = load <4 x i32>, <4 x i32>* %a %ptrs = load <4 x i32*>, <4 x i32*>* %b @@ -312,9 +341,12 @@ ; VBITS_GE_512-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].d, vl8 ; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_512-NEXT: cmpeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0 -; VBITS_GE_512-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s -; VBITS_GE_512-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_512-NEXT: cmpeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0 +; VBITS_GE_512-NEXT: mov [[MONE:z[0-9]+]].s, [[CMP]]/z, #-1 +; VBITS_GE_512-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 +; VBITS_GE_512-NEXT: uunpklo [[UPKV:z[0-9]+]].d, [[VALS]].s +; VBITS_GE_512-NEXT: st1w { [[UPKV]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. @@ -327,18 +359,19 @@ ; VBITS_EQ_256-DAG: cmpeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0 ; VBITS_EQ_256-DAG: add x8, sp, #32 ; VBITS_EQ_256-DAG: mov x9, sp -; VBITS_EQ_256-DAG: mov [[MONE:z[0-9]+]].s, p1/z, #-1 +; VBITS_EQ_256-DAG: mov [[MONE:z[0-9]+]].s, [[MASK]]/z, #-1 ; VBITS_EQ_256-DAG: st1w { [[MONE]].s }, [[PG0]], [x8] ; VBITS_EQ_256-DAG: st1w { [[VALS]].s }, [[PG0]], [x9] ; VBITS_EQ_256-DAG: ldr q[[CMP_LO:[0-9]+]], [sp, #32] ; VBITS_EQ_256-DAG: ldr q[[VAL_LO:[0-9]+]], [sp] -; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s, vl4 -; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].s, [[PG2]]/z, z[[CMP_LO]].s, #0 +; VBITS_EQ_256-DAG: uunpklo [[UPKC_LO:z[0-9]+]].d, z[[CMP_LO]].s +; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].d, [[PG1]]/z, [[UPKC_LO]].d, #0 ; VBITS_EQ_256-DAG: uunpklo [[UPK1_LO:z[0-9]+]].d, z[[VAL_LO]].s ; VBITS_EQ_256-DAG: st1w { [[UPK1_LO]].d }, [[MASK_LO]], {{\[}}[[PTRS_LO]].d] ; VBITS_EQ_256-DAG: ldr q[[CMP_HI:[0-9]+]], [sp, #48] ; VBITS_EQ_256-DAG: ldr q[[VAL_HI:[0-9]+]], [sp, #16] -; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].s, [[PG2]]/z, z[[CMP_HI]].s, #0 +; VBITS_EQ_256-DAG: uunpklo [[UPKC_HI:z[0-9]+]].d, z[[CMP_HI]].s +; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].d, [[PG1]]/z, [[UPKC_HI]].d, #0 ; VBITS_EQ_256-DAG: uunpklo [[UPK1_HI:z[0-9]+]].d, z[[VAL_HI]].s ; VBITS_EQ_256-DAG: st1w { [[UPK1_HI]].d }, [[MASK_HI]], {{\[}}[[PTRS_HI]].d] %vals = load <8 x i32>, <8 x i32>* %a @@ -354,9 +387,12 @@ ; VBITS_GE_1024-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl16 ; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_1024-NEXT: cmpeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0 -; VBITS_GE_1024-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s -; VBITS_GE_1024-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_1024-NEXT: cmpeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0 +; VBITS_GE_1024-NEXT: mov [[MONE:z[0-9]+]].s, [[CMP]]/z, #-1 +; VBITS_GE_1024-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_1024-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 +; VBITS_GE_1024-NEXT: uunpklo [[UPKV:z[0-9]+]].d, [[VALS]].s +; VBITS_GE_1024-NEXT: st1w { [[UPKV]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_1024-NEXT: ret %vals = load <16 x i32>, <16 x i32>* %a %ptrs = load <16 x i32*>, <16 x i32*>* %b @@ -371,9 +407,12 @@ ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0 -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s -; VBITS_GE_2048-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_2048-NEXT: cmpeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].s, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].d, [[VALS]].s +; VBITS_GE_2048-NEXT: st1w { [[UPKV]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x i32>, <32 x i32>* %a %ptrs = load <32 x i32*>, <32 x i32*>* %b @@ -495,7 +534,7 @@ ; CHECK: ldr s[[VALS:[0-9]+]], [x0] ; CHECK-NEXT: movi d2, #0000000000000000 ; CHECK-NEXT: ldr q[[PTRS:[0-9]+]], [x1] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].h, vl4 +; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4 ; CHECK-NEXT: fcmeq v[[CMP:[0-9]+]].4h, v[[VALS]].4h, #0.0 ; CHECK-NEXT: umov w8, v[[CMP]].h[0] ; CHECK-NEXT: umov w9, v[[CMP]].h[1] @@ -508,11 +547,13 @@ ; CHECK-NEXT: mov v[[NCMP:[0-9]+]].h[0], w9 ; CHECK-NEXT: mov v[[NCMP]].h[1], w8 ; CHECK-NEXT: shl v[[NCMP]].4h, v[[NCMP]].4h, #15 -; CHECK-NEXT: uunpklo [[UPK1]].s, z[[VALS]].h ; CHECK-NEXT: sshr v[[NCMP]].4h, v[[NCMP]].4h, #15 -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].h, [[PG0]]/z, z[[NCMP]].h, #0 -; CHECK-NEXT: uunpklo [[UPK2]].d, [[UPK1]].s -; CHECK-NEXT: st1h { [[UPK2]].d }, [[MASK]], [z[[PTRS]].d] +; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[NCMP]].h +; CHECK-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, z[[VALS]].h +; CHECK-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG0]]/z, [[UPK2]].d, #0 +; CHECK-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s +; CHECK-NEXT: st1h { [[UPKV2]].d }, [[MASK]], [z[[PTRS]].d] ; CHECK-NEXT: ret %vals = load <2 x half>, <2 x half>* %a %ptrs = load <2 x half*>, <2 x half*>* %b @@ -524,14 +565,15 @@ define void @masked_scatter_v4f16(<4 x half>* %a, <4 x half*>* %b) #0 { ; CHECK-LABEL: masked_scatter_v4f16: ; CHECK: ldr d[[VALS:[0-9]+]], [x0] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4 -; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].h, vl4 +; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4 +; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-NEXT: fcmeq v[[CMP:[0-9]+]].4h, v[[VALS]].4h, #0 -; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[VALS]].h -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].h, [[PG1]]/z, z[[CMP]].h, #0 +; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[CMP]].h +; CHECK-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, z[[VALS]].h ; CHECK-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s -; CHECK-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG]]/z, [[UPK2]].d, #0 +; CHECK-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s +; CHECK-NEXT: st1h { [[UPKV2]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; CHECK-NEXT: ret %vals = load <4 x half>, <4 x half>* %a %ptrs = load <4 x half*>, <4 x half*>* %b @@ -543,14 +585,15 @@ define void @masked_scatter_v8f16(<8 x half>* %a, <8 x half*>* %b) #0 { ; CHECK-LABEL: masked_scatter_v8f16: ; VBITS_GE_512: ldr q[[VALS:[0-9]+]], [x0] -; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].h, vl8 +; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8 +; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1] ; VBITS_GE_512-NEXT: fcmeq v[[CMP:[0-9]+]].8h, v[[VALS]].8h, #0 -; VBITS_GE_512-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[VALS]].h -; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].h, [[PG1]]/z, z[[CMP]].h, #0 +; VBITS_GE_512-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[CMP]].h +; VBITS_GE_512-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, z[[VALS]].h ; VBITS_GE_512-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s -; VBITS_GE_512-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG]]/z, [[UPK2]].d, #0 +; VBITS_GE_512-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s +; VBITS_GE_512-NEXT: st1h { [[UPKV2]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_512-NEXT: ret %vals = load <8 x half>, <8 x half>* %a %ptrs = load <8 x half*>, <8 x half*>* %b @@ -565,10 +608,14 @@ ; VBITS_GE_1024-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl16 ; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 -; VBITS_GE_1024-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[VALS]].h +; VBITS_GE_1024-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 +; VBITS_GE_1024-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1 +; VBITS_GE_1024-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_1024-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, [[VALS]].h ; VBITS_GE_1024-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s -; VBITS_GE_1024-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_1024-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK2]].d, #0 +; VBITS_GE_1024-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s +; VBITS_GE_1024-NEXT: st1h { [[UPKV2]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_1024-NEXT: ret %vals = load <16 x half>, <16 x half>* %a %ptrs = load <16 x half*>, <16 x half*>* %b @@ -583,10 +630,14 @@ ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 -; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[VALS]].h +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_2048-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, [[VALS]].h ; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s -; VBITS_GE_2048-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK2]].d, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s +; VBITS_GE_2048-NEXT: st1h { [[UPKV2]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x half>, <32 x half>* %a %ptrs = load <32 x half*>, <32 x half*>* %b @@ -603,10 +654,11 @@ ; CHECK-LABEL: masked_scatter_v2f32: ; CHECK: ldr d[[VALS:[0-9]+]], [x0] ; CHECK-NEXT: ldr q[[PTRS:[0-9]+]], [x1] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].s, vl2 +; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl2 ; CHECK-NEXT: fcmeq v[[CMP:[0-9]+]].2s, v[[VALS]].2s, #0 -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG0]]/z, z[[CMP]].s, #0 +; CHECK-NEXT: ushll v[[SHLC:[0-9]+]].2d, v[[CMP]].2s, #0 ; CHECK-NEXT: ushll v[[SHL:[0-9]+]].2d, v[[VALS]].2s, #0 +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG0]]/z, z[[SHLC]].d, #0 ; CHECK-NEXT: st1w { z[[SHL]].d }, [[MASK]], [z[[PTRS]].d] ; CHECK-NEXT: ret %vals = load <2 x float>, <2 x float>* %a @@ -619,13 +671,13 @@ define void @masked_scatter_v4f32(<4 x float>* %a, <4 x float*>* %b) #0 { ; CHECK-LABEL: masked_scatter_v4f32: ; CHECK: ldr q[[VALS:[0-9]+]], [x0] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4 -; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4 +; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4 +; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-NEXT: fcmeq v[[CMP:[0-9]+]].4s, v[[VALS]].4s, #0 -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, z[[CMP]].s, #0 -; CHECK-NEXT: uunpklo [[UPK:z[0-9]+]].d, z[[VALS]].s -; CHECK-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; CHECK-NEXT: uunpklo [[UPK:z[0-9]+]].d, z[[CMP]].s +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG]]/z, [[UPK]].d, #0 +; CHECK-NEXT: uunpklo [[UPKV:z[0-9]+]].d, z[[VALS]].s +; CHECK-NEXT: st1w { [[UPKV]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; CHECK-NEXT: ret %vals = load <4 x float>, <4 x float>* %a %ptrs = load <4 x float*>, <4 x float*>* %b @@ -640,9 +692,12 @@ ; VBITS_GE_512-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].d, vl8 ; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_512-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 -; VBITS_GE_512-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s -; VBITS_GE_512-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_512-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_512-NEXT: mov [[MONE:z[0-9]]].s, [[CMP]]/z, #-1 +; VBITS_GE_512-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 +; VBITS_GE_512-NEXT: uunpklo [[UPKV:z[0-9]+]].d, [[VALS]].s +; VBITS_GE_512-NEXT: st1w { [[UPKV]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_512-NEXT: ret %vals = load <8 x float>, <8 x float>* %a %ptrs = load <8 x float*>, <8 x float*>* %b @@ -657,9 +712,12 @@ ; VBITS_GE_1024-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl16 ; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 -; VBITS_GE_1024-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s -; VBITS_GE_1024-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_1024-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_1024-NEXT: mov [[MONE:z[0-9]]].s, [[CMP]]/z, #-1 +; VBITS_GE_1024-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_1024-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 +; VBITS_GE_1024-NEXT: uunpklo [[UPKV:z[0-9]+]].d, [[VALS]].s +; VBITS_GE_1024-NEXT: st1w { [[UPKV]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_1024-NEXT: ret %vals = load <16 x float>, <16 x float>* %a %ptrs = load <16 x float*>, <16 x float*>* %b @@ -674,9 +732,12 @@ ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s -; VBITS_GE_2048-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]]].s, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].d, [[VALS]].s +; VBITS_GE_2048-NEXT: st1w { [[UPKV]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x float>, <32 x float>* %a %ptrs = load <32 x float*>, <32 x float*>* %b @@ -785,9 +846,12 @@ ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[VALS]].h -; VBITS_GE_2048-NEXT: st1h { [[VALS]].s }, [[MASK]], [x2, [[PTRS]].s, sxtw #1] +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[PG0]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].s, [[VALS]].h +; VBITS_GE_2048-NEXT: st1h { [[UPKV]].s }, [[MASK]], [x2, [[PTRS]].s, sxtw #1] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b @@ -804,9 +868,12 @@ ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[VALS]].h -; VBITS_GE_2048-NEXT: st1h { [[VALS]].s }, [[MASK]], [x2, [[PTRS]].s, uxtw #1] +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[PG0]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].s, [[VALS]].h +; VBITS_GE_2048-NEXT: st1h { [[UPKV]].s }, [[MASK]], [x2, [[PTRS]].s, uxtw #1] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b @@ -823,9 +890,12 @@ ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[VALS]].h -; VBITS_GE_2048-NEXT: st1h { [[VALS]].s }, [[MASK]], [x2, [[PTRS]].s, sxtw] +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[PG0]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].s, [[VALS]].h +; VBITS_GE_2048-NEXT: st1h { [[UPKV]].s }, [[MASK]], [x2, [[PTRS]].s, sxtw] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b @@ -843,9 +913,12 @@ ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[VALS]].h -; VBITS_GE_2048-NEXT: st1h { [[VALS]].s }, [[MASK]], [x2, [[PTRS]].s, uxtw] +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[PG0]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].s, [[VALS]].h +; VBITS_GE_2048-NEXT: st1h { [[UPKV]].s }, [[MASK]], [x2, [[PTRS]].s, uxtw] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b @@ -863,9 +936,12 @@ ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s -; VBITS_GE_2048-NEXT: st1w { [[VALS]].d }, [[MASK]], [x2, [[PTRS]].d, lsl #2] +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].s, [[PG0]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].d, [[VALS]].s +; VBITS_GE_2048-NEXT: st1w { [[UPKV]].d }, [[MASK]], [x2, [[PTRS]].d, lsl #2] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x float>, <32 x float>* %a %idxs = load <32 x i64>, <32 x i64>* %b @@ -881,9 +957,12 @@ ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s -; VBITS_GE_2048-NEXT: st1w { [[VALS]].d }, [[MASK]], [x2, [[PTRS]].d] +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].s, [[PG0]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].d, [[VALS]].s +; VBITS_GE_2048-NEXT: st1w { [[UPKV]].d }, [[MASK]], [x2, [[PTRS]].d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x float>, <32 x float>* %a %idxs = load <32 x i64>, <32 x i64>* %b @@ -902,10 +981,13 @@ ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] ; VBITS_GE_2048-NEXT: mov [[OFF:z[0-9]+]].d, x2 -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 ; VBITS_GE_2048-NEXT: add [[PTRS_ADD:z[0-9]+]].d, [[PG1]]/m, [[PTRS]].d, [[OFF]].d -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s -; VBITS_GE_2048-NEXT: st1w { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS_ADD]].d] +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].s, [[PG0]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].d, [[VALS]].s +; VBITS_GE_2048-NEXT: st1w { [[UPKV]].d }, [[MASK]], {{\[}}[[PTRS_ADD]].d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x float>, <32 x float>* %a %bases = load <32 x i8*>, <32 x i8*>* %b @@ -924,10 +1006,13 @@ ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] ; VBITS_GE_2048-NEXT: mov [[OFF:z[0-9]+]].d, #4 -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 ; VBITS_GE_2048-NEXT: add [[PTRS_ADD:z[0-9]+]].d, [[PG1]]/m, [[PTRS]].d, [[OFF]].d -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s -; VBITS_GE_2048-NEXT: st1w { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS_ADD]].d] +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].s, [[PG0]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].d, [[VALS]].s +; VBITS_GE_2048-NEXT: st1w { [[UPKV]].d }, [[MASK]], {{\[}}[[PTRS_ADD]].d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x float>, <32 x float>* %a %bases = load <32 x i8*>, <32 x i8*>* %b