diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -4452,6 +4452,9 @@ ISD::ANY_EXTEND, DL, VT.changeVectorElementType(IndexVT.getVectorElementType()), StoreVal); StoreVal = convertToScalableVector(DAG, IndexVT, StoreVal); + Mask = DAG.getNode( + ISD::ZERO_EXTEND, DL, + VT.changeVectorElementType(IndexVT.getVectorElementType()), Mask); } else if (VT.isFloatingPoint()) { // Handle FP data by casting the data so an integer scatter can be used. EVT StoreValVT = getPackedSVEVectorVT(VT.getVectorElementCount()); diff --git a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll --- a/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-fixed-length-masked-scatter.ll @@ -29,13 +29,14 @@ ; CHECK: ldrb [[VALS_LO:w[0-9]+]], [x0] ; CHECK-NEXT: ldrb [[VALS_HI:w[0-9]+]], [x0, #1] ; CHECK-NEXT: ldr q[[PTRS:[0-9]+]], [x1] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].s, vl2 +; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl2 ; CHECK-NEXT: fmov s[[VALS:[0-9]+]], [[VALS_LO]] ; CHECK-NEXT: mov v[[VALS]].s[1], [[VALS_HI]] ; CHECK-NEXT: cmeq v[[CMP:[0-9]+]].2s, v[[VALS]].2s, #0 -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG0]]/z, z[[CMP]].s, #0 -; CHECK-NEXT: ushll v[[SHL:[0-9]+]].2d, v[[VALS]].2s, #0 -; CHECK-NEXT: st1b { z[[SHL]].d }, [[MASK]], [z[[PTRS]].d] +; CHECK-NEXT: ushll v[[SHL:[0-9]+]].2d, v[[CMP]].2s, #0 +; CHECK-NEXT: ushll v[[SHL2:[0-9]+]].2d, v[[VALS]].2s, #0 +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG0]]/z, z[[SHL]].d, #0 +; CHECK-NEXT: st1b { z[[SHL2]].d }, [[MASK]], [z[[PTRS]].d] ; CHECK-NEXT: ret %vals = load <2 x i8>, <2 x i8>* %a %ptrs = load <2 x i8*>, <2 x i8*>* %b @@ -47,15 +48,16 @@ define void @masked_scatter_v4i8(<4 x i8>* %a, <4 x i8*>* %b) #0 { ; CHECK-LABEL: masked_scatter_v4i8: ; CHECK: ldr s[[VALS:[0-9]+]], [x0] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4 -; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].h, vl4 +; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4 +; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-NEXT: ushll [[SHL:v[0-9]+]].8h, v[[VALS]].8b, #0 ; CHECK-NEXT: cmeq v[[CMP:[0-9]+]].4h, [[SHL]].4h, #0 -; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[VALS]].h -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].h, [[PG1]]/z, z[[CMP]].h, #0 -; CHECK-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s -; CHECK-NEXT: st1b { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[CMP]].h +; CHECK-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, z[[VALS]].h +; CHECK-NEXT: uunpklo z[[UPK2:[0-9]+]].d, [[UPK1]].s +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG]]/z, z[[UPK2]].d, #0 +; CHECK-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s +; CHECK-NEXT: st1b { [[UPKV2]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; CHECK-NEXT: ret %vals = load <4 x i8>, <4 x i8>* %a %ptrs = load <4 x i8*>, <4 x i8*>* %b @@ -67,33 +69,38 @@ define void @masked_scatter_v8i8(<8 x i8>* %a, <8 x i8*>* %b) #0 { ; CHECK-LABEL: masked_scatter_v8i8: ; VBITS_GE_512: ldr d[[VALS:[0-9]+]], [x0] -; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].b, vl8 +; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8 +; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1] ; VBITS_GE_512-NEXT: cmeq v[[CMP:[0-9]+]].8b, v[[VALS]].8b, #0 -; VBITS_GE_512-NEXT: uunpklo [[UPK1:z[0-9]+]].h, z[[VALS]].b +; VBITS_GE_512-NEXT: uunpklo [[UPK1:z[0-9]+]].h, z[[CMP]].b +; VBITS_GE_512-NEXT: uunpklo [[UPKV1:z[0-9]+]].h, z[[VALS]].b ; VBITS_GE_512-NEXT: uunpklo [[UPK2:z[0-9]+]].s, [[UPK1]].h -; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].b, [[PG1]]/z, z[[CMP]].b, #0 +; VBITS_GE_512-NEXT: uunpklo [[UPKV2:z[0-9]+]].s, [[UPKV1]].h ; VBITS_GE_512-NEXT: uunpklo [[UPK3:z[0-9]+]].d, [[UPK2]].s -; VBITS_GE_512-NEXT: st1b { [[UPK3]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG]]/z, [[UPK3]].d, #0 +; VBITS_GE_512-NEXT: uunpklo [[UPKV3:z[0-9]+]].d, [[UPKV2]].s +; VBITS_GE_512-NEXT: st1b { [[UPKV3]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. ; VBITS_EQ_256-DAG: ldr d[[VALS:[0-9]+]], [x0] -; VBITS_EQ_256-DAG: ptrue [[PG0:p[0-9]+]].d, vl4 -; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].h, vl4 +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 ; VBITS_EQ_256-DAG: cmeq [[ZMSK:v[0-9]+]].8b, v[[VALS]].8b, #0 -; VBITS_EQ_256-DAG: zip1 [[VAL_LO:v[0-9]+]].8b, [[ZMSK]].8b, v[[VALS]].8b -; VBITS_EQ_256-DAG: zip2 [[VAL_HI:v[0-9]+]].8b, [[ZMSK]].8b, v[[VALS]].8b -; VBITS_EQ_256-DAG: shl [[SHL_LO:v[0-9]+]].4h, [[VAL_LO]].4h, #8 -; VBITS_EQ_256-DAG: shl [[SHL_HI:v[0-9]+]].4h, [[VAL_HI]].4h, #8 -; VBITS_EQ_256-DAG: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_EQ_256-DAG: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG0]]/z, [x1, x[[NUMELTS]], lsl #3] +; VBITS_EQ_256-DAG: zip1 v[[VAL_LO:[0-9]+]].8b, [[ZMSK]].8b, v[[VALS]].8b +; VBITS_EQ_256-DAG: zip2 v[[VAL_HI:[0-9]+]].8b, [[ZMSK]].8b, v[[VALS]].8b +; VBITS_EQ_256-DAG: shl [[SHL_LO:v[0-9]+]].4h, v[[VAL_LO]].4h, #8 +; VBITS_EQ_256-DAG: shl [[SHL_HI:v[0-9]+]].4h, v[[VAL_HI]].4h, #8 +; VBITS_EQ_256-DAG: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_EQ_256-DAG: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3] ; VBITS_EQ_256-DAG: sshr v[[SSHR_LO:[0-9]+]].4h, [[SHL_LO]].4h, #8 ; VBITS_EQ_256-DAG: sshr v[[SSHR_HI:[0-9]+]].4h, [[SHL_HI]].4h, #8 -; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].h, [[PG1]]/z, z[[SSHR_LO]].h, #0 -; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].h, [[PG1]]/z, z[[SSHR_HI]].h, #0 +; VBITS_EQ_256-DAG: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[VAL_LO]].h +; VBITS_EQ_256-DAG: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[VAL_HI]].h +; VBITS_EQ_256-DAG: uunpklo z[[UPK2_LO:[0-9]+]].d, [[UPK1_LO]].s +; VBITS_EQ_256-DAG: uunpklo z[[UPK2_HI:[0-9]+]].d, [[UPK1_HI]].s +; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].d, [[PG]]/z, z[[UPK2_LO]].d, #0 +; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].d, [[PG]]/z, z[[UPK2_HI]].d, #0 ; VBITS_EQ_256-DAG: zip1 v[[VALS2_LO:[0-9]+]].8b, v[[VALS]].8b, v[[VALS]].8b ; VBITS_EQ_256-DAG: zip2 v[[VALS2_HI:[0-9]+]].8b, v[[VALS]].8b, v[[VALS]].8b ; VBITS_EQ_256-DAG: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[VALS2_LO]].h @@ -113,15 +120,17 @@ define void @masked_scatter_v16i8(<16 x i8>* %a, <16 x i8*>* %b) #0 { ; CHECK-LABEL: masked_scatter_v16i8: ; VBITS_GE_1024: ldr q[[VALS:[0-9]+]], [x0] -; VBITS_GE_1024-NEXT: ptrue [[PG0:p[0-9]+]].d, vl16 -; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].b, vl16 +; VBITS_GE_1024-NEXT: ptrue [[PG:p[0-9]+]].d, vl16 +; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1] ; VBITS_GE_1024-NEXT: cmeq v[[CMP:[0-9]+]].16b, v[[VALS]].16b, #0 -; VBITS_GE_1024-NEXT: uunpklo [[UPK1:z[0-9]+]].h, z[[VALS]].b +; VBITS_GE_1024-NEXT: uunpklo [[UPK1:z[0-9]+]].h, z[[CMP]].b +; VBITS_GE_1024-NEXT: uunpklo [[UPKV1:z[0-9]+]].h, z[[VALS]].b ; VBITS_GE_1024-NEXT: uunpklo [[UPK2:z[0-9]+]].s, [[UPK1]].h -; VBITS_GE_1024-NEXT: cmpne [[MASK:p[0-9]+]].b, [[PG1]]/z, z[[CMP]].b, #0 +; VBITS_GE_1024-NEXT: uunpklo [[UPKV2:z[0-9]+]].s, [[UPKV1]].h ; VBITS_GE_1024-NEXT: uunpklo [[UPK3:z[0-9]+]].d, [[UPK2]].s -; VBITS_GE_1024-NEXT: st1b { [[UPK3]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_1024-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG]]/z, [[UPK3]].d, #0 +; VBITS_GE_1024-NEXT: uunpklo [[UPKV3:z[0-9]+]].d, [[UPKV2]].s +; VBITS_GE_1024-NEXT: st1b { [[UPKV3]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_1024-NEXT: ret %vals = load <16 x i8>, <16 x i8>* %a %ptrs = load <16 x i8*>, <16 x i8*>* %b @@ -136,11 +145,16 @@ ; VBITS_GE_2048-NEXT: ld1b { [[VALS:z[0-9]+]].b }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq [[MASK:p[0-9]+]].b, [[PG0]]/z, [[VALS]].b, #0 -; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].h, [[VALS]].b +; VBITS_GE_2048-NEXT: cmpeq [[CMP:p[0-9]+]].b, [[PG0]]/z, [[VALS]].b, #0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].b, [[PG0]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].h, [[MONE]].b +; VBITS_GE_2048-NEXT: uunpklo [[UPKV1:z[0-9]+]].h, [[VALS]].b ; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].s, [[UPK1]].h +; VBITS_GE_2048-NEXT: uunpklo [[UPKV2:z[0-9]+]].s, [[UPKV1]].h ; VBITS_GE_2048-NEXT: uunpklo [[UPK3:z[0-9]+]].d, [[UPK2]].s -; VBITS_GE_2048-NEXT: st1b { [[UPK3]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK3]].d, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV3:z[0-9]+]].d, [[UPKV2]].s +; VBITS_GE_2048-NEXT: st1b { [[UPKV3]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x i8>, <32 x i8>* %a %ptrs = load <32 x i8*>, <32 x i8*>* %b @@ -158,13 +172,14 @@ ; CHECK: ldrh [[VALS_LO:w[0-9]+]], [x0] ; CHECK-NEXT: ldrh [[VALS_HI:w[0-9]+]], [x0, #2] ; CHECK-NEXT: ldr q[[PTRS:[0-9]+]], [x1] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].s, vl2 +; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl2 ; CHECK-NEXT: fmov s[[VALS:[0-9]+]], [[VALS_LO]] ; CHECK-NEXT: mov v[[VALS]].s[1], [[VALS_HI]] ; CHECK-NEXT: cmeq v[[CMP:[0-9]+]].2s, v[[VALS]].2s, #0 -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG0]]/z, z[[CMP]].s, #0 -; CHECK-NEXT: ushll v[[SHL:[0-9]+]].2d, v[[VALS]].2s, #0 -; CHECK-NEXT: st1h { z[[SHL]].d }, [[MASK]], [z[[PTRS]].d] +; CHECK-NEXT: ushll v[[SHL:[0-9]+]].2d, v[[CMP]].2s, #0 +; CHECK-NEXT: ushll v[[SHL2:[0-9]+]].2d, v[[VALS]].2s, #0 +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG0]]/z, z[[SHL]].d, #0 +; CHECK-NEXT: st1h { z[[SHL2]].d }, [[MASK]], [z[[PTRS]].d] ; CHECK-NEXT: ret %vals = load <2 x i16>, <2 x i16>* %a %ptrs = load <2 x i16*>, <2 x i16*>* %b @@ -176,14 +191,15 @@ define void @masked_scatter_v4i16(<4 x i16>* %a, <4 x i16*>* %b) #0 { ; CHECK-LABEL: masked_scatter_v4i16: ; CHECK: ldr d[[VALS:[0-9]+]], [x0] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4 -; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].h, vl4 +; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4 +; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-NEXT: cmeq v[[CMP:[0-9]+]].4h, v[[VALS]].4h, #0 -; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[VALS]].h -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].h, [[PG1]]/z, z[[CMP]].h, #0 -; CHECK-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s -; CHECK-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[CMP]].h +; CHECK-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, z[[VALS]].h +; CHECK-NEXT: uunpklo z[[UPK2:[0-9]+]].d, [[UPK1]].s +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG]]/z, z[[UPK2]].d, #0 +; CHECK-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s +; CHECK-NEXT: st1h { [[UPKV2]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; CHECK-NEXT: ret %vals = load <4 x i16>, <4 x i16>* %a %ptrs = load <4 x i16*>, <4 x i16*>* %b @@ -195,28 +211,32 @@ define void @masked_scatter_v8i16(<8 x i16>* %a, <8 x i16*>* %b) #0 { ; CHECK-LABEL: masked_scatter_v8i16: ; VBITS_GE_512: ldr q[[VALS:[0-9]+]], [x0] -; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].h, vl8 +; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8 +; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1] ; VBITS_GE_512-NEXT: cmeq v[[CMP:[0-9]+]].8h, v[[VALS]].8h, #0 -; VBITS_GE_512-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[VALS]].h -; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].h, [[PG1]]/z, z[[CMP]].h, #0 +; VBITS_GE_512-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[CMP]].h +; VBITS_GE_512-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, z[[VALS]].h ; VBITS_GE_512-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s -; VBITS_GE_512-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG]]/z, [[UPK2]].d, #0 +; VBITS_GE_512-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s +; VBITS_GE_512-NEXT: st1h { [[UPKV2]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. ; VBITS_EQ_256-DAG: ldr q[[VALS:[0-9]+]], [x0] -; VBITS_EQ_256-DAG: ptrue [[PG0:p[0-9]+]].d, vl4 +; VBITS_EQ_256-DAG: ptrue [[PG:p[0-9]+]].d, vl4 ; VBITS_EQ_256-DAG: mov x[[NUMELTS:[0-9]+]], #4 -; VBITS_EQ_256-DAG: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_EQ_256-DAG: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG0]]/z, [x1, x[[NUMELTS]], lsl #3] -; VBITS_EQ_256-DAG: ptrue [[PG1:p[0-9]+]].h, vl4 +; VBITS_EQ_256-DAG: ld1d { [[PTRS_LO:z[0-9]+]].d }, [[PG]]/z, [x1] +; VBITS_EQ_256-DAG: ld1d { [[PTRS_HI:z[0-9]+]].d }, [[PG]]/z, [x1, x[[NUMELTS]], lsl #3] ; VBITS_EQ_256-DAG: cmeq v[[ZMSK:[0-9]+]].8h, v[[VALS]].8h, #0 ; VBITS_EQ_256-DAG: ext v[[EXT:[0-9]+]].16b, v[[VALS]].16b, v[[VALS]].16b, #8 ; VBITS_EQ_256-DAG: ext v[[ZEXT:[0-9]+]].16b, v[[ZMSK]].16b, v[[ZMSK]].16b, #8 -; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].h, [[PG1]]/z, z[[ZMSK]].h, #0 -; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].h, [[PG1]]/z, z[[ZEXT]].h, #0 +; VBITS_EQ_256-DAG: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[ZMSK]].h +; VBITS_EQ_256-DAG: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[ZEXT]].h +; VBITS_EQ_256-DAG: uunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s +; VBITS_EQ_256-DAG: uunpklo [[UPK2_HI:z[0-9]+]].d, [[UPK1_HI]].s +; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].d, [[PG]]/z, [[UPK2_LO]].d, #0 +; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].d, [[PG]]/z, [[UPK2_HI]].d, #0 ; VBITS_EQ_256-DAG: uunpklo [[UPK1_LO:z[0-9]+]].s, z[[VALS]].h ; VBITS_EQ_256-DAG: uunpklo [[UPK1_HI:z[0-9]+]].s, z[[EXT]].h ; VBITS_EQ_256-DAG: uunpklo [[UPK2_LO:z[0-9]+]].d, [[UPK1_LO]].s @@ -237,10 +257,14 @@ ; VBITS_GE_1024-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl16 ; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_1024-NEXT: cmpeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0 -; VBITS_GE_1024-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[VALS]].h +; VBITS_GE_1024-NEXT: cmpeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0 +; VBITS_GE_1024-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1 +; VBITS_GE_1024-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_1024-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, [[VALS]].h ; VBITS_GE_1024-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s -; VBITS_GE_1024-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_1024-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK2]].d, #0 +; VBITS_GE_1024-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s +; VBITS_GE_1024-NEXT: st1h { [[UPKV2]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_1024-NEXT: ret %vals = load <16 x i16>, <16 x i16>* %a %ptrs = load <16 x i16*>, <16 x i16*>* %b @@ -255,10 +279,14 @@ ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0 -; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[VALS]].h +; VBITS_GE_2048-NEXT: cmpeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_2048-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, [[VALS]].h ; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s -; VBITS_GE_2048-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK2]].d, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s +; VBITS_GE_2048-NEXT: st1h { [[UPKV2]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x i16>, <32 x i16>* %a %ptrs = load <32 x i16*>, <32 x i16*>* %b @@ -275,11 +303,12 @@ ; CHECK-LABEL: masked_scatter_v2i32: ; CHECK: ldr d[[VALS:[0-9]+]], [x0] ; CHECK-NEXT: ldr q[[PTRS:[0-9]+]], [x1] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].s, vl2 +; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl2 ; CHECK-NEXT: cmeq v[[CMP:[0-9]+]].2s, v[[VALS]].2s, #0 -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG0]]/z, z[[CMP]].s, #0 -; CHECK-NEXT: ushll v[[SHL:[0-9]+]].2d, v[[VALS]].2s, #0 -; CHECK-NEXT: st1w { z[[SHL]].d }, [[MASK]], [z[[PTRS]].d] +; CHECK-NEXT: ushll v[[SHL:[0-9]+]].2d, v[[CMP]].2s, #0 +; CHECK-NEXT: ushll v[[SHL2:[0-9]+]].2d, v[[VALS]].2s, #0 +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG0]]/z, z[[SHL]].d, #0 +; CHECK-NEXT: st1w { z[[SHL2]].d }, [[MASK]], [z[[PTRS]].d] ; CHECK-NEXT: ret %vals = load <2 x i32>, <2 x i32>* %a %ptrs = load <2 x i32*>, <2 x i32*>* %b @@ -291,13 +320,13 @@ define void @masked_scatter_v4i32(<4 x i32>* %a, <4 x i32*>* %b) #0 { ; CHECK-LABEL: masked_scatter_v4i32: ; CHECK: ldr q[[VALS:[0-9]+]], [x0] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4 -; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4 +; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4 +; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-NEXT: cmeq v[[CMP:[0-9]+]].4s, v[[VALS]].4s, #0 -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, z[[CMP]].s, #0 -; CHECK-NEXT: uunpklo [[UPK:z[0-9]+]].d, z[[VALS]].s -; CHECK-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; CHECK-NEXT: uunpklo [[UPK:z[0-9]+]].d, z[[CMP]].s +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG]]/z, [[UPK]].d, #0 +; CHECK-NEXT: uunpklo [[UPKV:z[0-9]+]].d, z[[VALS]].s +; CHECK-NEXT: st1w { [[UPKV]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; CHECK-NEXT: ret %vals = load <4 x i32>, <4 x i32>* %a %ptrs = load <4 x i32*>, <4 x i32*>* %b @@ -312,9 +341,12 @@ ; VBITS_GE_512-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].d, vl8 ; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_512-NEXT: cmpeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0 -; VBITS_GE_512-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s -; VBITS_GE_512-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_512-NEXT: cmpeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0 +; VBITS_GE_512-NEXT: mov [[MONE:z[0-9]+]].s, [[CMP]]/z, #-1 +; VBITS_GE_512-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 +; VBITS_GE_512-NEXT: uunpklo [[UPKV:z[0-9]+]].d, [[VALS]].s +; VBITS_GE_512-NEXT: st1w { [[UPKV]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_512-NEXT: ret ; Ensure sensible type legalisation. @@ -327,18 +359,19 @@ ; VBITS_EQ_256-DAG: cmpeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0 ; VBITS_EQ_256-DAG: add x8, sp, #32 ; VBITS_EQ_256-DAG: mov x9, sp -; VBITS_EQ_256-DAG: mov [[MONE:z[0-9]+]].s, p1/z, #-1 +; VBITS_EQ_256-DAG: mov [[MONE:z[0-9]+]].s, [[MASK]]/z, #-1 ; VBITS_EQ_256-DAG: st1w { [[MONE]].s }, [[PG0]], [x8] ; VBITS_EQ_256-DAG: st1w { [[VALS]].s }, [[PG0]], [x9] ; VBITS_EQ_256-DAG: ldr q[[CMP_LO:[0-9]+]], [sp, #32] ; VBITS_EQ_256-DAG: ldr q[[VAL_LO:[0-9]+]], [sp] -; VBITS_EQ_256-DAG: ptrue [[PG2:p[0-9]+]].s, vl4 -; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].s, [[PG2]]/z, z[[CMP_LO]].s, #0 +; VBITS_EQ_256-DAG: uunpklo [[UPKC_LO:z[0-9]+]].d, z[[CMP_LO]].s +; VBITS_EQ_256-DAG: cmpne [[MASK_LO:p[0-9]+]].d, [[PG1]]/z, [[UPKC_LO]].d, #0 ; VBITS_EQ_256-DAG: uunpklo [[UPK1_LO:z[0-9]+]].d, z[[VAL_LO]].s ; VBITS_EQ_256-DAG: st1w { [[UPK1_LO]].d }, [[MASK_LO]], {{\[}}[[PTRS_LO]].d] ; VBITS_EQ_256-DAG: ldr q[[CMP_HI:[0-9]+]], [sp, #48] ; VBITS_EQ_256-DAG: ldr q[[VAL_HI:[0-9]+]], [sp, #16] -; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].s, [[PG2]]/z, z[[CMP_HI]].s, #0 +; VBITS_EQ_256-DAG: uunpklo [[UPKC_HI:z[0-9]+]].d, z[[CMP_HI]].s +; VBITS_EQ_256-DAG: cmpne [[MASK_HI:p[0-9]+]].d, [[PG1]]/z, [[UPKC_HI]].d, #0 ; VBITS_EQ_256-DAG: uunpklo [[UPK1_HI:z[0-9]+]].d, z[[VAL_HI]].s ; VBITS_EQ_256-DAG: st1w { [[UPK1_HI]].d }, [[MASK_HI]], {{\[}}[[PTRS_HI]].d] %vals = load <8 x i32>, <8 x i32>* %a @@ -354,9 +387,12 @@ ; VBITS_GE_1024-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl16 ; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_1024-NEXT: cmpeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0 -; VBITS_GE_1024-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s -; VBITS_GE_1024-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_1024-NEXT: cmpeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0 +; VBITS_GE_1024-NEXT: mov [[MONE:z[0-9]+]].s, [[CMP]]/z, #-1 +; VBITS_GE_1024-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_1024-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 +; VBITS_GE_1024-NEXT: uunpklo [[UPKV:z[0-9]+]].d, [[VALS]].s +; VBITS_GE_1024-NEXT: st1w { [[UPKV]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_1024-NEXT: ret %vals = load <16 x i32>, <16 x i32>* %a %ptrs = load <16 x i32*>, <16 x i32*>* %b @@ -371,9 +407,12 @@ ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: cmpeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0 -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s -; VBITS_GE_2048-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_2048-NEXT: cmpeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].s, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].d, [[VALS]].s +; VBITS_GE_2048-NEXT: st1w { [[UPKV]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x i32>, <32 x i32>* %a %ptrs = load <32 x i32*>, <32 x i32*>* %b @@ -495,7 +534,7 @@ ; CHECK: ldr s[[VALS:[0-9]+]], [x0] ; CHECK-NEXT: movi d2, #0000000000000000 ; CHECK-NEXT: ldr q[[PTRS:[0-9]+]], [x1] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].h, vl4 +; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4 ; CHECK-NEXT: fcmeq v[[CMP:[0-9]+]].4h, v[[VALS]].4h, #0.0 ; CHECK-NEXT: umov w8, v[[CMP]].h[0] ; CHECK-NEXT: umov w9, v[[CMP]].h[1] @@ -508,11 +547,13 @@ ; CHECK-NEXT: mov v[[NCMP:[0-9]+]].h[0], w9 ; CHECK-NEXT: mov v[[NCMP]].h[1], w8 ; CHECK-NEXT: shl v[[NCMP]].4h, v[[NCMP]].4h, #15 -; CHECK-NEXT: uunpklo [[UPK1]].s, z[[VALS]].h ; CHECK-NEXT: sshr v[[NCMP]].4h, v[[NCMP]].4h, #15 -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].h, [[PG0]]/z, z[[NCMP]].h, #0 -; CHECK-NEXT: uunpklo [[UPK2]].d, [[UPK1]].s -; CHECK-NEXT: st1h { [[UPK2]].d }, [[MASK]], [z[[PTRS]].d] +; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[NCMP]].h +; CHECK-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, z[[VALS]].h +; CHECK-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG0]]/z, [[UPK2]].d, #0 +; CHECK-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s +; CHECK-NEXT: st1h { [[UPKV2]].d }, [[MASK]], [z[[PTRS]].d] ; CHECK-NEXT: ret %vals = load <2 x half>, <2 x half>* %a %ptrs = load <2 x half*>, <2 x half*>* %b @@ -524,14 +565,15 @@ define void @masked_scatter_v4f16(<4 x half>* %a, <4 x half*>* %b) #0 { ; CHECK-LABEL: masked_scatter_v4f16: ; CHECK: ldr d[[VALS:[0-9]+]], [x0] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4 -; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].h, vl4 +; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4 +; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-NEXT: fcmeq v[[CMP:[0-9]+]].4h, v[[VALS]].4h, #0 -; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[VALS]].h -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].h, [[PG1]]/z, z[[CMP]].h, #0 +; CHECK-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[CMP]].h +; CHECK-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, z[[VALS]].h ; CHECK-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s -; CHECK-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG]]/z, [[UPK2]].d, #0 +; CHECK-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s +; CHECK-NEXT: st1h { [[UPKV2]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; CHECK-NEXT: ret %vals = load <4 x half>, <4 x half>* %a %ptrs = load <4 x half*>, <4 x half*>* %b @@ -543,14 +585,15 @@ define void @masked_scatter_v8f16(<8 x half>* %a, <8 x half*>* %b) #0 { ; CHECK-LABEL: masked_scatter_v8f16: ; VBITS_GE_512: ldr q[[VALS:[0-9]+]], [x0] -; VBITS_GE_512-NEXT: ptrue [[PG0:p[0-9]+]].d, vl8 -; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].h, vl8 +; VBITS_GE_512-NEXT: ptrue [[PG:p[0-9]+]].d, vl8 +; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1] ; VBITS_GE_512-NEXT: fcmeq v[[CMP:[0-9]+]].8h, v[[VALS]].8h, #0 -; VBITS_GE_512-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[VALS]].h -; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].h, [[PG1]]/z, z[[CMP]].h, #0 +; VBITS_GE_512-NEXT: uunpklo [[UPK1:z[0-9]+]].s, z[[CMP]].h +; VBITS_GE_512-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, z[[VALS]].h ; VBITS_GE_512-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s -; VBITS_GE_512-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG]]/z, [[UPK2]].d, #0 +; VBITS_GE_512-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s +; VBITS_GE_512-NEXT: st1h { [[UPKV2]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_512-NEXT: ret %vals = load <8 x half>, <8 x half>* %a %ptrs = load <8 x half*>, <8 x half*>* %b @@ -565,10 +608,14 @@ ; VBITS_GE_1024-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl16 ; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 -; VBITS_GE_1024-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[VALS]].h +; VBITS_GE_1024-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 +; VBITS_GE_1024-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1 +; VBITS_GE_1024-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_1024-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, [[VALS]].h ; VBITS_GE_1024-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s -; VBITS_GE_1024-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_1024-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK2]].d, #0 +; VBITS_GE_1024-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s +; VBITS_GE_1024-NEXT: st1h { [[UPKV2]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_1024-NEXT: ret %vals = load <16 x half>, <16 x half>* %a %ptrs = load <16 x half*>, <16 x half*>* %b @@ -583,10 +630,14 @@ ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 -; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[VALS]].h +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK1:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_2048-NEXT: uunpklo [[UPKV1:z[0-9]+]].s, [[VALS]].h ; VBITS_GE_2048-NEXT: uunpklo [[UPK2:z[0-9]+]].d, [[UPK1]].s -; VBITS_GE_2048-NEXT: st1h { [[UPK2]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK2]].d, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV2:z[0-9]+]].d, [[UPKV1]].s +; VBITS_GE_2048-NEXT: st1h { [[UPKV2]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x half>, <32 x half>* %a %ptrs = load <32 x half*>, <32 x half*>* %b @@ -603,10 +654,11 @@ ; CHECK-LABEL: masked_scatter_v2f32: ; CHECK: ldr d[[VALS:[0-9]+]], [x0] ; CHECK-NEXT: ldr q[[PTRS:[0-9]+]], [x1] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].s, vl2 +; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl2 ; CHECK-NEXT: fcmeq v[[CMP:[0-9]+]].2s, v[[VALS]].2s, #0 -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG0]]/z, z[[CMP]].s, #0 +; CHECK-NEXT: ushll v[[SHLC:[0-9]+]].2d, v[[CMP]].2s, #0 ; CHECK-NEXT: ushll v[[SHL:[0-9]+]].2d, v[[VALS]].2s, #0 +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG0]]/z, z[[SHLC]].d, #0 ; CHECK-NEXT: st1w { z[[SHL]].d }, [[MASK]], [z[[PTRS]].d] ; CHECK-NEXT: ret %vals = load <2 x float>, <2 x float>* %a @@ -619,13 +671,13 @@ define void @masked_scatter_v4f32(<4 x float>* %a, <4 x float*>* %b) #0 { ; CHECK-LABEL: masked_scatter_v4f32: ; CHECK: ldr q[[VALS:[0-9]+]], [x0] -; CHECK-NEXT: ptrue [[PG0:p[0-9]+]].d, vl4 -; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG0]]/z, [x1] -; CHECK-NEXT: ptrue [[PG1:p[0-9]+]].s, vl4 +; CHECK-NEXT: ptrue [[PG:p[0-9]+]].d, vl4 +; CHECK-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG]]/z, [x1] ; CHECK-NEXT: fcmeq v[[CMP:[0-9]+]].4s, v[[VALS]].4s, #0 -; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, z[[CMP]].s, #0 -; CHECK-NEXT: uunpklo [[UPK:z[0-9]+]].d, z[[VALS]].s -; CHECK-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; CHECK-NEXT: uunpklo [[UPK:z[0-9]+]].d, z[[CMP]].s +; CHECK-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG]]/z, [[UPK]].d, #0 +; CHECK-NEXT: uunpklo [[UPKV:z[0-9]+]].d, z[[VALS]].s +; CHECK-NEXT: st1w { [[UPKV]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; CHECK-NEXT: ret %vals = load <4 x float>, <4 x float>* %a %ptrs = load <4 x float*>, <4 x float*>* %b @@ -640,9 +692,12 @@ ; VBITS_GE_512-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_512-NEXT: ptrue [[PG1:p[0-9]+]].d, vl8 ; VBITS_GE_512-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_512-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 -; VBITS_GE_512-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s -; VBITS_GE_512-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_512-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_512-NEXT: mov [[MONE:z[0-9]]].s, [[CMP]]/z, #-1 +; VBITS_GE_512-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_512-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 +; VBITS_GE_512-NEXT: uunpklo [[UPKV:z[0-9]+]].d, [[VALS]].s +; VBITS_GE_512-NEXT: st1w { [[UPKV]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_512-NEXT: ret %vals = load <8 x float>, <8 x float>* %a %ptrs = load <8 x float*>, <8 x float*>* %b @@ -657,9 +712,12 @@ ; VBITS_GE_1024-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_1024-NEXT: ptrue [[PG1:p[0-9]+]].d, vl16 ; VBITS_GE_1024-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_1024-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 -; VBITS_GE_1024-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s -; VBITS_GE_1024-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_1024-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_1024-NEXT: mov [[MONE:z[0-9]]].s, [[CMP]]/z, #-1 +; VBITS_GE_1024-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_1024-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 +; VBITS_GE_1024-NEXT: uunpklo [[UPKV:z[0-9]+]].d, [[VALS]].s +; VBITS_GE_1024-NEXT: st1w { [[UPKV]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_1024-NEXT: ret %vals = load <16 x float>, <16 x float>* %a %ptrs = load <16 x float*>, <16 x float*>* %b @@ -674,9 +732,12 @@ ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s -; VBITS_GE_2048-NEXT: st1w { [[UPK]].d }, [[MASK]], {{\[}}[[PTRS]].d] +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]]].s, [[CMP]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].d, [[VALS]].s +; VBITS_GE_2048-NEXT: st1w { [[UPKV]].d }, [[MASK]], {{\[}}[[PTRS]].d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x float>, <32 x float>* %a %ptrs = load <32 x float*>, <32 x float*>* %b @@ -785,9 +846,12 @@ ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[VALS]].h -; VBITS_GE_2048-NEXT: st1h { [[VALS]].s }, [[MASK]], [x2, [[PTRS]].s, sxtw #1] +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[PG0]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].s, [[VALS]].h +; VBITS_GE_2048-NEXT: st1h { [[UPKV]].s }, [[MASK]], [x2, [[PTRS]].s, sxtw #1] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b @@ -804,9 +868,12 @@ ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[VALS]].h -; VBITS_GE_2048-NEXT: st1h { [[VALS]].s }, [[MASK]], [x2, [[PTRS]].s, uxtw #1] +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[PG0]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].s, [[VALS]].h +; VBITS_GE_2048-NEXT: st1h { [[UPKV]].s }, [[MASK]], [x2, [[PTRS]].s, uxtw #1] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b @@ -823,9 +890,12 @@ ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[VALS]].h -; VBITS_GE_2048-NEXT: st1h { [[VALS]].s }, [[MASK]], [x2, [[PTRS]].s, sxtw] +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[PG0]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].s, [[VALS]].h +; VBITS_GE_2048-NEXT: st1h { [[UPKV]].s }, [[MASK]], [x2, [[PTRS]].s, sxtw] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b @@ -843,9 +913,12 @@ ; VBITS_GE_2048-NEXT: ld1h { [[VALS:z[0-9]+]].h }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].s, vl32 ; VBITS_GE_2048-NEXT: ld1w { [[PTRS:z[0-9]+]].s }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[VALS]].h -; VBITS_GE_2048-NEXT: st1h { [[VALS]].s }, [[MASK]], [x2, [[PTRS]].s, uxtw] +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].h, [[PG0]]/z, [[VALS]].h, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].h, [[PG0]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].s, [[MONE]].h +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].s, [[PG1]]/z, [[UPK]].s, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].s, [[VALS]].h +; VBITS_GE_2048-NEXT: st1h { [[UPKV]].s }, [[MASK]], [x2, [[PTRS]].s, uxtw] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x half>, <32 x half>* %a %idxs = load <32 x i32>, <32 x i32>* %b @@ -863,9 +936,12 @@ ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s -; VBITS_GE_2048-NEXT: st1w { [[VALS]].d }, [[MASK]], [x2, [[PTRS]].d, lsl #2] +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].s, [[PG0]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].d, [[VALS]].s +; VBITS_GE_2048-NEXT: st1w { [[UPKV]].d }, [[MASK]], [x2, [[PTRS]].d, lsl #2] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x float>, <32 x float>* %a %idxs = load <32 x i64>, <32 x i64>* %b @@ -881,9 +957,12 @@ ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ptrue [[PG1:p[0-9]+]].d, vl32 ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s -; VBITS_GE_2048-NEXT: st1w { [[VALS]].d }, [[MASK]], [x2, [[PTRS]].d] +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].s, [[PG0]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].d, [[VALS]].s +; VBITS_GE_2048-NEXT: st1w { [[UPKV]].d }, [[MASK]], [x2, [[PTRS]].d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x float>, <32 x float>* %a %idxs = load <32 x i64>, <32 x i64>* %b @@ -902,10 +981,13 @@ ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] ; VBITS_GE_2048-NEXT: mov [[OFF:z[0-9]+]].d, x2 -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 ; VBITS_GE_2048-NEXT: add [[PTRS_ADD:z[0-9]+]].d, [[PG1]]/m, [[PTRS]].d, [[OFF]].d -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s -; VBITS_GE_2048-NEXT: st1w { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS_ADD]].d] +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].s, [[PG0]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].d, [[VALS]].s +; VBITS_GE_2048-NEXT: st1w { [[UPKV]].d }, [[MASK]], {{\[}}[[PTRS_ADD]].d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x float>, <32 x float>* %a %bases = load <32 x i8*>, <32 x i8*>* %b @@ -924,10 +1006,13 @@ ; VBITS_GE_2048-NEXT: ld1w { [[VALS:z[0-9]+]].s }, [[PG0]]/z, [x0] ; VBITS_GE_2048-NEXT: ld1d { [[PTRS:z[0-9]+]].d }, [[PG1]]/z, [x1] ; VBITS_GE_2048-NEXT: mov [[OFF:z[0-9]+]].d, #4 -; VBITS_GE_2048-NEXT: fcmeq [[MASK:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 +; VBITS_GE_2048-NEXT: fcmeq [[CMP:p[0-9]+]].s, [[PG0]]/z, [[VALS]].s, #0.0 ; VBITS_GE_2048-NEXT: add [[PTRS_ADD:z[0-9]+]].d, [[PG1]]/m, [[PTRS]].d, [[OFF]].d -; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[VALS]].s -; VBITS_GE_2048-NEXT: st1w { [[VALS]].d }, [[MASK]], {{\[}}[[PTRS_ADD]].d] +; VBITS_GE_2048-NEXT: mov [[MONE:z[0-9]+]].s, [[PG0]]/z, #-1 +; VBITS_GE_2048-NEXT: uunpklo [[UPK:z[0-9]+]].d, [[MONE]].s +; VBITS_GE_2048-NEXT: cmpne [[MASK:p[0-9]+]].d, [[PG1]]/z, [[UPK]].d, #0 +; VBITS_GE_2048-NEXT: uunpklo [[UPKV:z[0-9]+]].d, [[VALS]].s +; VBITS_GE_2048-NEXT: st1w { [[UPKV]].d }, [[MASK]], {{\[}}[[PTRS_ADD]].d] ; VBITS_GE_2048-NEXT: ret %vals = load <32 x float>, <32 x float>* %a %bases = load <32 x i8*>, <32 x i8*>* %b