diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1222,17 +1222,17 @@ // Extract lo/hi halves of legal predicate types. def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 0))), - (ZIP1_PPP_S PPR:$Ps, (PFALSE))>; + (PUNPKLO_PP PPR:$Ps)>; def : Pat<(nxv2i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 2))), - (ZIP2_PPP_S PPR:$Ps, (PFALSE))>; + (PUNPKHI_PP PPR:$Ps)>; def : Pat<(nxv4i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 0))), - (ZIP1_PPP_H PPR:$Ps, (PFALSE))>; + (PUNPKLO_PP PPR:$Ps)>; def : Pat<(nxv4i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 4))), - (ZIP2_PPP_H PPR:$Ps, (PFALSE))>; + (PUNPKHI_PP PPR:$Ps)>; def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))), - (ZIP1_PPP_B PPR:$Ps, (PFALSE))>; + (PUNPKLO_PP PPR:$Ps)>; def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))), - (ZIP2_PPP_B PPR:$Ps, (PFALSE))>; + (PUNPKHI_PP PPR:$Ps)>; // Extract subvectors from FP SVE vectors def : Pat<(nxv2f16 (extract_subvector (nxv4f16 ZPR:$Zs), (i64 0))), diff --git a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll --- a/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-gather-legalize.ll @@ -80,10 +80,9 @@ define @masked_gather_nxv4f16( %ptrs, %mask) #0 { ; CHECK-LABEL: masked_gather_nxv4f16: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z1.d }, p2/z, [z1.d] +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1h { z1.d }, p1/z, [z1.d] ; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d] ; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s ; CHECK-NEXT: ret @@ -106,16 +105,15 @@ define @masked_gather_nxv8f16( %ptrs, %mask) #0 { ; CHECK-LABEL: masked_gather_nxv8f16: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: zip2 p2.h, p0.h, p1.h -; CHECK-NEXT: zip2 p3.s, p2.s, p1.s -; CHECK-NEXT: zip1 p2.s, p2.s, p1.s -; CHECK-NEXT: zip1 p0.h, p0.h, p1.h -; CHECK-NEXT: ld1h { z3.d }, p3/z, [z3.d] -; CHECK-NEXT: ld1h { z2.d }, p2/z, [z2.d] -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1h { z1.d }, p2/z, [z1.d] +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpkhi p2.h, p1.b +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1h { z3.d }, p2/z, [z3.d] +; CHECK-NEXT: ld1h { z2.d }, p1/z, [z2.d] +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1h { z1.d }, p1/z, [z1.d] ; CHECK-NEXT: ld1h { z0.d }, p0/z, [z0.d] ; CHECK-NEXT: uzp1 z2.s, z2.s, z3.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s @@ -128,12 +126,11 @@ define @masked_gather_nxv8bf16(bfloat* %base, %indices, %mask) #0 { ; CHECK-LABEL: masked_gather_nxv8bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: zip2 p2.h, p0.h, p1.h ; CHECK-NEXT: sunpkhi z1.s, z0.h -; CHECK-NEXT: zip1 p0.h, p0.h, p1.h +; CHECK-NEXT: punpkhi p1.h, p0.b ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: ld1h { z1.s }, p2/z, [x0, z1.s, sxtw #1] +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0, z1.s, sxtw #1] ; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0, z0.s, sxtw #1] ; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h ; CHECK-NEXT: ret @@ -148,12 +145,11 @@ ; CHECK-NEXT: ptrue p1.s ; CHECK-NEXT: movprfx z1, z0 ; CHECK-NEXT: sxth z1.s, p1/m, z0.s -; CHECK-NEXT: pfalse p1.b ; CHECK-NEXT: sunpklo z0.d, z1.s -; CHECK-NEXT: zip1 p2.s, p0.s, p1.s +; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: sunpkhi z1.d, z1.s -; CHECK-NEXT: zip2 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1d { z0.d }, p2/z, [x0, z0.d, lsl #3] +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0, z0.d, lsl #3] ; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0, z1.d, lsl #3] ; CHECK-NEXT: ret %ptrs = getelementptr double, double* %base, %indices @@ -164,10 +160,9 @@ define @masked_gather_nxv8f32(float* %base, %offsets, %mask) #0 { ; CHECK-LABEL: masked_gather_nxv8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: zip1 p2.h, p0.h, p1.h -; CHECK-NEXT: zip2 p0.h, p0.h, p1.h -; CHECK-NEXT: ld1w { z0.s }, p2/z, [x0, z0.s, uxtw #2] +; CHECK-NEXT: punpklo p1.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, z0.s, uxtw #2] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, z1.s, uxtw #2] ; CHECK-NEXT: ret %offsets.zext = zext %offsets to @@ -180,23 +175,22 @@ define @masked_gather_nxv16i8(i8* %base, %indices, %mask) #0 { ; CHECK-LABEL: masked_gather_nxv16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: zip2 p2.b, p0.b, p1.b ; CHECK-NEXT: sunpkhi z1.h, z0.b -; CHECK-NEXT: zip2 p3.h, p2.h, p1.h +; CHECK-NEXT: punpkhi p1.h, p0.b ; CHECK-NEXT: sunpkhi z2.s, z1.h -; CHECK-NEXT: zip1 p2.h, p2.h, p1.h +; CHECK-NEXT: punpkhi p2.h, p1.b ; CHECK-NEXT: sunpklo z1.s, z1.h -; CHECK-NEXT: ld1sb { z2.s }, p3/z, [x0, z2.s, sxtw] -; CHECK-NEXT: ld1sb { z1.s }, p2/z, [x0, z1.s, sxtw] -; CHECK-NEXT: zip1 p0.b, p0.b, p1.b +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: ld1sb { z2.s }, p2/z, [x0, z2.s, sxtw] +; CHECK-NEXT: ld1sb { z1.s }, p1/z, [x0, z1.s, sxtw] ; CHECK-NEXT: sunpklo z0.h, z0.b -; CHECK-NEXT: zip2 p2.h, p0.h, p1.h +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpkhi p1.h, p0.b ; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h ; CHECK-NEXT: sunpkhi z2.s, z0.h -; CHECK-NEXT: zip1 p0.h, p0.h, p1.h ; CHECK-NEXT: sunpklo z0.s, z0.h -; CHECK-NEXT: ld1sb { z2.s }, p2/z, [x0, z2.s, sxtw] +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1sb { z2.s }, p1/z, [x0, z2.s, sxtw] ; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0, z0.s, sxtw] ; CHECK-NEXT: uzp1 z0.h, z0.h, z2.h ; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b @@ -210,33 +204,26 @@ define @masked_gather_nxv32i32(i32* %base, %indices, %mask) #0 { ; CHECK-LABEL: masked_gather_nxv32i32: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK-NEXT: pfalse p2.b -; CHECK-NEXT: zip1 p3.b, p0.b, p2.b -; CHECK-NEXT: zip1 p4.h, p3.h, p2.h -; CHECK-NEXT: zip2 p3.h, p3.h, p2.h -; CHECK-NEXT: zip2 p0.b, p0.b, p2.b -; CHECK-NEXT: ld1w { z0.s }, p4/z, [x0, z0.s, sxtw #2] -; CHECK-NEXT: ld1w { z1.s }, p3/z, [x0, z1.s, sxtw #2] -; CHECK-NEXT: zip1 p3.h, p0.h, p2.h -; CHECK-NEXT: zip2 p0.h, p0.h, p2.h -; CHECK-NEXT: ld1w { z2.s }, p3/z, [x0, z2.s, sxtw #2] +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: punpklo p3.h, p2.b +; CHECK-NEXT: punpkhi p2.h, p2.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: ld1w { z0.s }, p3/z, [x0, z0.s, sxtw #2] +; CHECK-NEXT: ld1w { z1.s }, p2/z, [x0, z1.s, sxtw #2] +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: ld1w { z2.s }, p2/z, [x0, z2.s, sxtw #2] ; CHECK-NEXT: ld1w { z3.s }, p0/z, [x0, z3.s, sxtw #2] -; CHECK-NEXT: zip1 p0.b, p1.b, p2.b -; CHECK-NEXT: zip1 p3.h, p0.h, p2.h -; CHECK-NEXT: zip2 p0.h, p0.h, p2.h -; CHECK-NEXT: ld1w { z4.s }, p3/z, [x0, z4.s, sxtw #2] +; CHECK-NEXT: punpklo p0.h, p1.b +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: ld1w { z4.s }, p2/z, [x0, z4.s, sxtw #2] ; CHECK-NEXT: ld1w { z5.s }, p0/z, [x0, z5.s, sxtw #2] -; CHECK-NEXT: zip2 p0.b, p1.b, p2.b -; CHECK-NEXT: zip1 p1.h, p0.h, p2.h -; CHECK-NEXT: zip2 p0.h, p0.h, p2.h +; CHECK-NEXT: punpkhi p0.h, p1.b +; CHECK-NEXT: punpklo p1.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: ld1w { z6.s }, p1/z, [x0, z6.s, sxtw #2] ; CHECK-NEXT: ld1w { z7.s }, p0/z, [x0, z7.s, sxtw #2] -; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %ptrs = getelementptr i32, i32* %base, %indices %data = call @llvm.masked.gather.nxv32i32( %ptrs, i32 4, %mask, undef) @@ -250,10 +237,9 @@ define @masked_sgather_nxv4i8( %ptrs, %mask) #0 { ; CHECK-LABEL: masked_sgather_nxv4i8: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1sb { z1.d }, p2/z, [z1.d] +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1sb { z1.d }, p1/z, [z1.d] ; CHECK-NEXT: ld1sb { z0.d }, p0/z, [z0.d] ; CHECK-NEXT: ptrue p0.s ; CHECK-NEXT: uzp1 z0.s, z0.s, z1.s diff --git a/llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll b/llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll --- a/llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-scatter-legalize.ll @@ -9,28 +9,27 @@ define void @masked_scatter_nxv16i8( %data, i8* %base, %offsets, %mask) #0 { ; CHECK-LABEL: masked_scatter_nxv16i8: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: zip1 p2.b, p0.b, p1.b ; CHECK-NEXT: sunpklo z2.h, z1.b +; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: uunpklo z4.h, z0.b -; CHECK-NEXT: zip1 p3.h, p2.h, p1.h ; CHECK-NEXT: sunpklo z3.s, z2.h +; CHECK-NEXT: punpklo p2.h, p1.b ; CHECK-NEXT: uunpklo z5.s, z4.h -; CHECK-NEXT: st1b { z5.s }, p3, [x0, z3.s, sxtw] -; CHECK-NEXT: zip2 p2.h, p2.h, p1.h +; CHECK-NEXT: st1b { z5.s }, p2, [x0, z3.s, sxtw] ; CHECK-NEXT: sunpkhi z2.s, z2.h +; CHECK-NEXT: punpkhi p1.h, p1.b ; CHECK-NEXT: uunpkhi z3.s, z4.h -; CHECK-NEXT: zip2 p0.b, p0.b, p1.b ; CHECK-NEXT: sunpkhi z1.h, z1.b +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: uunpkhi z0.h, z0.b -; CHECK-NEXT: st1b { z3.s }, p2, [x0, z2.s, sxtw] -; CHECK-NEXT: zip1 p2.h, p0.h, p1.h +; CHECK-NEXT: st1b { z3.s }, p1, [x0, z2.s, sxtw] ; CHECK-NEXT: sunpklo z2.s, z1.h +; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: uunpklo z3.s, z0.h -; CHECK-NEXT: zip2 p0.h, p0.h, p1.h ; CHECK-NEXT: sunpkhi z1.s, z1.h +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: uunpkhi z0.s, z0.h -; CHECK-NEXT: st1b { z3.s }, p2, [x0, z2.s, sxtw] +; CHECK-NEXT: st1b { z3.s }, p1, [x0, z2.s, sxtw] ; CHECK-NEXT: st1b { z0.s }, p0, [x0, z1.s, sxtw] ; CHECK-NEXT: ret %ptrs = getelementptr i8, i8* %base, %offsets @@ -41,14 +40,13 @@ define void @masked_scatter_nxv8i16( %data, i16* %base, %offsets, %mask) #0 { ; CHECK-LABEL: masked_scatter_nxv8i16: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: zip1 p2.h, p0.h, p1.h ; CHECK-NEXT: sunpklo z2.s, z1.h +; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: uunpklo z3.s, z0.h -; CHECK-NEXT: zip2 p0.h, p0.h, p1.h ; CHECK-NEXT: sunpkhi z1.s, z1.h +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: uunpkhi z0.s, z0.h -; CHECK-NEXT: st1h { z3.s }, p2, [x0, z2.s, sxtw #1] +; CHECK-NEXT: st1h { z3.s }, p1, [x0, z2.s, sxtw #1] ; CHECK-NEXT: st1h { z0.s }, p0, [x0, z1.s, sxtw #1] ; CHECK-NEXT: ret %ptrs = getelementptr i16, i16* %base, %offsets @@ -59,14 +57,13 @@ define void @masked_scatter_nxv8bf16( %data, bfloat* %base, %offsets, %mask) #0 { ; CHECK-LABEL: masked_scatter_nxv8bf16: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: zip1 p2.h, p0.h, p1.h ; CHECK-NEXT: sunpklo z2.s, z1.h +; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: uunpklo z3.s, z0.h -; CHECK-NEXT: zip2 p0.h, p0.h, p1.h ; CHECK-NEXT: sunpkhi z1.s, z1.h +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: uunpkhi z0.s, z0.h -; CHECK-NEXT: st1h { z3.s }, p2, [x0, z2.s, sxtw #1] +; CHECK-NEXT: st1h { z3.s }, p1, [x0, z2.s, sxtw #1] ; CHECK-NEXT: st1h { z0.s }, p0, [x0, z1.s, sxtw #1] ; CHECK-NEXT: ret %ptrs = getelementptr bfloat, bfloat* %base, %offsets @@ -77,10 +74,9 @@ define void @masked_scatter_nxv8f32( %data, float* %base, %indexes, %masks) #0 { ; CHECK-LABEL: masked_scatter_nxv8f32: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: zip1 p2.h, p0.h, p1.h -; CHECK-NEXT: zip2 p0.h, p0.h, p1.h -; CHECK-NEXT: st1w { z0.s }, p2, [x0, z2.s, uxtw #2] +; CHECK-NEXT: punpklo p1.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: st1w { z0.s }, p1, [x0, z2.s, uxtw #2] ; CHECK-NEXT: st1w { z1.s }, p0, [x0, z3.s, uxtw #2] ; CHECK-NEXT: ret %ext = zext %indexes to @@ -93,9 +89,6 @@ define void @masked_scatter_nxv32i32( %data, i32* %base, %offsets, %mask) #0 { ; CHECK-LABEL: masked_scatter_nxv32i32: ; CHECK: // %bb.0: -; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill -; CHECK-NEXT: addvl sp, sp, #-1 -; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill ; CHECK-NEXT: ptrue p2.s ; CHECK-NEXT: ld1w { z24.s }, p2/z, [x1, #7, mul vl] ; CHECK-NEXT: ld1w { z25.s }, p2/z, [x1, #6, mul vl] @@ -105,30 +98,26 @@ ; CHECK-NEXT: ld1w { z29.s }, p2/z, [x1, #2, mul vl] ; CHECK-NEXT: ld1w { z30.s }, p2/z, [x1, #1, mul vl] ; CHECK-NEXT: ld1w { z31.s }, p2/z, [x1] -; CHECK-NEXT: pfalse p2.b -; CHECK-NEXT: zip1 p3.b, p0.b, p2.b -; CHECK-NEXT: zip1 p4.h, p3.h, p2.h -; CHECK-NEXT: zip2 p3.h, p3.h, p2.h -; CHECK-NEXT: zip2 p0.b, p0.b, p2.b -; CHECK-NEXT: st1w { z0.s }, p4, [x0, z31.s, sxtw #2] -; CHECK-NEXT: st1w { z1.s }, p3, [x0, z30.s, sxtw #2] -; CHECK-NEXT: zip1 p3.h, p0.h, p2.h -; CHECK-NEXT: zip2 p0.h, p0.h, p2.h -; CHECK-NEXT: st1w { z2.s }, p3, [x0, z29.s, sxtw #2] +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: punpklo p3.h, p2.b +; CHECK-NEXT: punpkhi p2.h, p2.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: st1w { z0.s }, p3, [x0, z31.s, sxtw #2] +; CHECK-NEXT: st1w { z1.s }, p2, [x0, z30.s, sxtw #2] +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: st1w { z2.s }, p2, [x0, z29.s, sxtw #2] ; CHECK-NEXT: st1w { z3.s }, p0, [x0, z28.s, sxtw #2] -; CHECK-NEXT: zip1 p0.b, p1.b, p2.b -; CHECK-NEXT: zip1 p3.h, p0.h, p2.h -; CHECK-NEXT: zip2 p0.h, p0.h, p2.h -; CHECK-NEXT: st1w { z4.s }, p3, [x0, z27.s, sxtw #2] +; CHECK-NEXT: punpklo p0.h, p1.b +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: st1w { z4.s }, p2, [x0, z27.s, sxtw #2] ; CHECK-NEXT: st1w { z5.s }, p0, [x0, z26.s, sxtw #2] -; CHECK-NEXT: zip2 p0.b, p1.b, p2.b -; CHECK-NEXT: zip1 p1.h, p0.h, p2.h -; CHECK-NEXT: zip2 p0.h, p0.h, p2.h +; CHECK-NEXT: punpkhi p0.h, p1.b +; CHECK-NEXT: punpklo p1.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: st1w { z6.s }, p1, [x0, z25.s, sxtw #2] ; CHECK-NEXT: st1w { z7.s }, p0, [x0, z24.s, sxtw #2] -; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK-NEXT: addvl sp, sp, #1 -; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload ; CHECK-NEXT: ret %ptrs = getelementptr i32, i32* %base, %offsets call void @llvm.masked.scatter.nxv32i32( %data, %ptrs, i32 4, %mask) diff --git a/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll b/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll --- a/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll +++ b/llvm/test/CodeGen/AArch64/sve-masked-scatter.ll @@ -76,11 +76,10 @@ define void @masked_scatter_splat_constant_pointer ( %pg) { ; CHECK-LABEL: masked_scatter_splat_constant_pointer: ; CHECK: // %bb.0: // %vector.body -; CHECK-NEXT: pfalse p1.b +; CHECK-NEXT: punpklo p1.h, p0.b ; CHECK-NEXT: mov z0.d, #0 // =0x0 -; CHECK-NEXT: zip1 p2.s, p0.s, p1.s -; CHECK-NEXT: zip2 p0.s, p0.s, p1.s -; CHECK-NEXT: st1w { z0.d }, p2, [x8, z0.d, lsl #2] +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: st1w { z0.d }, p1, [x8, z0.d, lsl #2] ; CHECK-NEXT: st1w { z0.d }, p0, [x8, z0.d, lsl #2] ; CHECK-NEXT: ret vector.body: diff --git a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll --- a/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-fcvt.ll @@ -305,11 +305,10 @@ define @scvtf_d_nxv4i1( %a) { ; CHECK-LABEL: scvtf_d_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: zip1 p3.s, p0.s, p1.s -; CHECK-NEXT: zip2 p0.s, p0.s, p1.s +; CHECK-NEXT: punpklo p1.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: ptrue p2.d -; CHECK-NEXT: mov z0.d, p3/z, #-1 // =0xffffffffffffffff +; CHECK-NEXT: mov z0.d, p1/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: mov z1.d, p0/z, #-1 // =0xffffffffffffffff ; CHECK-NEXT: scvtf z0.d, p2/m, z0.d ; CHECK-NEXT: scvtf z1.d, p2/m, z1.d @@ -366,11 +365,10 @@ define @ucvtf_d_nxv4i1( %a) { ; CHECK-LABEL: ucvtf_d_nxv4i1: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: zip1 p3.s, p0.s, p1.s -; CHECK-NEXT: zip2 p0.s, p0.s, p1.s +; CHECK-NEXT: punpklo p1.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b ; CHECK-NEXT: ptrue p2.d -; CHECK-NEXT: mov z0.d, p3/z, #1 // =0x1 +; CHECK-NEXT: mov z0.d, p1/z, #1 // =0x1 ; CHECK-NEXT: mov z1.d, p0/z, #1 // =0x1 ; CHECK-NEXT: ucvtf z0.d, p2/m, z0.d ; CHECK-NEXT: ucvtf z1.d, p2/m, z1.d diff --git a/llvm/test/CodeGen/AArch64/sve-split-load.ll b/llvm/test/CodeGen/AArch64/sve-split-load.ll --- a/llvm/test/CodeGen/AArch64/sve-split-load.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-load.ll @@ -90,15 +90,14 @@ define @masked_load_split_32i16( *%a, %pg) { ; CHECK-LABEL: masked_load_split_32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p2.b -; CHECK-NEXT: zip1 p3.b, p0.b, p2.b -; CHECK-NEXT: zip2 p0.b, p0.b, p2.b -; CHECK-NEXT: ld1h { z0.h }, p3/z, [x0] -; CHECK-NEXT: zip1 p3.b, p1.b, p2.b +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p3.h, p1.b +; CHECK-NEXT: punpkhi p1.h, p1.b +; CHECK-NEXT: ld1h { z0.h }, p2/z, [x0] ; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0, #1, mul vl] -; CHECK-NEXT: zip2 p0.b, p1.b, p2.b ; CHECK-NEXT: ld1h { z2.h }, p3/z, [x0, #2, mul vl] -; CHECK-NEXT: ld1h { z3.h }, p0/z, [x0, #3, mul vl] +; CHECK-NEXT: ld1h { z3.h }, p1/z, [x0, #3, mul vl] ; CHECK-NEXT: ret %load = call @llvm.masked.load.nxv32i16( *%a, i32 1, %pg, undef) ret %load @@ -107,10 +106,9 @@ define @masked_load_split_8i32( *%a, %pg) { ; CHECK-LABEL: masked_load_split_8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: zip1 p2.h, p0.h, p1.h -; CHECK-NEXT: zip2 p0.h, p0.h, p1.h -; CHECK-NEXT: ld1w { z0.s }, p2/z, [x0] +; CHECK-NEXT: punpklo p1.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0] ; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0, #1, mul vl] ; CHECK-NEXT: ret %load = call @llvm.masked.load.nxv8i32( *%a, i32 1, %pg, undef) @@ -120,16 +118,15 @@ define @masked_load_split_8i64( *%a, %pg) { ; CHECK-LABEL: masked_load_split_8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: zip1 p2.h, p0.h, p1.h -; CHECK-NEXT: zip2 p0.h, p0.h, p1.h -; CHECK-NEXT: zip1 p3.s, p2.s, p1.s -; CHECK-NEXT: zip2 p2.s, p2.s, p1.s -; CHECK-NEXT: ld1d { z0.d }, p3/z, [x0] -; CHECK-NEXT: ld1d { z1.d }, p2/z, [x0, #1, mul vl] -; CHECK-NEXT: zip1 p2.s, p0.s, p1.s -; CHECK-NEXT: zip2 p0.s, p0.s, p1.s -; CHECK-NEXT: ld1d { z2.d }, p2/z, [x0, #2, mul vl] +; CHECK-NEXT: punpklo p1.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p2.h, p1.b +; CHECK-NEXT: punpkhi p1.h, p1.b +; CHECK-NEXT: ld1d { z0.d }, p2/z, [x0] +; CHECK-NEXT: ld1d { z1.d }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: punpklo p1.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: ld1d { z2.d }, p1/z, [x0, #2, mul vl] ; CHECK-NEXT: ld1d { z3.d }, p0/z, [x0, #3, mul vl] ; CHECK-NEXT: ret %load = call @llvm.masked.load.nxv8i64( *%a, i32 1, %pg, undef) diff --git a/llvm/test/CodeGen/AArch64/sve-split-store.ll b/llvm/test/CodeGen/AArch64/sve-split-store.ll --- a/llvm/test/CodeGen/AArch64/sve-split-store.ll +++ b/llvm/test/CodeGen/AArch64/sve-split-store.ll @@ -78,12 +78,11 @@ define void @masked_store_split_32i16( %data, *%a, %pg) { ; CHECK-LABEL: masked_store_split_32i16: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p2.b -; CHECK-NEXT: zip2 p3.b, p1.b, p2.b -; CHECK-NEXT: zip1 p1.b, p1.b, p2.b -; CHECK-NEXT: st1h { z3.h }, p3, [x0, #3, mul vl] -; CHECK-NEXT: zip2 p3.b, p0.b, p2.b -; CHECK-NEXT: zip1 p0.b, p0.b, p2.b +; CHECK-NEXT: punpkhi p2.h, p1.b +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: punpkhi p3.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: st1h { z3.h }, p2, [x0, #3, mul vl] ; CHECK-NEXT: st1h { z2.h }, p1, [x0, #2, mul vl] ; CHECK-NEXT: st1h { z1.h }, p3, [x0, #1, mul vl] ; CHECK-NEXT: st1h { z0.h }, p0, [x0] @@ -95,10 +94,9 @@ define void @masked_store_split_8i32( %data, *%a, %pg) { ; CHECK-LABEL: masked_store_split_8i32: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: zip2 p2.h, p0.h, p1.h -; CHECK-NEXT: zip1 p0.h, p0.h, p1.h -; CHECK-NEXT: st1w { z1.s }, p2, [x0, #1, mul vl] +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: st1w { z1.s }, p1, [x0, #1, mul vl] ; CHECK-NEXT: st1w { z0.s }, p0, [x0] ; CHECK-NEXT: ret call void @llvm.masked.store.nxv8i32( %data, *%a, i32 1, %pg) @@ -108,16 +106,15 @@ define void @masked_store_split_8i64( %data, *%a, %pg) { ; CHECK-LABEL: masked_store_split_8i64: ; CHECK: // %bb.0: -; CHECK-NEXT: pfalse p1.b -; CHECK-NEXT: zip2 p2.h, p0.h, p1.h -; CHECK-NEXT: zip1 p0.h, p0.h, p1.h -; CHECK-NEXT: zip2 p3.s, p2.s, p1.s -; CHECK-NEXT: zip1 p2.s, p2.s, p1.s -; CHECK-NEXT: st1d { z3.d }, p3, [x0, #3, mul vl] -; CHECK-NEXT: st1d { z2.d }, p2, [x0, #2, mul vl] -; CHECK-NEXT: zip2 p2.s, p0.s, p1.s -; CHECK-NEXT: zip1 p0.s, p0.s, p1.s -; CHECK-NEXT: st1d { z1.d }, p2, [x0, #1, mul vl] +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpkhi p2.h, p1.b +; CHECK-NEXT: punpklo p1.h, p1.b +; CHECK-NEXT: st1d { z3.d }, p2, [x0, #3, mul vl] +; CHECK-NEXT: st1d { z2.d }, p1, [x0, #2, mul vl] +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: st1d { z1.d }, p1, [x0, #1, mul vl] ; CHECK-NEXT: st1d { z0.d }, p0, [x0] ; CHECK-NEXT: ret call void @llvm.masked.store.nxv8i64( %data, *%a, i32 1, %pg)