diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td --- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td +++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td @@ -1531,6 +1531,14 @@ def : Pat<(nxv8i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))), (PUNPKHI_PP PPR:$Ps)>; + def : Pat<(nxv1i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 0))), + (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))>; + def : Pat<(nxv1i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 1))), + (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))>; + def : Pat<(nxv1i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 2))), + (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))>; + def : Pat<(nxv1i1 (extract_subvector (nxv4i1 PPR:$Ps), (i64 3))), + (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))>; def : Pat<(nxv2i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 0))), (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))>; def : Pat<(nxv2i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 2))), @@ -1539,7 +1547,6 @@ (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))>; def : Pat<(nxv2i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 6))), (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))>; - def : Pat<(nxv4i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))), (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))>; def : Pat<(nxv4i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 4))), @@ -1549,6 +1556,23 @@ def : Pat<(nxv4i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 12))), (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))>; + + def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 0))), + (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps)))>; + def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 1))), + (PUNPKHI_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps)))>; + def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 2))), + (PUNPKLO_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps)))>; + def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 3))), + (PUNPKHI_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps)))>; + def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 4))), + (PUNPKLO_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps)))>; + def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 5))), + (PUNPKHI_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps)))>; + def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 6))), + (PUNPKLO_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps)))>; + def : Pat<(nxv1i1 (extract_subvector (nxv8i1 PPR:$Ps), (i64 7))), + (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps)))>; def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))), (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps)))>; def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 2))), @@ -1566,6 +1590,39 @@ def : Pat<(nxv2i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 14))), (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps)))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 0))), + (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 1))), + (PUNPKHI_PP (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 2))), + (PUNPKLO_PP (PUNPKHI_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 3))), + (PUNPKHI_PP (PUNPKHI_PP (PUNPKLO_PP (PUNPKLO_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 4))), + (PUNPKLO_PP (PUNPKLO_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 5))), + (PUNPKHI_PP (PUNPKLO_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 6))), + (PUNPKLO_PP (PUNPKHI_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 7))), + (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP (PUNPKLO_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 8))), + (PUNPKLO_PP (PUNPKLO_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 9))), + (PUNPKHI_PP (PUNPKLO_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 10))), + (PUNPKLO_PP (PUNPKHI_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 11))), + (PUNPKHI_PP (PUNPKHI_PP (PUNPKLO_PP (PUNPKHI_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 12))), + (PUNPKLO_PP (PUNPKLO_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 13))), + (PUNPKHI_PP (PUNPKLO_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 14))), + (PUNPKLO_PP (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))))>; + def : Pat<(nxv1i1 (extract_subvector (nxv16i1 PPR:$Ps), (i64 15))), + (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP (PUNPKHI_PP PPR:$Ps))))>; + // Extract subvectors from FP SVE vectors def : Pat<(nxv2f16 (extract_subvector (nxv4f16 ZPR:$Zs), (i64 0))), (UUNPKLO_ZZ_D ZPR:$Zs)>; diff --git a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll --- a/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-extract-scalable-vector.ll @@ -1101,4 +1101,340 @@ ret %res } +; +; Extract nxv1i1 type from: nxv4i1 +; + +define @extract_nxv1i1_nxv4i1_0( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv4i1_0: +; CHECK: // %bb.0: +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv4i1( %in, i64 0) + ret %res +} + +define @extract_nxv1i1_nxv4i1_1( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv4i1_1: +; CHECK: // %bb.0: +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv4i1( %in, i64 1) + ret %res +} + +define @extract_nxv1i1_nxv4i1_2( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv4i1_2: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv4i1( %in, i64 2) + ret %res +} + +define @extract_nxv1i1_nxv4i1_3( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv4i1_3: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv4i1( %in, i64 3) + ret %res +} + +; +; Extract nxv1i1 type from: nxv8i1 +; + +define @extract_nxv1i1_nxv8i1_0( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv8i1_0: +; CHECK: // %bb.0: +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv8i1( %in, i64 0) + ret %res +} + +define @extract_nxv1i1_nxv8i1_1( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv8i1_1: +; CHECK: // %bb.0: +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv8i1( %in, i64 1) + ret %res +} + +define @extract_nxv1i1_nxv8i1_2( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv8i1_2: +; CHECK: // %bb.0: +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv8i1( %in, i64 2) + ret %res +} + +define @extract_nxv1i1_nxv8i1_3( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv8i1_3: +; CHECK: // %bb.0: +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv8i1( %in, i64 3) + ret %res +} + +define @extract_nxv1i1_nxv8i1_4( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv8i1_4: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv8i1( %in, i64 4) + ret %res +} + +define @extract_nxv1i1_nxv8i1_5( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv8i1_5: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv8i1( %in, i64 5) + ret %res +} + +define @extract_nxv1i1_nxv8i1_6( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv8i1_6: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv8i1( %in, i64 6) + ret %res +} + +define @extract_nxv1i1_nxv8i1_7( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv8i1_7: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv8i1( %in, i64 7) + ret %res +} + + +; +; Extract nxv1i1 type from: nxv16i1 +; + +define @extract_nxv1i1_nxv16i1_0( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv16i1_0: +; CHECK: // %bb.0: +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv16i1( %in, i64 0) + ret %res +} + +define @extract_nxv1i1_nxv16i1_1( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv16i1_1: +; CHECK: // %bb.0: +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv16i1( %in, i64 1) + ret %res +} + +define @extract_nxv1i1_nxv16i1_2( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv16i1_2: +; CHECK: // %bb.0: +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv16i1( %in, i64 2) + ret %res +} + +define @extract_nxv1i1_nxv16i1_3( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv16i1_3: +; CHECK: // %bb.0: +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv16i1( %in, i64 3) + ret %res +} + +define @extract_nxv1i1_nxv16i1_4( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv16i1_4: +; CHECK: // %bb.0: +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv16i1( %in, i64 4) + ret %res +} + +define @extract_nxv1i1_nxv16i1_5( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv16i1_5: +; CHECK: // %bb.0: +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv16i1( %in, i64 5) + ret %res +} + +define @extract_nxv1i1_nxv16i1_6( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv16i1_6: +; CHECK: // %bb.0: +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv16i1( %in, i64 6) + ret %res +} + +define @extract_nxv1i1_nxv16i1_7( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv16i1_7: +; CHECK: // %bb.0: +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv16i1( %in, i64 7) + ret %res +} + +define @extract_nxv1i1_nxv16i1_8( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv16i1_8: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv16i1( %in, i64 8) + ret %res +} + +define @extract_nxv1i1_nxv16i1_9( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv16i1_9: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv16i1( %in, i64 9) + ret %res +} + +define @extract_nxv1i1_nxv16i1_10( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv16i1_10: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv16i1( %in, i64 10) + ret %res +} + +define @extract_nxv1i1_nxv16i1_11( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv16i1_11: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv16i1( %in, i64 11) + ret %res +} + +define @extract_nxv1i1_nxv16i1_12( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv16i1_12: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv16i1( %in, i64 12) + ret %res +} + +define @extract_nxv1i1_nxv16i1_13( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv16i1_13: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv16i1( %in, i64 13) + ret %res +} + +define @extract_nxv1i1_nxv16i1_14( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv16i1_14: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv16i1( %in, i64 14) + ret %res +} + +define @extract_nxv1i1_nxv16i1_15( %in) { +; CHECK-LABEL: extract_nxv1i1_nxv16i1_15: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: ret + %res = call @llvm.vector.extract.nxv1i1.nxv16i1( %in, i64 15) + ret %res +} + declare @llvm.vector.extract.nxv1i1.nxv2i1(, i64) +declare @llvm.vector.extract.nxv1i1.nxv4i1(, i64) +declare @llvm.vector.extract.nxv1i1.nxv8i1(, i64) +declare @llvm.vector.extract.nxv1i1.nxv16i1(, i64) diff --git a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll --- a/llvm/test/CodeGen/AArch64/sve-insert-vector.ll +++ b/llvm/test/CodeGen/AArch64/sve-insert-vector.ll @@ -708,6 +708,670 @@ ret %v0 } +; +; Insert nxv1i1 type into: nxv2i1 +; + +define @insert_nxv1i1_nxv2i1_0( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv2i1_0: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: uzp1 p0.d, p1.d, p0.d +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv2i1.nxv1i1( %vec, %sv, i64 0) + ret %res +} + +define @insert_nxv1i1_nxv2i1_1( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv2i1_1: +; CHECK: // %bb.0: +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: uzp1 p0.d, p0.d, p1.d +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv2i1.nxv1i1( %vec, %sv, i64 1) + ret %res +} + +; +; Insert nxv1i1 type into: nxv4i1 +; + +define @insert_nxv1i1_nxv4i1_0( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv4i1_0: +; CHECK: // %bb.0: +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpkhi p2.h, p2.b +; CHECK-NEXT: uzp1 p1.d, p1.d, p2.d +; CHECK-NEXT: uzp1 p0.s, p1.s, p0.s +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv4i1.nxv1i1( %vec, %sv, i64 0) + ret %res +} + +define @insert_nxv1i1_nxv4i1_1( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv4i1_1: +; CHECK: // %bb.0: +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: uzp1 p1.d, p2.d, p1.d +; CHECK-NEXT: uzp1 p0.s, p1.s, p0.s +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv4i1.nxv1i1( %vec, %sv, i64 1) + ret %res +} + +define @insert_nxv1i1_nxv4i1_2( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv4i1_2: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p2.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpkhi p2.h, p2.b +; CHECK-NEXT: uzp1 p1.d, p1.d, p2.d +; CHECK-NEXT: uzp1 p0.s, p0.s, p1.s +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv4i1.nxv1i1( %vec, %sv, i64 2) + ret %res +} + +define @insert_nxv1i1_nxv4i1_3( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv4i1_3: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p2.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: uzp1 p1.d, p2.d, p1.d +; CHECK-NEXT: uzp1 p0.s, p0.s, p1.s +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv4i1.nxv1i1( %vec, %sv, i64 3) + ret %res +} + +; +; Insert nxv1i1 type into: nxv8i1 +; + +define @insert_nxv1i1_nxv8i1_0( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv8i1_0: +; CHECK: // %bb.0: +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p3.h, p2.b +; CHECK-NEXT: punpkhi p2.h, p2.b +; CHECK-NEXT: punpkhi p3.h, p3.b +; CHECK-NEXT: uzp1 p1.d, p1.d, p3.d +; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s +; CHECK-NEXT: uzp1 p0.h, p1.h, p0.h +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv8i1.nxv1i1( %vec, %sv, i64 0) + ret %res +} + +define @insert_nxv1i1_nxv8i1_1( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv8i1_1: +; CHECK: // %bb.0: +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpklo p3.h, p2.b +; CHECK-NEXT: punpkhi p2.h, p2.b +; CHECK-NEXT: punpklo p3.h, p3.b +; CHECK-NEXT: uzp1 p1.d, p3.d, p1.d +; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s +; CHECK-NEXT: uzp1 p0.h, p1.h, p0.h +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv8i1.nxv1i1( %vec, %sv, i64 1) + ret %res +} + +define @insert_nxv1i1_nxv8i1_2( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv8i1_2: +; CHECK: // %bb.0: +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpkhi p3.h, p2.b +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: punpkhi p3.h, p3.b +; CHECK-NEXT: uzp1 p1.d, p1.d, p3.d +; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s +; CHECK-NEXT: uzp1 p0.h, p1.h, p0.h +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv8i1.nxv1i1( %vec, %sv, i64 2) + ret %res +} + +define @insert_nxv1i1_nxv8i1_3( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv8i1_3: +; CHECK: // %bb.0: +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: punpkhi p3.h, p2.b +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: punpklo p3.h, p3.b +; CHECK-NEXT: uzp1 p1.d, p3.d, p1.d +; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s +; CHECK-NEXT: uzp1 p0.h, p1.h, p0.h +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv8i1.nxv1i1( %vec, %sv, i64 3) + ret %res +} + +define @insert_nxv1i1_nxv8i1_4( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv8i1_4: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p2.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p3.h, p2.b +; CHECK-NEXT: punpkhi p2.h, p2.b +; CHECK-NEXT: punpkhi p3.h, p3.b +; CHECK-NEXT: uzp1 p1.d, p1.d, p3.d +; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s +; CHECK-NEXT: uzp1 p0.h, p0.h, p1.h +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv8i1.nxv1i1( %vec, %sv, i64 4) + ret %res +} + +define @insert_nxv1i1_nxv8i1_5( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv8i1_5: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p2.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpklo p3.h, p2.b +; CHECK-NEXT: punpkhi p2.h, p2.b +; CHECK-NEXT: punpklo p3.h, p3.b +; CHECK-NEXT: uzp1 p1.d, p3.d, p1.d +; CHECK-NEXT: uzp1 p1.s, p1.s, p2.s +; CHECK-NEXT: uzp1 p0.h, p0.h, p1.h +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv8i1.nxv1i1( %vec, %sv, i64 5) + ret %res +} + +define @insert_nxv1i1_nxv8i1_6( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv8i1_6: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p2.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpkhi p3.h, p2.b +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: punpkhi p3.h, p3.b +; CHECK-NEXT: uzp1 p1.d, p1.d, p3.d +; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s +; CHECK-NEXT: uzp1 p0.h, p0.h, p1.h +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv8i1.nxv1i1( %vec, %sv, i64 6) + ret %res +} + +define @insert_nxv1i1_nxv8i1_7( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv8i1_7: +; CHECK: // %bb.0: +; CHECK-NEXT: punpkhi p2.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: punpkhi p3.h, p2.b +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: punpklo p3.h, p3.b +; CHECK-NEXT: uzp1 p1.d, p3.d, p1.d +; CHECK-NEXT: uzp1 p1.s, p2.s, p1.s +; CHECK-NEXT: uzp1 p0.h, p0.h, p1.h +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv8i1.nxv1i1( %vec, %sv, i64 7) + ret %res +} + +; +; Insert nxv1i1 type into: nxv16i1 +; + +define @insert_nxv1i1_nxv16i1_0( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv16i1_0: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: punpklo p3.h, p2.b +; CHECK-NEXT: punpkhi p2.h, p2.b +; CHECK-NEXT: punpklo p4.h, p3.b +; CHECK-NEXT: punpkhi p3.h, p3.b +; CHECK-NEXT: punpkhi p4.h, p4.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: uzp1 p1.d, p1.d, p4.d +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p1.s, p1.s, p3.s +; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h +; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv16i1.nxv1i1( %vec, %sv, i64 0) + ret %res +} + +define @insert_nxv1i1_nxv16i1_1( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv16i1_1: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: punpklo p3.h, p2.b +; CHECK-NEXT: punpkhi p2.h, p2.b +; CHECK-NEXT: punpklo p4.h, p3.b +; CHECK-NEXT: punpkhi p3.h, p3.b +; CHECK-NEXT: punpklo p4.h, p4.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: uzp1 p1.d, p4.d, p1.d +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p1.s, p1.s, p3.s +; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h +; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv16i1.nxv1i1( %vec, %sv, i64 1) + ret %res +} + +define @insert_nxv1i1_nxv16i1_2( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv16i1_2: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: punpklo p3.h, p2.b +; CHECK-NEXT: punpkhi p2.h, p2.b +; CHECK-NEXT: punpkhi p4.h, p3.b +; CHECK-NEXT: punpklo p3.h, p3.b +; CHECK-NEXT: punpkhi p4.h, p4.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: uzp1 p1.d, p1.d, p4.d +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p1.s, p3.s, p1.s +; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h +; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv16i1.nxv1i1( %vec, %sv, i64 2) + ret %res +} + +define @insert_nxv1i1_nxv16i1_3( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv16i1_3: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: punpklo p3.h, p2.b +; CHECK-NEXT: punpkhi p2.h, p2.b +; CHECK-NEXT: punpkhi p4.h, p3.b +; CHECK-NEXT: punpklo p3.h, p3.b +; CHECK-NEXT: punpklo p4.h, p4.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: uzp1 p1.d, p4.d, p1.d +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p1.s, p3.s, p1.s +; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h +; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv16i1.nxv1i1( %vec, %sv, i64 3) + ret %res +} + +define @insert_nxv1i1_nxv16i1_4( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv16i1_4: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: punpkhi p3.h, p2.b +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: punpklo p4.h, p3.b +; CHECK-NEXT: punpkhi p3.h, p3.b +; CHECK-NEXT: punpkhi p4.h, p4.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: uzp1 p1.d, p1.d, p4.d +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p1.s, p1.s, p3.s +; CHECK-NEXT: uzp1 p1.h, p2.h, p1.h +; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv16i1.nxv1i1( %vec, %sv, i64 4) + ret %res +} + +define @insert_nxv1i1_nxv16i1_5( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv16i1_5: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: punpkhi p3.h, p2.b +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: punpklo p4.h, p3.b +; CHECK-NEXT: punpkhi p3.h, p3.b +; CHECK-NEXT: punpklo p4.h, p4.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: uzp1 p1.d, p4.d, p1.d +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p1.s, p1.s, p3.s +; CHECK-NEXT: uzp1 p1.h, p2.h, p1.h +; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv16i1.nxv1i1( %vec, %sv, i64 5) + ret %res +} + +define @insert_nxv1i1_nxv16i1_6( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv16i1_6: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: punpkhi p3.h, p2.b +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: punpkhi p4.h, p3.b +; CHECK-NEXT: punpklo p3.h, p3.b +; CHECK-NEXT: punpkhi p4.h, p4.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: uzp1 p1.d, p1.d, p4.d +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p1.s, p3.s, p1.s +; CHECK-NEXT: uzp1 p1.h, p2.h, p1.h +; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv16i1.nxv1i1( %vec, %sv, i64 6) + ret %res +} + +define @insert_nxv1i1_nxv16i1_7( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv16i1_7: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: punpklo p2.h, p0.b +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: punpkhi p3.h, p2.b +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: punpkhi p4.h, p3.b +; CHECK-NEXT: punpklo p3.h, p3.b +; CHECK-NEXT: punpklo p4.h, p4.b +; CHECK-NEXT: punpkhi p0.h, p0.b +; CHECK-NEXT: uzp1 p1.d, p4.d, p1.d +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p1.s, p3.s, p1.s +; CHECK-NEXT: uzp1 p1.h, p2.h, p1.h +; CHECK-NEXT: uzp1 p0.b, p1.b, p0.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv16i1.nxv1i1( %vec, %sv, i64 7) + ret %res +} + +define @insert_nxv1i1_nxv16i1_8( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv16i1_8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: punpkhi p2.h, p0.b +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: punpklo p3.h, p2.b +; CHECK-NEXT: punpkhi p2.h, p2.b +; CHECK-NEXT: punpklo p4.h, p3.b +; CHECK-NEXT: punpkhi p3.h, p3.b +; CHECK-NEXT: punpkhi p4.h, p4.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: uzp1 p1.d, p1.d, p4.d +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p1.s, p1.s, p3.s +; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h +; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv16i1.nxv1i1( %vec, %sv, i64 8) + ret %res +} + +define @insert_nxv1i1_nxv16i1_9( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv16i1_9: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: punpkhi p2.h, p0.b +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: punpklo p3.h, p2.b +; CHECK-NEXT: punpkhi p2.h, p2.b +; CHECK-NEXT: punpklo p4.h, p3.b +; CHECK-NEXT: punpkhi p3.h, p3.b +; CHECK-NEXT: punpklo p4.h, p4.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: uzp1 p1.d, p4.d, p1.d +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p1.s, p1.s, p3.s +; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h +; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv16i1.nxv1i1( %vec, %sv, i64 9) + ret %res +} + +define @insert_nxv1i1_nxv16i1_10( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv16i1_10: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: punpkhi p2.h, p0.b +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: punpklo p3.h, p2.b +; CHECK-NEXT: punpkhi p2.h, p2.b +; CHECK-NEXT: punpkhi p4.h, p3.b +; CHECK-NEXT: punpklo p3.h, p3.b +; CHECK-NEXT: punpkhi p4.h, p4.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: uzp1 p1.d, p1.d, p4.d +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p1.s, p3.s, p1.s +; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h +; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv16i1.nxv1i1( %vec, %sv, i64 10) + ret %res +} + +define @insert_nxv1i1_nxv16i1_11( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv16i1_11: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: punpkhi p2.h, p0.b +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: punpklo p3.h, p2.b +; CHECK-NEXT: punpkhi p2.h, p2.b +; CHECK-NEXT: punpkhi p4.h, p3.b +; CHECK-NEXT: punpklo p3.h, p3.b +; CHECK-NEXT: punpklo p4.h, p4.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: uzp1 p1.d, p4.d, p1.d +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p1.s, p3.s, p1.s +; CHECK-NEXT: uzp1 p1.h, p1.h, p2.h +; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv16i1.nxv1i1( %vec, %sv, i64 11) + ret %res +} + +define @insert_nxv1i1_nxv16i1_12( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv16i1_12: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: punpkhi p2.h, p0.b +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: punpkhi p3.h, p2.b +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: punpklo p4.h, p3.b +; CHECK-NEXT: punpkhi p3.h, p3.b +; CHECK-NEXT: punpkhi p4.h, p4.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: uzp1 p1.d, p1.d, p4.d +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p1.s, p1.s, p3.s +; CHECK-NEXT: uzp1 p1.h, p2.h, p1.h +; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv16i1.nxv1i1( %vec, %sv, i64 12) + ret %res +} + +define @insert_nxv1i1_nxv16i1_13( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv16i1_13: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: punpkhi p2.h, p0.b +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: punpkhi p3.h, p2.b +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: punpklo p4.h, p3.b +; CHECK-NEXT: punpkhi p3.h, p3.b +; CHECK-NEXT: punpklo p4.h, p4.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: uzp1 p1.d, p4.d, p1.d +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p1.s, p1.s, p3.s +; CHECK-NEXT: uzp1 p1.h, p2.h, p1.h +; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv16i1.nxv1i1( %vec, %sv, i64 13) + ret %res +} + +define @insert_nxv1i1_nxv16i1_14( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv16i1_14: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: punpkhi p2.h, p0.b +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: punpkhi p3.h, p2.b +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: punpkhi p4.h, p3.b +; CHECK-NEXT: punpklo p3.h, p3.b +; CHECK-NEXT: punpkhi p4.h, p4.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: uzp1 p1.d, p1.d, p4.d +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p1.s, p3.s, p1.s +; CHECK-NEXT: uzp1 p1.h, p2.h, p1.h +; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv16i1.nxv1i1( %vec, %sv, i64 14) + ret %res +} + +define @insert_nxv1i1_nxv16i1_15( %vec, %sv) { +; CHECK-LABEL: insert_nxv1i1_nxv16i1_15: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x08, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 8 * VG +; CHECK-NEXT: punpkhi p2.h, p0.b +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: punpkhi p3.h, p2.b +; CHECK-NEXT: punpklo p2.h, p2.b +; CHECK-NEXT: punpkhi p4.h, p3.b +; CHECK-NEXT: punpklo p3.h, p3.b +; CHECK-NEXT: punpklo p4.h, p4.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: uzp1 p1.d, p4.d, p1.d +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: uzp1 p1.s, p3.s, p1.s +; CHECK-NEXT: uzp1 p1.h, p2.h, p1.h +; CHECK-NEXT: uzp1 p0.b, p0.b, p1.b +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call @llvm.vector.insert.nxv16i1.nxv1i1( %vec, %sv, i64 15) + ret %res +} + attributes #0 = { vscale_range(2,2) } declare @llvm.vector.insert.nxv16i8.v16i8(, <16 x i8>, i64) @@ -747,6 +1411,10 @@ declare @llvm.vector.insert.nxv2i1.v8i1(, <8 x i1>, i64) declare @llvm.vector.insert.nxv4i1.v16i1(, <16 x i1>, i64) declare @llvm.vector.insert.nxv8i1.v32i1(, <32 x i1>, i64) +declare @llvm.vector.insert.nxv16i1.nxv1i1(, , i64) +declare @llvm.vector.insert.nxv8i1.nxv1i1(, , i64) +declare @llvm.vector.insert.nxv4i1.nxv1i1(, , i64) +declare @llvm.vector.insert.nxv2i1.nxv1i1(, , i64) declare @llvm.vector.insert.nx16i1.nxv4i1(, , i64) declare @llvm.vector.insert.nx16i1.nxv8i1(, , i64) declare @llvm.vector.insert.nxv16i1.v64i1(, <64 x i1>, i64)