diff --git a/llvm/include/llvm/IR/IntrinsicsAArch64.td b/llvm/include/llvm/IR/IntrinsicsAArch64.td --- a/llvm/include/llvm/IR/IntrinsicsAArch64.td +++ b/llvm/include/llvm/IR/IntrinsicsAArch64.td @@ -2773,6 +2773,10 @@ [llvm_aarch64_svcount_ty, llvm_i32_ty], [IntrNoMem, ImmArg>]>; + def int_aarch64_sve_pext_x2 + : DefaultAttrsIntrinsic<[llvm_anyvector_ty, LLVMMatchType<0>], + [llvm_aarch64_svcount_ty, llvm_i32_ty], + [IntrNoMem, ImmArg>]>; def int_aarch64_sve_ptrue_c8 : DefaultAttrsIntrinsic<[llvm_aarch64_svcount_ty], [], [IntrNoMem]>; diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp @@ -372,6 +372,7 @@ bool IsIntr = false); void SelectDestructiveMultiIntrinsic(SDNode *N, unsigned NumVecs, bool IsZmMulti, unsigned Opcode); + void SelectPExtPair(SDNode *N, unsigned Opc); void SelectWhilePair(SDNode *N, unsigned Opc); void SelectCVTIntrinsic(SDNode *N, unsigned NumVecs, unsigned Opcode); void SelectClamp(SDNode *N, unsigned NumVecs, unsigned Opcode); @@ -1652,6 +1653,28 @@ return (Opcodes.size() <= Offset) ? 0 : Opcodes[Offset]; } +// This function is almost identical to SelectWhilePair, but has an +// extra check on the range of the immediate operand. +// TODO: Merge these two functions together at some point? +void AArch64DAGToDAGISel::SelectPExtPair(SDNode *N, unsigned Opc) { + // Immediate can be either 0 or 1. + if (ConstantSDNode *Imm = dyn_cast(N->getOperand(2))) + if (Imm->getZExtValue() > 1) + return; + + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue Ops[] = {N->getOperand(1), N->getOperand(2)}; + SDNode *WhilePair = CurDAG->getMachineNode(Opc, DL, MVT::Untyped, Ops); + SDValue SuperReg = SDValue(WhilePair, 0); + + for (unsigned I = 0; I < 2; ++I) + ReplaceUses(SDValue(N, I), CurDAG->getTargetExtractSubreg( + AArch64::psub0 + I, DL, VT, SuperReg)); + + CurDAG->RemoveDeadNode(N); +} + void AArch64DAGToDAGISel::SelectWhilePair(SDNode *N, unsigned Opc) { SDLoc DL(N); EVT VT = N->getValueType(0); @@ -5359,6 +5382,14 @@ AArch64::UUNPK_VG4_4Z2Z_D})) SelectUnaryMultiIntrinsic(Node, 4, /*IsTupleInput=*/true, Op); return; + case Intrinsic::aarch64_sve_pext_x2: { + if (auto Op = SelectOpcodeFromVT( + Node->getValueType(0), + {AArch64::PEXT_2PCI_B, AArch64::PEXT_2PCI_H, AArch64::PEXT_2PCI_S, + AArch64::PEXT_2PCI_D})) + SelectPExtPair(Node, Op); + return; + } } break; } diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-predicate-as-counter.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-predicate-as-counter.ll --- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-predicate-as-counter.ll +++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-predicate-as-counter.ll @@ -71,6 +71,75 @@ declare @llvm.aarch64.sve.pext.nxv4i1(target("aarch64.svcount"), i32) declare @llvm.aarch64.sve.pext.nxv2i1(target("aarch64.svcount"), i32) +define {,} @pext_x2_b(target("aarch64.svcount") %x) nounwind { +; CHECK-LABEL: pext_x2_b: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: pext { p0.b, p1.b }, pn8[1] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call {,} @llvm.aarch64.sve.pext.x2.nxv16i1(target("aarch64.svcount") %x, i32 1) + ret {,} %res +} + +define {,} @pext_x2_h(target("aarch64.svcount") %x) nounwind { +; CHECK-LABEL: pext_x2_h: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: pext { p0.h, p1.h }, pn8[1] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call {,} @llvm.aarch64.sve.pext.x2.nxv8i1(target("aarch64.svcount") %x, i32 1) + ret {,} %res +} + +define {,} @pext_x2_s(target("aarch64.svcount") %x) nounwind { +; CHECK-LABEL: pext_x2_s: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: pext { p0.s, p1.s }, pn8[1] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call {,} @llvm.aarch64.sve.pext.x2.nxv4i1(target("aarch64.svcount") %x, i32 1) + ret {,} %res +} + +define {,} @pext_x2_d(target("aarch64.svcount") %x) nounwind { +; CHECK-LABEL: pext_x2_d: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p8, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p8.b, p0.b +; CHECK-NEXT: pext { p0.d, p1.d }, pn8[1] +; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %res = call {,} @llvm.aarch64.sve.pext.x2.nxv2i1(target("aarch64.svcount") %x, i32 1) + ret {,} %res +} + +declare {,} @llvm.aarch64.sve.pext.x2.nxv16i1(target("aarch64.svcount"), i32) +declare {,} @llvm.aarch64.sve.pext.x2.nxv8i1(target("aarch64.svcount"), i32) +declare {,} @llvm.aarch64.sve.pext.x2.nxv4i1(target("aarch64.svcount"), i32) +declare {,} @llvm.aarch64.sve.pext.x2.nxv2i1(target("aarch64.svcount"), i32) + define target("aarch64.svcount") @ptrue_b() nounwind { ; CHECK-LABEL: ptrue_b: ; CHECK: // %bb.0: