diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp --- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp +++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp @@ -12755,6 +12755,12 @@ if (isSplatShuffle(II->getOperand(1))) Ops.push_back(&II->getOperandUse(1)); return !Ops.empty(); + case Intrinsic::aarch64_sve_ptest_first: + case Intrinsic::aarch64_sve_ptest_last: + if (auto *IIOp = dyn_cast(II->getOperand(0))) + if (IIOp->getIntrinsicID() == Intrinsic::aarch64_sve_ptrue) + Ops.push_back(&II->getOperandUse(0)); + return !Ops.empty(); case Intrinsic::aarch64_sme_write_horiz: case Intrinsic::aarch64_sme_write_vert: case Intrinsic::aarch64_sme_writeq_horiz: diff --git a/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-ptest-removal-sink.ll @@ -0,0 +1,46 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=aarch64 -mattr=+sve < %s | FileCheck %s + +; +; Ensure that the %ptrue from the preheader is sunk into the loop such that the ptest is removed. +; + +define void @test_sink_ptrue_into_ptest(ptr %a, ptr %b, i32 %n) { +; CHECK-LABEL: test_sink_ptrue_into_ptest: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: whilelt p0.s, wzr, w2 +; CHECK-NEXT: b.pl .LBB0_3 +; CHECK-NEXT: // %bb.1: // %for.body.preheader +; CHECK-NEXT: mov w9, wzr +; CHECK-NEXT: cntw x8 +; CHECK-NEXT: .LBB0_2: // %for.body +; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add w10, w9, w8 +; CHECK-NEXT: whilelt p0.s, w9, w2 +; CHECK-NEXT: mov w9, w10 +; CHECK-NEXT: b.mi .LBB0_2 +; CHECK-NEXT: .LBB0_3: // %exit +; CHECK-NEXT: ret +entry: + %vscale = tail call i32 @llvm.vscale.i32() + %step = shl nuw nsw i32 %vscale, 2 + %ptrue.ph = tail call @llvm.aarch64.sve.ptrue.nxv4i1(i32 31) + %while.ph = tail call @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 0, i32 %n) + %ptest.ph = tail call i1 @llvm.aarch64.sve.ptest.first.nxv4i1( %ptrue.ph, %while.ph) + br i1 %ptest.ph, label %for.body, label %exit + +for.body: + %i = phi i32 [ 0, %entry ], [ %i.next, %for.body ] + %i.next = add i32 %i, %step + %while = call @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32 %i, i32 %n) + %ptest = call i1 @llvm.aarch64.sve.ptest.first.nxv4i1( %ptrue.ph, %while) + br i1 %ptest, label %for.body, label %exit + +exit: + ret void +} + +declare i32 @llvm.vscale.i32() +declare @llvm.aarch64.sve.ptrue.nxv4i1(i32 immarg) +declare @llvm.aarch64.sve.whilelt.nxv4i1.i32(i32, i32) +declare i1 @llvm.aarch64.sve.ptest.first.nxv4i1(, )