Index: llvm/lib/CodeGen/ReachingDefAnalysis.cpp =================================================================== --- llvm/lib/CodeGen/ReachingDefAnalysis.cpp +++ llvm/lib/CodeGen/ReachingDefAnalysis.cpp @@ -433,7 +433,6 @@ if (LocalDef && InstIds.lookup(LocalDef) < InstIds.lookup(MI)) return LocalDef; - SmallPtrSet VisitedBBs; SmallPtrSet Incoming; MachineBasicBlock *Parent = MI->getParent(); for (auto *Pred : Parent->predecessors()) Index: llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp =================================================================== --- llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -233,12 +233,51 @@ return isPredicatedOnVCTP(Insts.front(), Exclusive); } - static bool isValid() { + // If this block begins with a VPT, we can check whether it's using + // at least one predicated input(s), as well as possible loop invariant + // which would result in it being implicitly predicated. + static bool hasImplicitlyValidVPT(VPTBlock &Block, + ReachingDefAnalysis &RDA) { + SmallVectorImpl &Insts = Block.getInsts(); + MachineInstr *VPT = Insts.front(); + if (VPT->getOpcode() == ARM::MVE_VPST) + return false; + + auto IsOperandPredicated = [&](MachineInstr *MI, unsigned Idx) { + MachineInstr *Op = RDA.getMIOperand(MI, MI->getOperand(Idx)); + return Op && PredicatedInsts.count(Op) && isPredicatedOnVCTP(Op); + }; + + auto IsOperandInvariant = [&](MachineInstr *MI, unsigned Idx) { + MachineOperand &MO = MI->getOperand(Idx); + if (!MO.isReg() || !MO.getReg()) + return true; + + SmallPtrSet Defs; + RDA.getGlobalReachingDefs(MI, MO.getReg(), Defs); + if (Defs.empty()) + return true; + + for (auto *Def : Defs) + if (Def->getParent() == VPT->getParent()) + return false; + return true; + }; + + // Check that at least one of the operands is directly predicated on a + // vctp and allow an invariant value too. + return (IsOperandPredicated(VPT, 1) || IsOperandPredicated(VPT, 2)) && + (IsOperandPredicated(VPT, 1) || IsOperandInvariant(VPT, 1)) && + (IsOperandPredicated(VPT, 2) || IsOperandInvariant(VPT, 2)); + } + + static bool isValid(ReachingDefAnalysis &RDA) { // All predication within the loop should be based on vctp. If the block // isn't predicated on entry, check whether the vctp is within the block // and that all other instructions are then predicated on it. for (auto &Block : Blocks) { - if (isEntryPredicatedOnVCTP(Block)) + if (isEntryPredicatedOnVCTP(Block, false) || + hasImplicitlyValidVPT(Block, RDA)) continue; SmallVectorImpl &Insts = Block.getInsts(); @@ -489,7 +528,7 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) { assert(!VCTPs.empty() && "VCTP instruction expected but is not set"); - if (!VPTBlock::isValid()) + if (!VPTBlock::isValid(RDA)) return false; if (!ValidateLiveOuts()) { Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll @@ -7,23 +7,13 @@ ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: ldrd r5, r12, [sp, #80] -; CHECK-NEXT: cmp.w r12, #4 -; CHECK-NEXT: mov r4, r12 ; CHECK-NEXT: vmvn.i32 q0, #0x80000000 -; CHECK-NEXT: it ge -; CHECK-NEXT: movge r4, #4 ; CHECK-NEXT: vmov.i32 q1, #0x3f -; CHECK-NEXT: sub.w r4, r12, r4 ; CHECK-NEXT: vmov.i32 q2, #0x1 -; CHECK-NEXT: add.w lr, r4, #3 -; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: add.w lr, r4, lr, lsr #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB0_1: @ %bb6 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r12 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q4, [r1], #16 +; CHECK-NEXT: vldrw.u32 q4, [r1], #16 ; CHECK-NEXT: vabs.s32 q5, q4 ; CHECK-NEXT: vcls.s32 q3, q5 ; CHECK-NEXT: vshl.u32 q5, q5, q3 @@ -41,15 +31,11 @@ ; CHECK-NEXT: vqshl.s32 q5, q5, #1 ; CHECK-NEXT: vpt.s32 lt, q4, zr ; CHECK-NEXT: vnegt.s32 q5, q5 -; CHECK-NEXT: vctp.32 r12 -; CHECK-NEXT: sub.w r12, r12, #4 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q4, [r0], #16 +; CHECK-NEXT: vldrw.u32 q4, [r0], #16 ; CHECK-NEXT: vqrdmulh.s32 q4, q4, q5 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vstrwt.32 q4, [r2], #16 -; CHECK-NEXT: vstrwt.32 q3, [r3], #16 -; CHECK-NEXT: le lr, .LBB0_1 +; CHECK-NEXT: vstrw.32 q4, [r2], #16 +; CHECK-NEXT: vstrw.32 q3, [r3], #16 +; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %bb44 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, r5, r7, pc} Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir @@ -593,26 +593,17 @@ ; CHECK: bb.1.vector.ph: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: liveins: $r0, $r1, $r2 - ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 - ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg - ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg - ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r1 ; CHECK: bb.2.vector.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3 - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg - ; CHECK: MVE_VPST 8, implicit $vpr - ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, killed renamable $vpr - ; CHECK: MVE_VPTv4s32r 14, renamable $q1, renamable $r2, 10, implicit-def $vpr + ; CHECK: liveins: $lr, $q0, $r0, $r2, $r3 + ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, killed $noreg + ; CHECK: MVE_VPTv4s32r 12, renamable $q1, renamable $r2, 10, implicit-def $vpr ; CHECK: renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 13, 1, killed renamable $vpr - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r1, 2, killed renamable $vpr ; CHECK: renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 2, killed renamable $vpr - ; CHECK: renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg - ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 ; CHECK: bb.3.for.cond.cleanup: ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc ;