Index: llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp =================================================================== --- llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp +++ llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp @@ -76,6 +76,8 @@ bool ReplaceConstByVPNOTs(MachineBasicBlock &MBB, MachineDominatorTree *DT); bool ConvertVPSEL(MachineBasicBlock &MBB); bool HintDoLoopStartReg(MachineBasicBlock &MBB); + MachineInstr *CheckForLRUseInPreheader(MachineBasicBlock *PreHeader, + MachineInstr *LoopStart); }; char MVETPAndVPTOptimisations::ID = 0; @@ -253,6 +255,53 @@ return true; } +// Return true if this instruction is invalid in a low overhead loop, usually +// because it clobbers LR. +static bool IsInvalidTPInstruction(MachineInstr &MI) { + return MI.isCall() || isLoopStart(MI); +} + +// Starting from PreHeader, search for invalid instructions back until the +// LoopStart block is reached. If invalid instructions are found, the loop start +// is reverted from a WhileLoopStart to a DoLoopStart on the same loop. Will +// return the new DLS LoopStart if updated. +MachineInstr * +MVETPAndVPTOptimisations::CheckForLRUseInPreheader(MachineBasicBlock *PreHeader, + MachineInstr *LoopStart) { + SmallVector Worklist; + SmallPtrSet Visited; + Worklist.push_back(PreHeader); + Visited.insert(LoopStart->getParent()); + + while (!Worklist.empty()) { + MachineBasicBlock *MBB = Worklist.pop_back_val(); + if (Visited.count(MBB)) + continue; + + for (MachineInstr &MI : *MBB) { + if (IsInvalidTPInstruction(MI)) { + LLVM_DEBUG(dbgs() << "Found LR use in preheader, reverting: " << MI); + + // Create a t2DoLoopStart at the end of the preheader. + MachineInstrBuilder MIB = + BuildMI(*PreHeader, PreHeader->getFirstTerminator(), + LoopStart->getDebugLoc(), TII->get(ARM::t2DoLoopStart)); + MIB.add(LoopStart->getOperand(0)); + MIB.add(LoopStart->getOperand(1)); + + // Revert the t2WhileLoopStartLR to a CMP and Br. + RevertWhileLoopStartLR(LoopStart, TII, ARM::t2Bcc, true); + return MIB; + } + } + + Visited.insert(MBB); + for (auto *Pred : MBB->predecessors()) + Worklist.push_back(Pred); + } + return LoopStart; +} + // This function converts loops with t2LoopEnd and t2LoopEnd instructions into // a single t2LoopEndDec instruction. To do that it needs to make sure that LR // will be valid to be used for the low overhead loop, which means nothing else @@ -275,29 +324,13 @@ // and if so revert it now before we get any further. While loops also need to // check the preheaders, but can be reverted to a DLS loop if needed. auto *PreHeader = ML->getLoopPreheader(); - if (LoopStart->getOpcode() == ARM::t2WhileLoopStartLR && PreHeader && - LoopStart->getParent() != PreHeader) { - for (MachineInstr &MI : *PreHeader) { - if (MI.isCall()) { - // Create a t2DoLoopStart at the end of the preheader. - MachineInstrBuilder MIB = - BuildMI(*PreHeader, PreHeader->getFirstTerminator(), - LoopStart->getDebugLoc(), TII->get(ARM::t2DoLoopStart)); - MIB.add(LoopStart->getOperand(0)); - MIB.add(LoopStart->getOperand(1)); - - // Revert the t2WhileLoopStartLR to a CMP and Br. - RevertWhileLoopStartLR(LoopStart, TII, ARM::t2Bcc, true); - LoopStart = MIB; - break; - } - } - } + if (LoopStart->getOpcode() == ARM::t2WhileLoopStartLR && PreHeader) + LoopStart = CheckForLRUseInPreheader(PreHeader, LoopStart); for (MachineBasicBlock *MBB : ML->blocks()) { for (MachineInstr &MI : *MBB) { - if (MI.isCall()) { - LLVM_DEBUG(dbgs() << "Found call in loop, reverting: " << MI); + if (IsInvalidTPInstruction(MI)) { + LLVM_DEBUG(dbgs() << "Found LR use in loop, reverting: " << MI); if (LoopStart->getOpcode() == ARM::t2DoLoopStart) RevertDoLoopStart(LoopStart, TII); else Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/memcall.ll @@ -276,6 +276,62 @@ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body } +define void @test_memset_preheader(i8* %x, i8* %y, i32 %n) { +; CHECK-LABEL: test_memset_preheader: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: cbz r2, .LBB6_5 +; CHECK-NEXT: @ %bb.1: @ %prehead +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: mov r12, r0 +; CHECK-NEXT: mov r3, r2 +; CHECK-NEXT: wlstp.8 lr, r3, .LBB6_3 +; CHECK-NEXT: .LBB6_2: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vstrb.8 q0, [r12], #16 +; CHECK-NEXT: letp lr, .LBB6_2 +; CHECK-NEXT: .LBB6_3: @ %prehead +; CHECK-NEXT: dls lr, r2 +; CHECK-NEXT: mov r12, r0 +; CHECK-NEXT: .LBB6_4: @ %for.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: ldrb r3, [r12], #1 +; CHECK-NEXT: strb r3, [r1], #1 +; CHECK-NEXT: le lr, .LBB6_4 +; CHECK-NEXT: .LBB6_5: @ %for.cond.cleanup +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: wlstp.8 lr, r2, .LBB6_7 +; CHECK-NEXT: .LBB6_6: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vstrb.8 q0, [r0], #16 +; CHECK-NEXT: letp lr, .LBB6_6 +; CHECK-NEXT: .LBB6_7: @ %for.cond.cleanup +; CHECK-NEXT: pop {r7, pc} +entry: + %cmp6 = icmp ne i32 %n, 0 + br i1 %cmp6, label %prehead, label %for.cond.cleanup + +prehead: + call void @llvm.memset.p0i8.i32(i8* %x, i8 0, i32 %n, i1 false) + br label %for.body + +for.body: ; preds = %entry, %for.body + %i.09 = phi i32 [ %inc, %for.body ], [ 0, %prehead ] + %x.addr.08 = phi i8* [ %add.ptr, %for.body ], [ %x, %prehead ] + %y.addr.07 = phi i8* [ %add.ptr1, %for.body ], [ %y, %prehead ] + %add.ptr = getelementptr inbounds i8, i8* %x.addr.08, i32 1 + %add.ptr1 = getelementptr inbounds i8, i8* %y.addr.07, i32 1 + %l = load i8, i8* %x.addr.08 + store i8 %l, i8* %y.addr.07 + %inc = add nuw nsw i32 %i.09, 1 + %exitcond.not = icmp eq i32 %inc, %n + br i1 %exitcond.not, label %for.cond.cleanup, label %for.body + +for.cond.cleanup: ; preds = %for.body, %entry + call void @llvm.memset.p0i8.i32(i8* %x, i8 0, i32 %n, i1 false) + ret void +} + + declare void @llvm.memcpy.p0i8.p0i8.i32(i8* noalias nocapture writeonly, i8* noalias nocapture readonly, i32, i1 immarg) declare void @llvm.memset.p0i8.i32(i8* nocapture writeonly, i8, i32, i1 immarg)