Index: llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp =================================================================== --- llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp +++ llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp @@ -59,7 +59,7 @@ } private: - bool RevertLoopWithCall(MachineLoop *ML); + bool MergeLoopEnd(MachineLoop *ML); bool ConvertTailPredLoop(MachineLoop *ML, MachineDominatorTree *DT); MachineInstr &ReplaceRegisterUseWithVPNOT(MachineBasicBlock &MBB, MachineInstr &Instr, @@ -158,7 +158,7 @@ return true; } -bool MVETPAndVPTOptimisations::RevertLoopWithCall(MachineLoop *ML) { +bool MVETPAndVPTOptimisations::MergeLoopEnd(MachineLoop *ML) { LLVM_DEBUG(dbgs() << "MergeLoopEnd on loop " << ML->getHeader()->getName() << "\n"); @@ -180,7 +180,55 @@ } } - return false; + // Remove any copies from the loop, to ensure the phi that remains is simpler. + Register PhiReg = LoopPhi->getOperand(0).getReg(); + Register DecReg = LoopDec->getOperand(0).getReg(); + Register StartReg = LoopStart->getOperand(0).getReg(); + // Ensure the uses are expected, and collect any copies we want to remove. + SmallVector Copies; + auto CheckUsers = [&Copies](Register BaseReg, ArrayRef OK, + MachineRegisterInfo *MRI) { + SmallVector Worklist; + Worklist.push_back(BaseReg); + while (!Worklist.empty()) { + Register Reg = Worklist.pop_back_val(); + for (MachineInstr &MI : MRI->use_nodbg_instructions(Reg)) { + if (count(OK, &MI)) + continue; + if (MI.getOpcode() != TargetOpcode::COPY || + !MI.getOperand(0).getReg().isVirtual()) { + LLVM_DEBUG(dbgs() << "Extra users of register found: " << MI); + return false; + } + Worklist.push_back(MI.getOperand(0).getReg()); + Copies.push_back(&MI); + } + } + return true; + }; + if (!CheckUsers(PhiReg, {LoopDec}, MRI) || + !CheckUsers(DecReg, {LoopPhi, LoopEnd}, MRI) || + !CheckUsers(StartReg, {LoopPhi}, MRI)) + return false; + + MRI->constrainRegClass(StartReg, &ARM::GPRlrRegClass); + MRI->constrainRegClass(PhiReg, &ARM::GPRlrRegClass); + MRI->constrainRegClass(DecReg, &ARM::GPRlrRegClass); + + if (LoopPhi->getOperand(2).getMBB() == ML->getLoopLatch()) { + LoopPhi->getOperand(3).setReg(StartReg); + LoopPhi->getOperand(1).setReg(DecReg); + } else { + LoopPhi->getOperand(1).setReg(StartReg); + LoopPhi->getOperand(3).setReg(DecReg); + } + + LoopDec->getOperand(1).setReg(PhiReg); + LoopEnd->getOperand(0).setReg(DecReg); + + for (auto *MI : Copies) + MI->eraseFromParent(); + return true; } // Convert t2DoLoopStart to t2DoLoopStartTP if the loop contains VCTP @@ -697,7 +745,7 @@ bool Modified = false; for (MachineLoop *ML : MLI->getBase().getLoopsInPreorder()) { - Modified |= RevertLoopWithCall(ML); + Modified |= MergeLoopEnd(ML); Modified |= ConvertTailPredLoop(ML, DT); } Index: llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll @@ -157,10 +157,10 @@ ; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: movw lr, #1250 ; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: vmov.i32 q2, #0x3 ; CHECK-NEXT: vadd.i32 q0, q0, r1 ; CHECK-NEXT: adds r1, r3, #4 -; CHECK-NEXT: vmov.i32 q2, #0x3 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vctp.32 r3 @@ -231,65 +231,55 @@ ; CHECK-NEXT: cmp r2, #0 ; CHECK-NEXT: beq.w .LBB3_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: adds r3, r2, #3 -; CHECK-NEXT: adr r7, .LCPI3_5 -; CHECK-NEXT: bic r3, r3, #3 ; CHECK-NEXT: vmov.i32 q0, #0x8000 -; CHECK-NEXT: sub.w r12, r3, #4 -; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: adr r6, .LCPI3_4 -; CHECK-NEXT: adr r5, .LCPI3_3 -; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: adr r4, .LCPI3_2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: adr r7, .LCPI3_5 ; CHECK-NEXT: vstrw.32 q0, [sp, #160] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r7] -; CHECK-NEXT: adr.w r8, .LCPI3_1 -; CHECK-NEXT: adr.w r12, .LCPI3_0 -; CHECK-NEXT: adr r3, .LCPI3_6 +; CHECK-NEXT: adr r6, .LCPI3_4 +; CHECK-NEXT: adr r5, .LCPI3_3 ; CHECK-NEXT: vstrw.32 q0, [sp, #176] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: vldrw.u32 q1, [r3] -; CHECK-NEXT: adr r3, .LCPI3_7 +; CHECK-NEXT: adr r4, .LCPI3_2 ; CHECK-NEXT: vstrw.32 q0, [sp, #144] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r5] -; CHECK-NEXT: adr r6, .LCPI3_10 -; CHECK-NEXT: adr r7, .LCPI3_9 +; CHECK-NEXT: adr.w r8, .LCPI3_1 ; CHECK-NEXT: vstrw.32 q0, [sp, #128] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r4] -; CHECK-NEXT: vstrw.32 q1, [sp, #192] @ 16-byte Spill ; CHECK-NEXT: vstrw.32 q0, [sp, #112] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r8] +; CHECK-NEXT: adr.w r12, .LCPI3_0 ; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r12] +; CHECK-NEXT: adr r3, .LCPI3_6 +; CHECK-NEXT: adr r6, .LCPI3_10 +; CHECK-NEXT: vldrw.u32 q1, [r3] ; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: vmov.i32 q0, #0x7fff +; CHECK-NEXT: adr r3, .LCPI3_7 ; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: adr r7, .LCPI3_9 ; CHECK-NEXT: adr r3, .LCPI3_8 ; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: vstrw.32 q1, [sp, #192] @ 16-byte Spill ; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r7] ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r3] ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB3_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [sp, #192] @ 16-byte Reload -; CHECK-NEXT: vctp.32 r2 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrbt.u32 q4, [r0, q0] +; CHECK-NEXT: vldrb.u32 q4, [r0, q0] ; CHECK-NEXT: vldrw.u32 q0, [sp, #176] @ 16-byte Reload -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrbt.u32 q7, [r0, q0] +; CHECK-NEXT: vldrb.u32 q7, [r0, q0] ; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q5, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmul.i32 q6, q7, q0 ; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrbt.u32 q1, [r0, q5] +; CHECK-NEXT: vldrb.u32 q1, [r0, q5] ; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vmul.i32 q3, q4, q0 ; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload @@ -320,14 +310,12 @@ ; CHECK-NEXT: vadd.i32 q1, q1, q0 ; CHECK-NEXT: vldrw.u32 q0, [sp, #192] @ 16-byte Reload ; CHECK-NEXT: vshr.u32 q1, q1, #16 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrbt.32 q1, [r1, q0] +; CHECK-NEXT: vstrb.32 q1, [r1, q0] ; CHECK-NEXT: vldrw.u32 q0, [sp, #176] @ 16-byte Reload -; CHECK-NEXT: vpstt -; CHECK-NEXT: vstrbt.32 q2, [r1, q0] -; CHECK-NEXT: vstrbt.32 q6, [r1, q5] +; CHECK-NEXT: vstrb.32 q2, [r1, q0] +; CHECK-NEXT: vstrb.32 q6, [r1, q5] ; CHECK-NEXT: adds r1, #12 -; CHECK-NEXT: le lr, .LBB3_2 +; CHECK-NEXT: letp lr, .LBB3_2 ; CHECK-NEXT: .LBB3_3: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #216 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} Index: llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll +++ llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll @@ -703,7 +703,7 @@ ; CHECK-NEXT: beq .LBB5_8 ; CHECK-NEXT: @ %bb.2: @ %for.body.lr.ph ; CHECK-NEXT: ldr r2, [sp, #88] -; CHECK-NEXT: mov.w r9, #0 +; CHECK-NEXT: mov.w r11, #0 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: ldr r4, [sp, #72] ; CHECK-NEXT: add.w r0, r1, r2, lsl #1 @@ -726,27 +726,27 @@ ; CHECK-NEXT: ldr r1, [sp, #96] ; CHECK-NEXT: add r0, r6 ; CHECK-NEXT: add r0, r12 -; CHECK-NEXT: strb.w r0, [r1, r9] -; CHECK-NEXT: add.w r9, r9, #1 +; CHECK-NEXT: strb.w r0, [r1, r11] +; CHECK-NEXT: add.w r11, r11, #1 ; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: cmp r9, r0 +; CHECK-NEXT: cmp r11, r0 ; CHECK-NEXT: beq .LBB5_8 ; CHECK-NEXT: .LBB5_5: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB5_7 Depth 2 ; CHECK-NEXT: ldr r0, [sp, #92] -; CHECK-NEXT: ldr.w r12, [r0, r9, lsl #2] +; CHECK-NEXT: ldr.w r12, [r0, r11, lsl #2] ; CHECK-NEXT: subs r0, r2, r2 ; CHECK-NEXT: ble .LBB5_3 ; CHECK-NEXT: @ %bb.6: @ %for.body24.preheader ; CHECK-NEXT: @ in Loop: Header=BB5_5 Depth=1 -; CHECK-NEXT: ldr.w r11, [sp, #88] +; CHECK-NEXT: ldr.w r9, [sp, #88] ; CHECK-NEXT: mov r6, r12 ; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: mov r8, r12 -; CHECK-NEXT: dlstp.16 lr, r11 +; CHECK-NEXT: dlstp.16 lr, r9 ; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mla r3, r9, r11, r1 +; CHECK-NEXT: mla r3, r11, r9, r1 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: ldrd r7, r5, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: mov r10, r12 @@ -891,7 +891,7 @@ ; CHECK-NEXT: beq .LBB6_8 ; CHECK-NEXT: @ %bb.2: @ %for.body.lr.ph ; CHECK-NEXT: ldr r2, [sp, #88] -; CHECK-NEXT: mov.w r9, #0 +; CHECK-NEXT: mov.w r11, #0 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: ldr r4, [sp, #72] ; CHECK-NEXT: add.w r0, r1, r2, lsl #1 @@ -907,18 +907,18 @@ ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB6_5 Depth 2 ; CHECK-NEXT: ldr r0, [sp, #92] -; CHECK-NEXT: ldr.w r12, [r0, r9, lsl #2] +; CHECK-NEXT: ldr.w r12, [r0, r11, lsl #2] ; CHECK-NEXT: subs r0, r2, r2 ; CHECK-NEXT: ble .LBB6_6 ; CHECK-NEXT: @ %bb.4: @ %for.body24.preheader ; CHECK-NEXT: @ in Loop: Header=BB6_3 Depth=1 -; CHECK-NEXT: ldr.w r11, [sp, #88] +; CHECK-NEXT: ldr.w r9, [sp, #88] ; CHECK-NEXT: mov r6, r12 ; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: mov r8, r12 -; CHECK-NEXT: dlstp.16 lr, r11 +; CHECK-NEXT: dlstp.16 lr, r9 ; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: mla r3, r9, r11, r1 +; CHECK-NEXT: mla r3, r11, r9, r1 ; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: ldrd r7, r5, [sp] @ 8-byte Folded Reload ; CHECK-NEXT: mov r10, r12 @@ -950,10 +950,10 @@ ; CHECK-NEXT: ldr r1, [sp, #96] ; CHECK-NEXT: add r0, r6 ; CHECK-NEXT: add r0, r12 -; CHECK-NEXT: strb.w r0, [r1, r9] -; CHECK-NEXT: add.w r9, r9, #1 +; CHECK-NEXT: strb.w r0, [r1, r11] +; CHECK-NEXT: add.w r11, r11, #1 ; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: cmp r9, r0 +; CHECK-NEXT: cmp r11, r0 ; CHECK-NEXT: bne .LBB6_3 ; CHECK-NEXT: .LBB6_8: @ %if.end ; CHECK-NEXT: ldr r0, [sp, #96] Index: llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll +++ llvm/test/CodeGen/Thumb2/mve-pred-vctpvpsel.ll @@ -9,32 +9,21 @@ ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: mov r12, r1 ; CHECK-NEXT: vidup.u32 q2, r6, #1 -; CHECK-NEXT: cmp r1, #4 -; CHECK-NEXT: it ge -; CHECK-NEXT: movge.w r12, #4 -; CHECK-NEXT: sub.w r6, r1, r12 -; CHECK-NEXT: adds r6, #3 -; CHECK-NEXT: mov.w lr, #1 ; CHECK-NEXT: adr r4, .LCPI0_0 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: add.w lr, lr, r6, lsr #2 ; CHECK-NEXT: vldrw.u32 q1, [r4] -; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: vmov.i32 q3, #0x4 +; CHECK-NEXT: dlstp.32 lr, r1 ; CHECK-NEXT: mov r12, r1 ; CHECK-NEXT: .LBB0_1: @ %do.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r12 -; CHECK-NEXT: sub.w r12, r12, #4 -; CHECK-NEXT: vpstttt -; CHECK-NEXT: vldrwt.u32 q4, [r0], #16 -; CHECK-NEXT: vcmpt.f32 ge, q1, q4 +; CHECK-NEXT: vldrw.u32 q4, [r0], #16 +; CHECK-NEXT: vptt.f32 ge, q1, q4 ; CHECK-NEXT: vmovt q1, q4 ; CHECK-NEXT: vmovt q0, q2 ; CHECK-NEXT: vadd.i32 q2, q2, q3 -; CHECK-NEXT: le lr, .LBB0_1 +; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %do.end ; CHECK-NEXT: vldr s8, .LCPI0_1 ; CHECK-NEXT: vdup.32 q3, r1