Index: llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp =================================================================== --- llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -613,6 +613,23 @@ return false; } + // Check that creating a [W|D]LSTP, which will define LR with an element + // count instead of iteration count, won't affect any other instructions + // than the LoopStart and LoopDec. + // TODO: We should try to insert the [W|D]LSTP after any of the other uses. + if (StartInsertPt == Start && Start->getOperand(0).getReg() == ARM::LR) { + if (auto *IterCount = RDA.getMIOperand(Start, 0)) { + SmallPtrSet Uses; + RDA.getGlobalUses(IterCount, ARM::LR, Uses); + for (auto *Use : Uses) { + if (Use != Start && Use != Dec) { + LLVM_DEBUG(dbgs() << " ARM Loops: Found LR use: " << *Use); + return false; + } + } + } + } + // For tail predication, we need to provide the number of elements, instead // of the iteration count, to the loop start instruction. The number of // elements is provided to the vctp instruction, so we need to check that Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain-store.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain-store.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain-store.mir @@ -142,18 +142,22 @@ ; CHECK: renamable $lr = t2ADDri killed renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r2, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $lr, 19, 14 /* CC::al */, $noreg, $noreg - ; CHECK: t2STRi12 killed renamable $lr, killed renamable $r3, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.iter.addr) - ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r12 + ; CHECK: t2STRi12 renamable $lr, killed renamable $r3, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.iter.addr) + ; CHECK: $lr = t2DLS killed renamable $lr ; CHECK: $r2 = tMOVr killed $lr, 14 /* CC::al */, $noreg ; CHECK: bb.1.do.body: ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) - ; CHECK: liveins: $r0, $r1, $r2 + ; CHECK: liveins: $r0, $r1, $r2, $r12 ; CHECK: $lr = tMOVr $r2, 14 /* CC::al */, $noreg + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r12, 0, $noreg ; CHECK: renamable $r2, dead $cpsr = nsw tSUBi8 killed $r2, 1, 14 /* CC::al */, $noreg - ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.pSrc.addr.02, align 4) + ; CHECK: renamable $r12 = nsw t2SUBri killed renamable $r12, 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.pSrc.addr.02, align 4) ; CHECK: renamable $q0 = MVE_VMULf32 killed renamable $q0, killed renamable $q0, 0, $noreg, undef renamable $q0 - ; CHECK: renamable $r1 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r1, 16, 0, killed $noreg :: (store 16 into %ir.pDst.addr.01, align 4) - ; CHECK: dead $lr = MVE_LETP killed renamable $lr, %bb.1 + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $r1 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r1, 16, 1, killed renamable $vpr :: (store 16 into %ir.pDst.addr.01, align 4) + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 ; CHECK: bb.2.do.end: ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0.entry: @@ -242,19 +246,23 @@ ; CHECK: renamable $r2 = t2RSBrs killed renamable $lr, killed renamable $r2, 10, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $lr = t2ADDri killed renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r2, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg - ; CHECK: dead renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $lr, 19, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r12 + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $lr, 19, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $lr = t2DLS killed renamable $lr ; CHECK: t2STRi12 renamable $lr, killed renamable $r3, 0, 14 /* CC::al */, $noreg :: (store 4 into %ir.iter.addr) ; CHECK: $r2 = tMOVr killed $lr, 14 /* CC::al */, $noreg ; CHECK: bb.1.do.body: ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) - ; CHECK: liveins: $r0, $r1, $r2 + ; CHECK: liveins: $r0, $r1, $r2, $r12 ; CHECK: $lr = tMOVr $r2, 14 /* CC::al */, $noreg + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r12, 0, $noreg ; CHECK: renamable $r2, dead $cpsr = nsw tSUBi8 killed $r2, 1, 14 /* CC::al */, $noreg - ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 0, $noreg :: (load 16 from %ir.pSrc.addr.02, align 4) + ; CHECK: renamable $r12 = nsw t2SUBri killed renamable $r12, 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $r0, renamable $q0 = MVE_VLDRWU32_post killed renamable $r0, 16, 1, renamable $vpr :: (load 16 from %ir.pSrc.addr.02, align 4) ; CHECK: renamable $q0 = MVE_VMULf32 killed renamable $q0, killed renamable $q0, 0, $noreg, undef renamable $q0 - ; CHECK: renamable $r1 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r1, 16, 0, killed $noreg :: (store 16 into %ir.pDst.addr.01, align 4) - ; CHECK: dead $lr = MVE_LETP killed renamable $lr, %bb.1 + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $r1 = MVE_VSTRWU32_post killed renamable $q0, killed renamable $r1, 16, 1, killed renamable $vpr :: (store 16 into %ir.pDst.addr.01, align 4) + ; CHECK: dead $lr = t2LEUpdate killed renamable $lr, %bb.1 ; CHECK: bb.2.do.end: ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0.entry: Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-after-dlstp.mir @@ -160,17 +160,20 @@ ; CHECK: renamable $r3, dead $cpsr = tSUBrr renamable $r1, killed renamable $r3, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 ; CHECK: renamable $r3, dead $cpsr = tADDi8 killed renamable $r3, 3, 14 /* CC::al */, $noreg - ; CHECK: dead renamable $lr = nuw nsw t2ADDrs killed renamable $r12, killed renamable $r3, 19, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r12, killed renamable $r3, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: $r3 = tMOVr $r1, 14 /* CC::al */, $noreg ; CHECK: $r12 = tMOVr $r0, 14 /* CC::al */, $noreg - ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r3 + ; CHECK: $lr = t2DLS killed renamable $lr ; CHECK: $r4 = tMOVr $lr, 14 /* CC::al */, $noreg ; CHECK: bb.1.do.body.i: ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) - ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r4, $r12 - ; CHECK: renamable $r12, renamable $q1 = MVE_VLDRWU32_post killed renamable $r12, 16, 0, $noreg :: (load 16 from %ir.pSrc.addr.0.i2, align 4) - ; CHECK: renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VADDf32 killed renamable $q0, killed renamable $q1, 0, killed $noreg, killed renamable $q0 - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 + ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r4, $r12 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg + ; CHECK: renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3, 4, 14 /* CC::al */, $noreg + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $r12, renamable $q1 = MVE_VLDRWU32_post killed renamable $r12, 16, 1, renamable $vpr :: (load 16 from %ir.pSrc.addr.0.i2, align 4) + ; CHECK: renamable $q0 = nnan ninf nsz arcp contract afn reassoc MVE_VADDf32 killed renamable $q0, killed renamable $q1, 1, killed renamable $vpr, killed renamable $q0 + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 ; CHECK: bb.2.arm_mean_f32_mve.exit: ; CHECK: successors: %bb.3(0x80000000) ; CHECK: liveins: $q0, $r0, $r1, $r2, $r4 Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/mov-operand.ll @@ -17,13 +17,16 @@ ; CHECK-NEXT: add.w lr, r12, r3, lsr #2 ; CHECK-NEXT: mov r3, r1 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: mov r4, lr ; CHECK-NEXT: .LBB0_1: @ %do.body.i ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q1, [r12], #16 -; CHECK-NEXT: vadd.f32 q0, q0, q1 -; CHECK-NEXT: letp lr, .LBB0_1 +; CHECK-NEXT: vctp.32 r3 +; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vldrwt.u32 q1, [r12], #16 +; CHECK-NEXT: vaddt.f32 q0, q0, q1 +; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %arm_mean_f32_mve.exit ; CHECK-NEXT: vmov s4, r1 ; CHECK-NEXT: vadd.f32 s0, s3, s3