diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp --- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -306,6 +306,7 @@ void Expand(LowOverheadLoop &LoLoop); + void IterationCountDCE(LowOverheadLoop &LoLoop); }; } @@ -818,38 +819,100 @@ MI->eraseFromParent(); } -MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) { - LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n"); - // When using tail-predication, try to delete the dead code that was used to - // calculate the number of loop iterations. - if (LoLoop.IsTailPredicationLegal()) { - SmallVector Killed; - SmallVector Dead; - if (auto *Def = RDA->getReachingMIDef(LoLoop.Start, - LoLoop.Start->getOperand(0).getReg())) { - SmallPtrSet Remove; - SmallPtrSet Ignore = { LoLoop.Start, LoLoop.Dec, - LoLoop.End, LoLoop.InsertPt }; - SmallVector Chain = { Def }; - while (!Chain.empty()) { - MachineInstr *MI = Chain.back(); - Chain.pop_back(); - if (TII->getPredicate(*MI) != ARMCC::AL) - continue; +// Perform dead code elimation on the loop iteration count setup expression. +// If we are tail-predicating, the number of elements to be processed is the +// operand of the VCTP instruction in the vector body, see getCount(), which is +// register $r3 in this example: +// +// $lr = big-itercount-expression +// .. +// t2DoLoopStart renamable $lr +// vector.body: +// .. +// $vpr = MVE_VCTP32 renamable $r3 +// renamable $lr = t2LoopDec killed renamable $lr, 1 +// t2LoopEnd renamable $lr, %vector.body +// tB %end +// +// What we would like achieve here is to replace the do-loop start pseudo +// instruction t2DoLoopStart with: +// +// $lr = MVE_DLSTP_32 killed renamable $r3 +// +// Thus, $r3 which defines the number of elements, is written to $lr, +// and then we want to delete the whole chain that used to define $lr, +// see the comment below how this chain could look like. +// +void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) { + if (!LoLoop.IsTailPredicationLegal()) + return; - if (RDA->isSafeToRemove(MI, Remove, Ignore)) { - for (auto &MO : MI->operands()) { - if (!MO.isReg() || !MO.isUse() || MO.getReg() == 0) - continue; - if (auto *Op = RDA->getReachingMIDef(MI, MO.getReg())) - Chain.push_back(Op); - } - Ignore.insert(MI); + if (auto *Def = RDA->getReachingMIDef(LoLoop.Start, + LoLoop.Start->getOperand(0).getReg())) { + SmallPtrSet Remove; + SmallPtrSet Ignore = { LoLoop.Start, LoLoop.Dec, + LoLoop.End, LoLoop.InsertPt }; + SmallVector Chain = { Def }; + while (!Chain.empty()) { + MachineInstr *MI = Chain.back(); + Chain.pop_back(); + + // If an instruction is conditionally executed, we assume here that this + // an IT-block with just this single instruction in it, otherwise we + // continue and can't perform dead-code elimination on it. This will + // capture most cases, because the loop iteration count expression + // that performs a round-up to next multiple of the vector length will + // look like this: + // + // %mull = .. + // %0 = add i32 %mul, 3 + // %1 = icmp slt i32 %mul, 4 + // %smin = select i1 %1, i32 %mul, i32 4 + // %2 = sub i32 %0, %smin + // %3 = lshr i32 %2, 2 + // %4 = add nuw nsw i32 %3, 1 + // + // There can be a select instruction, checking if we need to execute only + // 1 vector iteration (in this examples that means 4 elements). Thus, + // we conditionally execute one instructions to materialise the iteration + // count. + MachineInstr *IT = nullptr; + if (TII->getPredicate(*MI) != ARMCC::AL) { + auto PrevMI = std::prev(MI->getIterator()); + auto NextMI = std::next(MI->getIterator()); + + if (PrevMI->getOpcode() == ARM::t2IT && + TII->getPredicate(*NextMI) == ARMCC::AL) + IT = &*PrevMI; + else + // We can't analyse IT-blocks with multiple statements. Be + // conservative here: clear the list, and don't remove any statements + // at all. + return; + } + + if (RDA->isSafeToRemove(MI, Remove, Ignore)) { + for (auto &MO : MI->operands()) { + if (!MO.isReg() || !MO.isUse() || MO.getReg() == 0) + continue; + if (auto *Op = RDA->getReachingMIDef(MI, MO.getReg())) + Chain.push_back(Op); } + Ignore.insert(MI); + + if (IT) + Remove.insert(IT); } - LoLoop.ToRemove.insert(Remove.begin(), Remove.end()); } + LoLoop.ToRemove.insert(Remove.begin(), Remove.end()); } +} + +MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) { + LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n"); + // When using tail-predication, try to delete the dead code that was used to + // calculate the number of loop iterations. + IterationCountDCE(LoLoop); MachineInstr *InsertPt = LoLoop.InsertPt; MachineInstr *Start = LoLoop.Start; diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir @@ -100,11 +100,7 @@ ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 - ; CHECK: renamable $r3, dead $cpsr = tLSLri renamable $r2, 1, 14, $noreg - ; CHECK: renamable $r12 = t2MOVi 4, 14, $noreg, $noreg - ; CHECK: tCMPi8 renamable $r3, 4, 14, $noreg, implicit-def $cpsr - ; CHECK: t2IT 11, 8, implicit-def $itstate - ; CHECK: dead $r12 = t2LSLri killed renamable $r2, 1, 11, killed $cpsr, $noreg, implicit killed renamable $r12, implicit killed $itstate + ; CHECK: renamable $r3, dead $cpsr = tLSLri killed renamable $r2, 1, 14, $noreg ; CHECK: renamable $r2 = tLEApcrel %const.0, 14, $noreg ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg :: (load 16 from constant-pool) ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r3 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain.mir copy from llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir copy to llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-chain.mir @@ -1,8 +1,10 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - --verify-machineinstrs | FileCheck %s +# IT-block with 3 statements, all chained together. + --- | - define hidden arm_aapcs_vfpcc void @dont_ignore_vctp(float* %pSrc, float* %pDst, i32 %blockSize) local_unnamed_addr #0 { + define hidden arm_aapcs_vfpcc void @it_block_2_stmts(float* %pSrc, float* %pDst, i32 %blockSize) local_unnamed_addr #0 { entry: %mul = shl i32 %blockSize, 1 %0 = add i32 %mul, 3 @@ -43,7 +45,7 @@ ... --- -name: dont_ignore_vctp +name: it_block_2_stmts alignment: 16 exposesReturnsTwice: false legalized: false @@ -92,19 +94,25 @@ isTargetSpecific: false machineFunctionInfo: {} body: | - ; CHECK-LABEL: name: dont_ignore_vctp + ; CHECK-LABEL: name: it_block_2_stmts ; CHECK: bb.0.entry: ; CHECK: successors: %bb.1(0x80000000) - ; CHECK: liveins: $lr, $r0, $r1, $r2, $r7 + ; CHECK: liveins: $lr, $r0, $r2, $r7 ; CHECK: frame-setup tPUSH 14, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 ; CHECK: renamable $r3, dead $cpsr = tLSLri renamable $r2, 1, 14, $noreg ; CHECK: renamable $r12 = t2MOVi 4, 14, $noreg, $noreg - ; CHECK: tCMPi8 renamable $r3, 4, 14, $noreg, implicit-def $cpsr + ; CHECK: tCMPi8 killed renamable $r3, 4, 14, $noreg, implicit-def $cpsr ; CHECK: t2IT 11, 8, implicit-def $itstate - ; CHECK: dead $r12 = t2LSLri killed renamable $r2, 1, 11, killed $cpsr, $noreg, implicit killed renamable $r12, implicit killed $itstate + ; CHECK: $r1 = t2ADDri renamable $r0, 3, 11, $noreg, $noreg, implicit $itstate + ; CHECK: $r3 = t2LSLri renamable $r2, 1, 11, $cpsr, $noreg, implicit renamable $r12, implicit $itstate + ; CHECK: $r12 = t2LSLri renamable $r3, 1, 11, killed $cpsr, $noreg, implicit killed renamable $r12, implicit killed $itstate + ; CHECK: renamable $r2 = t2RSBrs killed renamable $r12, killed renamable $r2, 10, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2ADDri killed renamable $r2, 3, 14, $noreg, $noreg + ; CHECK: renamable $r2, dead $cpsr = tMOVi8 1, 14, $noreg + ; CHECK: dead renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $r12, 19, 14, $noreg, $noreg ; CHECK: renamable $r2 = tLEApcrel %const.0, 14, $noreg ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg :: (load 16 from constant-pool) ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r3 @@ -133,7 +141,9 @@ renamable $r12 = t2MOVi 4, 14, $noreg, $noreg tCMPi8 renamable $r3, 4, 14, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate - $r12 = t2LSLri renamable $r2, 1, 11, killed $cpsr, $noreg, implicit killed renamable $r12, implicit killed $itstate + $r1 = t2ADDri killed renamable $r0, 3, 11, $noreg, $noreg, implicit $itstate + $r3 = t2LSLri renamable $r2, 1, 11, $cpsr, $noreg, implicit renamable $r12, implicit $itstate + $r12 = t2LSLri renamable $r3, 1, 11, killed $cpsr, $noreg, implicit killed renamable $r12, implicit killed $itstate renamable $r2 = t2RSBrs killed renamable $r12, killed renamable $r2, 10, 14, $noreg, $noreg renamable $r12 = t2ADDri killed renamable $r2, 3, 14, $noreg, $noreg renamable $r2, dead $cpsr = tMOVi8 1, 14, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-itercount.mir copy from llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir copy to llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-itercount.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-itercount.mir @@ -1,8 +1,11 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - --verify-machineinstrs | FileCheck %s +# IT-block with 2 statements, which we don't support yet, so check that we do +# not remove any of the iteration count statements. + --- | - define hidden arm_aapcs_vfpcc void @dont_ignore_vctp(float* %pSrc, float* %pDst, i32 %blockSize) local_unnamed_addr #0 { + define hidden arm_aapcs_vfpcc void @it_block_2_stmts(float* %pSrc, float* %pDst, i32 %blockSize) local_unnamed_addr #0 { entry: %mul = shl i32 %blockSize, 1 %0 = add i32 %mul, 3 @@ -43,7 +46,7 @@ ... --- -name: dont_ignore_vctp +name: it_block_2_stmts alignment: 16 exposesReturnsTwice: false legalized: false @@ -92,7 +95,7 @@ isTargetSpecific: false machineFunctionInfo: {} body: | - ; CHECK-LABEL: name: dont_ignore_vctp + ; CHECK-LABEL: name: it_block_2_stmts ; CHECK: bb.0.entry: ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $lr, $r0, $r1, $r2, $r7 @@ -104,7 +107,12 @@ ; CHECK: renamable $r12 = t2MOVi 4, 14, $noreg, $noreg ; CHECK: tCMPi8 renamable $r3, 4, 14, $noreg, implicit-def $cpsr ; CHECK: t2IT 11, 8, implicit-def $itstate - ; CHECK: dead $r12 = t2LSLri killed renamable $r2, 1, 11, killed $cpsr, $noreg, implicit killed renamable $r12, implicit killed $itstate + ; CHECK: $r12 = t2LSLri renamable $r2, 1, 11, $cpsr, $noreg, implicit killed renamable $r12, implicit $itstate + ; CHECK: $r12 = t2LSLri renamable $r2, 1, 11, killed $cpsr, $noreg, implicit killed renamable $r12, implicit killed $itstate + ; CHECK: renamable $r2 = t2RSBrs killed renamable $r12, killed renamable $r2, 10, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2ADDri killed renamable $r2, 3, 14, $noreg, $noreg + ; CHECK: renamable $r2, dead $cpsr = tMOVi8 1, 14, $noreg + ; CHECK: dead renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $r12, 19, 14, $noreg, $noreg ; CHECK: renamable $r2 = tLEApcrel %const.0, 14, $noreg ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg :: (load 16 from constant-pool) ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r3 @@ -133,6 +141,7 @@ renamable $r12 = t2MOVi 4, 14, $noreg, $noreg tCMPi8 renamable $r3, 4, 14, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate + $r12 = t2LSLri renamable $r2, 1, 11, $cpsr, $noreg, implicit renamable $r12, implicit $itstate $r12 = t2LSLri renamable $r2, 1, 11, killed $cpsr, $noreg, implicit killed renamable $r12, implicit killed $itstate renamable $r2 = t2RSBrs killed renamable $r12, killed renamable $r2, 10, 14, $noreg, $noreg renamable $r12 = t2ADDri killed renamable $r2, 3, 14, $noreg, $noreg diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-random.mir copy from llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir copy to llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-random.mir --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/dont-ignore-vctp.mir +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/it-block-random.mir @@ -1,8 +1,12 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py # RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - --verify-machineinstrs | FileCheck %s +# IT-block with 2 statements, with the last instruction not connected to the +# use-def chain of the iteration counter; make sure we don't remove the +# IT block and any of its instructions. + --- | - define hidden arm_aapcs_vfpcc void @dont_ignore_vctp(float* %pSrc, float* %pDst, i32 %blockSize) local_unnamed_addr #0 { + define hidden arm_aapcs_vfpcc void @it_block_2_stmts(float* %pSrc, float* %pDst, i32 %blockSize) local_unnamed_addr #0 { entry: %mul = shl i32 %blockSize, 1 %0 = add i32 %mul, 3 @@ -43,7 +47,7 @@ ... --- -name: dont_ignore_vctp +name: it_block_2_stmts alignment: 16 exposesReturnsTwice: false legalized: false @@ -92,7 +96,7 @@ isTargetSpecific: false machineFunctionInfo: {} body: | - ; CHECK-LABEL: name: dont_ignore_vctp + ; CHECK-LABEL: name: it_block_2_stmts ; CHECK: bb.0.entry: ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $lr, $r0, $r1, $r2, $r7 @@ -104,7 +108,12 @@ ; CHECK: renamable $r12 = t2MOVi 4, 14, $noreg, $noreg ; CHECK: tCMPi8 renamable $r3, 4, 14, $noreg, implicit-def $cpsr ; CHECK: t2IT 11, 8, implicit-def $itstate - ; CHECK: dead $r12 = t2LSLri killed renamable $r2, 1, 11, killed $cpsr, $noreg, implicit killed renamable $r12, implicit killed $itstate + ; CHECK: $r12 = t2LSLri renamable $r2, 1, 11, $cpsr, $noreg, implicit killed renamable $r12, implicit $itstate + ; CHECK: $r0 = t2ADDri killed renamable $r0, 42, 11, killed $cpsr, $noreg, implicit killed renamable $r0, implicit killed $itstate + ; CHECK: renamable $r2 = t2RSBrs killed renamable $r12, killed renamable $r2, 10, 14, $noreg, $noreg + ; CHECK: renamable $r12 = t2ADDri killed renamable $r2, 3, 14, $noreg, $noreg + ; CHECK: renamable $r2, dead $cpsr = tMOVi8 1, 14, $noreg + ; CHECK: dead renamable $lr = nuw nsw t2ADDrs killed renamable $r2, killed renamable $r12, 19, 14, $noreg, $noreg ; CHECK: renamable $r2 = tLEApcrel %const.0, 14, $noreg ; CHECK: renamable $q0 = MVE_VLDRWU32 killed renamable $r2, 0, 0, $noreg :: (load 16 from constant-pool) ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r3 @@ -133,7 +142,8 @@ renamable $r12 = t2MOVi 4, 14, $noreg, $noreg tCMPi8 renamable $r3, 4, 14, $noreg, implicit-def $cpsr t2IT 11, 8, implicit-def $itstate - $r12 = t2LSLri renamable $r2, 1, 11, killed $cpsr, $noreg, implicit killed renamable $r12, implicit killed $itstate + $r12 = t2LSLri renamable $r2, 1, 11, $cpsr, $noreg, implicit renamable $r12, implicit $itstate + $r0 = t2ADDri renamable $r0, 42, 11, killed $cpsr, $noreg, implicit killed renamable $r0, implicit killed $itstate renamable $r2 = t2RSBrs killed renamable $r12, killed renamable $r2, 10, 14, $noreg, $noreg renamable $r12 = t2ADDri killed renamable $r2, 3, 14, $noreg, $noreg renamable $r2, dead $cpsr = tMOVi8 1, 14, $noreg