Index: lib/CodeGen/HardwareLoops.cpp =================================================================== --- lib/CodeGen/HardwareLoops.cpp +++ lib/CodeGen/HardwareLoops.cpp @@ -294,6 +294,7 @@ // Check that the icmp is checking for equality of Count and zero and that // a non-zero value results in entering the loop. auto ICmp = cast(BI->getCondition()); + LLVM_DEBUG(dbgs() << " - Found condition: " << *ICmp << "\n"); if (!ICmp->isEquality()) return false; Index: lib/Target/ARM/ARMISelDAGToDAG.cpp =================================================================== --- lib/Target/ARM/ARMISelDAGToDAG.cpp +++ lib/Target/ARM/ARMISelDAGToDAG.cpp @@ -2998,6 +2998,16 @@ // Other cases are autogenerated. break; } + case ARMISD::WLS: { + SDValue Ops[] = { N->getOperand(1), // Loop count + N->getOperand(2), // Exit target + N->getOperand(0) }; + SDNode *LoopStart = + CurDAG->getMachineNode(ARM::t2WhileLoopStart, dl, MVT::Other, Ops); + ReplaceUses(N, LoopStart); + CurDAG->RemoveDeadNode(N); + return; + } case ARMISD::BRCOND: { // Pattern: (ARMbrcond:void (bb:Other):$dst, (imm:i32):$cc) // Emits: (Bcc:void (bb:Other):$dst, (imm:i32):$cc) Index: lib/Target/ARM/ARMISelLowering.h =================================================================== --- lib/Target/ARM/ARMISelLowering.h +++ lib/Target/ARM/ARMISelLowering.h @@ -121,6 +121,8 @@ WIN__CHKSTK, // Windows' __chkstk call to do stack probing. WIN__DBZCHK, // Windows' divide by zero check + WLS, // Low-overhead loops, While Loop Start + VCEQ, // Vector compare equal. VCEQZ, // Vector compare equal to zero. VCGE, // Vector compare greater than or equal. Index: lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- lib/Target/ARM/ARMISelLowering.cpp +++ lib/Target/ARM/ARMISelLowering.cpp @@ -597,6 +597,10 @@ if (Subtarget->hasMVEIntegerOps()) addMVEVectorTypes(Subtarget->hasMVEFloatOps()); + // Combine low-overhead loop intrinsics so that we can lower i1 types. + if (Subtarget->hasLOB()) + setTargetDAGCombine(ISD::BRCOND); + if (Subtarget->hasNEON()) { addDRTypeForNEON(MVT::v2f32); addDRTypeForNEON(MVT::v8i8); @@ -1497,6 +1501,7 @@ case ARMISD::VST2LN_UPD: return "ARMISD::VST2LN_UPD"; case ARMISD::VST3LN_UPD: return "ARMISD::VST3LN_UPD"; case ARMISD::VST4LN_UPD: return "ARMISD::VST4LN_UPD"; + case ARMISD::WLS: return "ARMISD::WLS"; } return nullptr; } @@ -12794,6 +12799,42 @@ return V; } +static SDValue PerformHWLoopCombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *ST) { + // Look for (brcond (xor test.set.loop.iterations, -1) + SDValue CC = N->getOperand(1); + + if (CC->getOpcode() != ISD::XOR && CC->getOpcode() != ISD::SETCC) + return SDValue(); + + if (CC->getOperand(0)->getOpcode() != ISD::INTRINSIC_W_CHAIN) + return SDValue(); + + SDValue Int = CC->getOperand(0); + unsigned IntOp = cast(Int.getOperand(1))->getZExtValue(); + if (IntOp != Intrinsic::test_set_loop_iterations) + return SDValue(); + + if (auto *Const = dyn_cast(CC->getOperand(1))) + assert(Const->isOne() && "Expected to compare against 1"); + else + assert(Const->isOne() && "Expected to compare against 1"); + + SDLoc dl(Int); + SDValue Chain = N->getOperand(0); + SDValue Elements = Int.getOperand(2); + SDValue ExitBlock = N->getOperand(2); + + // TODO: Once we start supporting tail predication, we can add another + // operand to WLS for the number of elements processed in a vector loop. + + SDValue Ops[] = { Chain, Elements, ExitBlock }; + SDValue Res = DCI.DAG.getNode(ARMISD::WLS, dl, MVT::Other, Ops); + DCI.DAG.ReplaceAllUsesOfValueWith(Int.getValue(1), Int.getOperand(0)); + return Res; +} + /// PerformBRCONDCombine - Target-specific DAG combining for ARMISD::BRCOND. SDValue ARMTargetLowering::PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const { @@ -13025,6 +13066,7 @@ case ISD::OR: return PerformORCombine(N, DCI, Subtarget); case ISD::XOR: return PerformXORCombine(N, DCI, Subtarget); case ISD::AND: return PerformANDCombine(N, DCI, Subtarget); + case ISD::BRCOND: return PerformHWLoopCombine(N, DCI, Subtarget); case ARMISD::ADDC: case ARMISD::SUBC: return PerformAddcSubcCombine(N, DCI, Subtarget); case ARMISD::SUBE: return PerformAddeSubeCombine(N, DCI, Subtarget); Index: lib/Target/ARM/ARMInstrInfo.td =================================================================== --- lib/Target/ARM/ARMInstrInfo.td +++ lib/Target/ARM/ARMInstrInfo.td @@ -99,6 +99,11 @@ SDTCisSameAs<0, 4>, SDTCisSameAs<0, 5>]>; +// TODO Add another operand for 'Size' so that we can re-use this node when we +// start supporting *TP versions. +def SDT_ARMWhileLoop : SDTypeProfile<0, 2, [SDTCisVT<0, i32>, + SDTCisVT<1, OtherVT>]>; + def ARMSmlald : SDNode<"ARMISD::SMLALD", SDT_LongMac>; def ARMSmlaldx : SDNode<"ARMISD::SMLALDX", SDT_LongMac>; def ARMSmlsld : SDNode<"ARMISD::SMLSLD", SDT_LongMac>; @@ -233,6 +238,9 @@ def ARMvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>; def ARMvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>; +def ARMWLS : SDNode<"ARMISD::WLS", SDT_ARMWhileLoop, + [SDNPHasChain]>; + //===----------------------------------------------------------------------===// // ARM Flag Definitions. Index: lib/Target/ARM/ARMInstrThumb2.td =================================================================== --- lib/Target/ARM/ARMInstrThumb2.td +++ lib/Target/ARM/ARMInstrThumb2.td @@ -5216,11 +5216,19 @@ t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$Rn, imm0_7:$size), 4, IIC_Br, []>, Sched<[WriteBr]>; -let isBranch = 1, isTerminator = 1, hasSideEffects = 1 in +let isBranch = 1, isTerminator = 1, hasSideEffects = 1 in { +def t2WhileLoopStart : + t2PseudoInst<(outs), + (ins rGPR:$elts, brtarget:$target), + 4, IIC_Br, []>, + Sched<[WriteBr]>; + def t2LoopEnd : t2PseudoInst<(outs), (ins GPRlr:$elts, brtarget:$target), 8, IIC_Br, []>, Sched<[WriteBr]>; +} // end isBranch, isTerminator, hasSideEffects + } // end isNotDuplicable class CS opcode, list pattern=[]> Index: lib/Target/ARM/ARMLowOverheadLoops.cpp =================================================================== --- lib/Target/ARM/ARMLowOverheadLoops.cpp +++ lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -105,15 +105,20 @@ LLVM_DEBUG(dbgs() << "ARM Loops: Processing " << *ML); auto IsLoopStart = [](MachineInstr &MI) { - return MI.getOpcode() == ARM::t2DoLoopStart; + return MI.getOpcode() == ARM::t2DoLoopStart || + MI.getOpcode() == ARM::t2WhileLoopStart; }; - auto SearchForStart = - [&IsLoopStart](MachineBasicBlock *MBB) -> MachineInstr* { + // Search the given block for a loop start instruction. If one isn't found, + // and there's only one predecessor block, search that one too. + std::function SearchForStart = + [&IsLoopStart, &SearchForStart](MachineBasicBlock *MBB) -> MachineInstr* { for (auto &MI : *MBB) { if (IsLoopStart(MI)) return &MI; } + if (MBB->pred_size() == 1) + return SearchForStart(*MBB->pred_begin()); return nullptr; }; @@ -122,8 +127,28 @@ MachineInstr *End = nullptr; bool Revert = false; - if (auto *Preheader = ML->getLoopPreheader()) + // Search the preheader for the start intrinsic, or look through the + // predecessors of the header to find exactly one set.iterations intrinsic. + // FIXME: I don't see why we shouldn't be supporting multiple predecessors + // with potentially multiple set.loop.iterations, so we need to enable this. + if (auto *Preheader = ML->getLoopPreheader()) { Start = SearchForStart(Preheader); + } else { + LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find loop preheader!\n" + << " - Performing manual predecessor search.\n"); + MachineBasicBlock *Pred = nullptr; + for (auto *MBB : ML->getHeader()->predecessors()) { + if (!ML->contains(MBB)) { + if (Pred) { + LLVM_DEBUG(dbgs() << " - Found multiple out-of-loop preds.\n"); + Start = nullptr; + break; + } + Pred = MBB; + Start = SearchForStart(MBB); + } + } + } // Find the low-overhead loop components and decide whether or not to fall // back to a normal loop. @@ -158,12 +183,11 @@ break; } - if (Start || Dec || End) { - if (!Start || !Dec || !End) - report_fatal_error("Failed to find all loop components"); - } else { + if (!Start && !Dec && !End) { LLVM_DEBUG(dbgs() << "ARM Loops: Not a low-overhead loop.\n"); return Changed; + } if (!(Start && Dec && End)) { + report_fatal_error("Failed to find all loop components"); } if (!End->getOperand(1).isMBB() || @@ -212,15 +236,21 @@ break; } + unsigned Opc = Start->getOpcode() == ARM::t2DoLoopStart ? + ARM::t2DLS : ARM::t2WLS; MachineInstrBuilder MIB = - BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(ARM::t2DLS)); - if (InsertPt != Start) - InsertPt->eraseFromParent(); + BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc)); MIB.addDef(ARM::LR); MIB.add(Start->getOperand(0)); - LLVM_DEBUG(dbgs() << "ARM Loops: Inserted DLS: " << *MIB); + if (Opc == ARM::t2WLS) + MIB.add(Start->getOperand(1)); + + if (InsertPt != Start) + InsertPt->eraseFromParent(); Start->eraseFromParent(); + LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB); + return &*MIB; }; // Combine the LoopDec and LoopEnd instructions into LE(TP). @@ -234,20 +264,9 @@ MIB.add(End->getOperand(1)); LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB); - // If there is a branch after loop end, which branches to the fallthrough - // block, remove the branch. - MachineBasicBlock *Latch = End->getParent(); - MachineInstr *Terminator = &Latch->instr_back(); - if (End != Terminator) { - MachineBasicBlock *Exit = ML->getExitBlock(); - if (Latch->isLayoutSuccessor(Exit)) { - LLVM_DEBUG(dbgs() << "ARM Loops: Removing loop exit branch: " - << *Terminator); - Terminator->eraseFromParent(); - } - } End->eraseFromParent(); Dec->eraseFromParent(); + return &*MIB; }; // Generate a subs, or sub and cmp, and a branch instead of an LE. @@ -282,12 +301,31 @@ Dec->eraseFromParent(); }; + // TODO: We should be able to automatically remove these branches before we + // get here - probably by teaching analyzeBranch about the pseudo + // instructions. + // If there is an unconditional branch, after I, that just branches to the + // next block, remove it. + auto RemoveDeadBranch = [](MachineInstr *I) { + MachineBasicBlock *BB = I->getParent(); + MachineInstr *Terminator = &BB->instr_back(); + if (Terminator->isUnconditionalBranch() && I != Terminator) { + MachineBasicBlock *Succ = Terminator->getOperand(0).getMBB(); + if (BB->isLayoutSuccessor(Succ)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Removing branch: " << *Terminator); + Terminator->eraseFromParent(); + } + } + }; + if (Revert) { Start->eraseFromParent(); ExpandBranch(Dec, End); } else { - ExpandLoopStart(ML, Start); - ExpandLoopEnd(ML, Dec, End); + Start = ExpandLoopStart(ML, Start); + RemoveDeadBranch(Start); + End = ExpandLoopEnd(ML, Dec, End); + RemoveDeadBranch(End); } } Index: lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- lib/Target/ARM/ARMTargetTransformInfo.cpp +++ lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -806,6 +806,7 @@ default: break; case Intrinsic::set_loop_iterations: + case Intrinsic::test_set_loop_iterations: case Intrinsic::loop_decrement: case Intrinsic::loop_decrement_reg: return true; @@ -841,6 +842,7 @@ LLVMContext &C = L->getHeader()->getContext(); HWLoopInfo.CounterInReg = true; HWLoopInfo.IsNestingLegal = false; + HWLoopInfo.PerformEntryTest = true; HWLoopInfo.CountType = Type::getInt32Ty(C); HWLoopInfo.LoopDecrement = ConstantInt::get(HWLoopInfo.CountType, 1); return true; Index: test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll =================================================================== --- /dev/null +++ test/CodeGen/Thumb2/LowOverheadLoops/loop-guards.ll @@ -0,0 +1,213 @@ +; RUN: llc -mtriple=thumbv8.1m.main -disable-arm-loloops=false -mattr=+lob -stop-after=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s +; RUN: llc -mtriple=thumbv8.1m.main -disable-arm-loloops=false -mattr=+lob -stop-after=arm-low-overhead-loops --verify-machineinstrs %s -o - | FileCheck %s --check-prefix=CHECK-GLOBAL + +; Not implemented as a mir test so that changes the generic HardwareLoop can +; also be tested. These functions have been taken from +; Transforms/HardwareLoops/loop-guards.ll in which can be seen the generation +; of a few test.set intrinsics, but only one (ne_trip_count) gets generated +; here. Simplifications result in icmps changing and maybe also the CFG. So, +; TODO: Teach the HardwareLoops some better pattern recognition. + +; CHECK-GLOBAL-NOT: DoLoopStart +; CHECK-GLOBAL-NOT: WhileLoopStart +; CHECK-GLOBAL-NOT: LoopEnd + +; CHECK: ne_and_guard +; CHECK: body: +; CHECK: bb.0.entry: +; CHECK: t2CMPri renamable $lr, 0 +; CHECK: tBcc %bb.3 +; CHECK: bb.1.while.body.preheader: +; CHECK: $lr = t2DLS renamable $lr +; CHECK: bb.2.while.body: +; CHECK: $lr = t2LEUpdate renamable $lr, %bb.2 +define void @ne_and_guard(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { +entry: + %brmerge.demorgan = and i1 %t1, %t2 + %cmp6 = icmp ne i32 %N, 0 + %or.cond = and i1 %brmerge.demorgan, %cmp6 + br i1 %or.cond, label %while.body, label %if.end + +while.body: ; preds = %while.body, %entry + %i.09 = phi i32 [ %inc, %while.body ], [ 0, %entry ] + %a.addr.08 = phi i32* [ %incdec.ptr3, %while.body ], [ %a, %entry ] + %b.addr.07 = phi i32* [ %incdec.ptr, %while.body ], [ %b, %entry ] + %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.07, i32 1 + %tmp = load i32, i32* %b.addr.07, align 4 + %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.08, i32 1 + store i32 %tmp, i32* %a.addr.08, align 4 + %inc = add nuw i32 %i.09, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %if.end, label %while.body + +if.end: ; preds = %while.body, %entry + ret void +} + +; TODO: This could generate WLS +; CHECK: ne_preheader +; CHECK: body: +; CHECK: bb.0.entry: +; CHECK: t2CMPri renamable $lr, 0 +; CHECK: tBcc %bb.3 +; CHECK: bb.1.while.body.preheader: +; CHECK: $lr = t2DLS renamable $lr +; CHECK: bb.2.while.body: +; CHECK: $lr = t2LEUpdate renamable $lr, %bb.2 +define void @ne_preheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { +entry: + %brmerge.demorgan = and i1 %t1, %t2 + br i1 %brmerge.demorgan, label %while.preheader, label %if.end + +while.preheader: ; preds = %entry + %cmp = icmp ne i32 %N, 0 + br i1 %cmp, label %while.body, label %if.end + +while.body: ; preds = %while.body, %while.preheader + %i.09 = phi i32 [ %inc, %while.body ], [ 0, %while.preheader ] + %a.addr.08 = phi i32* [ %incdec.ptr3, %while.body ], [ %a, %while.preheader ] + %b.addr.07 = phi i32* [ %incdec.ptr, %while.body ], [ %b, %while.preheader ] + %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.07, i32 1 + %tmp = load i32, i32* %b.addr.07, align 4 + %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.08, i32 1 + store i32 %tmp, i32* %a.addr.08, align 4 + %inc = add nuw i32 %i.09, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %if.end, label %while.body + +if.end: ; preds = %while.body, %while.preheader, %entry + ret void +} + +; TODO: This could generate WLS +; CHECK: eq_preheader +; CHECK: body: +; CHECK: bb.0.entry: +; CHECK: t2CMPri renamable $lr, 0 +; CHECK: tBcc %bb.3 +; CHECK: bb.1.while.body.preheader: +; CHECK: $lr = t2DLS renamable $lr +; CHECK: bb.2.while.body: +; CHECK: $lr = t2LEUpdate renamable $lr, %bb.2 +define void @eq_preheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { +entry: + %brmerge.demorgan = and i1 %t1, %t2 + br i1 %brmerge.demorgan, label %while.preheader, label %if.end + +while.preheader: ; preds = %entry + %cmp = icmp eq i32 %N, 0 + br i1 %cmp, label %if.end, label %while.body + +while.body: ; preds = %while.body, %while.preheader + %i.09 = phi i32 [ %inc, %while.body ], [ 0, %while.preheader ] + %a.addr.08 = phi i32* [ %incdec.ptr3, %while.body ], [ %a, %while.preheader ] + %b.addr.07 = phi i32* [ %incdec.ptr, %while.body ], [ %b, %while.preheader ] + %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.07, i32 1 + %tmp = load i32, i32* %b.addr.07, align 4 + %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.08, i32 1 + store i32 %tmp, i32* %a.addr.08, align 4 + %inc = add nuw i32 %i.09, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %if.end, label %while.body + +if.end: ; preds = %while.body, %while.preheader, %entry + ret void +} + +; TODO: This could generate WLS +; CHECK: ne_prepreheader +; CHECK: body: +; CHECK: bb.0.entry: +; CHECK: t2CMPri renamable $lr, 0 +; CHECK: tBcc %bb.3 +; CHECK: bb.1.while.body.preheader: +; CHECK: $lr = t2DLS renamable $lr +; CHECK: bb.2.while.body: +; CHECK: $lr = t2LEUpdate renamable $lr, %bb.2 +define void @ne_prepreheader(i1 zeroext %t1, i1 zeroext %t2, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { +entry: + %cmp = icmp ne i32 %N, 0 + br i1 %cmp, label %while.preheader, label %if.end + +while.preheader: ; preds = %entry + %brmerge.demorgan = and i1 %t1, %t2 + br i1 %brmerge.demorgan, label %while.body, label %if.end + +while.body: ; preds = %while.body, %while.preheader + %i.09 = phi i32 [ %inc, %while.body ], [ 0, %while.preheader ] + %a.addr.08 = phi i32* [ %incdec.ptr3, %while.body ], [ %a, %while.preheader ] + %b.addr.07 = phi i32* [ %incdec.ptr, %while.body ], [ %b, %while.preheader ] + %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.07, i32 1 + %tmp = load i32, i32* %b.addr.07, align 4 + %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.08, i32 1 + store i32 %tmp, i32* %a.addr.08, align 4 + %inc = add nuw i32 %i.09, 1 + %exitcond = icmp eq i32 %inc, %N + br i1 %exitcond, label %if.end, label %while.body + +if.end: ; preds = %while.body, %while.preheader, %entry + ret void +} + +; CHECK: be_ne +; CHECK: body: +; CHECK: bb.0.entry: +; CHECK: $lr = t2DLS renamable $lr +; CHECK: bb.1.do.body: +; CHECK: $lr = t2LEUpdate renamable $lr, %bb.1 +define void @be_ne(i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { +entry: + %cmp = icmp ne i32 %N, 0 + %sub = sub i32 %N, 1 + %be = select i1 %cmp, i32 0, i32 %sub + %cmp.1 = icmp ne i32 %be, 0 + br i1 %cmp.1, label %do.body, label %if.end + +do.body: ; preds = %do.body, %entry + %b.addr.0 = phi i32* [ %incdec.ptr, %do.body ], [ %b, %entry ] + %a.addr.0 = phi i32* [ %incdec.ptr3, %do.body ], [ %a, %entry ] + %i.0 = phi i32 [ %inc, %do.body ], [ 0, %entry ] + %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.0, i32 1 + %tmp = load i32, i32* %b.addr.0, align 4 + %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.0, i32 1 + store i32 %tmp, i32* %a.addr.0, align 4 + %inc = add nuw i32 %i.0, 1 + %cmp.2 = icmp ult i32 %inc, %N + br i1 %cmp.2, label %do.body, label %if.end + +if.end: ; preds = %do.body, %entry + ret void +} + +; TODO: Remove the tMOVr in the preheader! +; CHECK: ne_trip_count +; CHECK: body: +; CHECK: bb.0.entry: +; CHECK: $lr = t2WLS $r3, %bb.3 +; CHECK: bb.1.do.body.preheader: +; CHECK: $lr = tMOVr +; CHECK: bb.2.do.body: +; CHECK: $lr = t2LEUpdate renamable $lr, %bb.2 +define void @ne_trip_count(i1 zeroext %t1, i32* nocapture %a, i32* nocapture readonly %b, i32 %N) { +entry: + br label %do.body.preheader + +do.body.preheader: + %cmp = icmp ne i32 %N, 0 + br i1 %cmp, label %do.body, label %if.end + +do.body: + %b.addr.0 = phi i32* [ %incdec.ptr, %do.body ], [ %b, %do.body.preheader ] + %a.addr.0 = phi i32* [ %incdec.ptr3, %do.body ], [ %a, %do.body.preheader ] + %i.0 = phi i32 [ %inc, %do.body ], [ 0, %do.body.preheader ] + %incdec.ptr = getelementptr inbounds i32, i32* %b.addr.0, i32 1 + %tmp = load i32, i32* %b.addr.0, align 4 + %incdec.ptr3 = getelementptr inbounds i32, i32* %a.addr.0, i32 1 + store i32 %tmp, i32* %a.addr.0, align 4 + %inc = add nuw i32 %i.0, 1 + %cmp.1 = icmp ult i32 %inc, %N + br i1 %cmp.1, label %do.body, label %if.end + +if.end: ; preds = %do.body, %entry + ret void +} Index: test/Transforms/HardwareLoops/ARM/do-rem.ll =================================================================== --- test/Transforms/HardwareLoops/ARM/do-rem.ll +++ test/Transforms/HardwareLoops/ARM/do-rem.ll @@ -3,10 +3,14 @@ @g = common local_unnamed_addr global i32* null, align 4 ; CHECK-LABEL: do_with_i32_urem +; CHECK: entry: +; CHECK: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %n) +; CHECK: br i1 [[TEST]], label %while.body.preheader, label %while.end + ; CHECK: while.body.preheader: -; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n) ; CHECK-NEXT: br label %while.body +; CHECK: while.body: ; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ] ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1) ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 @@ -38,10 +42,14 @@ } ; CHECK-LABEL: do_with_i32_srem +; CHECK: entry: +; CHECK: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %n) +; CHECK: br i1 [[TEST]], label %while.body.preheader, label %while.end + ; CHECK: while.body.preheader: -; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n) ; CHECK-NEXT: br label %while.body +; CHECK: while.body: ; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ] ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1) ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 @@ -73,10 +81,14 @@ } ; CHECK-LABEL: do_with_i32_udiv +; CHECK: entry: +; CHECK: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %n) +; CHECK: br i1 [[TEST]], label %while.body.preheader, label %while.end + ; CHECK: while.body.preheader: -; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n) ; CHECK-NEXT: br label %while.body +; CHECK: while.body: ; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ] ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1) ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 @@ -108,10 +120,14 @@ } ; CHECK-LABEL: do_with_i32_sdiv +; CHECK: entry: +; CHECK: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %n) +; CHECK: br i1 [[TEST]], label %while.body.preheader, label %while.end + ; CHECK: while.body.preheader: -; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n) ; CHECK-NEXT: br label %while.body +; CHECK: while.body: ; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.preheader ], [ [[LOOP_DEC:%[^ ]+]], %while.body ] ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1) ; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 @@ -143,7 +159,7 @@ } ; CHECK-LABEL: do_with_i64_urem -; CHECK-NOT: llvm.set.loop.iterations +; CHECK-NOT: llvm.{{.*}}.loop.iterations ; CHECK-NOT: llvm.loop.decrement define i64 @do_with_i64_urem(i32 %n) { entry: @@ -172,7 +188,7 @@ } ; CHECK-LABEL: do_with_i64_srem -; CHECK-NOT: llvm.set.loop.iterations +; CHECK-NOT: llvm.{{.*}}.loop.iterations ; CHECK-NOT: llvm.loop.decrement define i64 @do_with_i64_srem(i32 %n) { entry: @@ -201,7 +217,7 @@ } ; CHECK-LABEL: do_with_i64_udiv -; CHECK-NOT: llvm.set.loop.iterations +; CHECK-NOT: llvm.{{.*}}.loop.iterations ; CHECK-NOT: llvm.loop.decrement define i64 @do_with_i64_udiv(i32 %n) { entry: @@ -230,7 +246,7 @@ } ; CHECK-LABEL: do_with_i64_sdiv -; CHECK-NOT: call void @llvm.set.loop.iterations +; CHECK-NOT: call void @llvm.{{.*}}.loop.iterations ; CHECK-NOT: call i32 @llvm.loop.decrement define i64 @do_with_i64_sdiv(i32 %n) { entry: Index: test/Transforms/HardwareLoops/ARM/fp-emulation.ll =================================================================== --- test/Transforms/HardwareLoops/ARM/fp-emulation.ll +++ test/Transforms/HardwareLoops/ARM/fp-emulation.ll @@ -2,9 +2,13 @@ ; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+soft-float -hardware-loops -disable-arm-loloops=false %s -S -o - | FileCheck %s --check-prefix=CHECK --check-prefix=CHECK-SOFT ; CHECK-LABEL: test_fptosi -; CHECK: while.body.lr.ph: +; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations + +; CHECK: entry: ; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1 ; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1 + +; CHECK: while.body.lr.ph: ; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) ; CHECK-FP-NEXT: br label %while.body @@ -13,8 +17,6 @@ ; CHECK-FP: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 ; CHECK-FP: br i1 [[CMP]], label %while.body, label %cleanup.loopexit -; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations - define void @test_fptosi(i32 %n, i32** %g, double** %d) { entry: %n.off = add i32 %n, -1 @@ -53,9 +55,10 @@ } ; CHECK-LABEL: test_fptoui -; CHECK-FP: while.body.lr.ph: +; CHECK: entry: ; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1 ; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1 +; CHECK-FP: while.body.lr.ph: ; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) ; CHECK-FP-NEXT: br label %while.body @@ -104,10 +107,11 @@ } ; CHECK-LABEL: load_store_float +; CHECK: entry: +; CHECK: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1 +; CHECK: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1 ; CHECK: while.body.lr.ph: -; CHECK: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1 -; CHECK: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1 -; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) ; CHECK-NEXT: br label %while.body ; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %if.end4 ] @@ -152,12 +156,11 @@ } ; CHECK-LABEL: fp_add -; CHECK: while.body.lr.ph: - ; CHECK-SOFT-NOT: call void @llvm.set.loop.iterations - +; CHECK: entry: ; CHECK-FP: [[CMP:%[^ ]+]] = icmp ugt i32 %n, 1 ; CHECK-FP: [[COUNT:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 1 +; CHECK: while.body.lr.ph: ; CHECK-FP: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) ; CHECK: br label %while.body Index: test/Transforms/HardwareLoops/ARM/simple-do.ll =================================================================== --- test/Transforms/HardwareLoops/ARM/simple-do.ll +++ test/Transforms/HardwareLoops/ARM/simple-do.ll @@ -3,7 +3,7 @@ ; RUN: opt -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=-lob -hardware-loops %s -S -o - | FileCheck %s --check-prefix=DISABLED ; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -disable-arm-loloops=false %s -o - | FileCheck %s --check-prefix=CHECK-LLC -; DISABLED-NOT: llvm.set.loop.iterations +; DISABLED-NOT: llvm.{{.*}}.loop.iterations ; DISABLED-NOT: llvm.loop.decrement @g = common local_unnamed_addr global i32* null, align 4 @@ -46,9 +46,12 @@ } ; CHECK-LABEL: do_inc1 +; CHECK: entry: +; CHECK: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %n) +; CHECK: br i1 [[TEST]], label %while.body.lr.ph, label %while.end + ; CHECK: while.body.lr.ph: -; CHECK: call void @llvm.set.loop.iterations.i32(i32 %n) -; CHECK-NEXT: br label %while.body +; CHECK: br label %while.body ; CHECK: [[REM:%[^ ]+]] = phi i32 [ %n, %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ] ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1) @@ -56,12 +59,12 @@ ; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit ; CHECK-LLC-LABEL:do_inc1: -; CHECK-LLC: dls lr, +; CHECK-LLC: wls lr, {{.*}}, [[LOOP_EXIT:.[LBB_0-3]+]] ; CHECK-LLC-NOT: mov lr, ; CHECK-LLC: [[LOOP_HEADER:\.LBB[0-9_]+]]: ; CHECK-LLC: le lr, [[LOOP_HEADER]] ; CHECK-LLC-NOT: b [[LOOP_EXIT:\.LBB[0-9_]+]] -; CHECK-LLC: [[LOOP_EXIT:\.LBB[0-9_]+]]: +; CHECK-LLC: [[LOOP_EXIT]]: define i32 @do_inc1(i32 %n) { entry: @@ -91,26 +94,26 @@ } ; CHECK-LABEL: do_inc2 -; CHECK: while.body.lr.ph: +; CHECK: entry: ; CHECK: [[ROUND:%[^ ]+]] = add i32 %n, -1 ; CHECK: [[HALVE:%[^ ]+]] = lshr i32 [[ROUND]], 1 ; CHECK: [[COUNT:%[^ ]+]] = add nuw i32 [[HALVE]], 1 -; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) -; CHECK-NEXT: br label %while.body -; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ] -; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1) -; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 -; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit +; CHECK: while.body.lr.ph: +; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) +; CHECK: br label %while.body +; CHECK: while.body: +; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ] +; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1) +; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 +; CHECK: br i1 [[CMP]], label %while.body, label %while.end.loopexit ; CHECK-LLC: do_inc2: ; CHECK-LLC-NOT: mov lr, -; CHECK-LLC: dls lr, +; CHECK-LLC: dls lr, {{.*}} ; CHECK-LLC-NOT: mov lr, ; CHECK-LLC: [[LOOP_HEADER:\.LBB[0-9._]+]]: ; CHECK-LLC: le lr, [[LOOP_HEADER]] -; CHECK-LLC-NOT: b [[LOOP_EXIT:\.LBB[0-9._]+]] -; CHECK-LLC: [[LOOP_EXIT:\.LBB[0-9_]+]]: define i32 @do_inc2(i32 %n) { entry: @@ -141,15 +144,17 @@ ; CHECK-LABEL: do_dec2 -; CHECK: while.body.lr.ph: +; CHECK: entry: ; CHECK: [[ROUND:%[^ ]+]] = add i32 %n, 1 ; CHECK: [[CMP:%[^ ]+]] = icmp slt i32 %n, 2 ; CHECK: [[SMIN:%[^ ]+]] = select i1 [[CMP]], i32 %n, i32 2 ; CHECK: [[SUB:%[^ ]+]] = sub i32 [[ROUND]], [[SMIN]] ; CHECK: [[HALVE:%[^ ]+]] = lshr i32 [[SUB]], 1 ; CHECK: [[COUNT:%[^ ]+]] = add nuw i32 [[HALVE]], 1 + +; CHECK: while.body.lr.ph: ; CHECK: call void @llvm.set.loop.iterations.i32(i32 [[COUNT]]) -; CHECK-NEXT: br label %while.body +; CHECK: br label %while.body ; CHECK: [[REM:%[^ ]+]] = phi i32 [ [[COUNT]], %while.body.lr.ph ], [ [[LOOP_DEC:%[^ ]+]], %while.body ] ; CHECK: [[LOOP_DEC]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 [[REM]], i32 1) @@ -158,12 +163,11 @@ ; CHECK-LLC: do_dec2 ; CHECK-LLC-NOT: mov lr, -; CHECK-LLC: dls lr, +; CHECK-LLC: dls lr, {{.*}} ; CHECK-LLC-NOT: mov lr, ; CHECK-LLC: [[LOOP_HEADER:\.LBB[0-9_]+]]: ; CHECK-LLC: le lr, [[LOOP_HEADER]] ; CHECK-LLC-NOT: b . -; CHECK-LLC: @ %while.end define i32 @do_dec2(i32 %n) { entry: %cmp6 = icmp sgt i32 %n, 0 Index: test/Transforms/HardwareLoops/ARM/structure.ll =================================================================== --- test/Transforms/HardwareLoops/ARM/structure.ll +++ test/Transforms/HardwareLoops/ARM/structure.ll @@ -109,6 +109,35 @@ ret i32 0 } +; CHECK-LABEL: pre_existing_test_set +; CHECK: call i1 @llvm.test.set.loop.iterations +; CHECK-NOT: llvm.set{{.*}}.loop.iterations +; CHECK: call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1) +; CHECK-NOT: call i32 @llvm.loop.decrement.reg +define i32 @pre_existing_test_set(i32 %n, i32* nocapture %p, i32* nocapture readonly %q) { +entry: + %guard = call i1 @llvm.test.set.loop.iterations.i32(i32 %n) + br i1 %guard, label %while.preheader, label %while.end + +while.preheader: + br label %while.body + +while.body: ; preds = %while.body, %entry + %q.addr.05 = phi i32* [ %incdec.ptr, %while.body ], [ %q, %while.preheader ] + %p.addr.04 = phi i32* [ %incdec.ptr1, %while.body ], [ %p, %while.preheader ] + %0 = phi i32 [ %n, %while.preheader ], [ %2, %while.body ] + %incdec.ptr = getelementptr inbounds i32, i32* %q.addr.05, i32 1 + %1 = load i32, i32* %q.addr.05, align 4 + %incdec.ptr1 = getelementptr inbounds i32, i32* %p.addr.04, i32 1 + store i32 %1, i32* %p.addr.04, align 4 + %2 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %0, i32 1) + %3 = icmp ne i32 %2, 0 + br i1 %3, label %while.body, label %while.end + +while.end: ; preds = %while.body + ret i32 0 +} + ; CHECK-LABEL: pre_existing_inner ; CHECK-NOT: llvm.set.loop.iterations ; CHECK: while.cond1.preheader.us: @@ -223,14 +252,16 @@ } ; CHECK-LABEL: search +; CHECK: entry: +; CHECK: [[TEST:%[^ ]+]] = call i1 @llvm.test.set.loop.iterations.i32(i32 %N) +; CHECK: br i1 [[TEST]], label %for.body.preheader, label %for.cond.cleanup ; CHECK: for.body.preheader: -; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N) -; CHECK: br label %for.body +; CHECK: br label %for.body ; CHECK: for.body: ; CHECK: for.inc: -; CHECK: [[LOOP_DEC:%[^ ]+]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32 -; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 -; CHECK: br i1 [[CMP]], label %for.body, label %for.cond.cleanup +; CHECK: [[LOOP_DEC:%[^ ]+]] = call i32 @llvm.loop.decrement.reg.i32.i32.i32 +; CHECK: [[CMP:%[^ ]+]] = icmp ne i32 [[LOOP_DEC]], 0 +; CHECK: br i1 [[CMP]], label %for.body, label %for.cond.cleanup define i32 @search(i8* nocapture readonly %c, i32 %N) { entry: %cmp11 = icmp eq i32 %N, 0 @@ -276,16 +307,16 @@ ; CHECK: call i32 @llvm.loop.decrement.reg.i32.i32.i32( ; TODO: We should be able to support the unrolled loop body. -; CHECK-UNROLL-LABEL: unroll_inc_int: +; CHECK-UNROLL-LABEL: unroll_inc_int ; CHECK-UNROLL: [[PREHEADER:.LBB[0-9_]+]]: @ %for.body.preheader ; CHECK-UNROLL-NOT: dls ; CHECK-UNROLL: [[LOOP:.LBB[0-9_]+]]: @ %for.body ; CHECK-UNROLL-NOT: le lr, [[LOOP]] ; CHECK-UNROLL: bne [[LOOP]] -; CHECK-UNROLL: %for.body.epil.preheader -; CHECK-UNROLL: dls -; CHECK-UNROLL: %for.body.epil -; CHECK-UNROLL: le +; CHECK-UNROLL: wls lr, lr, [[EXIT:.LBB[0-9_]+]] +; CHECK-UNROLL: [[EPIL:.LBB[0-9_]+]]: +; CHECK-UNROLL: le lr, [[EPIL]] +; CHECK-UNROLL-NEXT: [[EXIT]] define void @unroll_inc_int(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) { entry: @@ -310,24 +341,27 @@ } ; CHECK-LABEL: unroll_inc_unsigned -; CHECK: call void @llvm.set.loop.iterations.i32(i32 %N) +; CHECK: call i1 @llvm.test.set.loop.iterations.i32(i32 %N) ; CHECK: call i32 @llvm.loop.decrement.reg.i32.i32.i32( ; CHECK-LLC-LABEL: unroll_inc_unsigned: -; CHECK-LLC: dls lr, [[COUNT:r[0-9]+]] -; CHECK-LLC: le lr +; CHECK-LLC: wls lr, r3, [[EXIT:.LBB[0-9_]+]] +; CHECK-LLC: [[HEADER:.LBB[0-9_]+]]: +; CHECK-LLC: le lr, [[HEADER]] +; CHECK-LLC-NEXT: [[EXIT]]: ; TODO: We should be able to support the unrolled loop body. -; CHECK-UNROLL-LABEL: unroll_inc_unsigned: +; CHECK-UNROLL-LABEL: unroll_inc_unsigned ; CHECK-UNROLL: [[PREHEADER:.LBB[0-9_]+]]: @ %for.body.preheader ; CHECK-UNROLL-NOT: dls ; CHECK-UNROLL: [[LOOP:.LBB[0-9_]+]]: @ %for.body ; CHECK-UNROLL-NOT: le lr, [[LOOP]] ; CHECK-UNROLL: bne [[LOOP]] -; CHECK-UNROLL: %for.body.epil.preheader -; CHECK-UNROLL: dls -; CHECK-UNROLL: %for.body.epil -; CHECK-UNROLL: le +; CHECK-UNROLL: wls lr, lr, [[EPIL_EXIT:.LBB[0-9_]+]] +; CHECK-UNROLL: [[EPIL:.LBB[0-9_]+]]: +; CHECK-UNROLL: le lr, [[EPIL]] +; CHECK-UNROLL: [[EPIL_EXIT]]: +; CHECK-UNROLL: pop define void @unroll_inc_unsigned(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) { entry: %cmp8 = icmp eq i32 %N, 0 @@ -357,15 +391,21 @@ ; TODO: An unnecessary register is being held to hold COUNT, lr should just ; be used instead. ; CHECK-LLC-LABEL: unroll_dec_int: -; CHECK-LLC: dls lr, [[COUNT:r[0-9]+]] -; CHECK-LLC: subs [[COUNT]], #1 -; CHECK-LLC: le lr - -; CHECK-UNROLL-LABEL: unroll_dec_int -; CHECK-UNROLL: dls lr -; CHECK-UNROLL: le lr -; CHECK-UNROLL: dls lr -; CHECK-UNROLL: le lr +; CHECK-LLC: dls lr, r3 +; CHECK-LLC-NOT: mov lr, r3 +; CHECK-LLC: [[HEADER:.LBB[0-9_]+]]: +; CHECK-LLC: le lr, [[HEADER]] + +; CHECK-UNROLL-LABEL: unroll_dec_int: +; CHECK-UNROLL: wls lr, {{.*}}, [[PROLOGUE_EXIT:.LBB[0-9_]+]] +; CHECK-UNROLL-NEXT: [[PROLOGUE:.LBB[0-9_]+]]: +; CHECK-UNROLL: le lr, [[PROLOGUE]] +; CHECK-UNROLL-NEXT: [[PROLOGUE_EXIT:.LBB[0-9_]+]]: +; CHECK-UNROLL: dls lr, lr +; CHECK-UNROLL: [[BODY:.LBB[0-9_]+]]: +; CHECK-UNROLL: le lr, [[BODY]] +; CHECK-UNROLL-NOT: b +; CHECK-UNROLL: pop define void @unroll_dec_int(i32* nocapture %a, i32* nocapture readonly %b, i32* nocapture readonly %c, i32 %N) { entry: %cmp8 = icmp sgt i32 %N, 0 @@ -389,5 +429,6 @@ } declare void @llvm.set.loop.iterations.i32(i32) #0 +declare i1 @llvm.test.set.loop.iterations.i32(i32) #0 declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #0 Index: test/Transforms/HardwareLoops/ARM/while.mir =================================================================== --- /dev/null +++ test/Transforms/HardwareLoops/ARM/while.mir @@ -0,0 +1,131 @@ +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+lob %s -run-pass=arm-low-overhead-loops --verify-machineinstrs -o - | FileCheck %s + +# TODO: Remove the lr = tMOVr +# CHECK: body: +# CHECK: $lr = t2WLS $r2, [[EXIT:%bb[.0-9]+]] +# CHECK: [[PREHEADER:bb[.0-9a-z]+]]: +# CHECK: $lr = tMOVr killed $r2 +# CHECK: [[BODY:bb[.0-9a-z]+]]: +# CHECK: $lr = t2LEUpdate renamable $lr + +--- | + target datalayout = "e-m:e-p:32:32-Fi8-i64:64-v128:64:128-a:0:32-n32-S64" + target triple = "thumbv8.1m.main-arm-unknown" + + ; Function Attrs: norecurse nounwind optsize + define dso_local arm_aapcscc void @copy(i16* nocapture %a, i16* nocapture readonly %b, i32 %N) { + entry: + %cmp4 = icmp eq i32 %N, 0 + %0 = call i1 @llvm.test.set.loop.iterations.i32(i32 %N) + br i1 %0, label %while.body.preheader, label %while.end + + while.body.preheader: ; preds = %entry + br label %while.body + + while.body: ; preds = %while.body, %while.body.preheader + %a.addr.06 = phi i16* [ %incdec.ptr1, %while.body ], [ %a, %while.body.preheader ] + %b.addr.05 = phi i16* [ %incdec.ptr, %while.body ], [ %b, %while.body.preheader ] + %1 = phi i32 [ %N, %while.body.preheader ], [ %3, %while.body ] + %incdec.ptr = getelementptr inbounds i16, i16* %b.addr.05, i32 1 + %2 = load i16, i16* %b.addr.05, align 2, !tbaa !3 + %incdec.ptr1 = getelementptr inbounds i16, i16* %a.addr.06, i32 1 + store i16 %2, i16* %a.addr.06, align 2, !tbaa !3 + %3 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %1, i32 1) + %4 = icmp ne i32 %3, 0 + br i1 %4, label %while.body, label %while.end + + while.end: ; preds = %while.body, %entry + ret void + } + + declare i1 @llvm.test.set.loop.iterations.i32(i32) #1 + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) #1 + + attributes #1 = { noduplicate nounwind } + attributes #2 = { nounwind } + + !llvm.module.flags = !{!0, !1} + + !0 = !{i32 1, !"wchar_size", i32 4} + !1 = !{i32 1, !"min_enum_size", i32 4} + !3 = !{!4, !4, i64 0} + !4 = !{!"short", !5, i64 0} + !5 = !{!"omnipotent char", !6, i64 0} + !6 = !{!"Simple C/C++ TBAA"} + +... +--- +name: copy +alignment: 1 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: false +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +constants: [] +machineFunctionInfo: {} +body: | + bb.0.entry: + successors: %bb.1(0x40000000), %bb.3(0x40000000) + + frame-setup tPUSH 14, $noreg, $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + $r7 = frame-setup tMOVr $sp, 14, $noreg + frame-setup CFI_INSTRUCTION def_cfa_register $r7 + t2WhileLoopStart $r2, %bb.3 + tB %bb.1, 14, $noreg + + bb.1.while.body.preheader: + successors: %bb.2(0x80000000) + + $lr = tMOVr killed $r2, 14, $noreg + + bb.2.while.body: + successors: %bb.2(0x7c000000), %bb.3(0x04000000) + + renamable $r2, renamable $r1 = t2LDRH_POST killed renamable $r1, 2, 14, $noreg :: (load 2 from %ir.b.addr.05, !tbaa !3) + early-clobber renamable $r0 = t2STRH_POST killed renamable $r2, killed renamable $r0, 2, 14, $noreg :: (store 2 into %ir.a.addr.06, !tbaa !3) + renamable $lr = t2LoopDec killed renamable $lr, 1 + t2LoopEnd renamable $lr, %bb.2 + tB %bb.3, 14, $noreg + + bb.3.while.end: + tPOP_RET 14, $noreg, def $r7, def $pc + +...