Index: lib/Target/Hexagon/HexagonHardwareLoops.cpp =================================================================== --- lib/Target/Hexagon/HexagonHardwareLoops.cpp +++ lib/Target/Hexagon/HexagonHardwareLoops.cpp @@ -159,7 +159,7 @@ MachineOperand *InitialValue, const MachineOperand *Endvalue, int64_t IVBump) const; - + /// \brief Analyze the statements in a loop to determine if the loop /// has a computable trip count and, if so, return a value that represents /// the trip count expression. @@ -179,15 +179,16 @@ /// \brief Return true if the instruction is not valid within a hardware /// loop. - bool isInvalidLoopOperation(const MachineInstr *MI) const; + bool isInvalidLoopOperation(const MachineInstr *MI, + bool IsInnerHWLoop) const; /// \brief Return true if the loop contains an instruction that inhibits /// using the hardware loop. - bool containsInvalidInstruction(MachineLoop *L) const; + bool containsInvalidInstruction(MachineLoop *L, bool IsInnerHWLoop) const; /// \brief Given a loop, check if we can convert it to a hardware loop. /// If so, then perform the conversion and return true. - bool convertToHardwareLoop(MachineLoop *L); + bool convertToHardwareLoop(MachineLoop *L, bool &L0used, bool &L1used); /// \brief Return true if the instruction is now dead. bool isDead(const MachineInstr *MI, @@ -307,18 +308,10 @@ INITIALIZE_PASS_END(HexagonHardwareLoops, "hwloops", "Hexagon Hardware Loops", false, false) - -/// \brief Returns true if the instruction is a hardware loop instruction. -static bool isHardwareLoop(const MachineInstr *MI) { - return MI->getOpcode() == Hexagon::J2_loop0r || - MI->getOpcode() == Hexagon::J2_loop0i; -} - FunctionPass *llvm::createHexagonHardwareLoops() { return new HexagonHardwareLoops(); } - bool HexagonHardwareLoops::runOnMachineFunction(MachineFunction &MF) { DEBUG(dbgs() << "********* Hexagon Hardware Loops *********\n"); @@ -329,12 +322,12 @@ MDT = &getAnalysis(); TII = MF.getSubtarget().getInstrInfo(); - for (MachineLoopInfo::iterator I = MLI->begin(), E = MLI->end(); - I != E; ++I) { - MachineLoop *L = *I; - if (!L->getParentLoop()) - Changed |= convertToHardwareLoop(L); - } + for (auto &L : *MLI) + if (!L->getParentLoop()) { + bool L0Used = false; + bool L1Used = false; + Changed |= convertToHardwareLoop(L, L0Used, L1Used); + } return Changed; } @@ -467,27 +460,27 @@ case Hexagon::C2_cmpeqi: case Hexagon::C2_cmpeq: case Hexagon::C2_cmpeqp: - Cmp = Comparison::Kind::EQ; + Cmp = Comparison::EQ; break; case Hexagon::C4_cmpneq: case Hexagon::C4_cmpneqi: - Cmp = Comparison::Kind::NE; + Cmp = Comparison::NE; break; case Hexagon::C4_cmplte: - Cmp = Comparison::Kind::LEs; + Cmp = Comparison::LEs; break; case Hexagon::C4_cmplteu: - Cmp = Comparison::Kind::LEu; + Cmp = Comparison::LEu; break; case Hexagon::C2_cmpgtui: case Hexagon::C2_cmpgtu: case Hexagon::C2_cmpgtup: - Cmp = Comparison::Kind::GTu; + Cmp = Comparison::GTu; break; case Hexagon::C2_cmpgti: case Hexagon::C2_cmpgt: case Hexagon::C2_cmpgtp: - Cmp = Comparison::Kind::GTs; + Cmp = Comparison::GTs; break; default: return (Comparison::Kind)0; @@ -749,7 +742,7 @@ MachineBasicBlock::iterator InsertPos = PH->getFirstTerminator(); DebugLoc DL; if (InsertPos != PH->end()) - InsertPos->getDebugLoc(); + DL = InsertPos->getDebugLoc(); // If Start is an immediate and End is a register, the trip count // will be "reg - imm". Hexagon's "subtract immediate" instruction @@ -828,7 +821,7 @@ const MCInstrDesc &SubD = RegToReg ? TII->get(Hexagon::A2_sub) : (RegToImm ? TII->get(Hexagon::A2_subri) : TII->get(Hexagon::A2_addi)); - if (RegToReg || RegToImm) { + if (RegToReg || RegToImm) { unsigned SubR = MRI->createVirtualRegister(IntRC); MachineInstrBuilder SubIB = BuildMI(*PH, InsertPos, DL, SubD, SubR); @@ -902,51 +895,50 @@ return new CountValue(CountValue::CV_Register, CountR, CountSR); } - /// \brief Return true if the operation is invalid within hardware loop. -bool HexagonHardwareLoops::isInvalidLoopOperation( - const MachineInstr *MI) const { +bool HexagonHardwareLoops::isInvalidLoopOperation(const MachineInstr *MI, + bool IsInnerHWLoop) const { // Call is not allowed because the callee may use a hardware loop except for // the case when the call never returns. if (MI->getDesc().isCall() && MI->getOpcode() != Hexagon::CALLv3nr) return true; - // do not allow nested hardware loops - if (isHardwareLoop(MI)) - return true; - - // check if the instruction defines a hardware loop register + // Check if the instruction defines a hardware loop register. for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) { const MachineOperand &MO = MI->getOperand(i); if (!MO.isReg() || !MO.isDef()) continue; unsigned R = MO.getReg(); - if (R == Hexagon::LC0 || R == Hexagon::LC1 || - R == Hexagon::SA0 || R == Hexagon::SA1) + if (IsInnerHWLoop && (R == Hexagon::LC0 || R == Hexagon::SA0 || + R == Hexagon::LC1 || R == Hexagon::SA1)) + return true; + if (!IsInnerHWLoop && (R == Hexagon::LC1 || R == Hexagon::SA1)) return true; } return false; } - -/// \brief - Return true if the loop contains an instruction that inhibits -/// the use of the hardware loop function. -bool HexagonHardwareLoops::containsInvalidInstruction(MachineLoop *L) const { +/// \brief Return true if the loop contains an instruction that inhibits +/// the use of the hardware loop instruction. +bool HexagonHardwareLoops::containsInvalidInstruction(MachineLoop *L, + bool IsInnerHWLoop) const { const std::vector &Blocks = L->getBlocks(); + DEBUG(dbgs() << "\nhw_loop head, BB#" << Blocks[0]->getNumber();); for (unsigned i = 0, e = Blocks.size(); i != e; ++i) { MachineBasicBlock *MBB = Blocks[i]; for (MachineBasicBlock::iterator MII = MBB->begin(), E = MBB->end(); MII != E; ++MII) { const MachineInstr *MI = &*MII; - if (isInvalidLoopOperation(MI)) + if (isInvalidLoopOperation(MI, IsInnerHWLoop)) { + DEBUG(dbgs()<< "\nCannot convert to hw_loop due to:"; MI->dump();); return true; + } } } return false; } - /// \brief Returns true if the instruction is dead. This was essentially /// copied from DeadMachineInstructionElim::isDead, but with special cases /// for inline asm, physical registers and instructions with side effects @@ -1041,19 +1033,47 @@ /// /// The code makes several assumptions about the representation of the loop /// in llvm. -bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L) { +bool HexagonHardwareLoops::convertToHardwareLoop(MachineLoop *L, + bool &RecL0used, + bool &RecL1used) { // This is just for sanity. assert(L->getHeader() && "Loop without a header?"); bool Changed = false; + bool L0Used = false; + bool L1Used = false; + // Process nested loops first. - for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I) - Changed |= convertToHardwareLoop(*I); + for (MachineLoop::iterator I = L->begin(), E = L->end(); I != E; ++I) { + Changed |= convertToHardwareLoop(*I, RecL0used, RecL1used); + L0Used |= RecL0used; + L1Used |= RecL1used; + } // If a nested loop has been converted, then we can't convert this loop. - if (Changed) + if (Changed && L0Used && L1Used) return Changed; + unsigned LOOP_i; + unsigned LOOP_r; + unsigned ENDLOOP; + + // Flag used to track loopN instruction: + // 1 - Hardware loop is being generated for the inner most loop. + // 0 - Hardware loop is being generated for the outer loop. + unsigned IsInnerHWLoop = 1; + + if (L0Used) { + LOOP_i = Hexagon::J2_loop1i; + LOOP_r = Hexagon::J2_loop1r; + ENDLOOP = Hexagon::ENDLOOP1; + IsInnerHWLoop = 0; + } else { + LOOP_i = Hexagon::J2_loop0i; + LOOP_r = Hexagon::J2_loop0r; + ENDLOOP = Hexagon::ENDLOOP0; + } + #ifndef NDEBUG // Stop trying after reaching the limit (if any). int Limit = HWLoopLimit; @@ -1065,10 +1085,10 @@ #endif // Does the loop contain any invalid instructions? - if (containsInvalidInstruction(L)) + if (containsInvalidInstruction(L, IsInnerHWLoop)) return false; - MachineBasicBlock *LastMBB = L->getExitingBlock(); + MachineBasicBlock *LastMBB = getExitingBlock(L); // Don't generate hw loop if the loop has more than one exit. if (!LastMBB) return false; @@ -1141,8 +1161,7 @@ BuildMI(*Preheader, InsertPos, DL, TII->get(TargetOpcode::COPY), CountReg) .addReg(TripCount->getReg(), 0, TripCount->getSubReg()); // Add the Loop instruction to the beginning of the loop. - BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::J2_loop0r)) - .addMBB(LoopStart) + BuildMI(*Preheader, InsertPos, DL, TII->get(LOOP_r)).addMBB(LoopStart) .addReg(CountReg); } else { assert(TripCount->isImm() && "Expecting immediate value for trip count"); @@ -1150,14 +1169,14 @@ // if the immediate fits in the instructions. Otherwise, we need to // create a new virtual register. int64_t CountImm = TripCount->getImm(); - if (!TII->isValidOffset(Hexagon::J2_loop0i, CountImm)) { + if (!TII->isValidOffset(LOOP_i, CountImm)) { unsigned CountReg = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass); BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::A2_tfrsi), CountReg) .addImm(CountImm); - BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::J2_loop0r)) + BuildMI(*Preheader, InsertPos, DL, TII->get(LOOP_r)) .addMBB(LoopStart).addReg(CountReg); } else - BuildMI(*Preheader, InsertPos, DL, TII->get(Hexagon::J2_loop0i)) + BuildMI(*Preheader, InsertPos, DL, TII->get(LOOP_i)) .addMBB(LoopStart).addImm(CountImm); } @@ -1171,8 +1190,7 @@ // Replace the loop branch with an endloop instruction. DebugLoc LastIDL = LastI->getDebugLoc(); - BuildMI(*LastMBB, LastI, LastIDL, - TII->get(Hexagon::ENDLOOP0)).addMBB(LoopStart); + BuildMI(*LastMBB, LastI, LastIDL, TII->get(ENDLOOP)).addMBB(LoopStart); // The loop ends with either: // - a conditional branch followed by an unconditional branch, or @@ -1200,6 +1218,15 @@ removeIfDead(OldInsts[i]); ++NumHWLoops; + + // Set RecL1used and RecL0used only after hardware loop has been + // successfully generated. Doing it earlier can cause wrong loop instruction + // to be used. + if (L0Used) // Loop0 was already used. So, the correct loop must be loop1. + RecL1used = true; + else + RecL0used = true; + return true; } @@ -1533,7 +1560,7 @@ if (Header->pred_size() > 2) { // Ensure that the header has only two predecessors: the preheader and // the loop latch. Any additional predecessors of the header should - // join at the newly created preheader. Inspect all PHI nodes from the + // join at the newly created preheader. Inspect all PHI nodes from the // header and create appropriate corresponding PHI nodes in the preheader. for (instr_iterator I = Header->instr_begin(), E = Header->instr_end(); Index: test/CodeGen/Hexagon/hwloop-loop1.ll =================================================================== --- /dev/null +++ test/CodeGen/Hexagon/hwloop-loop1.ll @@ -0,0 +1,68 @@ +; RUN: llc -march=hexagon -mcpu=hexagonv5 < %s | FileCheck %s +; +; Generate loop1 instruction for double loop sequence. + +; CHECK: loop0(.LBB{{.}}_{{.}}, #100) +; CHECK: endloop0 +; CHECK: loop1(.LBB{{.}}_{{.}}, #100) +; CHECK: loop0(.LBB{{.}}_{{.}}, #100) +; CHECK: endloop0 +; CHECK: endloop1 + +define i32 @main() #0 { +entry: + %array = alloca [100 x i32], align 8 + %doublearray = alloca [100 x [100 x i32]], align 8 + %0 = bitcast [100 x i32]* %array to i8* + call void @llvm.lifetime.start(i64 400, i8* %0) #1 + %1 = bitcast [100 x [100 x i32]]* %doublearray to i8* + call void @llvm.lifetime.start(i64 40000, i8* %1) #1 + %arrayidx1 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* %doublearray, i32 0, i32 10, i32 10 + %arrayidx2.gep = getelementptr [100 x i32], [100 x i32]* %array, i32 0, i32 0 + br label %for.body + +for.body: + %2 = phi i32 [ undef, %entry ], [ %.pre, %for.body.for.body_crit_edge ] + %sum.031 = phi i32 [ undef, %entry ], [ %add, %for.body.for.body_crit_edge ] + %arrayidx2.phi = phi i32* [ %arrayidx2.gep, %entry ], [ %arrayidx2.inc, %for.body.for.body_crit_edge ] + %i.030 = phi i32 [ 1, %entry ], [ %phitmp, %for.body.for.body_crit_edge ] + %add = add nsw i32 %2, %sum.031 + %exitcond33 = icmp eq i32 %i.030, 100 + %arrayidx2.inc = getelementptr i32, i32* %arrayidx2.phi, i32 1 + br i1 %exitcond33, label %for.cond7.preheader.preheader, label %for.body.for.body_crit_edge + +for.cond7.preheader.preheader: + br label %for.cond7.preheader + +for.body.for.body_crit_edge: + %.pre = load i32, i32* %arrayidx2.inc, align 4 + %phitmp = add i32 %i.030, 1 + br label %for.body + +for.cond7.preheader: + %i.129 = phi i32 [ %inc16, %for.inc15 ], [ 0, %for.cond7.preheader.preheader ] + br label %for.body9 + +for.body9: + %j.028 = phi i32 [ 0, %for.cond7.preheader ], [ %inc13, %for.body9 ] + %arrayidx11 = getelementptr inbounds [100 x [100 x i32]], [100 x [100 x i32]]* %doublearray, i32 0, i32 %i.129, i32 %j.028 + store i32 %add, i32* %arrayidx11, align 4 + %inc13 = add nsw i32 %j.028, 1 + %exitcond = icmp eq i32 %inc13, 100 + br i1 %exitcond, label %for.inc15, label %for.body9 + +for.inc15: + %inc16 = add nsw i32 %i.129, 1 + %exitcond32 = icmp eq i32 %inc16, 100 + br i1 %exitcond32, label %for.end17, label %for.cond7.preheader + +for.end17: + %3 = load i32, i32* %arrayidx1, align 8 + call void @llvm.lifetime.end(i64 40000, i8* %1) #1 + call void @llvm.lifetime.end(i64 400, i8* %0) #1 + ret i32 %3 +} + +declare void @llvm.lifetime.start(i64, i8* nocapture) #1 + +declare void @llvm.lifetime.end(i64, i8* nocapture) #1