diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp @@ -394,6 +394,17 @@ } else if (I->isReturn()) { // Returns can't be analyzed, but we should run cleanup. CantAnalyze = true; + } else if (I->getOpcode() == ARM::t2LoopEnd && + MBB.getParent() + ->getSubtarget() + .enableMachinePipeliner()) { + if (!Cond.empty()) + return true; + FBB = TBB; + TBB = I->getOperand(1).getMBB(); + Cond.push_back(MachineOperand::CreateImm(I->getOpcode())); + Cond.push_back(I->getOperand(0)); + Cond.push_back(MachineOperand::CreateImm(0)); // So it becomes 3 operands } else { // We encountered other unrecognized terminator. Bail out immediately. return true; @@ -457,7 +468,7 @@ return 0; if (!isUncondBranchOpcode(I->getOpcode()) && - !isCondBranchOpcode(I->getOpcode())) + !isCondBranchOpcode(I->getOpcode()) && I->getOpcode() != ARM::t2LoopEnd) return 0; // Remove the branch. @@ -467,7 +478,7 @@ if (I == MBB.begin()) return 1; --I; - if (!isCondBranchOpcode(I->getOpcode())) + if (!isCondBranchOpcode(I->getOpcode()) && I->getOpcode() != ARM::t2LoopEnd) return 1; // Remove the branch. @@ -491,8 +502,8 @@ // Shouldn't be a fall through. assert(TBB && "insertBranch must not be told to insert a fallthrough"); - assert((Cond.size() == 2 || Cond.size() == 0) && - "ARM branch conditions have two components!"); + assert((Cond.size() == 2 || Cond.size() == 0 || Cond.size() == 3) && + "ARM branch conditions have two or three components!"); // For conditional branches, we use addOperand to preserve CPSR flags. @@ -502,19 +513,24 @@ BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB).add(predOps(ARMCC::AL)); else BuildMI(&MBB, DL, get(BOpc)).addMBB(TBB); - } else + } else if (Cond.size() == 2) { BuildMI(&MBB, DL, get(BccOpc)) .addMBB(TBB) .addImm(Cond[0].getImm()) .add(Cond[1]); + } else + BuildMI(&MBB, DL, get(Cond[0].getImm())).add(Cond[1]).addMBB(TBB); return 1; } // Two-way conditional branch. - BuildMI(&MBB, DL, get(BccOpc)) - .addMBB(TBB) - .addImm(Cond[0].getImm()) - .add(Cond[1]); + if (Cond.size() == 2) + BuildMI(&MBB, DL, get(BccOpc)) + .addMBB(TBB) + .addImm(Cond[0].getImm()) + .add(Cond[1]); + else if (Cond.size() == 3) + BuildMI(&MBB, DL, get(Cond[0].getImm())).add(Cond[1]).addMBB(TBB); if (isThumb) BuildMI(&MBB, DL, get(BOpc)).addMBB(FBB).add(predOps(ARMCC::AL)); else @@ -524,9 +540,12 @@ bool ARMBaseInstrInfo:: reverseBranchCondition(SmallVectorImpl &Cond) const { - ARMCC::CondCodes CC = (ARMCC::CondCodes)(int)Cond[0].getImm(); - Cond[0].setImm(ARMCC::getOppositeCondition(CC)); - return false; + if (Cond.size() == 2) { + ARMCC::CondCodes CC = (ARMCC::CondCodes)(int)Cond[0].getImm(); + Cond[0].setImm(ARMCC::getOppositeCondition(CC)); + return false; + } + return true; } bool ARMBaseInstrInfo::isPredicated(const MachineInstr &MI) const { @@ -6730,9 +6749,11 @@ // Meanings of the various stuff with loop types: // t2Bcc: - // Loop = null -- there is no setup. // EndLoop = branch at end of original BB that will become a kernel // LoopCount = CC setter live into branch + // t2LoopEnd: + // EndLoop = branch at end of original BB + // LoopCount = t2LoopDec public: ARMPipelinerLoopInfo(MachineInstr *EndLoop, MachineInstr *LoopCount) : EndLoop(EndLoop), LoopCount(LoopCount), @@ -6755,6 +6776,25 @@ TII->reverseBranchCondition(Cond); } return {}; + } else if (EndLoop->getOpcode() == ARM::t2LoopEnd) { + // General case just lets the unrolled t2LoopDec do the subtraction and + // therefore just needs to check if zero has been reached. + MachineInstr *LoopDec = nullptr; + for (auto &I : MBB.instrs()) + if (I.getOpcode() == ARM::t2LoopDec) + LoopDec = &I; + assert(LoopDec && "Unable to find copied LoopDec"); + // Check if we're done with the loop. + MachineInstr *NewCmp = + BuildMI(&MBB, LoopDec->getDebugLoc(), TII->get(ARM::t2CMPri)) + .addReg(LoopDec->getOperand(0).getReg()) + .addImm(0) + .addImm(ARMCC::AL) + .addReg(ARM::NoRegister); + (void)NewCmp; + Cond.push_back(MachineOperand::CreateImm(ARMCC::EQ)); + Cond.push_back(MachineOperand::CreateReg(ARM::CPSR, false)); + return {}; } else llvm_unreachable("Unknown EndLoop"); } @@ -6793,5 +6833,35 @@ // that pipeline will work } + // Recognize: + // preheader: + // %1 = t2DoopLoopStart %0 + // loop: + // %2 = phi %1, , %..., %loop + // %3 = t2LoopDec %2, + // t2LoopEnd %3, %loop + + if (I != LoopBB->end() && I->getOpcode() == ARM::t2LoopEnd) { + for (auto &L : LoopBB->instrs()) + if (L.isCall()) + return nullptr; + else if (L.getOpcode() == ARM::MVE_VCTP8 || + L.getOpcode() == ARM::MVE_VCTP16 || + L.getOpcode() == ARM::MVE_VCTP32 || + L.getOpcode() == ARM::MVE_VCTP64) + return nullptr; + Register LoopDecResult = I->getOperand(0).getReg(); + MachineRegisterInfo &MRI = LoopBB->getParent()->getRegInfo(); + MachineInstr *LoopDec = MRI.getUniqueVRegDef(LoopDecResult); + if (LoopDec->getOpcode() != ARM::t2LoopDec) + return nullptr; + MachineInstr *LoopStart = nullptr; + for (auto &J : Preheader->instrs()) + if (J.getOpcode() == ARM::t2DoLoopStart) + LoopStart = &J; + if (!LoopStart) + return nullptr; + return std::make_unique(&*I, LoopDec); + } return nullptr; } diff --git a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp --- a/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp +++ b/llvm/lib/Target/ARM/MVETPAndVPTOptimisationsPass.cpp @@ -404,6 +404,17 @@ LoopPhi->getOperand(3).setReg(DecReg); } + SmallVector Cond; // For analyzeBranch. + MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch. + if (!TII->analyzeBranch(*(LoopEnd->getParent()), TBB, FBB, Cond) && !FBB) { + // If the LoopEnd falls through, need to insert a t2B to the fall-through + // block so that the non-analyzable t2LoopEndDec doesn't fall through. + MachineFunction::iterator MBBI = ++LoopEnd->getParent()->getIterator(); + BuildMI(LoopEnd->getParent(), DebugLoc(), TII->get(ARM::t2B)) + .addMBB(&*MBBI) + .add(predOps(ARMCC::AL)); + } + // Replace the loop dec and loop end as a single instruction. MachineInstrBuilder MI = BuildMI(*LoopEnd->getParent(), *LoopEnd, LoopEnd->getDebugLoc(), diff --git a/llvm/test/CodeGen/Thumb2/swp-fixedii-le.mir b/llvm/test/CodeGen/Thumb2/swp-fixedii-le.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/swp-fixedii-le.mir @@ -0,0 +1,186 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# NOTE: cortex-m7 is used to provide scheduling information despite not having LE instructions +# RUN: llc -mtriple=thumbv7m-none-eabi -mcpu=cortex-m7 -run-pass=pipeliner -o - %s | FileCheck %s --check-prefix=CHECK + +--- | + define hidden float @dot(float* nocapture noundef readonly %a, float* nocapture noundef readonly %b, i32 noundef %sz) local_unnamed_addr #0 { + entry: + %cmp8 = icmp sgt i32 %sz, 0 + br i1 %cmp8, label %for.body.preheader, label %for.end + + for.body.preheader: ; preds = %entry + %scevgep = getelementptr float, float* %b, i32 -1 + %scevgep4 = getelementptr float, float* %a, i32 -1 + br label %for.body + + for.body: ; preds = %for.body.preheader, %for.body + %lsr.iv5 = phi float* [ %scevgep4, %for.body.preheader ], [ %scevgep6, %for.body ] + %lsr.iv1 = phi float* [ %scevgep, %for.body.preheader ], [ %scevgep2, %for.body ] + %lsr.iv = phi i32 [ %sz, %for.body.preheader ], [ %lsr.iv.next, %for.body ] + %sum.010 = phi float [ %add, %for.body ], [ 0.000000e+00, %for.body.preheader ] + %scevgep7 = getelementptr float, float* %lsr.iv5, i32 1 + %0 = load float, float* %scevgep7, align 4 + %scevgep3 = getelementptr float, float* %lsr.iv1, i32 1 + %1 = load float, float* %scevgep3, align 4 + %mul = fmul fast float %1, %0 + %add = fadd fast float %mul, %sum.010 + %lsr.iv.next = add i32 %lsr.iv, -1 + %scevgep2 = getelementptr float, float* %lsr.iv1, i32 1 + %scevgep6 = getelementptr float, float* %lsr.iv5, i32 1 + %exitcond.not = icmp eq i32 %lsr.iv.next, 0 + br i1 %exitcond.not, label %for.end, label %for.body, !llvm.loop !0 + + for.end: ; preds = %for.body, %entry + %sum.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %add, %for.body ] + ret float %sum.0.lcssa + } + + !0 = distinct !{!0, !1, !2, !3} + !1 = !{!"llvm.loop.mustprogress"} + !2 = !{!"llvm.loop.unroll.disable"} + !3 = !{!"llvm.loop.pipeline.initiationinterval", i32 3} + +... +--- +name: dot +alignment: 2 +tracksRegLiveness: true +constants: + - id: 0 + value: 'float 0.000000e+00' + alignment: 4 + isTargetSpecific: false +body: | + ; CHECK-LABEL: name: dot + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.2(0x50000000), %bb.1(0x30000000) + ; CHECK-NEXT: liveins: $r0, $r1, $r2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gprlr = COPY $r2 + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gprnopc = COPY $r1 + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gprnopc = COPY $r0 + ; CHECK-NEXT: t2CMPri [[COPY]], 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK-NEXT: t2Bcc %bb.2, 10 /* CC::ge */, $cpsr + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[VLDRS:%[0-9]+]]:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool) + ; CHECK-NEXT: t2B %bb.4, 14 /* CC::al */, $noreg + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.for.body.preheader: + ; CHECK-NEXT: successors: %bb.5(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[t2SUBri:%[0-9]+]]:rgpr = t2SUBri [[COPY1]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: [[COPY3:%[0-9]+]]:gprnopc = COPY [[t2SUBri]] + ; CHECK-NEXT: [[t2SUBri1:%[0-9]+]]:rgpr = t2SUBri [[COPY2]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: [[VLDRS1:%[0-9]+]]:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool) + ; CHECK-NEXT: [[t2DoLoopStart:%[0-9]+]]:gprlr = t2DoLoopStart [[COPY]] + ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gprnopc = COPY [[t2SUBri1]] + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.5.for.body: + ; CHECK-NEXT: successors: %bb.6(0x80000000), %bb.7(0x00000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[t2ADDri:%[0-9]+]]:rgpr = t2ADDri [[COPY4]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: [[VLDRS2:%[0-9]+]]:spr = VLDRS [[COPY4]], 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep7) + ; CHECK-NEXT: [[t2ADDri1:%[0-9]+]]:rgpr = t2ADDri [[COPY3]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: [[VLDRS3:%[0-9]+]]:spr = VLDRS [[COPY3]], 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep3) + ; CHECK-NEXT: [[VMULS:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VMULS [[VLDRS3]], [[VLDRS2]], 14 /* CC::al */, $noreg + ; CHECK-NEXT: [[COPY5:%[0-9]+]]:gprlr = COPY [[t2DoLoopStart]] + ; CHECK-NEXT: [[t2LoopDec:%[0-9]+]]:gprlr = t2LoopDec [[COPY5]], 1 + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gpr = COPY [[t2LoopDec]] + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gpr = COPY [[t2ADDri1]] + ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gpr = COPY [[t2ADDri]] + ; CHECK-NEXT: t2CMPri [[t2LoopDec]], 0, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK-NEXT: t2Bcc %bb.7, 0 /* CC::eq */, $cpsr + ; CHECK-NEXT: t2B %bb.6, 14 /* CC::al */, $noreg + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.6.for.body: + ; CHECK-NEXT: successors: %bb.7(0x04000000), %bb.6(0x7c000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI:%[0-9]+]]:gprnopc = PHI [[COPY8]], %bb.5, %43, %bb.6 + ; CHECK-NEXT: [[PHI1:%[0-9]+]]:gprnopc = PHI [[COPY7]], %bb.5, %44, %bb.6 + ; CHECK-NEXT: [[PHI2:%[0-9]+]]:gpr = PHI [[COPY6]], %bb.5, %47, %bb.6 + ; CHECK-NEXT: [[PHI3:%[0-9]+]]:spr = PHI [[VLDRS1]], %bb.5, %46, %bb.6 + ; CHECK-NEXT: [[PHI4:%[0-9]+]]:spr = PHI [[VMULS]], %bb.5, %45, %bb.6 + ; CHECK-NEXT: [[VLDRS4:%[0-9]+]]:spr = VLDRS [[PHI1]], 1, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.scevgep3, align 4) + ; CHECK-NEXT: [[VLDRS5:%[0-9]+]]:spr = VLDRS [[PHI]], 1, 14 /* CC::al */, $noreg :: (load unknown-size from %ir.scevgep7, align 4) + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gprlr = COPY [[PHI2]] + ; CHECK-NEXT: [[t2LoopDec1:%[0-9]+]]:gprlr = t2LoopDec [[COPY9]], 1 + ; CHECK-NEXT: [[t2ADDri2:%[0-9]+]]:rgpr = t2ADDri [[PHI]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: [[t2ADDri3:%[0-9]+]]:rgpr = t2ADDri [[PHI1]], 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gpr = COPY [[t2ADDri2]] + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gpr = COPY [[t2ADDri3]] + ; CHECK-NEXT: [[VMULS1:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VMULS [[VLDRS4]], [[VLDRS5]], 14 /* CC::al */, $noreg + ; CHECK-NEXT: [[VADDS:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VADDS [[PHI4]], [[PHI3]], 14 /* CC::al */, $noreg + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gpr = COPY [[t2LoopDec1]] + ; CHECK-NEXT: t2LoopEnd [[t2LoopDec1]], %bb.6, implicit-def $cpsr + ; CHECK-NEXT: t2B %bb.7, 14 /* CC::al */, $noreg + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.7: + ; CHECK-NEXT: successors: %bb.4(0x80000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[PHI5:%[0-9]+]]:spr = PHI [[VLDRS1]], %bb.5, [[VADDS]], %bb.6 + ; CHECK-NEXT: [[PHI6:%[0-9]+]]:spr = PHI [[VMULS]], %bb.5, [[VMULS1]], %bb.6 + ; CHECK-NEXT: [[VADDS1:%[0-9]+]]:spr = nnan ninf nsz arcp contract afn reassoc VADDS [[PHI6]], [[PHI5]], 14 /* CC::al */, $noreg + ; CHECK-NEXT: t2B %bb.4, 14 /* CC::al */, $noreg + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.4.for.end: + ; CHECK-NEXT: [[PHI7:%[0-9]+]]:spr = PHI [[VLDRS]], %bb.1, [[VADDS1]], %bb.7 + ; CHECK-NEXT: [[VMOVRS:%[0-9]+]]:gpr = VMOVRS [[PHI7]], 14 /* CC::al */, $noreg + ; CHECK-NEXT: $r0 = COPY [[VMOVRS]] + ; CHECK-NEXT: tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + bb.0.entry: + successors: %bb.1(0x50000000), %bb.4(0x30000000) + liveins: $r0, $r1, $r2 + + %13:gprlr = COPY $r2 + %12:gprnopc = COPY $r1 + %11:gprnopc = COPY $r0 + t2CMPri %13, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2Bcc %bb.1, 10 /* CC::ge */, $cpsr + + bb.4: + successors: %bb.3(0x80000000) + + %14:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool) + t2B %bb.3, 14 /* CC::al */, $noreg + + bb.1.for.body.preheader: + successors: %bb.2(0x80000000) + + %16:rgpr = t2SUBri %12, 4, 14 /* CC::al */, $noreg, $noreg + %0:gpr = COPY %16 + %17:rgpr = t2SUBri %11, 4, 14 /* CC::al */, $noreg, $noreg + %15:spr = VLDRS %const.0, 0, 14 /* CC::al */, $noreg :: (load (s32) from constant-pool) + %44:gprlr = t2DoLoopStart %13 + %1:gpr = COPY %17 + + bb.2.for.body: + successors: %bb.3(0x04000000), %bb.2(0x7c000000) + + %2:gprnopc = PHI %1, %bb.1, %9, %bb.2 + %3:gprnopc = PHI %0, %bb.1, %8, %bb.2 + %4:gpr = PHI %44, %bb.1, %7, %bb.2 + %5:spr = PHI %15, %bb.1, %6, %bb.2 + %18:rgpr = t2ADDri %2, 4, 14 /* CC::al */, $noreg, $noreg + %19:spr = VLDRS %2, 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep7) + %20:rgpr = t2ADDri %3, 4, 14 /* CC::al */, $noreg, $noreg + %21:spr = VLDRS %3, 1, 14 /* CC::al */, $noreg :: (load (s32) from %ir.scevgep3) + %22:spr = nnan ninf nsz arcp contract afn reassoc VMULS killed %21, killed %19, 14 /* CC::al */, $noreg + %6:spr = nnan ninf nsz arcp contract afn reassoc VADDS killed %22, %5, 14 /* CC::al */, $noreg + %42:gprlr = COPY %4 + %23:gprlr = t2LoopDec %42:gprlr, 1 + %7:gpr = COPY %23 + %8:gpr = COPY %20 + %9:gpr = COPY %18 + t2LoopEnd %23:gprlr, %bb.2, implicit-def dead $cpsr + t2B %bb.3, 14 /* CC::al */, $noreg + + bb.3.for.end: + %10:spr = PHI %14, %bb.4, %6, %bb.2 + %24:gpr = VMOVRS %10, 14 /* CC::al */, $noreg + $r0 = COPY %24 + tBX_RET 14 /* CC::al */, $noreg, implicit $r0 + +...