Index: llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp =================================================================== --- llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -127,23 +127,25 @@ MI(I) { Predicates.insert(Preds.begin(), Preds.end()); } }; - // Represent a VPT block, a list of instructions that begins with a VPST and - // has a maximum of four proceeding instructions. All instructions within the - // block are predicated upon the vpr and we allow instructions to define the - // vpr within in the block too. + // Represent a VPT block, a list of instructions that begins with a VPT/VPST + // and has a maximum of four proceeding instructions. All instructions within + // the block are predicated upon the vpr and we allow instructions to define + // the vpr within in the block too. class VPTBlock { - std::unique_ptr VPST; + // The predicate then instruction, which is either a VPT, or a VPST + // instruction. + std::unique_ptr PredicateThen; PredicatedMI *Divergent = nullptr; SmallVector Insts; public: VPTBlock(MachineInstr *MI, SetVector &Preds) { - VPST = std::make_unique(MI, Preds); + PredicateThen = std::make_unique(MI, Preds); } void addInst(MachineInstr *MI, SetVector &Preds) { LLVM_DEBUG(dbgs() << "ARM Loops: Adding predicated MI: " << *MI); - if (!Divergent && !set_difference(Preds, VPST->Predicates).empty()) { + if (!Divergent && !set_difference(Preds, PredicateThen->Predicates).empty()) { Divergent = &Insts.back(); LLVM_DEBUG(dbgs() << " - has divergent predicate: " << *Divergent->MI); } @@ -160,18 +162,27 @@ // Is the given instruction part of the predicate set controlling the entry // to the block. bool IsPredicatedOn(MachineInstr *MI) const { - return VPST->Predicates.count(MI); + return PredicateThen->Predicates.count(MI); + } + + // Returns true if this is a VPT instruction, which is a predicate-then + // instruction that performs a comparison. + bool isVPT() const { return !isVPT(); } + + // Returns true if this is a VPST instruction. + bool isVPST() const { + return PredicateThen->MI->getOpcode() == ARM::MVE_VPST; } // Is the given instruction the only predicate which controls the entry to // the block. bool IsOnlyPredicatedOn(MachineInstr *MI) const { - return IsPredicatedOn(MI) && VPST->Predicates.size() == 1; + return IsPredicatedOn(MI) && PredicateThen->Predicates.size() == 1; } unsigned size() const { return Insts.size(); } SmallVectorImpl &getInsts() { return Insts; } - MachineInstr *getVPST() const { return VPST->MI; } + MachineInstr *getPredicateThen() const { return PredicateThen->MI; } PredicatedMI *getDivergent() const { return Divergent; } }; @@ -187,6 +198,7 @@ MachineInstr *Dec = nullptr; MachineInstr *End = nullptr; MachineInstr *VCTP = nullptr; + SmallPtrSet SecondaryVCTPs; VPTBlock *CurrentBlock = nullptr; SetVector CurrentPredicate; SmallVector VPTBlocks; @@ -485,6 +497,8 @@ if (auto *Def = RDA.getUniqueReachingMIDef(&MBB->back(), NumElements)) { SmallPtrSet ElementChain; SmallPtrSet Ignore = { VCTP }; + Ignore.insert(SecondaryVCTPs.begin(), SecondaryVCTPs.end()); + unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode()); if (RDA.isSafeToRemove(Def, ElementChain, Ignore)) { @@ -508,6 +522,7 @@ ToRemove.insert(ElementChain.begin(), ElementChain.end()); } } + return true; } @@ -781,29 +796,35 @@ if (CannotTailPredicate) return false; - // Only support a single vctp. - if (isVCTP(MI) && VCTP) - return false; - - // Start a new vpt block when we discover a vpt. - if (MI->getOpcode() == ARM::MVE_VPST) { + if (isVCTP(MI)) { + // If we find another VCTP, check whether it's identical to the "main" VCTP. + // If it is, store it in the SecondaryVCTPs set. + if (VCTP) { + if (!VCTP->getOperand(1).isIdenticalTo(MI->getOperand(1))) + return false; + LLVM_DEBUG(dbgs() << "ARM Loops: Found secondary VCTP: " << *MI); + SecondaryVCTPs.insert(MI); + } else { + LLVM_DEBUG(dbgs() << "ARM Loops: Found 'main' VCTP: " << *MI); + VCTP = MI; + } + } else if (isVPTOpcode(MI->getOpcode())) { + // Start a new vpt block when we discover a VPT or a VPST. VPTBlocks.emplace_back(MI, CurrentPredicate); CurrentBlock = &VPTBlocks.back(); return true; - } else if (isVCTP(MI)) - VCTP = MI; - else if (MI->getOpcode() == ARM::MVE_VPSEL || - MI->getOpcode() == ARM::MVE_VPNOT) - return false; - + } // TODO: Allow VPSEL and VPNOT, we currently cannot because: // 1) It will use the VPR as a predicate operand, but doesn't have to be // instead a VPT block, which means we can assert while building up - // the VPT block because we don't find another VPST to being a new + // the VPT block because we don't find another VPT or VPST to being a new // one. // 2) VPSEL still requires a VPR operand even after tail predicating, // which means we can't remove it unless there is another // instruction, such as vcmp, that can provide the VPR def. + else if (MI->getOpcode() == ARM::MVE_VPSEL || + MI->getOpcode() == ARM::MVE_VPNOT) + return false; bool IsUse = false; bool IsDef = false; @@ -1197,12 +1218,12 @@ if (Block.HasNonUniformPredicate()) { PredicatedMI *Divergent = Block.getDivergent(); if (isVCTP(Divergent->MI)) { - // The vctp will be removed, so the block mask of the VPST/VPT will need + // The vctp will be removed, so the block mask of the vp(s)t will need // to be recomputed. - LoLoop.BlockMasksToRecompute.insert(Block.getVPST()); - } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) { - // The VPT block has a non-uniform predicate but it's entry is guarded - // only by a vctp, which means we: + LoLoop.BlockMasksToRecompute.insert(Block.getPredicateThen()); + } else if (Block.isVPST() && Block.IsOnlyPredicatedOn(LoLoop.VCTP)) { + // The VPT block has a non-uniform predicate but it uses a vpst and its + // entry is guarded only by a vctp, which means we: // - Need to remove the original vpst. // - Then need to unpredicate any following instructions, until // we come across the divergent vpr def. @@ -1210,7 +1231,7 @@ // the divergent vpr def. // TODO: We could be producing more VPT blocks than necessary and could // fold the newly created one into a proceeding one. - for (auto I = ++MachineBasicBlock::iterator(Block.getVPST()), + for (auto I = ++MachineBasicBlock::iterator(Block.getPredicateThen()), E = ++MachineBasicBlock::iterator(Divergent->MI); I != E; ++I) RemovePredicate(&*I); @@ -1223,29 +1244,57 @@ ++Size; ++I; } - // Create a VPST with a null mask, we'll recompute it later. + // Create a VPST (with a null mask for now, we'll recompute it later). MachineInstrBuilder MIB = BuildMI(*InsertAt->getParent(), InsertAt, InsertAt->getDebugLoc(), TII->get(ARM::MVE_VPST)); MIB.addImm(0); - LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST()); + LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getPredicateThen()); LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB); - LoLoop.ToRemove.insert(Block.getVPST()); + LoLoop.ToRemove.insert(Block.getPredicateThen()); LoLoop.BlockMasksToRecompute.insert(MIB.getInstr()); } - } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) { - // A vpt block which is only predicated upon vctp and has no internal vpr - // defs: + // Else, iterate over the block, removing the extra VCTPs it may contain. + else { + bool RemovedVCTP = false; + for(PredicatedMI &Elt : Block.getInsts()) { + MachineInstr* MI = Elt.MI; + if(isVCTP(MI)) { + LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *MI); + LoLoop.ToRemove.insert(MI); + RemovedVCTP = true; + continue; + } + } + if(RemovedVCTP) + LoLoop.BlockMasksToRecompute.insert(Block.getPredicateThen()); + } + } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP) && Block.isVPST()) { + // A vpt block starting with VPST, is only predicated upon vctp and has no + // internal vpr defs: // - Remove vpst. // - Unpredicate the remaining instructions. - LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST()); - LoLoop.ToRemove.insert(Block.getVPST()); + LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getPredicateThen()); + LoLoop.ToRemove.insert(Block.getPredicateThen()); for (auto &PredMI : Insts) RemovePredicate(PredMI.MI); } } - LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *LoLoop.VCTP); + LLVM_DEBUG(dbgs() << "ARM Loops: Removing remaining VCTPs...\n"); + // Remove the "main" VCTP LoLoop.ToRemove.insert(LoLoop.VCTP); + LLVM_DEBUG(dbgs() << " " << *LoLoop.VCTP); + // Remove remaining secondary VCTPs + for (MachineInstr *VCTP : LoLoop.SecondaryVCTPs) { + // All VCTPs that aren't marked for removal yet should be unpredicated ones. + // The predicated ones should have already been marked for removal when + // visiting the VPT blocks. + if (LoLoop.ToRemove.insert(VCTP).second) { + assert(getVPTInstrPredicate(*VCTP) == ARMVCC::None && + "Removing Predicated VCTP without updating the block mask!"); + LLVM_DEBUG(dbgs() << " " << *VCTP); + } + } } void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll @@ -449,26 +449,17 @@ ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} -; CHECK-NEXT: add.w r12, r3, #3 -; CHECK-NEXT: mov.w lr, #1 -; CHECK-NEXT: bic r12, r12, #3 -; CHECK-NEXT: sub.w r12, r12, #4 -; CHECK-NEXT: add.w lr, lr, r12, lsr #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, r3 ; CHECK-NEXT: .LBB5_1: @ %bb12 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r3 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q0, [r0] -; CHECK-NEXT: vpttt.i32 ne, q0, zr +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vptt.i32 ne, q0, zr ; CHECK-NEXT: vcmpt.s32 le, q0, r2 -; CHECK-NEXT: vctpt.32 r3 ; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 -; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [r0], #16 -; CHECK-NEXT: le lr, .LBB5_1 +; CHECK-NEXT: letp lr, .LBB5_1 ; CHECK-NEXT: @ %bb.2: @ %bb32 ; CHECK-NEXT: pop {r7, pc} bb: Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir @@ -118,24 +118,16 @@ ; CHECK: bb.1.bb3: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: liveins: $r0, $r1, $r2, $r3 - ; CHECK: renamable $r12 = t2ADDri renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $lr = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg - ; CHECK: renamable $r12 = t2BICri killed renamable $r12, 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: $vpr = VMSR_P0 killed $r3, 14 /* CC::al */, $noreg - ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14 /* CC::al */, $noreg, $noreg ; CHECK: VSTR_P0_off killed renamable $vpr, $sp, 0, 14 /* CC::al */, $noreg :: (store 4 into %stack.0) ; CHECK: $r3 = tMOVr $r0, 14 /* CC::al */, $noreg - ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg - ; CHECK: $lr = t2DLS killed renamable $lr + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2 ; CHECK: bb.2.bb9: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 + ; CHECK: liveins: $lr, $r0, $r1, $r3 ; CHECK: renamable $vpr = VLDR_P0_off $sp, 0, 14 /* CC::al */, $noreg :: (load 4 from %stack.0) - ; CHECK: MVE_VPST 4, implicit $vpr - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr - ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4) - ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg - ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4) ; CHECK: MVE_VPST 4, implicit $vpr ; CHECK: renamable $vpr = MVE_VCMPi32r renamable $q0, $zr, 1, 1, killed renamable $vpr ; CHECK: renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4) @@ -143,7 +135,7 @@ ; CHECK: MVE_VPST 8, implicit $vpr ; CHECK: MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4) ; CHECK: $r0 = tMOVr $r3, 14 /* CC::al */, $noreg - ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 ; CHECK: bb.3.bb27: ; CHECK: $sp = tADDspi $sp, 1, 14 /* CC::al */, $noreg ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir @@ -0,0 +1,165 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s + +--- | + @_ZL3arr = internal global [10 x i32] [i32 1, i32 2, i32 3, i32 5, i32 5, i32 5, i32 -2, i32 0, i32 -8, i32 -1], align 4 + @.str = private unnamed_addr constant [5 x i8] c"%d, \00", align 1 + + define arm_aapcs_vfpcc void @vpt_block(i32* nocapture %A, i32 %n, i32 %x) { + entry: + %cmp9 = icmp sgt i32 %n, 0 + %0 = add i32 %n, 3 + %1 = lshr i32 %0, 2 + %2 = shl nuw i32 %1, 2 + %3 = add i32 %2, -4 + %4 = lshr i32 %3, 2 + %5 = add nuw nsw i32 %4, 1 + br i1 %cmp9, label %vector.ph, label %for.cond.cleanup + + vector.ph: ; preds = %entry + %sub = sub nsw i32 0, %x + call void @llvm.set.loop.iterations.i32(i32 %5) + br label %vector.body + + vector.body: ; preds = %vector.body, %vector.ph + %lsr.iv1 = phi i32* [ %scevgep, %vector.body ], [ %A, %vector.ph ] + %6 = phi i32 [ %5, %vector.ph ], [ %18, %vector.body ] + %7 = phi i32 [ %n, %vector.ph ], [ %9, %vector.body ] + %lsr.iv12 = bitcast i32* %lsr.iv1 to <4 x i32>* + %8 = call <4 x i1> @llvm.arm.mve.vctp32(i32 %7) + %9 = sub i32 %7, 4 + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %lsr.iv12, i32 4, <4 x i1> %8, <4 x i32> undef) + %10 = insertelement <4 x i32> undef, i32 %x, i32 0 + %11 = shufflevector <4 x i32> %10, <4 x i32> undef, <4 x i32> zeroinitializer + %12 = icmp slt <4 x i32> %wide.masked.load, %11 + %13 = insertelement <4 x i32> undef, i32 %sub, i32 0 + %14 = shufflevector <4 x i32> %13, <4 x i32> undef, <4 x i32> zeroinitializer + %15 = icmp sgt <4 x i32> %wide.masked.load, %14 + %16 = and <4 x i1> %12, %15 + %17 = and <4 x i1> %16, %8 + call void @llvm.masked.store.v4i32.p0v4i32(<4 x i32> zeroinitializer, <4 x i32>* %lsr.iv12, i32 4, <4 x i1> %17) + %scevgep = getelementptr i32, i32* %lsr.iv1, i32 4 + %18 = call i32 @llvm.loop.decrement.reg.i32.i32.i32(i32 %6, i32 1) + %19 = icmp ne i32 %18, 0 + br i1 %19, label %vector.body, label %for.cond.cleanup + + for.cond.cleanup: ; preds = %vector.body, %entry + ret void + } + + declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) + declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) + declare void @llvm.set.loop.iterations.i32(i32) + declare i32 @llvm.loop.decrement.reg.i32.i32.i32(i32, i32) + declare <4 x i1> @llvm.arm.mve.vctp32(i32) +... +--- +name: vpt_block +alignment: 2 +exposesReturnsTwice: false +legalized: false +regBankSelected: false +selected: false +failedISel: false +tracksRegLiveness: true +hasWinCFI: false +registers: [] +liveins: + - { reg: '$r0', virtual-reg: '' } + - { reg: '$r1', virtual-reg: '' } + - { reg: '$r2', virtual-reg: '' } +frameInfo: + isFrameAddressTaken: false + isReturnAddressTaken: false + hasStackMap: false + hasPatchPoint: false + stackSize: 8 + offsetAdjustment: 0 + maxAlignment: 4 + adjustsStack: false + hasCalls: false + stackProtector: '' + maxCallFrameSize: 0 + cvBytesOfCalleeSavedRegisters: 0 + hasOpaqueSPAdjustment: false + hasVAStart: false + hasMustTailInVarArgFunc: false + localFrameSize: 0 + savePoint: '' + restorePoint: '' +fixedStack: [] +stack: + - { id: 0, name: '', type: spill-slot, offset: -4, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$lr', callee-saved-restored: false, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } + - { id: 1, name: '', type: spill-slot, offset: -8, size: 4, alignment: 4, + stack-id: default, callee-saved-register: '$r7', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +callSites: [] +constants: [] +machineFunctionInfo: {} +body: | + ; CHECK-LABEL: name: vpt_block + ; CHECK: bb.0.entry: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r7 + ; CHECK: frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + ; CHECK: frame-setup CFI_INSTRUCTION def_cfa_offset 8 + ; CHECK: frame-setup CFI_INSTRUCTION offset $lr, -4 + ; CHECK: frame-setup CFI_INSTRUCTION offset $r7, -8 + ; CHECK: tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + ; CHECK: t2IT 11, 8, implicit-def $itstate + ; CHECK: frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg + ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r1 + ; CHECK: bb.1.vector.body: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $q0, $r0, $r2, $r3 + ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, killed $noreg + ; CHECK: MVE_VPTv4s32r 4, renamable $q1, renamable $r2, 11, implicit-def $vpr + ; CHECK: renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 12, 1, killed renamable $vpr + ; CHECK: renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 + ; CHECK: bb.2.for.cond.cleanup: + ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc + bb.0.entry: + successors: %bb.1(0x80000000) + liveins: $r0, $r1, $r2, $r7, $lr + + frame-setup tPUSH 14 /* CC::al */, $noreg, killed $r7, killed $lr, implicit-def $sp, implicit $sp + frame-setup CFI_INSTRUCTION def_cfa_offset 8 + frame-setup CFI_INSTRUCTION offset $lr, -4 + frame-setup CFI_INSTRUCTION offset $r7, -8 + tCMPi8 renamable $r1, 1, 14 /* CC::al */, $noreg, implicit-def $cpsr + t2IT 11, 8, implicit-def $itstate + frame-destroy tPOP_RET 11 /* CC::lt */, killed $cpsr, def $r7, def $pc, implicit killed $itstate + renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg + renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg + renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg + renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg + renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg + renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg + t2DoLoopStart renamable $lr + + bb.1.vector.body: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $q0, $r0, $r1, $r2, $r3 + + renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg + MVE_VPST 8, implicit $vpr + renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, killed renamable $vpr + MVE_VPTv4s32r 2, renamable $q1, renamable $r2, 11, implicit-def $vpr + renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 12, 1, killed renamable $vpr + renamable $vpr = MVE_VCTP32 renamable $r1, 1, killed renamable $vpr + renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr + renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg + renamable $lr = t2LoopDec killed renamable $lr, 1 + t2LoopEnd renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14 /* CC::al */, $noreg + + bb.2.for.cond.cleanup: + frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc + +...