Index: llvm/lib/Target/ARM/ARMInstrVFP.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrVFP.td +++ llvm/lib/Target/ARM/ARMInstrVFP.td @@ -2845,7 +2845,8 @@ } } -let Predicates = [HasV8_1MMainline, HasMVEInt] in { +let Predicates = [HasV8_1MMainline, HasMVEInt], + D = MVEDomain, validForTailPredication=1 in { let Uses = [VPR] in { defm VSTR_VPR : vfp_vstrldr_sysreg<0b0,0b1100, "vpr">; } @@ -2863,7 +2864,8 @@ } } -let Predicates = [HasV8_1MMainline, HasMVEInt] in { +let Predicates = [HasV8_1MMainline, HasMVEInt], + D = MVEDomain, validForTailPredication=1 in { let Defs = [VPR] in { defm VLDR_VPR : vfp_vstrldr_sysreg<0b1,0b1100, "vpr">; } Index: llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp =================================================================== --- llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -160,7 +160,6 @@ } void addInst(MachineInstr *MI, SetVector &Preds) { - LLVM_DEBUG(dbgs() << "ARM Loops: Adding predicated MI: " << *MI); if (!Divergent && !set_difference(Preds, PredicateThen->Predicates).empty()) { Divergent = &Insts.back(); LLVM_DEBUG(dbgs() << " - has divergent predicate: " << *Divergent->MI); @@ -850,61 +849,88 @@ dbgs() << "ARM Loops: Couldn't validate tail predicate.\n"); } +// Check whether this instruction is predicated and whether it would be legal +// to tail predicate it. Predication considerations: +// - VPR writes reset the VPR when the instruction is not in a VPT block. +// - VPR writes are AND'd when the instruction is in a VPT block - or when it +// is in a tail predicated loop. +// This means that predicated VPR defs build up an AND chain of sequential +// predicates. When we perform the tail predication conversion, we effectively +// have a loop contained within a large VPT, where the VPR is reset for each +// loop iteration and the rest of the defs for the iteration build up by +// ANDing their result with the current VPR.P0 value. So, we need to figure out +// whether each the predication will be equivalent in the current loop and in +// its tail predicated form: +// - Each VPT block will have a predicate that controls its entry predicate. +// - A VPT instruction will generate a new value, whereas a VPST will use the +// current value. +// - We may have multiple VCTP instructions, but they need to use the same +// input. They don't have to be completely equivalent, as we may have an +// unpredicated VCTP as well as another within a VPT block. +// - There shouldn't be any predicated instructions that aren't predicated upon +// the VCTP, but this doesn't mean that all VPT blocks need a entry predicate +// based upon a VCTP. Illustration: +// VCTP +// VPSTT <- VPR = VCTP +// VLDRT <- predicated only on VCTP +// VLDRT <- predicated only on VCTP +// VPTT <- VPR = VPT +// VCTPT <- predicated only on VPT +// <- VPR = VPT && VCTP (this VCTP isn't identical to the first VCTP) +// VSTRT <- predicated upon VPT && VCTP + bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) { if (CannotTailPredicate) return false; - if (isVCTP(MI)) { - // If we find another VCTP, check whether it uses the same value as the main VCTP. - // If it does, store it in the SecondaryVCTPs set, else refuse it. - if (VCTP) { - if (!VCTP->getOperand(1).isIdenticalTo(MI->getOperand(1)) || - !RDA.hasSameReachingDef(VCTP, MI, MI->getOperand(1).getReg())) { - LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching " - "definition from the main VCTP"); - return false; - } - LLVM_DEBUG(dbgs() << "ARM Loops: Found secondary VCTP: " << *MI); - SecondaryVCTPs.insert(MI); - } else { - LLVM_DEBUG(dbgs() << "ARM Loops: Found 'main' VCTP: " << *MI); - VCTP = MI; - } - } else if (isVPTOpcode(MI->getOpcode())) { + // Don't check non-mve instructions. + const MCInstrDesc &MCID = MI->getDesc(); + uint64_t Flags = MCID.TSFlags; + if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE) + return true; + + // TODO: Allow VPSEL and VPNOT, we currently cannot because: + // 1) It will use the VPR as a predicate operand, but doesn't have to be + // instead a VPT block, which means we can assert while building up + // the VPT block because we don't find another VPT or VPST to being a new + // one. + // 2) VPSEL still requires a VPR operand even after tail predicating, + // which means we can't remove it unless there is another + // instruction, such as vcmp, that can provide the VPR def. + if (MI->getOpcode() == ARM::MVE_VPSEL || + MI->getOpcode() == ARM::MVE_VPNOT) + return false; + + // Beginning a new VPT block. + if (isVPTOpcode(MI->getOpcode())) { if (MI->getOpcode() != ARM::MVE_VPST) { assert(MI->findRegisterDefOperandIdx(ARM::VPR) != -1 && "VPT does not implicitly define VPR?!"); + CurrentPredicate.clear(); CurrentPredicate.insert(MI); } - + assert(CurrentPredicate.size() && "Can't begin VPT without predicate"); VPTBlocks.emplace_back(MI, CurrentPredicate); CurrentBlock = &VPTBlocks.back(); + LLVM_DEBUG(dbgs() << "ARM Loops: Created new VPT block with predicate:\n"; + for (auto *PI : CurrentPredicate) + dbgs() << " - " << *PI; + dbgs() << " - at: " << *MI); return true; - } else if (MI->getOpcode() == ARM::MVE_VPSEL || - MI->getOpcode() == ARM::MVE_VPNOT) { - // TODO: Allow VPSEL and VPNOT, we currently cannot because: - // 1) It will use the VPR as a predicate operand, but doesn't have to be - // instead a VPT block, which means we can assert while building up - // the VPT block because we don't find another VPT or VPST to being a new - // one. - // 2) VPSEL still requires a VPR operand even after tail predicating, - // which means we can't remove it unless there is another - // instruction, such as vcmp, that can provide the VPR def. - return false; } + // Inspect the instruction for uses of the VPR, where we only allow it to + // be used as a predicate register. We allow instructions to define the + // VPR, without necessarily being predicated upon the VCTP, because we can + // build up the ANDed predicate. bool IsUse = false; - bool IsDef = false; - const MCInstrDesc &MCID = MI->getDesc(); + bool IsDef = MI->findRegisterDefOperandIdx(ARM::VPR) != -1; for (int i = MI->getNumOperands() - 1; i >= 0; --i) { const MachineOperand &MO = MI->getOperand(i); - if (!MO.isReg() || MO.getReg() != ARM::VPR) + if (!MO.isReg() || MO.isDef() || MO.getReg() != ARM::VPR) continue; - if (MO.isDef()) { - CurrentPredicate.insert(MI); - IsDef = true; - } else if (ARM::isVpred(MCID.OpInfo[i].OperandType)) { + if (ARM::isVpred(MCID.OpInfo[i].OperandType)) { CurrentBlock->addInst(MI, CurrentPredicate); IsUse = true; } else { @@ -913,18 +939,6 @@ } } - // If we find a vpr def that is not already predicated on the vctp, we've - // got disjoint predicates that may not be equivalent when we do the - // conversion. - if (IsDef && !IsUse && VCTP && !isVCTP(MI)) { - LLVM_DEBUG(dbgs() << "ARM Loops: Found disjoint vpr def: " << *MI); - return false; - } - - uint64_t Flags = MCID.TSFlags; - if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE) - return true; - // If we find an instruction that has been marked as not valid for tail // predication, only allow the instruction if it's contained within a valid // VPT block. @@ -935,7 +949,37 @@ // If the instruction is already explicitly predicated, then the conversion // will be fine, but ensure that all store operations are predicated. - return !IsUse && MI->mayStore() ? false : true; + if (!IsUse && MI->mayStore()) + return false; + + // Assign the first VCTP and collect any 'secondary' ones, checking that + // they're generating the same predicate as the first. + if (isVCTP(MI)) { + if (!VCTP) + VCTP = MI; + else { + if (VCTP->getOpcode() != MI->getOpcode() || + !VCTP->getOperand(1).isIdenticalTo(MI->getOperand(1)) || + !RDA.hasSameReachingDef(VCTP, MI, MI->getOperand(1).getReg())) { + LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching " + "definition from the main VCTP"); + return false; + } + LLVM_DEBUG(dbgs() << "ARM Loops: Found secondary VCTP: " << *MI); + SecondaryVCTPs.insert(MI); + } + } + + // If this instruction defines the VPR, update the predicate for the + // proceeding instructions. + if (IsDef) { + // Clear the existing predicate when we're not in VPT Active state. + if (!isVectorPredicated(MI)) + CurrentPredicate.clear(); + CurrentPredicate.insert(MI); + LLVM_DEBUG(dbgs() << "ARM Loops: Adding Predicate: " << *MI); + } + return true; } bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/cond-vector-reduce-mve-codegen.ll @@ -465,19 +465,28 @@ ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r7, pc} ; CHECK-NEXT: .LBB5_1: @ %bb4 +; CHECK-NEXT: add.w r12, r3, #3 +; CHECK-NEXT: mov.w lr, #1 +; CHECK-NEXT: bic r12, r12, #3 +; CHECK-NEXT: sub.w r12, r12, #4 +; CHECK-NEXT: add.w lr, lr, r12, lsr #2 ; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: dlstp.32 lr, r3 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB5_2: @ %bb12 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vptt.i32 ne, q0, zr +; CHECK-NEXT: vctp.32 r3 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: vpttt.i32 ne, q0, zr ; CHECK-NEXT: vcmpt.s32 le, q0, r2 +; CHECK-NEXT: vctpt.32 r3 ; CHECK-NEXT: vldrwt.u32 q1, [r1], #16 ; CHECK-NEXT: add.w r12, r12, #4 +; CHECK-NEXT: subs r3, #4 ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrwt.32 q0, [r0], #16 -; CHECK-NEXT: letp lr, .LBB5_2 +; CHECK-NEXT: le lr, .LBB5_2 ; CHECK-NEXT: @ %bb.3: @ %bb32 ; CHECK-NEXT: pop {r7, pc} bb: Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/disjoint-vcmp.mir @@ -135,27 +135,34 @@ ; CHECK: successors: %bb.2(0x80000000) ; CHECK: liveins: $r0, $r1, $r2, $r3 ; CHECK: $r12 = t2MOVi16 target-flags(arm-lo16) @mask, 14 /* CC::al */, $noreg + ; CHECK: renamable $r4, dead $cpsr = tADDi3 renamable $r2, 3, 14 /* CC::al */, $noreg ; CHECK: $r12 = t2MOVTi16 killed $r12, target-flags(arm-hi16) @mask, 14 /* CC::al */, $noreg + ; CHECK: renamable $r4 = t2BICri killed renamable $r4, 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r5 = t2LDRHi12 killed renamable $r12, 0, 14 /* CC::al */, $noreg :: (dereferenceable load 2 from %ir.mask.gep9) + ; CHECK: renamable $r12 = t2SUBri killed renamable $r4, 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r4, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg ; CHECK: $vpr = VMSR_P0 $r5, 14 /* CC::al */, $noreg + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r4, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 16, 14 /* CC::al */, $noreg, $noreg ; CHECK: VSTR_P0_off killed renamable $vpr, $sp, 0, 14 /* CC::al */, $noreg :: (store 4 into %stack.0) ; CHECK: renamable $q0 = MVE_VDUP32 killed renamable $r5, 0, $noreg, undef renamable $q0 ; CHECK: $r3 = tMOVr $r0, 14 /* CC::al */, $noreg - ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2 + ; CHECK: $lr = t2DLS killed renamable $lr ; CHECK: bb.2.bb9: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $lr, $q0, $r0, $r1, $r3, $r12 + ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r12 ; CHECK: renamable $vpr = VLDR_P0_off $sp, 0, 14 /* CC::al */, $noreg :: (load 4 from %stack.0) - ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: MVE_VPST 2, implicit $vpr + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr ; CHECK: renamable $r1, renamable $q1 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4) ; CHECK: renamable $r3, renamable $q2 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4) + ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg ; CHECK: renamable $q1 = nsw MVE_VMULi32 killed renamable $q2, killed renamable $q1, 0, $noreg, undef renamable $q1 ; CHECK: renamable $r12, renamable $q2 = MVE_VLDRWU32_pre killed renamable $r12, 16, 0, $noreg :: (load 16 from %ir.scevgep2, align 8) ; CHECK: MVE_VPTv4u32 8, renamable $q0, killed renamable $q2, 2, implicit-def $vpr ; CHECK: MVE_VSTRWU32 killed renamable $q1, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4) ; CHECK: $r0 = tMOVr $r3, 14 /* CC::al */, $noreg - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 ; CHECK: bb.3.bb27: ; CHECK: $sp = tADDspi $sp, 1, 14 /* CC::al */, $noreg ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r4, def $r5, def $r7, def $pc Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/remat-vctp.ll @@ -7,13 +7,23 @@ ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: ldrd r5, r12, [sp, #80] +; CHECK-NEXT: cmp.w r12, #4 +; CHECK-NEXT: mov r4, r12 ; CHECK-NEXT: vmvn.i32 q0, #0x80000000 +; CHECK-NEXT: it ge +; CHECK-NEXT: movge r4, #4 ; CHECK-NEXT: vmov.i32 q1, #0x3f +; CHECK-NEXT: sub.w r4, r12, r4 ; CHECK-NEXT: vmov.i32 q2, #0x1 -; CHECK-NEXT: dlstp.32 lr, r12 +; CHECK-NEXT: add.w lr, r4, #3 +; CHECK-NEXT: movs r4, #1 +; CHECK-NEXT: add.w lr, r4, lr, lsr #2 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_1: @ %bb6 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q4, [r1], #16 +; CHECK-NEXT: vctp.32 r12 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q4, [r1], #16 ; CHECK-NEXT: vabs.s32 q5, q4 ; CHECK-NEXT: vcls.s32 q3, q5 ; CHECK-NEXT: vshl.u32 q5, q5, q3 @@ -31,13 +41,15 @@ ; CHECK-NEXT: vqshl.s32 q5, q5, #1 ; CHECK-NEXT: vpt.s32 lt, q4, zr ; CHECK-NEXT: vnegt.s32 q5, q5 +; CHECK-NEXT: vctp.32 r12 +; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrwt.u32 q4, [r0], #16 ; CHECK-NEXT: vqrdmulh.s32 q4, q4, q5 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vstrwt.32 q4, [r2], #16 ; CHECK-NEXT: vstrwt.32 q3, [r3], #16 -; CHECK-NEXT: letp lr, .LBB0_1 +; CHECK-NEXT: le lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %bb44 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, r5, r7, pc} Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/tp-multiple-vpst.ll @@ -0,0 +1,67 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -tail-predication=enabled %s -o - | FileCheck %s + +define dso_local arm_aapcs_vfpcc i32 @minmaxval4(i32* nocapture readonly %x, i32* nocapture %minp) { +; CHECK-LABEL: minmaxval4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: mov.w lr, #3 +; CHECK-NEXT: vmov.i32 q0, #0x80000000 +; CHECK-NEXT: vmvn.i32 q1, #0x80000000 +; CHECK-NEXT: movs r2, #10 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q2, [r0], #16 +; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vcmpt.s32 gt, q2, q0 +; CHECK-NEXT: vmovt q0, q2 +; CHECK-NEXT: vctp.32 r2 +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vcmpt.s32 gt, q1, q2 +; CHECK-NEXT: vmovt q1, q2 +; CHECK-NEXT: le lr, .LBB0_1 +; CHECK-NEXT: @ %bb.2: @ %middle.block +; CHECK-NEXT: mvn r0, #-2147483648 +; CHECK-NEXT: vminv.s32 r0, q1 +; CHECK-NEXT: str r0, [r1] +; CHECK-NEXT: mov.w r0, #-2147483648 +; CHECK-NEXT: vmaxv.s32 r0, q0 +; CHECK-NEXT: pop {r7, pc} +entry: + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %index = phi i32 [ 0, %entry ], [ %index.next, %vector.body ] + %vec.phi = phi <4 x i32> [ , %entry ], [ %5, %vector.body ] + %vec.phi29 = phi <4 x i32> [ , %entry ], [ %7, %vector.body ] + %0 = getelementptr inbounds i32, i32* %x, i32 %index + %active.lane.mask = call <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32 %index, i32 10) + %1 = bitcast i32* %0 to <4 x i32>* + %wide.masked.load = call <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>* %1, i32 4, <4 x i1> %active.lane.mask, <4 x i32> undef) + %2 = icmp sgt <4 x i32> %wide.masked.load, %vec.phi29 + %3 = icmp slt <4 x i32> %wide.masked.load, %vec.phi + %4 = and <4 x i1> %active.lane.mask, %3 + %5 = select <4 x i1> %4, <4 x i32> %wide.masked.load, <4 x i32> %vec.phi + %6 = and <4 x i1> %active.lane.mask, %2 + %7 = select <4 x i1> %6, <4 x i32> %wide.masked.load, <4 x i32> %vec.phi29 + %index.next = add i32 %index, 4 + %8 = icmp eq i32 %index.next, 12 + br i1 %8, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %9 = call i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32> %7) + %10 = call i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32> %5) + store i32 %10, i32* %minp, align 4 + ret i32 %9 +} + +declare <4 x i1> @llvm.get.active.lane.mask.v4i1.i32(i32, i32) #1 +declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) #2 +declare i32 @llvm.experimental.vector.reduce.smin.v4i32(<4 x i32>) #3 +declare i32 @llvm.experimental.vector.reduce.smax.v4i32(<4 x i32>) #3 + Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/vctp-in-vpt-2.mir @@ -118,16 +118,24 @@ ; CHECK: bb.1.bb3: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: liveins: $r0, $r1, $r2, $r3 + ; CHECK: renamable $r12 = t2ADDri renamable $r2, 3, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $lr = t2MOVi 1, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r12 = t2BICri killed renamable $r12, 3, 14 /* CC::al */, $noreg, $noreg ; CHECK: $vpr = VMSR_P0 killed $r3, 14 /* CC::al */, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r12, 4, 14 /* CC::al */, $noreg, $noreg ; CHECK: VSTR_P0_off killed renamable $vpr, $sp, 0, 14 /* CC::al */, $noreg :: (store 4 into %stack.0) ; CHECK: $r3 = tMOVr $r0, 14 /* CC::al */, $noreg - ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r2 + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $lr, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg + ; CHECK: $lr = t2DLS killed renamable $lr ; CHECK: bb.2.bb9: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $lr, $r0, $r1, $r3 + ; CHECK: liveins: $lr, $r0, $r1, $r2, $r3 ; CHECK: renamable $vpr = VLDR_P0_off $sp, 0, 14 /* CC::al */, $noreg :: (load 4 from %stack.0) - ; CHECK: MVE_VPST 8, implicit $vpr - ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4) + ; CHECK: MVE_VPST 4, implicit $vpr + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 1, killed renamable $vpr + ; CHECK: renamable $r1, renamable $q0 = MVE_VLDRWU32_post killed renamable $r1, 16, 1, killed renamable $vpr :: (load 16 from %ir.lsr.iv24, align 4) + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r2, 0, $noreg + ; CHECK: renamable $r2, dead $cpsr = tSUBi8 killed renamable $r2, 4, 14 /* CC::al */, $noreg ; CHECK: MVE_VPST 4, implicit $vpr ; CHECK: renamable $vpr = MVE_VCMPi32r renamable $q0, $zr, 1, 1, killed renamable $vpr ; CHECK: renamable $r3, renamable $q1 = MVE_VLDRWU32_post killed renamable $r3, 16, 1, renamable $vpr :: (load 16 from %ir.lsr.iv1, align 4) @@ -135,7 +143,7 @@ ; CHECK: MVE_VPST 8, implicit $vpr ; CHECK: MVE_VSTRWU32 killed renamable $q0, killed renamable $r0, 0, 1, killed renamable $vpr :: (store 16 into %ir.lsr.iv1, align 4) ; CHECK: $r0 = tMOVr $r3, 14 /* CC::al */, $noreg - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 ; CHECK: bb.3.bb27: ; CHECK: $sp = tADDspi $sp, 1, 14 /* CC::al */, $noreg ; CHECK: tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/vpt-blocks.mir @@ -215,17 +215,26 @@ ; CHECK: bb.1.vector.ph: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: liveins: $r0, $r1, $r2 + ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg - ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r1 + ; CHECK: $lr = t2DLS killed renamable $lr ; CHECK: bb.2.vector.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $lr, $q0, $r0, $r2, $r3 - ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, killed $noreg - ; CHECK: MVE_VPTv4s32r 4, renamable $q1, renamable $r2, 11, implicit-def $vpr + ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, killed renamable $vpr + ; CHECK: MVE_VPTv4s32r 2, renamable $q1, renamable $r2, 11, implicit-def $vpr ; CHECK: renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 12, 1, killed renamable $vpr + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r1, 1, killed renamable $vpr ; CHECK: renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 ; CHECK: bb.3.for.cond.cleanup: ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0.entry: @@ -593,17 +602,26 @@ ; CHECK: bb.1.vector.ph: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: liveins: $r0, $r1, $r2 + ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg - ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r1 + ; CHECK: $lr = t2DLS killed renamable $lr ; CHECK: bb.2.vector.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $lr, $q0, $r0, $r2, $r3 - ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, killed $noreg - ; CHECK: MVE_VPTv4s32r 12, renamable $q1, renamable $r2, 10, implicit-def $vpr + ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, killed renamable $vpr + ; CHECK: MVE_VPTv4s32r 14, renamable $q1, renamable $r2, 10, implicit-def $vpr ; CHECK: renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 13, 1, killed renamable $vpr + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r1, 2, killed renamable $vpr ; CHECK: renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 2, killed renamable $vpr - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 ; CHECK: bb.3.for.cond.cleanup: ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc ; @@ -713,17 +731,26 @@ ; CHECK: bb.1.vector.ph: ; CHECK: successors: %bb.2(0x80000000) ; CHECK: liveins: $r0, $r1, $r2 + ; CHECK: renamable $r3, dead $cpsr = tADDi3 renamable $r1, 3, 14 /* CC::al */, $noreg ; CHECK: renamable $q0 = MVE_VMOVimmi32 0, 0, $noreg, undef renamable $q0 + ; CHECK: renamable $r3 = t2BICri killed renamable $r3, 3, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r12 = t2SUBri killed renamable $r3, 4, 14 /* CC::al */, $noreg, $noreg + ; CHECK: renamable $r3, dead $cpsr = tMOVi8 1, 14 /* CC::al */, $noreg + ; CHECK: renamable $lr = nuw nsw t2ADDrs killed renamable $r3, killed renamable $r12, 19, 14 /* CC::al */, $noreg, $noreg ; CHECK: renamable $r3, dead $cpsr = nsw tRSB renamable $r2, 14 /* CC::al */, $noreg - ; CHECK: $lr = MVE_DLSTP_32 killed renamable $r1 + ; CHECK: $lr = t2DLS killed renamable $lr ; CHECK: bb.2.vector.body: ; CHECK: successors: %bb.2(0x7c000000), %bb.3(0x04000000) - ; CHECK: liveins: $lr, $q0, $r0, $r2, $r3 - ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 0, killed $noreg - ; CHECK: MVE_VPTv4s32r 4, renamable $q0, renamable $r2, 11, implicit-def $vpr + ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3 + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r1, 0, $noreg + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $q1 = MVE_VLDRWU32 renamable $r0, 0, 1, killed renamable $vpr + ; CHECK: MVE_VPTv4s32r 2, renamable $q0, renamable $r2, 11, implicit-def $vpr ; CHECK: renamable $vpr = MVE_VCMPs32r killed renamable $q1, renamable $r3, 12, 1, killed renamable $vpr + ; CHECK: renamable $vpr = MVE_VCTP32 renamable $r1, 1, killed renamable $vpr ; CHECK: renamable $r0 = MVE_VSTRWU32_post renamable $q0, killed renamable $r0, 16, 1, killed renamable $vpr - ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.2 + ; CHECK: renamable $r1, dead $cpsr = tSUBi8 killed renamable $r1, 4, 14 /* CC::al */, $noreg + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.2 ; CHECK: bb.3.for.cond.cleanup: ; CHECK: frame-destroy tPOP_RET 14 /* CC::al */, $noreg, def $r7, def $pc bb.0.entry: