Index: llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp =================================================================== --- llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -1084,7 +1084,80 @@ return true; } -bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) { +static bool ValidateMVEStore(MachineInstr *MI, MachineLoop *ML) { + + auto GetFrameIndex = [](MachineMemOperand *Operand) { + auto PseudoValue = Operand->getPseudoValue(); + if (PseudoValue->kind() == PseudoSourceValue::FixedStack) { + if (const auto *FS = dyn_cast(PseudoValue)) { + return FS->getFrameIndex(); + } + } + return -1; + }; + + auto IsStackOp = [GetFrameIndex](MachineInstr *I) { + switch (I->getOpcode()) { + case ARM::MVE_VSTRWU32: + case ARM::MVE_VLDRWU32: { + return I->getOperand(1).getReg() == ARM::SP && + I->memoperands().size() >= 1 && + GetFrameIndex(I->memoperands().front()) >= 0; + } + default: + return false; + } + }; + + // An unpredicated vector register spill is allowed if all of the uses of the + // stack slot are within the loop + if (MI->getOpcode() != ARM::MVE_VSTRWU32 || !IsStackOp(MI)) + return false; + + // Search all blocks after the loop for accesses to the same stack slot. + // ReachingDefAnalysis doesn't work for sp as it relies on registers being + // live-out (which sp never is) to know what blocks to look in + if (MI->memoperands().size() == 0) + return false; + int FI = GetFrameIndex(MI->memoperands().front()); + + MachineFrameInfo FrameInfo = MI->getParent()->getParent()->getFrameInfo(); + if (FI == -1 || !FrameInfo.isSpillSlotObjectIndex(FI)) + return false; + + SmallVector Frontier; + ML->getExitBlocks(Frontier); + SmallPtrSet Visited{MI->getParent()}; + unsigned int Idx = 0; + while (Idx < Frontier.size()) { + MachineBasicBlock *BB = Frontier[Idx]; + for (auto &I : *BB) { + if (!IsStackOp(&I) || I.memoperands().size() == 0) + continue; + if (GetFrameIndex(I.memoperands().front()) != FI) + continue; + // If this block has a store to the stack slot before any loads then we + // can ignore the block + if (I.getOpcode() == ARM::MVE_VSTRWU32) + break; + // If the store and the load are using the same stack slot then the + // store isn't valid for tail predication + if (I.getOpcode() == ARM::MVE_VLDRWU32) + return false; + } + + for (auto Succ : BB->successors()) { + if (!Visited.contains(Succ)) + Frontier.push_back(Succ); + } + Visited.insert(BB); + Idx++; + } + + return true; +} + +bool LowOverheadLoop::ValidateMVEInst(MachineInstr *MI) { if (CannotTailPredicate) return false; @@ -1140,7 +1213,7 @@ // If the instruction is already explicitly predicated, then the conversion // will be fine, but ensure that all store operations are predicated. - if (MI->mayStore()) + if (MI->mayStore() && !ValidateMVEStore(MI, &ML)) return IsUse; // If this instruction defines the VPR, update the predicate for the Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector_spill_in_loop.mir =================================================================== --- /dev/null +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector_spill_in_loop.mir @@ -0,0 +1,166 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -mtriple=thumbv8.1m.main -mattr=+mve -run-pass=arm-low-overhead-loops %s -o - | FileCheck %s +--- | + define hidden void @vector_spill_in_loop() { + entry: + ret void + } + + define hidden void @vector_spill_load_outside() { + entry: + ret void + } +... +--- +name: vector_spill_in_loop +tracksRegLiveness: true +stack: + - { id: 0, name: '', type: spill-slot, offset: -120, size: 16, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +body: | + ; CHECK-LABEL: name: vector_spill_in_loop + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $q0, $r1, $r2, $r3, $r4, $r5, $r7, $r8, $r10, $r11, $r12 + ; CHECK: $r6 = tMOVr $r2, 14 /* CC::al */, $noreg + ; CHECK: $r0 = tMOVr $r12, 14 /* CC::al */, $noreg + ; CHECK: $lr = MVE_DLSTP_16 renamable $r3 + ; CHECK: bb.1: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r10, $r11, $r12 + ; CHECK: renamable $r0, renamable $q6 = MVE_VLDRHU16_post killed renamable $r0, 16, 0, $noreg + ; CHECK: renamable $q3 = MVE_VLDRHU16 renamable $r6, 0, 0, $noreg + ; CHECK: renamable $q5 = MVE_VSHR_immu16 killed renamable $q3, 11, 0, $noreg, undef renamable $q5 + ; CHECK: MVE_VSTRWU32 killed renamable $q5, $sp, 80, 0, $noreg :: (store (s128) into %stack.0, align 8) + ; CHECK: dead renamable $q7 = MVE_VLDRWU32 $sp, 80, 0, $noreg :: (load (s128) from %stack.0, align 8) + ; CHECK: dead renamable $vpr = MVE_VCMPi16r killed renamable $q6, renamable $r8, 1, 0, killed $noreg + ; CHECK: $lr = MVE_LETP killed renamable $lr, %bb.1 + bb.0: + successors: %bb.1(0x80000000) + liveins: $q0, $r1, $r2, $r3, $r4, $r5, $r7, $r8, $r10, $r11, $r12 + + $r6 = tMOVr $r2, 14 /* CC::al */, $noreg + $r0 = tMOVr $r12, 14 /* CC::al */, $noreg + $r9 = tMOVr $r3, 14 /* CC::al */, $noreg + renamable $lr = t2DoLoopStartTP renamable $r1, renamable $r3 + + bb.1: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12 + + renamable $vpr = MVE_VCTP16 renamable $r9, 0, $noreg + MVE_VPST 8, implicit $vpr + renamable $r0, renamable $q6 = MVE_VLDRHU16_post killed renamable $r0, 16, 1, renamable $vpr + renamable $q3 = MVE_VLDRHU16 renamable $r6, 0, 1, renamable $vpr + MVE_VPST 2, implicit $vpr + renamable $q5 = MVE_VSHR_immu16 renamable $q3, 11, 1, renamable $vpr, undef renamable $q5 + renamable $r9 = nsw t2SUBri killed renamable $r9, 8, 14 /* CC::al */, $noreg, $noreg + MVE_VSTRWU32 killed renamable $q5, $sp, 80, 0, $noreg :: (store (s128) into %stack.0, align 8) + MVE_VPST 8, implicit $vpr + renamable $q7 = MVE_VLDRWU32 $sp, 80, 0, $noreg :: (load (s128) from %stack.0, align 8) + MVE_VPST 1, implicit $vpr + renamable $vpr = MVE_VCMPi16r killed renamable $q6, renamable $r8, 1, 1, killed renamable $vpr + renamable $lr = t2LoopEndDec killed renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14 /* CC::al */, $noreg + + bb.2: + successors: %bb.3(0x04000000), %bb.0(0x7c000000) + liveins: $q0, $r1, $r2, $r3, $r4, $r5, $r7, $r8, $r10, $r11, $r12 + + renamable $r0 = tLDRspi $sp, 1, 14 /* CC::al */, $noreg + renamable $r10 = nuw t2ADDri killed renamable $r10, 1, 14 /* CC::al */, $noreg, $noreg + renamable $r12 = t2ADDrs killed renamable $r12, killed renamable $r0, 10, 14 /* CC::al */, $noreg, $noreg + renamable $r0 = tLDRspi $sp, 3, 14 /* CC::al */, $noreg + renamable $r2 = t2ADDrs killed renamable $r2, killed renamable $r0, 10, 14 /* CC::al */, $noreg, $noreg + renamable $r0 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg + tCMPhir renamable $r10, killed renamable $r0, 14 /* CC::al */, $noreg, implicit-def $cpsr + tBcc %bb.0, 1 /* CC::ne */, killed $cpsr + + bb.3: + $sp = frame-destroy tADDspi $sp, 24, 14 /* CC::al */, $noreg + $sp = frame-destroy VLDMDIA_UPD $sp, 14 /* CC::al */, $noreg, def $d8, def $d9, def $d10, def $d11, def $d12, def $d13, def $d14, def $d15 + $sp = frame-destroy tADDspi $sp, 1, 14 /* CC::al */, $noreg + $sp = frame-destroy t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11, def $pc +... +--- +name: vector_spill_load_outside +stack: + - { id: 0, name: '', type: spill-slot, offset: -120, size: 16, alignment: 8, + stack-id: default, callee-saved-register: '', callee-saved-restored: true, + debug-info-variable: '', debug-info-expression: '', debug-info-location: '' } +tracksRegLiveness: true +body: | + ; CHECK-LABEL: name: vector_spill_load_outside + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: liveins: $q0, $r1, $r2, $r3, $r4, $r5, $r7, $r8, $r10, $r11, $r12 + ; CHECK: $r6 = tMOVr $r2, 14 /* CC::al */, $noreg + ; CHECK: $r0 = tMOVr $r12, 14 /* CC::al */, $noreg + ; CHECK: $r9 = tMOVr $r3, 14 /* CC::al */, $noreg + ; CHECK: $lr = t2DLS renamable $r1 + ; CHECK: bb.1: + ; CHECK: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK: liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12 + ; CHECK: renamable $vpr = MVE_VCTP16 renamable $r9, 0, $noreg + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: renamable $r0, renamable $q6 = MVE_VLDRHU16_post killed renamable $r0, 16, 1, renamable $vpr + ; CHECK: renamable $q3 = MVE_VLDRHU16 renamable $r6, 0, 1, renamable $vpr + ; CHECK: MVE_VPST 2, implicit $vpr + ; CHECK: renamable $q5 = MVE_VSHR_immu16 killed renamable $q3, 11, 1, renamable $vpr, undef renamable $q5 + ; CHECK: renamable $r9 = nsw t2SUBri killed renamable $r9, 8, 14 /* CC::al */, $noreg, $noreg + ; CHECK: MVE_VSTRWU32 killed renamable $q5, $sp, 80, 0, $noreg :: (store (s128) into %stack.0, align 8) + ; CHECK: MVE_VPST 8, implicit $vpr + ; CHECK: dead renamable $q7 = MVE_VLDRWU32 $sp, 80, 0, $noreg :: (load (s128) from %stack.0, align 8) + ; CHECK: MVE_VPST 1, implicit $vpr + ; CHECK: dead renamable $vpr = MVE_VCMPi16r killed renamable $q6, renamable $r8, 1, 1, killed renamable $vpr + ; CHECK: $lr = t2LEUpdate killed renamable $lr, %bb.1 + bb.0: + successors: %bb.1(0x80000000) + liveins: $q0, $r1, $r2, $r3, $r4, $r5, $r7, $r8, $r10, $r11, $r12 + + $r6 = tMOVr $r2, 14 /* CC::al */, $noreg + $r0 = tMOVr $r12, 14 /* CC::al */, $noreg + $r9 = tMOVr $r3, 14 /* CC::al */, $noreg + renamable $lr = t2DoLoopStartTP renamable $r1, renamable $r3 + + bb.1: + successors: %bb.1(0x7c000000), %bb.2(0x04000000) + liveins: $lr, $q0, $r0, $r1, $r2, $r3, $r4, $r5, $r6, $r7, $r8, $r9, $r10, $r11, $r12 + + renamable $vpr = MVE_VCTP16 renamable $r9, 0, $noreg + MVE_VPST 8, implicit $vpr + renamable $r0, renamable $q6 = MVE_VLDRHU16_post killed renamable $r0, 16, 1, renamable $vpr + renamable $q3 = MVE_VLDRHU16 renamable $r6, 0, 1, renamable $vpr + MVE_VPST 2, implicit $vpr + renamable $q5 = MVE_VSHR_immu16 renamable $q3, 11, 1, renamable $vpr, undef renamable $q5 + renamable $r9 = nsw t2SUBri killed renamable $r9, 8, 14 /* CC::al */, $noreg, $noreg + MVE_VSTRWU32 killed renamable $q5, $sp, 80, 0, $noreg :: (store (s128) into %stack.0, align 8) + MVE_VPST 8, implicit $vpr + renamable $q7 = MVE_VLDRWU32 $sp, 80, 0, $noreg :: (load (s128) from %stack.0, align 8) + MVE_VPST 1, implicit $vpr + renamable $vpr = MVE_VCMPi16r killed renamable $q6, renamable $r8, 1, 1, killed renamable $vpr + renamable $lr = t2LoopEndDec killed renamable $lr, %bb.1, implicit-def dead $cpsr + tB %bb.2, 14 /* CC::al */, $noreg + + bb.2: + successors: %bb.3(0x04000000), %bb.0(0x7c000000) + liveins: $q0, $r1, $r2, $r3, $r4, $r5, $r7, $r8, $r10, $r11, $r12 + + renamable $q7 = MVE_VLDRWU32 $sp, 80, 0, $noreg :: (load (s128) from %stack.0, align 8) + renamable $r0 = tLDRspi $sp, 1, 14 /* CC::al */, $noreg + renamable $r10 = nuw t2ADDri killed renamable $r10, 1, 14 /* CC::al */, $noreg, $noreg + renamable $r12 = t2ADDrs killed renamable $r12, killed renamable $r0, 10, 14 /* CC::al */, $noreg, $noreg + renamable $r0 = tLDRspi $sp, 3, 14 /* CC::al */, $noreg + renamable $r2 = t2ADDrs killed renamable $r2, killed renamable $r0, 10, 14 /* CC::al */, $noreg, $noreg + renamable $r0 = tLDRspi $sp, 2, 14 /* CC::al */, $noreg + tCMPhir renamable $r10, killed renamable $r0, 14 /* CC::al */, $noreg, implicit-def $cpsr + tBcc %bb.0, 1 /* CC::ne */, killed $cpsr + + bb.3: + $sp = frame-destroy tADDspi $sp, 24, 14 /* CC::al */, $noreg + $sp = frame-destroy VLDMDIA_UPD $sp, 14 /* CC::al */, $noreg, def $d8, def $d9, def $d10, def $d11, def $d12, def $d13, def $d14, def $d15 + $sp = frame-destroy tADDspi $sp, 1, 14 /* CC::al */, $noreg + $sp = frame-destroy t2LDMIA_RET $sp, 14 /* CC::al */, $noreg, def $r4, def $r5, def $r6, def $r7, def $r8, def $r9, def $r10, def $r11, def $pc + +...