Index: llvm/lib/Target/ARM/ARMInstrMVE.td =================================================================== --- llvm/lib/Target/ARM/ARMInstrMVE.td +++ llvm/lib/Target/ARM/ARMInstrMVE.td @@ -1515,6 +1515,7 @@ let Inst{12-8} = 0b01001; let Inst{4} = 0b1; let Inst{0} = 0b0; + let validForTailPredication = 1; } def MVE_VMULt1i8 : MVE_VMULt1<"i8", 0b00>; Index: llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp =================================================================== --- llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp +++ llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp @@ -25,6 +25,7 @@ #include "llvm/CodeGen/MachineFunctionPass.h" #include "llvm/CodeGen/MachineLoopInfo.h" #include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/MC/MCInstrDesc.h" using namespace llvm; @@ -33,6 +34,97 @@ namespace { + struct LowOverheadLoop { + + MachineLoop *ML = nullptr; + MachineFunction *MF = nullptr; + MachineInstr *InsertPt = nullptr; + MachineInstr *Start = nullptr; + MachineInstr *Dec = nullptr; + MachineInstr *End = nullptr; + MachineInstr *VCTP = nullptr; + SmallVector VPTUsers; + bool Revert = false; + bool FoundOneVCTP = false; + bool CannotTailPredicate = false; + + LowOverheadLoop(MachineLoop *ML) : ML(ML) { + MF = ML->getHeader()->getParent(); + } + + // For now, only support one vctp instruction. If we find multiple then + // we shouldn't perform tail predication. + void addVCTP(MachineInstr *MI) { + if (!VCTP) { + VCTP = MI; + FoundOneVCTP = true; + } else + FoundOneVCTP = false; + } + + // Check that nothing else is writing to VPR and record any insts + // reading the VPR. + void ScanForVPR(MachineInstr *MI) { + for (auto &MO : MI->operands()) { + if (!MO.isReg() || MO.getReg() != ARM::VPR) + continue; + if (MO.isUse()) + VPTUsers.push_back(MI); + if (MO.isDef()) { + CannotTailPredicate = true; + break; + } + } + } + + // If this is an MVE instruction, check that we know how to use tail + // predication with it. + void CheckTPValidity(MachineInstr *MI) { + if (CannotTailPredicate) + return; + + const MCInstrDesc &MCID = MI->getDesc(); + uint64_t Flags = MCID.TSFlags; + if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE) + return; + + if ((Flags & ARMII::ValidForTailPredication) == 0) { + LLVM_DEBUG(dbgs() << "ARM Loops: Can't tail predicate: " << *MI); + CannotTailPredicate = true; + } + } + + bool IsTailPredicationLegal() const { + // For now, let's keep things really simple and only support a single + // block for tail predication. + return !Revert && FoundAllComponents() && FoundOneVCTP && + !CannotTailPredicate && ML->getNumBlocks() == 1; + } + + // Is it safe to define LR with DLS/WLS? + // LR can be defined if it is the operand to start, because it's the same + // value, or if it's going to be equivalent to the operand to Start. + MachineInstr *IsSafeToDefineLR(); + + // Check the branch targets are within range and we satisfy our restructi + void CheckLegality(ARMBasicBlockUtils *BBUtils); + + bool FoundAllComponents() const { + return Start && Dec && End; + } + + void dump() const { + if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start; + if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec; + if (End) dbgs() << "ARM Loops: Found Loop End: " << *End; + if (VCTP) dbgs() << "ARM Loops: Found VCTP: " << *VCTP; + if (!FoundAllComponents()) + dbgs() << "ARM Loops: Not a low-overhead loop.\n"; + else if (!(Start && Dec && End)) + dbgs() << "ARM Loops: Failed to find all loop components.\n"; + } + }; + class ARMLowOverheadLoops : public MachineFunctionPass { MachineFunction *MF = nullptr; const ARMBaseInstrInfo *TII = nullptr; @@ -64,8 +156,6 @@ private: bool ProcessLoop(MachineLoop *ML); - MachineInstr * IsSafeToDefineLR(MachineInstr *MI); - bool RevertNonLoops(); void RevertWhile(MachineInstr *MI) const; @@ -74,9 +164,11 @@ void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const; - void Expand(MachineLoop *ML, MachineInstr *Start, - MachineInstr *InsertPt, MachineInstr *Dec, - MachineInstr *End, bool Revert); + void RemoveVPTBlocks(LowOverheadLoop &LoLoop); + + MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop); + + void Expand(LowOverheadLoop &LoLoop); }; } @@ -86,31 +178,6 @@ INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME, false, false) -bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { - const ARMSubtarget &ST = static_cast(mf.getSubtarget()); - if (!ST.hasLOB()) - return false; - - MF = &mf; - LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n"); - - auto &MLI = getAnalysis(); - MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness); - MRI = &MF->getRegInfo(); - TII = static_cast(ST.getInstrInfo()); - BBUtils = std::unique_ptr(new ARMBasicBlockUtils(*MF)); - BBUtils->computeAllBlockSizes(); - BBUtils->adjustBBOffsetsAfter(&MF->front()); - - bool Changed = false; - for (auto ML : MLI) { - if (!ML->getParentLoop()) - Changed |= ProcessLoop(ML); - } - Changed |= RevertNonLoops(); - return Changed; -} - static bool IsLoopStart(MachineInstr &MI) { return MI.getOpcode() == ARM::t2DoLoopStart || MI.getOpcode() == ARM::t2WhileLoopStart; @@ -141,10 +208,20 @@ return nullptr; } -// Is it safe to define LR with DLS/WLS? -// LR can defined if it is the operand to start, because it's the same value, -// or if it's going to be equivalent to the operand to Start. -MachineInstr *ARMLowOverheadLoops::IsSafeToDefineLR(MachineInstr *Start) { +static bool IsVCTP(MachineInstr *MI) { + switch (MI->getOpcode()) { + default: + break; + case ARM::MVE_VCTP8: + case ARM::MVE_VCTP16: + case ARM::MVE_VCTP32: + case ARM::MVE_VCTP64: + return true; + } + return false; +} + +MachineInstr *LowOverheadLoop::IsSafeToDefineLR() { auto IsMoveLR = [](MachineInstr *MI, unsigned Reg) { return MI->getOpcode() == ARM::tMOVr && @@ -210,6 +287,78 @@ return nullptr; } +void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils) { + if (Revert) + return; + + if (!End->getOperand(1).isMBB()) + report_fatal_error("Expected LoopEnd to target basic block"); + + // TODO Maybe there's cases where the target doesn't have to be the header, + // but for now be safe and revert. + if (End->getOperand(1).getMBB() != ML->getHeader()) { + LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targetting header.\n"); + Revert = true; + return; + } + + // The WLS and LE instructions have 12-bits for the label offset. WLS + // requires a positive offset, while LE uses negative. + if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML->getHeader()) || + !BBUtils->isBBInRange(End, ML->getHeader(), 4094)) { + LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n"); + Revert = true; + return; + } + + if (Start->getOpcode() == ARM::t2WhileLoopStart && + (BBUtils->getOffsetOf(Start) > + BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) || + !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) { + LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n"); + Revert = true; + return; + } + + InsertPt = Revert ? nullptr : IsSafeToDefineLR(); + if (!InsertPt) { + LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n"); + Revert = true; + } else + LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt); + + LLVM_DEBUG(if (IsTailPredicationLegal()) { + dbgs() << "ARM Loops: Will use tail predication to convert:\n"; + for (auto *MI : VPTUsers) + dbgs() << " - " << *MI; + }); +} + +bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) { + const ARMSubtarget &ST = static_cast(mf.getSubtarget()); + if (!ST.hasLOB()) + return false; + + MF = &mf; + LLVM_DEBUG(dbgs() << "ARM Loops on " << MF->getName() << " ------------- \n"); + + auto &MLI = getAnalysis(); + MF->getProperties().set(MachineFunctionProperties::Property::TracksLiveness); + MRI = &MF->getRegInfo(); + TII = static_cast(ST.getInstrInfo()); + BBUtils = std::unique_ptr(new ARMBasicBlockUtils(*MF)); + BBUtils->computeAllBlockSizes(); + BBUtils->adjustBBOffsetsAfter(&MF->front()); + + bool Changed = false; + for (auto ML : MLI) { + if (!ML->getParentLoop()) + Changed |= ProcessLoop(ML); + } + Changed |= RevertNonLoops(); + return Changed; +} + bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) { bool Changed = false; @@ -233,18 +382,14 @@ return nullptr; }; - MachineInstr *Start = nullptr; - MachineInstr *Dec = nullptr; - MachineInstr *End = nullptr; - bool Revert = false; - + LowOverheadLoop LoLoop(ML); // Search the preheader for the start intrinsic, or look through the // predecessors of the header to find exactly one set.iterations intrinsic. // FIXME: I don't see why we shouldn't be supporting multiple predecessors // with potentially multiple set.loop.iterations, so we need to enable this. - if (auto *Preheader = ML->getLoopPreheader()) { - Start = SearchForStart(Preheader); - } else { + if (auto *Preheader = ML->getLoopPreheader()) + LoLoop.Start = SearchForStart(Preheader); + else { LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find loop preheader!\n" << " - Performing manual predecessor search.\n"); MachineBasicBlock *Pred = nullptr; @@ -252,34 +397,46 @@ if (!ML->contains(MBB)) { if (Pred) { LLVM_DEBUG(dbgs() << " - Found multiple out-of-loop preds.\n"); - Start = nullptr; + LoLoop.Start = nullptr; break; } Pred = MBB; - Start = SearchForStart(MBB); + LoLoop.Start = SearchForStart(MBB); } } } // Find the low-overhead loop components and decide whether or not to fall - // back to a normal loop. + // back to a normal loop. Also look for a vctp instructions and decide + // whether we can convert that predicate using tail predication. for (auto *MBB : reverse(ML->getBlocks())) { for (auto &MI : *MBB) { if (MI.getOpcode() == ARM::t2LoopDec) - Dec = &MI; + LoLoop.Dec = &MI; else if (MI.getOpcode() == ARM::t2LoopEnd) - End = &MI; + LoLoop.End = &MI; else if (IsLoopStart(MI)) - Start = &MI; + LoLoop.Start = &MI; + else if (IsVCTP(&MI)) + LoLoop.addVCTP(&MI); else if (MI.getDesc().isCall()) { // TODO: Though the call will require LE to execute again, does this // mean we should revert? Always executing LE hopefully should be // faster than performing a sub,cmp,br or even subs,br. - Revert = true; + LoLoop.Revert = true; LLVM_DEBUG(dbgs() << "ARM Loops: Found call.\n"); + } else { + // Once we've found a vctp, record the users of vpr and check there's + // no more vpr defs. + if (LoLoop.FoundOneVCTP) + LoLoop.ScanForVPR(&MI); + // Check we know how to tail predicate any mve instructions. + LoLoop.CheckTPValidity(&MI); } - if (!Dec || End) + // We need to ensure that LR is not used or defined inbetween LoopDec and + // LoopEnd. + if (!LoLoop.Dec || LoLoop.End || LoLoop.Revert) continue; // If we find that LR has been written or read between LoopDec and @@ -294,61 +451,19 @@ if (MI.getOpcode() != ARM::t2LoopDec && MO.isReg() && MO.getReg() == ARM::LR) { LLVM_DEBUG(dbgs() << "ARM Loops: Found LR Use/Def: " << MI); - Revert = true; + LoLoop.Revert = true; break; } } } - - if (Dec && End && Revert) - break; } - LLVM_DEBUG(if (Start) dbgs() << "ARM Loops: Found Loop Start: " << *Start; - if (Dec) dbgs() << "ARM Loops: Found Loop Dec: " << *Dec; - if (End) dbgs() << "ARM Loops: Found Loop End: " << *End;); - - if (!Start && !Dec && !End) { - LLVM_DEBUG(dbgs() << "ARM Loops: Not a low-overhead loop.\n"); - return Changed; - } else if (!(Start && Dec && End)) { - LLVM_DEBUG(dbgs() << "ARM Loops: Failed to find all loop components.\n"); + LLVM_DEBUG(LoLoop.dump()); + if (!LoLoop.FoundAllComponents()) return false; - } - - if (!End->getOperand(1).isMBB()) - report_fatal_error("Expected LoopEnd to target basic block"); - - // TODO Maybe there's cases where the target doesn't have to be the header, - // but for now be safe and revert. - if (End->getOperand(1).getMBB() != ML->getHeader()) { - LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targetting header.\n"); - Revert = true; - } - - // The WLS and LE instructions have 12-bits for the label offset. WLS - // requires a positive offset, while LE uses negative. - if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML->getHeader()) || - !BBUtils->isBBInRange(End, ML->getHeader(), 4094)) { - LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n"); - Revert = true; - } - if (Start->getOpcode() == ARM::t2WhileLoopStart && - (BBUtils->getOffsetOf(Start) > - BBUtils->getOffsetOf(Start->getOperand(1).getMBB()) || - !BBUtils->isBBInRange(Start, Start->getOperand(1).getMBB(), 4094))) { - LLVM_DEBUG(dbgs() << "ARM Loops: WLS offset is out-of-range!\n"); - Revert = true; - } - - MachineInstr *InsertPt = Revert ? nullptr : IsSafeToDefineLR(Start); - if (!InsertPt) { - LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n"); - Revert = true; - } else - LLVM_DEBUG(dbgs() << "ARM Loops: Start insertion point: " << *InsertPt); - Expand(ML, Start, InsertPt, Dec, End, Revert); + LoLoop.CheckLegality(BBUtils.get()); + Expand(LoLoop); return true; } @@ -438,44 +553,87 @@ MI->eraseFromParent(); } -void ARMLowOverheadLoops::Expand(MachineLoop *ML, MachineInstr *Start, - MachineInstr *InsertPt, - MachineInstr *Dec, MachineInstr *End, - bool Revert) { +MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) { + MachineInstr *InsertPt = LoLoop.InsertPt; + MachineInstr *Start = LoLoop.Start; + MachineBasicBlock *MBB = InsertPt->getParent(); + bool IsDo = Start->getOpcode() == ARM::t2DoLoopStart; + unsigned Opc = 0; + + if (!LoLoop.IsTailPredicationLegal()) + Opc = IsDo ? ARM::t2DLS : ARM::t2WLS; + else { + switch (LoLoop.VCTP->getOpcode()) { + case ARM::MVE_VCTP8: + Opc = IsDo ? ARM::MVE_DLSTP_8 : ARM::MVE_WLSTP_8; + break; + case ARM::MVE_VCTP16: + Opc = IsDo ? ARM::MVE_DLSTP_16 : ARM::MVE_WLSTP_16; + break; + case ARM::MVE_VCTP32: + Opc = IsDo ? ARM::MVE_DLSTP_32 : ARM::MVE_WLSTP_32; + break; + case ARM::MVE_VCTP64: + Opc = IsDo ? ARM::MVE_DLSTP_64 : ARM::MVE_WLSTP_64; + break; + } + } - auto ExpandLoopStart = [this](MachineLoop *ML, MachineInstr *Start, - MachineInstr *InsertPt) { - MachineBasicBlock *MBB = InsertPt->getParent(); - unsigned Opc = Start->getOpcode() == ARM::t2DoLoopStart ? - ARM::t2DLS : ARM::t2WLS; - MachineInstrBuilder MIB = - BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc)); + MachineInstrBuilder MIB = + BuildMI(*MBB, InsertPt, InsertPt->getDebugLoc(), TII->get(Opc)); - MIB.addDef(ARM::LR); - MIB.add(Start->getOperand(0)); - if (Opc == ARM::t2WLS) - MIB.add(Start->getOperand(1)); - - if (InsertPt != Start) - InsertPt->eraseFromParent(); - Start->eraseFromParent(); - LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB); - return &*MIB; - }; + MIB.addDef(ARM::LR); + MIB.add(Start->getOperand(0)); + if (!IsDo) + MIB.add(Start->getOperand(1)); + + if (InsertPt != Start) + InsertPt->eraseFromParent(); + Start->eraseFromParent(); + LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB); + return &*MIB; +} + +void ARMLowOverheadLoops::RemoveVPTBlocks(LowOverheadLoop &LoLoop) { + LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *LoLoop.VCTP); + LoLoop.VCTP->eraseFromParent(); + + for (auto *MI : LoLoop.VPTUsers) { + if (MI->getOpcode() == ARM::MVE_VPST) { + LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *MI); + MI->eraseFromParent(); + } else { + unsigned OpNum = MI->getNumOperands() - 1; + assert((MI->getOperand(OpNum).isReg() && + MI->getOperand(OpNum).getReg() == ARM::VPR) && + "Expected VPR"); + assert((MI->getOperand(OpNum-1).isImm() && + MI->getOperand(OpNum-1).getImm() == ARMVCC::Then) && + "Expected Then predicate"); + MI->getOperand(OpNum-1).setImm(ARMVCC::None); + MI->getOperand(OpNum).setReg(0); + LLVM_DEBUG(dbgs() << "ARM Loops: Removed predicate from: " << *MI); + } + } +} + +void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) { // Combine the LoopDec and LoopEnd instructions into LE(TP). - auto ExpandLoopEnd = [this](MachineLoop *ML, MachineInstr *Dec, - MachineInstr *End) { + auto ExpandLoopEnd = [this](LowOverheadLoop &LoLoop) { + MachineInstr *End = LoLoop.End; MachineBasicBlock *MBB = End->getParent(); + unsigned Opc = LoLoop.IsTailPredicationLegal() ? + ARM::MVE_LETP : ARM::t2LEUpdate; MachineInstrBuilder MIB = BuildMI(*MBB, End, End->getDebugLoc(), - TII->get(ARM::t2LEUpdate)); + TII->get(Opc)); MIB.addDef(ARM::LR); MIB.add(End->getOperand(0)); MIB.add(End->getOperand(1)); LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB); - End->eraseFromParent(); - Dec->eraseFromParent(); + LoLoop.End->eraseFromParent(); + LoLoop.Dec->eraseFromParent(); return &*MIB; }; @@ -496,18 +654,20 @@ } }; - if (Revert) { - if (Start->getOpcode() == ARM::t2WhileLoopStart) - RevertWhile(Start); + if (LoLoop.Revert) { + if (LoLoop.Start->getOpcode() == ARM::t2WhileLoopStart) + RevertWhile(LoLoop.Start); else - Start->eraseFromParent(); - bool FlagsAlreadySet = RevertLoopDec(Dec, true); - RevertLoopEnd(End, FlagsAlreadySet); + LoLoop.Start->eraseFromParent(); + bool FlagsAlreadySet = RevertLoopDec(LoLoop.Dec, true); + RevertLoopEnd(LoLoop.End, FlagsAlreadySet); } else { - Start = ExpandLoopStart(ML, Start, InsertPt); - RemoveDeadBranch(Start); - End = ExpandLoopEnd(ML, Dec, End); - RemoveDeadBranch(End); + LoLoop.Start = ExpandLoopStart(LoLoop); + RemoveDeadBranch(LoLoop.Start); + LoLoop.End = ExpandLoopEnd(LoLoop); + RemoveDeadBranch(LoLoop.End); + if (LoLoop.IsTailPredicationLegal()) + RemoveVPTBlocks(LoLoop); } } Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/fast-fp-loops.ll @@ -40,21 +40,18 @@ ; CHECK-NEXT: bic r6, r6, #3 ; CHECK-NEXT: subs r6, #4 ; CHECK-NEXT: add.w lr, r12, r6, lsr #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, lr ; CHECK-NEXT: .LBB0_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r3 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q0, [r1] -; CHECK-NEXT: vldrwt.u32 q1, [r2] +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: vmul.f32 q0, q1, q0 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: adds r1, #16 ; CHECK-NEXT: adds r2, #16 ; CHECK-NEXT: adds r0, #16 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: le lr, .LBB0_5 +; CHECK-NEXT: letp lr, .LBB0_5 ; CHECK-NEXT: b .LBB0_11 ; CHECK-NEXT: .LBB0_6: @ %for.body.preheader.new ; CHECK-NEXT: subs r3, r3, r7 Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/mve-tail-data-types.ll @@ -355,18 +355,16 @@ ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, lr ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r2 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q2, [r1] +; CHECK-NEXT: vldrw.u32 q2, [r1] ; CHECK-NEXT: mov r3, r2 ; CHECK-NEXT: adds r1, #16 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: vmla.u32 q0, q2, r0 -; CHECK-NEXT: le lr, .LBB4_1 +; CHECK-NEXT: letp lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block ; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpsel q0, q0, q1 @@ -1132,22 +1130,19 @@ ; CHECK-NEXT: bic r4, r4, #3 ; CHECK-NEXT: subs r4, #4 ; CHECK-NEXT: add.w lr, lr, r4, lsr #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, lr ; CHECK-NEXT: .LBB9_5: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r12 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q0, [r0] -; CHECK-NEXT: vldrwt.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: vmul.i32 q0, q1, q0 ; CHECK-NEXT: adds r0, #16 ; CHECK-NEXT: vadd.i32 q0, q0, r2 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q0, [r3] +; CHECK-NEXT: vstrw.32 q0, [r3] ; CHECK-NEXT: adds r1, #16 ; CHECK-NEXT: adds r3, #16 ; CHECK-NEXT: sub.w r12, r12, #4 -; CHECK-NEXT: le lr, .LBB9_5 +; CHECK-NEXT: letp lr, .LBB9_5 ; CHECK-NEXT: b .LBB9_11 ; CHECK-NEXT: .LBB9_6: @ %for.body.preheader.new ; CHECK-NEXT: sub.w r7, r12, r5 Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/vector-arith-codegen.ll @@ -15,21 +15,19 @@ ; CHECK-NEXT: sub.w r12, r3, #4 ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r12, lsr #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, lr ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vctp.32 r2 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vldrwt.u32 q0, [r0] -; CHECK-NEXT: vldrwt.u32 q2, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r1] ; CHECK-NEXT: mov r3, r2 ; CHECK-NEXT: vmul.i32 q0, q2, q0 ; CHECK-NEXT: adds r0, #16 ; CHECK-NEXT: adds r1, #16 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: le lr, .LBB0_1 +; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block ; CHECK-NEXT: vctp.32 r3 ; CHECK-NEXT: vpsel q0, q0, q1 @@ -90,18 +88,16 @@ ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: subs r1, #4 ; CHECK-NEXT: add.w lr, r3, r1, lsr #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, lr ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov r1, r2 -; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: adds r0, #16 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: le lr, .LBB1_1 +; CHECK-NEXT: letp lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block ; CHECK-NEXT: vctp.32 r1 ; CHECK-NEXT: vpsel q0, q0, q1 @@ -158,18 +154,16 @@ ; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: subs r1, #4 ; CHECK-NEXT: add.w lr, r3, r1, lsr #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, lr ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: mov r1, r2 -; CHECK-NEXT: vctp.32 r2 ; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [r0] ; CHECK-NEXT: adds r0, #16 ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: le lr, .LBB2_1 +; CHECK-NEXT: letp lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block ; CHECK-NEXT: vctp.32 r1 ; CHECK-NEXT: vpsel q0, q0, q1 @@ -224,19 +218,16 @@ ; CHECK-NEXT: bic r12, r12, #3 ; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: add.w lr, lr, r12, lsr #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, lr ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r3 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vmul.i32 q0, q0, r2 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: adds r1, #16 ; CHECK-NEXT: adds r0, #16 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: le lr, .LBB3_1 +; CHECK-NEXT: letp lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: @@ -286,19 +277,16 @@ ; CHECK-NEXT: bic r12, r12, #3 ; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: add.w lr, lr, r12, lsr #2 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, lr ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vctp.32 r3 -; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vadd.i32 q0, q0, r2 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q0, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: adds r1, #16 ; CHECK-NEXT: adds r0, #16 ; CHECK-NEXT: subs r3, #4 -; CHECK-NEXT: le lr, .LBB4_1 +; CHECK-NEXT: letp lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} entry: @@ -336,7 +324,138 @@ ret void } +define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i8(i8* noalias nocapture %a, i8* noalias nocapture readonly %b, i8* noalias nocapture readonly %c, i32 %N) { +; CHECK-LABEL: vector_mul_vector_i8: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: it eq +; CHECK-NEXT: popeq {r4, pc} +; CHECK-NEXT: add.w r12, r3, #15 +; CHECK-NEXT: mov.w lr, #1 +; CHECK-NEXT: bic r12, r12, #15 +; CHECK-NEXT: sub.w r12, r12, #16 +; CHECK-NEXT: add.w lr, lr, r12, lsr #4 +; CHECK-NEXT: mov.w r12, #0 +; CHECK-NEXT: dlstp.8 lr, lr +; CHECK-NEXT: .LBB5_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: add.w r4, r1, r12 +; CHECK-NEXT: vldrb.u8 q0, [r4] +; CHECK-NEXT: add.w r4, r2, r12 +; CHECK-NEXT: vldrb.u8 q1, [r4] +; CHECK-NEXT: add.w r4, r0, r12 +; CHECK-NEXT: add.w r12, r12, #16 +; CHECK-NEXT: subs r3, #16 +; CHECK-NEXT: vmul.i8 q0, q1, q0 +; CHECK-NEXT: vstrb.8 q0, [r4] +; CHECK-NEXT: letp lr, .LBB5_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: pop {r4, pc} +entry: + %cmp10 = icmp eq i32 %N, 0 + br i1 %cmp10, label %for.cond.cleanup, label %vector.ph + +vector.ph: ; preds = %entry + %n.rnd.up = add i32 %N, 15 + %n.vec = and i32 %n.rnd.up, -16 + %trip.count.minus.1 = add i32 %N, -1 + %broadcast.splatinsert12 = insertelement <16 x i32> undef, i32 %trip.count.minus.1, i32 0 + %broadcast.splat13 = shufflevector <16 x i32> %broadcast.splatinsert12, <16 x i32> undef, <16 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %broadcast.splatinsert = insertelement <16 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <16 x i32> %broadcast.splatinsert, <16 x i32> undef, <16 x i32> zeroinitializer + %induction = add <16 x i32> %broadcast.splat, + %0 = getelementptr inbounds i8, i8* %b, i32 %index + %1 = icmp ule <16 x i32> %induction, %broadcast.splat13 + %2 = bitcast i8* %0 to <16 x i8>* + %wide.masked.load = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %2, i32 1, <16 x i1> %1, <16 x i8> undef) + %3 = getelementptr inbounds i8, i8* %c, i32 %index + %4 = bitcast i8* %3 to <16 x i8>* + %wide.masked.load14 = call <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>* %4, i32 1, <16 x i1> %1, <16 x i8> undef) + %5 = mul <16 x i8> %wide.masked.load14, %wide.masked.load + %6 = getelementptr inbounds i8, i8* %a, i32 %index + %7 = bitcast i8* %6 to <16 x i8>* + call void @llvm.masked.store.v16i8.p0v16i8(<16 x i8> %5, <16 x i8>* %7, i32 1, <16 x i1> %1) + %index.next = add i32 %index, 16 + %8 = icmp eq i32 %index.next, %n.vec + br i1 %8, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +; Function Attrs: nofree norecurse nounwind +define dso_local arm_aapcs_vfpcc void @vector_mul_vector_i16(i16* noalias nocapture %a, i16* noalias nocapture readonly %b, i16* noalias nocapture readonly %c, i32 %N) local_unnamed_addr #0 { +; CHECK-LABEL: vector_mul_vector_i16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: it eq +; CHECK-NEXT: popeq {r7, pc} +; CHECK-NEXT: add.w r12, r3, #7 +; CHECK-NEXT: mov.w lr, #1 +; CHECK-NEXT: bic r12, r12, #7 +; CHECK-NEXT: sub.w r12, r12, #8 +; CHECK-NEXT: add.w lr, lr, r12, lsr #3 +; CHECK-NEXT: dlstp.16 lr, lr +; CHECK-NEXT: .LBB6_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.u16 q0, [r1] +; CHECK-NEXT: vldrh.u16 q1, [r2] +; CHECK-NEXT: vmul.i16 q0, q1, q0 +; CHECK-NEXT: vstrh.16 q0, [r0] +; CHECK-NEXT: adds r1, #16 +; CHECK-NEXT: adds r2, #16 +; CHECK-NEXT: adds r0, #16 +; CHECK-NEXT: subs r3, #8 +; CHECK-NEXT: letp lr, .LBB6_1 +; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup +; CHECK-NEXT: pop {r7, pc} +entry: + %cmp10 = icmp eq i32 %N, 0 + br i1 %cmp10, label %for.cond.cleanup, label %vector.ph + +vector.ph: ; preds = %entry + %n.rnd.up = add i32 %N, 7 + %n.vec = and i32 %n.rnd.up, -8 + %trip.count.minus.1 = add i32 %N, -1 + %broadcast.splatinsert12 = insertelement <8 x i32> undef, i32 %trip.count.minus.1, i32 0 + %broadcast.splat13 = shufflevector <8 x i32> %broadcast.splatinsert12, <8 x i32> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %broadcast.splatinsert = insertelement <8 x i32> undef, i32 %index, i32 0 + %broadcast.splat = shufflevector <8 x i32> %broadcast.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer + %induction = add <8 x i32> %broadcast.splat, + %0 = getelementptr inbounds i16, i16* %b, i32 %index + %1 = icmp ule <8 x i32> %induction, %broadcast.splat13 + %2 = bitcast i16* %0 to <8 x i16>* + %wide.masked.load = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %2, i32 2, <8 x i1> %1, <8 x i16> undef) + %3 = getelementptr inbounds i16, i16* %c, i32 %index + %4 = bitcast i16* %3 to <8 x i16>* + %wide.masked.load14 = call <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>* %4, i32 2, <8 x i1> %1, <8 x i16> undef) + %5 = mul <8 x i16> %wide.masked.load14, %wide.masked.load + %6 = getelementptr inbounds i16, i16* %a, i32 %index + %7 = bitcast i16* %6 to <8 x i16>* + call void @llvm.masked.store.v8i16.p0v8i16(<8 x i16> %5, <8 x i16>* %7, i32 2, <8 x i1> %1) + %index.next = add i32 %index, 8 + %8 = icmp eq i32 %index.next, %n.vec + br i1 %8, label %for.cond.cleanup, label %vector.body + +for.cond.cleanup: ; preds = %vector.body, %entry + ret void +} + +declare <16 x i8> @llvm.masked.load.v16i8.p0v16i8(<16 x i8>*, i32 immarg, <16 x i1>, <16 x i8>) +declare <8 x i16> @llvm.masked.load.v8i16.p0v8i16(<8 x i16>*, i32 immarg, <8 x i1>, <8 x i16>) declare <4 x i32> @llvm.masked.load.v4i32.p0v4i32(<4 x i32>*, i32 immarg, <4 x i1>, <4 x i32>) -declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) #4 +declare void @llvm.masked.store.v16i8.p0v16i8(<16 x i8>, <16 x i8>*, i32 immarg, <16 x i1>) +declare void @llvm.masked.store.v8i16.p0v8i16(<8 x i16>, <8 x i16>*, i32 immarg, <8 x i1>) +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32 immarg, <4 x i1>) declare i32 @llvm.experimental.vector.reduce.add.v4i32(<4 x i32>) Index: llvm/unittests/Target/ARM/MachineInstrTest.cpp =================================================================== --- llvm/unittests/Target/ARM/MachineInstrTest.cpp +++ llvm/unittests/Target/ARM/MachineInstrTest.cpp @@ -250,6 +250,9 @@ case MVE_VMUL_qr_i8: case MVE_VMULf16: case MVE_VMULf32: + case MVE_VMULt1i16: + case MVE_VMULt1i8: + case MVE_VMULt1i32: case MVE_VMVN: case MVE_VMVNimmi16: case MVE_VMVNimmi32: