Index: llvm/lib/Target/ARM/ARM.h =================================================================== --- llvm/lib/Target/ARM/ARM.h +++ llvm/lib/Target/ARM/ARM.h @@ -48,6 +48,7 @@ FunctionPass *createThumb2ITBlockPass(); FunctionPass *createMVEVPTBlockPass(); FunctionPass *createMVEVPTOptimisationsPass(); +FunctionPass *createARMRegisterTypePass(); FunctionPass *createARMOptimizeBarriersPass(); FunctionPass *createThumb2SizeReductionPass( std::function Ftor = nullptr); @@ -68,6 +69,7 @@ void initializeThumb2ITBlockPass(PassRegistry &); void initializeMVEVPTBlockPass(PassRegistry &); void initializeMVEVPTOptimisationsPass(PassRegistry &); +void initializeARMRegisterTypePass(PassRegistry &); void initializeARMLowOverheadLoopsPass(PassRegistry &); void initializeMVETailPredicationPass(PassRegistry &); void initializeMVEGatherScatterLoweringPass(PassRegistry &); Index: llvm/lib/Target/ARM/ARMRegisterInfo.td =================================================================== --- llvm/lib/Target/ARM/ARMRegisterInfo.td +++ llvm/lib/Target/ARM/ARMRegisterInfo.td @@ -359,6 +359,17 @@ let DiagnosticString = "operand must be an even-numbered register"; } +// GPRnolr and friends - Versions of the respective register classes without +// LR. Used in low overhead loops to constrain the register classes to not +// spill and reload LR in the loop, forcing it to not be used for anything +// other than the loop iteration count. See the ARMRegisterTypePass. +def GPRnolr : RegisterClass<"ARM", [i32], 32, (sub GPR, LR)>; +def GPRnopclr : RegisterClass<"ARM", [i32], 32, (sub GPR, PC, LR)>; +def rGPRnolr : RegisterClass<"ARM", [i32], 32, (sub rGPR, LR)>; +def tGPREvennolr : RegisterClass<"ARM", [i32], 32, (sub tGPREven, LR)>; +def GPRwithZRnolr : RegisterClass<"ARM", [i32], 32, (sub GPRwithZR, LR)>; +def GPRwithZRnosplr : RegisterClass<"ARM", [i32], 32, (sub GPRwithZRnosp, LR)>; + // Condition code registers. def CCR : RegisterClass<"ARM", [i32], 32, (add CPSR)> { let CopyCost = -1; // Don't allow copying of status registers. Index: llvm/lib/Target/ARM/ARMRegisterTypePass.cpp =================================================================== --- /dev/null +++ llvm/lib/Target/ARM/ARMRegisterTypePass.cpp @@ -0,0 +1,136 @@ +//===-- ARMRegisterTypePass.cpp ---------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This pass adjusts the register classes around Arm 8.1-M Low Overhead +/// Loops in order to prevent any instructions from using LR, which should +/// ideally be used exclusively as the trip count and not spilled and reloaded. +/// +//===----------------------------------------------------------------------===// + +#include "ARM.h" +#include "ARMRegisterInfo.h" +#include "ARMSubtarget.h" +#include "MCTargetDesc/ARMBaseInfo.h" +#include "Thumb2InstrInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineFunction.h" +#include "llvm/CodeGen/MachineFunctionPass.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineLoopInfo.h" +#include "llvm/InitializePasses.h" +#include "llvm/Support/Debug.h" +#include + +using namespace llvm; + +#define DEBUG_TYPE "arm-reg-type" + +namespace { +class ARMRegisterType : public MachineFunctionPass { +public: + static char ID; + + ARMRegisterType() : MachineFunctionPass(ID) {} + + bool runOnMachineFunction(MachineFunction &MF) override; + + StringRef getPassName() const override { return "ARM Register Type Changer"; } + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.addRequired(); + AU.setPreservesAll(); + MachineFunctionPass::getAnalysisUsage(AU); + } +}; + +char ARMRegisterType::ID = 0; + +} // end anonymous namespace + +INITIALIZE_PASS_BEGIN(ARMRegisterType, DEBUG_TYPE, "ARM Register Type Changer", + false, false) +INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo) +INITIALIZE_PASS_END(ARMRegisterType, DEBUG_TYPE, "ARM Register Type Changer", + false, false) + +static bool ModifyLoopsRegisters(MachineLoop *L, MachineRegisterInfo *MRI) { + const TargetRegisterInfo *TRI = MRI->getTargetRegisterInfo(); + + bool Modified = false; + for (MachineLoop *LL : *L) + Modified |= ModifyLoopsRegisters(LL, MRI); + + // If this is a Low overhead loop, for each reg def/uuse in the loop constrain + // the register class to not include LR. + if (!L->getLoopLatch() || !any_of(*L->getLoopLatch(), [](MachineInstr &MI) { + return MI.getOpcode() == ARM::t2LoopEnd; + })) + return Modified; + + for (auto *MBB : L->getBlocks()) { + for (MachineInstr &MI : *MBB) { + for (MachineOperand &MO : MI.operands()) { + if (!MO.isReg() || !MO.getReg().isVirtual()) + continue; + + Register Reg = MO.getReg(); + const TargetRegisterClass *RC = MRI->getRegClass(Reg); + + auto TryConstrain = [&](const TargetRegisterClass *OrigRC, + const TargetRegisterClass *ConstrainedRC) { + if (RC == OrigRC) { + LLVM_DEBUG(dbgs() << " Constraining Reg " << Reg.virtRegIndex() + << " to regclass " + << TRI->getRegClassName(ConstrainedRC) << "\n"); + MRI->constrainRegClass(Reg, ConstrainedRC); + RC = ConstrainedRC; + Modified = true; + } + }; + + TryConstrain(&ARM::GPRRegClass, &ARM::GPRnolrRegClass); + TryConstrain(&ARM::GPRnopcRegClass, &ARM::GPRnopclrRegClass); + TryConstrain(&ARM::rGPRRegClass, &ARM::rGPRnolrRegClass); + TryConstrain(&ARM::tGPREvenRegClass, &ARM::tGPREvennolrRegClass); + TryConstrain(&ARM::GPRwithZRRegClass, &ARM::GPRwithZRnolrRegClass); + TryConstrain(&ARM::GPRwithZRnospRegClass, + &ARM::GPRwithZRnosplrRegClass); + +#ifndef NDEBUG + if (RC != &ARM::GPRlrRegClass && RC->contains(ARM::LR)) + LLVM_DEBUG(dbgs() << " Unhandled regclass of " << MI << "\n"); +#endif + } + } + } + + return Modified; +} + +bool ARMRegisterType::runOnMachineFunction(MachineFunction &MF) { + const ARMSubtarget &STI = + static_cast(MF.getSubtarget()); + + if (!STI.hasLOB()) + return false; + + MachineRegisterInfo *MRI = &MF.getRegInfo(); + MachineLoopInfo *Loops = &getAnalysis(); + + bool Modified = false; + + for (MachineLoop *L : *Loops) + Modified |= ModifyLoopsRegisters(L, MRI); + + return Modified; +} + +FunctionPass *llvm::createARMRegisterTypePass() { + return new ARMRegisterType(); +} Index: llvm/lib/Target/ARM/ARMTargetMachine.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetMachine.cpp +++ llvm/lib/Target/ARM/ARMTargetMachine.cpp @@ -97,6 +97,7 @@ initializeThumb2SizeReducePass(Registry); initializeMVEVPTBlockPass(Registry); initializeMVEVPTOptimisationsPass(Registry); + initializeARMRegisterTypePass(Registry); initializeMVETailPredicationPass(Registry); initializeARMLowOverheadLoopsPass(Registry); initializeMVEGatherScatterLoweringPass(Registry); @@ -364,6 +365,7 @@ void addPreSched2() override; void addPreEmitPass() override; void addPreEmitPass2() override; + bool addRegAssignmentOptimized() override; std::unique_ptr getCSEConfig() const override; }; @@ -500,6 +502,12 @@ } } +bool ARMPassConfig::addRegAssignmentOptimized() { + if (getOptLevel() != CodeGenOpt::None) + addPass(createARMRegisterTypePass()); + return TargetPassConfig::addRegAssignmentOptimized(); +} + void ARMPassConfig::addPreSched2() { if (getOptLevel() != CodeGenOpt::None) { if (EnableARMLoadStoreOpt) Index: llvm/lib/Target/ARM/CMakeLists.txt =================================================================== --- llvm/lib/Target/ARM/CMakeLists.txt +++ llvm/lib/Target/ARM/CMakeLists.txt @@ -55,6 +55,7 @@ MVETailPredication.cpp MVEVPTBlockPass.cpp MVEVPTOptimisationsPass.cpp + ARMRegisterTypePass.cpp Thumb1FrameLowering.cpp Thumb1InstrInfo.cpp ThumbRegisterInfo.cpp Index: llvm/test/CodeGen/ARM/O3-pipeline.ll =================================================================== --- llvm/test/CodeGen/ARM/O3-pipeline.ll +++ llvm/test/CodeGen/ARM/O3-pipeline.ll @@ -112,6 +112,7 @@ ; CHECK-NEXT: Simple Register Coalescing ; CHECK-NEXT: Rename Disconnected Subregister Components ; CHECK-NEXT: Machine Instruction Scheduler +; CHECK-NEXT: ARM Register Type Changer ; CHECK-NEXT: Machine Block Frequency Analysis ; CHECK-NEXT: Debug Variable Analysis ; CHECK-NEXT: Live Stack Slot Analysis Index: llvm/test/CodeGen/Thumb2/LowOverheadLoops/regalloc.ll =================================================================== --- llvm/test/CodeGen/Thumb2/LowOverheadLoops/regalloc.ll +++ llvm/test/CodeGen/Thumb2/LowOverheadLoops/regalloc.ll @@ -15,91 +15,87 @@ ; CHECK-NEXT: cmp r7, #3 ; CHECK-NEXT: bhs .LBB0_4 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: bne .LBB0_7 ; CHECK-NEXT: b .LBB0_10 ; CHECK-NEXT: .LBB0_3: -; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: movs r4, #0 ; CHECK-NEXT: b .LBB0_10 ; CHECK-NEXT: .LBB0_4: @ %while.body.preheader.new ; CHECK-NEXT: bic r7, r9, #3 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: subs r7, #4 -; CHECK-NEXT: adds r4, r0, #2 -; CHECK-NEXT: add.w r7, r6, r7, lsr #2 -; CHECK-NEXT: movs r6, #0 -; CHECK-NEXT: mov r10, r7 +; CHECK-NEXT: movs r4, #0 +; CHECK-NEXT: add.w lr, r6, r7, lsr #2 +; CHECK-NEXT: adds r6, r0, #2 ; CHECK-NEXT: movs r7, #0 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB0_5: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrsb lr, [r4, #-1] +; CHECK-NEXT: ldrsb r10, [r6, #-1] ; CHECK-NEXT: sxtb.w r5, r12 -; CHECK-NEXT: cmp r5, lr +; CHECK-NEXT: cmp r5, r10 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r12, lr -; CHECK-NEXT: ldrsb.w r5, [r4] -; CHECK-NEXT: sxtb.w lr, r12 -; CHECK-NEXT: csinc r7, r7, r6, ge -; CHECK-NEXT: cmp lr, r5 +; CHECK-NEXT: movlt r12, r10 +; CHECK-NEXT: ldrsb.w r5, [r6] +; CHECK-NEXT: sxtb.w r10, r12 +; CHECK-NEXT: csinc r4, r4, r7, ge +; CHECK-NEXT: cmp r10, r5 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r12, r5 -; CHECK-NEXT: ldrsb.w r5, [r4, #1] -; CHECK-NEXT: sxtb.w lr, r12 +; CHECK-NEXT: ldrsb.w r5, [r6, #1] ; CHECK-NEXT: it lt -; CHECK-NEXT: addlt r7, r6, #2 -; CHECK-NEXT: cmp lr, r5 +; CHECK-NEXT: addlt r4, r7, #2 +; CHECK-NEXT: sxtb.w r10, r12 +; CHECK-NEXT: cmp r10, r5 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r12, r5 -; CHECK-NEXT: ldrsb.w r5, [r4, #2] -; CHECK-NEXT: sxtb.w lr, r12 +; CHECK-NEXT: ldrsb.w r5, [r6, #2] ; CHECK-NEXT: it lt -; CHECK-NEXT: addlt r7, r6, #3 +; CHECK-NEXT: addlt r4, r7, #3 +; CHECK-NEXT: adds r7, #4 ; CHECK-NEXT: adds r6, #4 -; CHECK-NEXT: cmp lr, r5 -; CHECK-NEXT: mov lr, r10 +; CHECK-NEXT: sxtb.w r10, r12 +; CHECK-NEXT: cmp r10, r5 ; CHECK-NEXT: itt lt ; CHECK-NEXT: movlt r12, r5 -; CHECK-NEXT: movlt r7, r6 -; CHECK-NEXT: adds r4, #4 -; CHECK-NEXT: subs.w lr, lr, #1 -; CHECK-NEXT: mov r10, lr -; CHECK-NEXT: bne .LBB0_5 -; CHECK-NEXT: b .LBB0_6 -; CHECK-NEXT: .LBB0_6: @ %while.end.loopexit.unr-lcssa.loopexit -; CHECK-NEXT: add r0, r6 -; CHECK-NEXT: sub.w r9, r9, r6 +; CHECK-NEXT: movlt r4, r7 +; CHECK-NEXT: le lr, .LBB0_5 +; CHECK-NEXT: @ %bb.6: @ %while.end.loopexit.unr-lcssa.loopexit +; CHECK-NEXT: add r0, r7 +; CHECK-NEXT: sub.w r9, r9, r7 ; CHECK-NEXT: cmp.w r8, #0 ; CHECK-NEXT: beq .LBB0_10 ; CHECK-NEXT: .LBB0_7: @ %while.body.epil -; CHECK-NEXT: ldrsb.w r4, [r0, #1] -; CHECK-NEXT: sxtb.w r5, r12 +; CHECK-NEXT: ldrsb.w r7, [r0, #1] +; CHECK-NEXT: sxtb.w r6, r12 ; CHECK-NEXT: sub.w r1, r1, r9 -; CHECK-NEXT: cmp r5, r4 +; CHECK-NEXT: cmp r6, r7 ; CHECK-NEXT: itt lt -; CHECK-NEXT: movlt r12, r4 -; CHECK-NEXT: movlt r7, r1 +; CHECK-NEXT: movlt r12, r7 +; CHECK-NEXT: movlt r4, r1 ; CHECK-NEXT: cmp.w r8, #1 ; CHECK-NEXT: sxtb.w r12, r12 ; CHECK-NEXT: beq .LBB0_10 ; CHECK-NEXT: @ %bb.8: @ %while.body.epil.1 -; CHECK-NEXT: ldrsb.w r4, [r0, #2] -; CHECK-NEXT: cmp r12, r4 +; CHECK-NEXT: ldrsb.w r7, [r0, #2] +; CHECK-NEXT: cmp r12, r7 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r12, r4 -; CHECK-NEXT: csinc r7, r7, r1, ge +; CHECK-NEXT: movlt r12, r7 +; CHECK-NEXT: csinc r4, r4, r1, ge ; CHECK-NEXT: cmp.w r8, #2 ; CHECK-NEXT: beq .LBB0_10 ; CHECK-NEXT: @ %bb.9: @ %while.body.epil.2 ; CHECK-NEXT: ldrsb.w r0, [r0, #3] -; CHECK-NEXT: sxtb.w r4, r12 -; CHECK-NEXT: cmp r4, r0 +; CHECK-NEXT: sxtb.w r7, r12 +; CHECK-NEXT: cmp r7, r0 ; CHECK-NEXT: itt lt ; CHECK-NEXT: movlt r12, r0 -; CHECK-NEXT: addlt r7, r1, #2 +; CHECK-NEXT: addlt r4, r1, #2 ; CHECK-NEXT: .LBB0_10: @ %while.end ; CHECK-NEXT: strb.w r12, [r2] -; CHECK-NEXT: str r7, [r3] +; CHECK-NEXT: str r4, [r3] ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} entry: %0 = load i8, i8* %pSrc, align 1 Index: llvm/test/CodeGen/Thumb2/high-reg-spill.mir =================================================================== --- llvm/test/CodeGen/Thumb2/high-reg-spill.mir +++ llvm/test/CodeGen/Thumb2/high-reg-spill.mir @@ -41,11 +41,11 @@ ; CHECK: renamable $r12 = COPY killed renamable $r0 ; CHECK: t2STRi12 killed $r12, %stack.1, 0, 14 /* CC::al */, $noreg :: (store 4 into %stack.1) ; CHECK: $r8 = t2LDRi12 %stack.1, 0, 14 /* CC::al */, $noreg :: (load 4 from %stack.1) - ; CHECK: INLINEASM &"@ $0", 1 /* sideeffect attdialect */, 589833 /* reguse:GPRnopc */, renamable $r8, 12 /* clobber */, implicit-def early-clobber $r12 + ; CHECK: INLINEASM &"@ $0", 1 /* sideeffect attdialect */, 655369 /* reguse:GPRnopc */, renamable $r8, 12 /* clobber */, implicit-def early-clobber $r12 ; CHECK: tBX_RET 14 /* CC::al */, $noreg %1:tgpr = tLDRspi %stack.0.i, 0, 14, $noreg :: (dereferenceable load 4 from %ir.i) %0:hgpr = COPY %1 - INLINEASM &"@ $0", 1, 589833, %0, 12, implicit-def early-clobber $r12 + INLINEASM &"@ $0", 1, 655369, %0, 12, implicit-def early-clobber $r12 tBX_RET 14, $noreg ... Index: llvm/test/CodeGen/Thumb2/mve-gather-increment.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -574,13 +574,13 @@ ; CHECK-NEXT: vmov.i16 q1, #0x8 ; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: add.w r8, r6, r3, lsr #3 -; CHECK-NEXT: adr r3, .LCPI11_0 -; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: adr r6, .LCPI11_0 +; CHECK-NEXT: vldrw.u32 q0, [r6] ; CHECK-NEXT: .LBB11_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB11_3 Depth 2 ; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r4, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: dls lr, r8 ; CHECK-NEXT: .LBB11_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB11_2 Depth=1 @@ -604,7 +604,7 @@ ; CHECK-NEXT: vmov r7, s16 ; CHECK-NEXT: vmov.32 q3[2], r5 ; CHECK-NEXT: vmov.u16 r5, q2[3] -; CHECK-NEXT: vmov r4, s17 +; CHECK-NEXT: vmov r3, s17 ; CHECK-NEXT: vmov.32 q3[3], r5 ; CHECK-NEXT: vadd.i16 q2, q2, q1 ; CHECK-NEXT: vmovlb.s16 q3, q3 @@ -615,7 +615,7 @@ ; CHECK-NEXT: vmov r12, s13 ; CHECK-NEXT: ldrh.w r11, [r7] ; CHECK-NEXT: vmov r7, s12 -; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: ldrh r3, [r3] ; CHECK-NEXT: ldrh.w r9, [r5] ; CHECK-NEXT: vmov r5, s18 ; CHECK-NEXT: ldrh.w r10, [r6] @@ -628,11 +628,11 @@ ; CHECK-NEXT: vmov.16 q3[3], r9 ; CHECK-NEXT: vmov.16 q3[4], r11 ; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q3[5], r4 +; CHECK-NEXT: vmov.16 q3[5], r3 ; CHECK-NEXT: ldrh r6, [r6] ; CHECK-NEXT: vmov.16 q3[6], r5 ; CHECK-NEXT: vmov.16 q3[7], r6 -; CHECK-NEXT: vstrb.8 q3, [r3], #16 +; CHECK-NEXT: vstrb.8 q3, [r4], #16 ; CHECK-NEXT: le lr, .LBB11_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=1 @@ -722,22 +722,22 @@ ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB12_3 Depth 2 ; CHECK-NEXT: dls lr, r8 -; CHECK-NEXT: ldr r3, [sp, #60] @ 4-byte Reload +; CHECK-NEXT: ldr r4, [sp, #60] @ 4-byte Reload ; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload ; CHECK-NEXT: .LBB12_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB12_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vmov.u16 r4, q5[0] +; CHECK-NEXT: vmov.u16 r3, q5[0] ; CHECK-NEXT: vmov.u16 r7, q7[4] -; CHECK-NEXT: vmov.32 q0[0], r4 -; CHECK-NEXT: vmov.u16 r4, q5[1] -; CHECK-NEXT: vmov.32 q0[1], r4 -; CHECK-NEXT: vmov.u16 r4, q5[2] -; CHECK-NEXT: vmov.32 q0[2], r4 -; CHECK-NEXT: vmov.u16 r4, q5[3] -; CHECK-NEXT: vmov.32 q0[3], r4 +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov.u16 r3, q5[1] +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: vmov.u16 r3, q5[2] +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vmov.u16 r3, q5[3] +; CHECK-NEXT: vmov.32 q0[3], r3 ; CHECK-NEXT: vmov.u16 r12, q6[0] ; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vmov.32 q1[0], r12 @@ -745,7 +745,7 @@ ; CHECK-NEXT: vmov.u16 r1, q6[1] ; CHECK-NEXT: vadd.i32 q2, q0, r0 ; CHECK-NEXT: vmov.32 q1[1], r1 -; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: vmov r3, s10 ; CHECK-NEXT: vmov.u16 r1, q6[2] ; CHECK-NEXT: vmov.32 q1[2], r1 ; CHECK-NEXT: vmov.u16 r1, q6[3] @@ -755,26 +755,26 @@ ; CHECK-NEXT: vmov r6, s11 ; CHECK-NEXT: vshl.i32 q1, q1, #1 ; CHECK-NEXT: vadd.i32 q4, q1, r0 -; CHECK-NEXT: ldrh.w r9, [r4] -; CHECK-NEXT: vmov.u16 r4, q5[4] -; CHECK-NEXT: vmov.32 q0[0], r4 -; CHECK-NEXT: vmov.u16 r4, q5[5] -; CHECK-NEXT: vmov.32 q0[1], r4 -; CHECK-NEXT: vmov.u16 r4, q5[6] -; CHECK-NEXT: vmov.32 q0[2], r4 -; CHECK-NEXT: vmov.u16 r4, q5[7] -; CHECK-NEXT: vmov.32 q0[3], r4 +; CHECK-NEXT: ldrh.w r9, [r3] +; CHECK-NEXT: vmov.u16 r3, q5[4] +; CHECK-NEXT: vmov.32 q0[0], r3 +; CHECK-NEXT: vmov.u16 r3, q5[5] +; CHECK-NEXT: vmov.32 q0[1], r3 +; CHECK-NEXT: vmov.u16 r3, q5[6] +; CHECK-NEXT: vmov.32 q0[2], r3 +; CHECK-NEXT: vmov.u16 r3, q5[7] +; CHECK-NEXT: vmov.32 q0[3], r3 ; CHECK-NEXT: ldrh r6, [r6] ; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vshl.i32 q0, q0, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: vmov r5, s3 -; CHECK-NEXT: ldrh.w r10, [r4] -; CHECK-NEXT: vmov r4, s1 +; CHECK-NEXT: ldrh.w r10, [r3] +; CHECK-NEXT: vmov r3, s1 ; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: ldrh.w r11, [r4] -; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: ldrh.w r11, [r3] +; CHECK-NEXT: vmov r3, s2 ; CHECK-NEXT: vmov.32 q0[0], r7 ; CHECK-NEXT: vmov.u16 r7, q7[5] ; CHECK-NEXT: vmov.32 q0[1], r7 @@ -809,7 +809,7 @@ ; CHECK-NEXT: vshl.i32 q3, q3, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: vadd.i32 q3, q3, r0 -; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: ldrh r3, [r3] ; CHECK-NEXT: ldrh r7, [r7] ; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: vmov.16 q1[0], r1 @@ -821,7 +821,7 @@ ; CHECK-NEXT: vmov.16 q1[3], r6 ; CHECK-NEXT: vmov.16 q1[4], r10 ; CHECK-NEXT: vmov.16 q1[5], r11 -; CHECK-NEXT: vmov.16 q1[6], r4 +; CHECK-NEXT: vmov.16 q1[6], r3 ; CHECK-NEXT: vmov.16 q1[7], r5 ; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: vmov.16 q2[0], r1 @@ -875,7 +875,7 @@ ; CHECK-NEXT: vmov.16 q0[7], r1 ; CHECK-NEXT: vadd.i16 q0, q0, q2 ; CHECK-NEXT: vadd.i16 q0, q0, q1 -; CHECK-NEXT: vstrb.8 q0, [r3], #16 +; CHECK-NEXT: vstrb.8 q0, [r4], #16 ; CHECK-NEXT: le lr, .LBB12_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB12_2 Depth=1 Index: llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll +++ llvm/test/CodeGen/Thumb2/mve-postinc-lsr.ll @@ -1059,78 +1059,84 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #56 -; CHECK-NEXT: sub sp, #56 +; CHECK-NEXT: .pad #72 +; CHECK-NEXT: sub sp, #72 ; CHECK-NEXT: cmp r2, #8 -; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: vstr s0, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: vstr s0, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: mov r1, r2 -; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r2, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: blo.w .LBB7_9 ; CHECK-NEXT: @ %bb.1: -; CHECK-NEXT: ldr r2, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: mov.w r10, #0 -; CHECK-NEXT: str r2, [sp, #16] @ 4-byte Spill -; CHECK-NEXT: lsrs r1, r2, #2 +; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: lsrs r2, r1, #2 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: str r1, [sp, #48] @ 4-byte Spill ; CHECK-NEXT: b .LBB7_3 ; CHECK-NEXT: .LBB7_2: @ in Loop: Header=BB7_3 Depth=1 -; CHECK-NEXT: ldr r2, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: add.w r10, r10, #1 +; CHECK-NEXT: ldr r7, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: lsls r3, r3, #2 -; CHECK-NEXT: cmp r2, #7 -; CHECK-NEXT: asr.w r1, r2, #2 +; CHECK-NEXT: ldr r1, [sp, #48] @ 4-byte Reload +; CHECK-NEXT: cmp r7, #7 +; CHECK-NEXT: asr.w r2, r7, #2 +; CHECK-NEXT: add.w r1, r1, #1 +; CHECK-NEXT: str r1, [sp, #48] @ 4-byte Spill ; CHECK-NEXT: ble .LBB7_9 ; CHECK-NEXT: .LBB7_3: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB7_6 Depth 2 ; CHECK-NEXT: @ Child Loop BB7_7 Depth 3 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r2, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: cmp r3, #1 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: lsr.w r2, r1, #2 -; CHECK-NEXT: str r2, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: ldr r2, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: lsr.w r1, r2, #2 +; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill ; CHECK-NEXT: blt .LBB7_2 ; CHECK-NEXT: @ %bb.4: @ in Loop: Header=BB7_3 Depth=1 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: cmp.w r2, r1, lsr #3 +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: cmp.w r1, r2, lsr #3 ; CHECK-NEXT: beq .LBB7_2 ; CHECK-NEXT: @ %bb.5: @ %.preheader ; CHECK-NEXT: @ in Loop: Header=BB7_3 Depth=1 -; CHECK-NEXT: lsrs r2, r1, #3 +; CHECK-NEXT: lsrs r1, r2, #3 +; CHECK-NEXT: mov.w r11, #0 +; CHECK-NEXT: str r1, [sp, #40] @ 4-byte Spill +; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: str r3, [sp, #44] @ 4-byte Spill ; CHECK-NEXT: lsls r1, r1, #1 -; CHECK-NEXT: str r2, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: ldr r2, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: str r3, [sp, #32] @ 4-byte Spill -; CHECK-NEXT: lsl.w r11, r2, #1 +; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill +; CHECK-NEXT: lsls r1, r2, #1 +; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill ; CHECK-NEXT: .LBB7_6: @ Parent Loop BB7_3 Depth=1 ; CHECK-NEXT: @ => This Loop Header: Depth=2 ; CHECK-NEXT: @ Child Loop BB7_7 Depth 3 -; CHECK-NEXT: add.w r12, r0, #16 -; CHECK-NEXT: ldr r4, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: ldr.w lr, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: ldm.w r12, {r1, r2, r3, r12} -; CHECK-NEXT: muls r4, r5, r4 -; CHECK-NEXT: ldr.w r2, [r2, r10, lsl #2] -; CHECK-NEXT: ldr.w r1, [r1, r10, lsl #2] -; CHECK-NEXT: ldrd r6, r7, [r0, #32] -; CHECK-NEXT: ldr.w r3, [r3, r10, lsl #2] +; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: ldrd r6, r12, [r0, #24] +; CHECK-NEXT: ldr r5, [sp, #48] @ 4-byte Reload +; CHECK-NEXT: ldrd lr, r3, [r0, #16] +; CHECK-NEXT: mul r1, r1, r11 +; CHECK-NEXT: ldrd r4, r7, [r0, #32] +; CHECK-NEXT: ldr.w r6, [r6, r5, lsl #2] +; CHECK-NEXT: ldr.w r2, [r3, r5, lsl #2] +; CHECK-NEXT: ldr.w r5, [lr, r5, lsl #2] +; CHECK-NEXT: ldr.w lr, [sp, #40] @ 4-byte Reload +; CHECK-NEXT: add.w r3, r7, r6, lsl #2 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: add.w r6, r6, r2, lsl #2 -; CHECK-NEXT: add.w r12, r12, r1, lsl #2 -; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: add.w r2, r1, r4, lsl #2 -; CHECK-NEXT: add.w r3, r7, r3, lsl #2 -; CHECK-NEXT: add.w r1, r2, r11, lsl #2 -; CHECK-NEXT: add.w r8, r1, r11, lsl #2 -; CHECK-NEXT: add.w r9, r8, r11, lsl #2 +; CHECK-NEXT: add.w r6, r4, r2, lsl #2 +; CHECK-NEXT: ldr r2, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: add.w r12, r12, r5, lsl #2 +; CHECK-NEXT: add.w r2, r2, r1, lsl #2 +; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: add.w r8, r2, r1, lsl #2 +; CHECK-NEXT: add.w r9, r8, r1, lsl #2 +; CHECK-NEXT: add.w r10, r9, r1, lsl #2 ; CHECK-NEXT: .LBB7_7: @ Parent Loop BB7_3 Depth=1 ; CHECK-NEXT: @ Parent Loop BB7_6 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vldrw.u32 q3, [r9] -; CHECK-NEXT: vldrw.u32 q4, [r1] -; CHECK-NEXT: vldrw.u32 q6, [r8] +; CHECK-NEXT: vldrw.u32 q3, [r10] +; CHECK-NEXT: vldrw.u32 q4, [r8] +; CHECK-NEXT: vldrw.u32 q6, [r9] ; CHECK-NEXT: vldrw.u32 q7, [r2] ; CHECK-NEXT: vsub.f32 q5, q4, q3 ; CHECK-NEXT: vsub.f32 q0, q7, q6 @@ -1144,34 +1150,34 @@ ; CHECK-NEXT: vldrw.u32 q0, [r6], #16 ; CHECK-NEXT: vcmul.f32 q3, q0, q4, #0 ; CHECK-NEXT: vcmla.f32 q3, q0, q4, #90 -; CHECK-NEXT: vstrb.8 q3, [r1], #16 +; CHECK-NEXT: vstrb.8 q3, [r8], #16 ; CHECK-NEXT: vldrw.u32 q0, [r12], #16 ; CHECK-NEXT: vcmul.f32 q3, q0, q2, #0 ; CHECK-NEXT: vcmla.f32 q3, q0, q2, #90 -; CHECK-NEXT: vstrb.8 q3, [r8], #16 +; CHECK-NEXT: vstrb.8 q3, [r9], #16 ; CHECK-NEXT: vldrw.u32 q0, [r3], #16 ; CHECK-NEXT: vcmul.f32 q2, q0, q1, #0 ; CHECK-NEXT: vcmla.f32 q2, q0, q1, #90 -; CHECK-NEXT: vstrb.8 q2, [r9], #16 +; CHECK-NEXT: vstrb.8 q2, [r10], #16 ; CHECK-NEXT: le lr, .LBB7_7 ; CHECK-NEXT: @ %bb.8: @ in Loop: Header=BB7_6 Depth=2 -; CHECK-NEXT: ldr r3, [sp, #32] @ 4-byte Reload -; CHECK-NEXT: adds r5, #1 -; CHECK-NEXT: cmp r5, r3 +; CHECK-NEXT: ldr r3, [sp, #44] @ 4-byte Reload +; CHECK-NEXT: add.w r11, r11, #1 +; CHECK-NEXT: cmp r11, r3 ; CHECK-NEXT: bne .LBB7_6 ; CHECK-NEXT: b .LBB7_2 ; CHECK-NEXT: .LBB7_9: ; CHECK-NEXT: adr r0, .LCPI7_0 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vldrw.u32 q2, [q1, #64]! -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: lsr.w lr, r0, #3 ; CHECK-NEXT: wls lr, lr, .LBB7_12 ; CHECK-NEXT: @ %bb.10: ; CHECK-NEXT: vldrw.u32 q3, [q1, #16] -; CHECK-NEXT: vldr s0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vldr s0, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: vmov r0, s0 ; CHECK-NEXT: .LBB7_11: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [q1, #24] @@ -1182,14 +1188,14 @@ ; CHECK-NEXT: vsub.f32 q0, q4, q0 ; CHECK-NEXT: vsub.f32 q7, q6, q5 ; CHECK-NEXT: vcadd.f32 q4, q2, q0, #270 -; CHECK-NEXT: vstrw.32 q7, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q7, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: vcadd.f32 q7, q2, q0, #90 ; CHECK-NEXT: vadd.f32 q0, q6, q5 ; CHECK-NEXT: vldrw.u32 q2, [q1, #64]! ; CHECK-NEXT: vmul.f32 q0, q0, r0 ; CHECK-NEXT: vldrw.u32 q3, [q1, #16] ; CHECK-NEXT: vstrw.32 q0, [q1, #-64] -; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vmul.f32 q0, q4, r0 ; CHECK-NEXT: vmul.f32 q4, q7, r0 ; CHECK-NEXT: vmul.f32 q5, q5, r0 @@ -1198,7 +1204,7 @@ ; CHECK-NEXT: vstrw.32 q0, [q1, #-40] ; CHECK-NEXT: le lr, .LBB7_11 ; CHECK-NEXT: .LBB7_12: -; CHECK-NEXT: add sp, #56 +; CHECK-NEXT: add sp, #72 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} Index: llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll +++ llvm/test/CodeGen/Thumb2/mve-satmul-loops.ll @@ -27,44 +27,44 @@ ; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: bic r3, r3, #1 ; CHECK-NEXT: subs r7, r3, #2 -; CHECK-NEXT: adr r4, .LCPI0_0 +; CHECK-NEXT: adr r5, .LCPI0_0 ; CHECK-NEXT: movs r6, #1 ; CHECK-NEXT: add.w r11, r2, r3, lsl #2 ; CHECK-NEXT: add.w lr, r6, r7, lsr #1 ; CHECK-NEXT: add.w r9, r1, r3, lsl #2 ; CHECK-NEXT: add.w r12, r0, r3, lsl #2 -; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: vldrw.u32 q0, [r5] ; CHECK-NEXT: vmvn.i32 q1, #0x80000000 ; CHECK-NEXT: mov.w r10, #-1 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: str r3, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB0_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrd r4, r5, [r0] +; CHECK-NEXT: ldrd r5, r4, [r0] ; CHECK-NEXT: adds r0, #8 -; CHECK-NEXT: ldrd r7, r6, [r1] +; CHECK-NEXT: ldrd r6, r3, [r1] ; CHECK-NEXT: adds r1, #8 -; CHECK-NEXT: smull r8, r5, r6, r5 -; CHECK-NEXT: smull r4, r7, r7, r4 -; CHECK-NEXT: asrl r8, r5, #31 -; CHECK-NEXT: asrl r4, r7, #31 -; CHECK-NEXT: rsbs.w r3, r4, #-2147483648 -; CHECK-NEXT: vmov.32 q4[0], r4 -; CHECK-NEXT: sbcs.w r3, r10, r7 +; CHECK-NEXT: smull r6, r7, r6, r5 +; CHECK-NEXT: asrl r6, r7, #31 +; CHECK-NEXT: rsbs.w r5, r6, #-2147483648 +; CHECK-NEXT: vmov.32 q4[0], r6 +; CHECK-NEXT: sbcs.w r5, r10, r7 ; CHECK-NEXT: vmov.32 q4[1], r7 -; CHECK-NEXT: mov.w r3, #0 -; CHECK-NEXT: vmov.32 q4[2], r8 +; CHECK-NEXT: mov.w r5, #0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r3, #1 -; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: csetm r3, ne -; CHECK-NEXT: vmov.32 q4[3], r5 -; CHECK-NEXT: vmov.32 q2[0], r3 -; CHECK-NEXT: vmov.32 q2[1], r3 +; CHECK-NEXT: movlt r5, #1 +; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: csetm r5, ne +; CHECK-NEXT: vmov.32 q2[0], r5 +; CHECK-NEXT: vmov.32 q2[1], r5 +; CHECK-NEXT: smull r8, r5, r3, r4 +; CHECK-NEXT: asrl r8, r5, #31 ; CHECK-NEXT: rsbs.w r3, r8, #-2147483648 +; CHECK-NEXT: vmov.32 q4[2], r8 ; CHECK-NEXT: sbcs.w r3, r10, r5 -; CHECK-NEXT: mvn r5, #-2147483648 +; CHECK-NEXT: vmov.32 q4[3], r5 ; CHECK-NEXT: mov.w r3, #0 +; CHECK-NEXT: mvn r5, #-2147483648 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r3, #1 ; CHECK-NEXT: cmp r3, #0 @@ -117,22 +117,22 @@ ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldr r3, [r12], #4 ; CHECK-NEXT: ldr r4, [r9], #4 -; CHECK-NEXT: smull r4, r3, r4, r3 -; CHECK-NEXT: asrl r4, r3, #31 -; CHECK-NEXT: subs r5, r1, r4 -; CHECK-NEXT: sbcs.w r5, r0, r3 -; CHECK-NEXT: mov.w r5, #0 +; CHECK-NEXT: smull r6, r3, r4, r3 +; CHECK-NEXT: asrl r6, r3, #31 +; CHECK-NEXT: subs r4, r1, r6 +; CHECK-NEXT: sbcs.w r4, r0, r3 +; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r5, #1 -; CHECK-NEXT: cmp r5, #0 +; CHECK-NEXT: movlt r4, #1 +; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: itt eq ; CHECK-NEXT: moveq r3, r0 -; CHECK-NEXT: moveq r4, r1 -; CHECK-NEXT: subs r5, r4, r2 +; CHECK-NEXT: moveq r6, r1 +; CHECK-NEXT: subs r4, r6, r2 ; CHECK-NEXT: sbcs r3, r3, #0 ; CHECK-NEXT: it ge -; CHECK-NEXT: movge r4, r2 -; CHECK-NEXT: str r4, [r11], #4 +; CHECK-NEXT: movge r6, r2 +; CHECK-NEXT: str r6, [r11], #4 ; CHECK-NEXT: le lr, .LBB0_7 ; CHECK-NEXT: .LBB0_8: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #8 @@ -236,37 +236,35 @@ ; CHECK-NEXT: cmp r3, #0 ; CHECK-NEXT: beq.w .LBB1_8 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: mov r8, r2 -; CHECK-NEXT: movs r2, #0 +; CHECK-NEXT: mov r9, r1 +; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhi .LBB1_3 ; CHECK-NEXT: @ %bb.2: ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r10, r1 -; CHECK-NEXT: mov r11, r8 +; CHECK-NEXT: mov r1, r9 +; CHECK-NEXT: mov r11, r2 ; CHECK-NEXT: b .LBB1_6 ; CHECK-NEXT: .LBB1_3: @ %vector.ph +; CHECK-NEXT: bic r7, r3, #3 +; CHECK-NEXT: adr r4, .LCPI1_0 +; CHECK-NEXT: subs r1, r7, #4 +; CHECK-NEXT: vldrw.u32 q0, [r4] +; CHECK-NEXT: adr r4, .LCPI1_1 ; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: subs r2, r3, #4 -; CHECK-NEXT: movs r7, #1 -; CHECK-NEXT: add.w r11, r8, r3, lsl #2 -; CHECK-NEXT: add.w r10, r1, r3, lsl #2 -; CHECK-NEXT: add.w lr, r7, r2, lsr #2 -; CHECK-NEXT: adr r7, .LCPI1_0 -; CHECK-NEXT: vldrw.u32 q0, [r7] -; CHECK-NEXT: adr r7, .LCPI1_1 -; CHECK-NEXT: add.w r12, r0, r3, lsl #2 -; CHECK-NEXT: vldrw.u32 q1, [r7] +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: add.w lr, r3, r1, lsr #2 +; CHECK-NEXT: add.w r11, r2, r7, lsl #2 +; CHECK-NEXT: add.w r1, r9, r7, lsl #2 +; CHECK-NEXT: add.w r12, r0, r7, lsl #2 +; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: mov.w r10, #-1 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: str r3, [sp] @ 4-byte Spill -; CHECK-NEXT: mov.w r3, #-1 -; CHECK-NEXT: mvn r9, #-2147483648 +; CHECK-NEXT: str r7, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB1_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q2, [r0], #16 -; CHECK-NEXT: vldrw.u32 q3, [r1], #16 -; CHECK-NEXT: mov r2, lr +; CHECK-NEXT: vldrw.u32 q3, [r9], #16 ; CHECK-NEXT: vmov.f32 s16, s10 ; CHECK-NEXT: vmov.f32 s20, s14 ; CHECK-NEXT: vmov.f32 s18, s11 @@ -276,151 +274,151 @@ ; CHECK-NEXT: vmov r7, s25 ; CHECK-NEXT: vmov r6, s24 ; CHECK-NEXT: asrl r6, r7, #31 -; CHECK-NEXT: vmov lr, s26 -; CHECK-NEXT: rsbs.w r5, r6, #-2147483648 -; CHECK-NEXT: vmov.f32 s14, s13 -; CHECK-NEXT: sbcs.w r5, r3, r7 -; CHECK-NEXT: mov.w r5, #0 -; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r5, #1 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csetm r5, ne -; CHECK-NEXT: vmov.32 q4[0], r5 -; CHECK-NEXT: vmov.32 q4[1], r5 ; CHECK-NEXT: vmov r5, s27 -; CHECK-NEXT: asrl lr, r5, #31 +; CHECK-NEXT: rsbs.w r4, r6, #-2147483648 +; CHECK-NEXT: vmov r8, s26 +; CHECK-NEXT: sbcs.w r4, r10, r7 +; CHECK-NEXT: asrl r8, r5, #31 +; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: vmov.32 q6[0], r6 -; CHECK-NEXT: rsbs.w r4, lr, #-2147483648 +; CHECK-NEXT: it lt +; CHECK-NEXT: movlt r4, #1 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: csetm r4, ne ; CHECK-NEXT: vmov.32 q6[1], r7 -; CHECK-NEXT: sbcs.w r4, r3, r5 -; CHECK-NEXT: vmov.32 q6[2], lr -; CHECK-NEXT: mov.w r4, #0 +; CHECK-NEXT: vmov.32 q4[0], r4 +; CHECK-NEXT: vmov.32 q6[2], r8 +; CHECK-NEXT: vmov.32 q4[1], r4 +; CHECK-NEXT: rsbs.w r4, r8, #-2147483648 +; CHECK-NEXT: sbcs.w r4, r10, r5 ; CHECK-NEXT: vmov.32 q6[3], r5 +; CHECK-NEXT: mov.w r4, #0 +; CHECK-NEXT: mvn r8, #-2147483648 ; CHECK-NEXT: it lt ; CHECK-NEXT: movlt r4, #1 ; CHECK-NEXT: cmp r4, #0 ; CHECK-NEXT: csetm r4, ne -; CHECK-NEXT: mov lr, r2 +; CHECK-NEXT: vmov.f32 s14, s13 ; CHECK-NEXT: vmov.32 q4[2], r4 ; CHECK-NEXT: vmov.32 q4[3], r4 -; CHECK-NEXT: vmov r4, s14 ; CHECK-NEXT: vbic q5, q0, q4 ; CHECK-NEXT: vand q4, q6, q4 ; CHECK-NEXT: vorr q4, q4, q5 -; CHECK-NEXT: vmov r6, s16 -; CHECK-NEXT: vmov r7, s17 -; CHECK-NEXT: subs.w r6, r6, r9 -; CHECK-NEXT: sbcs r7, r7, #0 -; CHECK-NEXT: vmov r6, s18 -; CHECK-NEXT: mov.w r7, #0 +; CHECK-NEXT: vmov r5, s16 +; CHECK-NEXT: vmov r4, s17 +; CHECK-NEXT: subs.w r5, r5, r8 +; CHECK-NEXT: sbcs r4, r4, #0 +; CHECK-NEXT: vmov r5, s18 +; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r7, #1 -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: csetm r7, ne -; CHECK-NEXT: vmov.32 q5[0], r7 -; CHECK-NEXT: vmov.32 q5[1], r7 -; CHECK-NEXT: vmov r7, s19 -; CHECK-NEXT: subs.w r6, r6, r9 -; CHECK-NEXT: vmov r6, s12 -; CHECK-NEXT: sbcs r7, r7, #0 -; CHECK-NEXT: mov.w r7, #0 +; CHECK-NEXT: movlt r4, #1 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: csetm r4, ne +; CHECK-NEXT: vmov.32 q5[0], r4 +; CHECK-NEXT: vmov.32 q5[1], r4 +; CHECK-NEXT: vmov r4, s19 +; CHECK-NEXT: subs.w r5, r5, r8 +; CHECK-NEXT: vmov r5, s12 +; CHECK-NEXT: sbcs r4, r4, #0 +; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r7, #1 -; CHECK-NEXT: cmp r7, #0 -; CHECK-NEXT: csetm r7, ne -; CHECK-NEXT: vmov.32 q5[2], r7 -; CHECK-NEXT: vmov r7, s8 +; CHECK-NEXT: movlt r4, #1 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: csetm r4, ne +; CHECK-NEXT: vmov.32 q5[2], r4 +; CHECK-NEXT: vmov r4, s8 ; CHECK-NEXT: vbic q6, q1, q5 ; CHECK-NEXT: vand q4, q4, q5 ; CHECK-NEXT: vorr q4, q4, q6 -; CHECK-NEXT: smull r6, r7, r6, r7 +; CHECK-NEXT: smull r6, r7, r5, r4 +; CHECK-NEXT: vmov r5, s14 ; CHECK-NEXT: asrl r6, r7, #31 -; CHECK-NEXT: rsbs.w r5, r6, #-2147483648 +; CHECK-NEXT: rsbs.w r4, r6, #-2147483648 ; CHECK-NEXT: vmov.32 q3[0], r6 -; CHECK-NEXT: sbcs.w r5, r3, r7 +; CHECK-NEXT: sbcs.w r4, r10, r7 ; CHECK-NEXT: vmov.32 q3[1], r7 -; CHECK-NEXT: mov.w r5, #0 +; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r5, #1 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csetm r5, ne -; CHECK-NEXT: vmov.32 q5[0], r5 -; CHECK-NEXT: vmov.32 q5[1], r5 -; CHECK-NEXT: vmov r5, s10 -; CHECK-NEXT: smull r4, r5, r4, r5 +; CHECK-NEXT: movlt r4, #1 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: csetm r4, ne +; CHECK-NEXT: vmov.32 q5[0], r4 +; CHECK-NEXT: vmov.32 q5[1], r4 +; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: smull r4, r5, r5, r4 ; CHECK-NEXT: asrl r4, r5, #31 -; CHECK-NEXT: rsbs.w r2, r4, #-2147483648 +; CHECK-NEXT: rsbs.w r3, r4, #-2147483648 ; CHECK-NEXT: vmov.32 q3[2], r4 -; CHECK-NEXT: sbcs.w r2, r3, r5 +; CHECK-NEXT: sbcs.w r3, r10, r5 ; CHECK-NEXT: vmov.32 q3[3], r5 -; CHECK-NEXT: mov.w r2, #0 +; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r2, #1 -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: vmov.32 q5[2], r2 -; CHECK-NEXT: vmov.32 q5[3], r2 +; CHECK-NEXT: movlt r3, #1 +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: vmov.32 q5[2], r3 +; CHECK-NEXT: vmov.32 q5[3], r3 ; CHECK-NEXT: vbic q2, q0, q5 ; CHECK-NEXT: vand q3, q3, q5 ; CHECK-NEXT: vorr q2, q3, q2 -; CHECK-NEXT: vmov r7, s8 -; CHECK-NEXT: vmov r2, s9 -; CHECK-NEXT: subs.w r7, r7, r9 -; CHECK-NEXT: sbcs r2, r2, #0 -; CHECK-NEXT: vmov r7, s10 -; CHECK-NEXT: mov.w r2, #0 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: subs.w r4, r4, r8 +; CHECK-NEXT: sbcs r3, r3, #0 +; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r2, #1 -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: vmov.32 q3[0], r2 -; CHECK-NEXT: vmov.32 q3[1], r2 -; CHECK-NEXT: vmov r2, s11 -; CHECK-NEXT: subs.w r7, r7, r9 -; CHECK-NEXT: sbcs r2, r2, #0 -; CHECK-NEXT: mov.w r2, #0 +; CHECK-NEXT: movlt r3, #1 +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: vmov.32 q3[0], r3 +; CHECK-NEXT: vmov.32 q3[1], r3 +; CHECK-NEXT: vmov r3, s11 +; CHECK-NEXT: subs.w r4, r4, r8 +; CHECK-NEXT: sbcs r3, r3, #0 +; CHECK-NEXT: mov.w r3, #0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r2, #1 -; CHECK-NEXT: cmp r2, #0 -; CHECK-NEXT: csetm r2, ne -; CHECK-NEXT: vmov.32 q3[2], r2 +; CHECK-NEXT: movlt r3, #1 +; CHECK-NEXT: cmp r3, #0 +; CHECK-NEXT: csetm r3, ne +; CHECK-NEXT: vmov.32 q3[2], r3 ; CHECK-NEXT: vbic q5, q1, q3 ; CHECK-NEXT: vand q2, q2, q3 ; CHECK-NEXT: vorr q2, q2, q5 ; CHECK-NEXT: vmov.f32 s9, s10 ; CHECK-NEXT: vmov.f32 s10, s16 ; CHECK-NEXT: vmov.f32 s11, s18 -; CHECK-NEXT: vstrb.8 q2, [r8], #16 +; CHECK-NEXT: vstrb.8 q2, [r2], #16 ; CHECK-NEXT: le lr, .LBB1_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: ldrd r2, r3, [sp] @ 8-byte Folded Reload -; CHECK-NEXT: cmp r2, r3 +; CHECK-NEXT: ldrd r7, r3, [sp] @ 8-byte Folded Reload +; CHECK-NEXT: cmp r7, r3 ; CHECK-NEXT: beq .LBB1_8 ; CHECK-NEXT: .LBB1_6: @ %for.body.preheader21 -; CHECK-NEXT: sub.w lr, r3, r2 +; CHECK-NEXT: sub.w lr, r3, r7 ; CHECK-NEXT: mov.w r0, #-1 -; CHECK-NEXT: mov.w r1, #-2147483648 -; CHECK-NEXT: mvn r3, #-2147483648 +; CHECK-NEXT: mov.w r3, #-2147483648 +; CHECK-NEXT: mvn r2, #-2147483648 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldr r2, [r12], #4 -; CHECK-NEXT: ldr r4, [r10], #4 -; CHECK-NEXT: smull r4, r5, r4, r2 +; CHECK-NEXT: ldr r4, [r12], #4 +; CHECK-NEXT: ldr r5, [r1], #4 +; CHECK-NEXT: smull r4, r5, r5, r4 ; CHECK-NEXT: asrl r4, r5, #31 -; CHECK-NEXT: subs r2, r1, r4 -; CHECK-NEXT: sbcs.w r2, r0, r5 -; CHECK-NEXT: mov.w r2, #0 +; CHECK-NEXT: subs r6, r3, r4 +; CHECK-NEXT: sbcs.w r6, r0, r5 +; CHECK-NEXT: mov.w r6, #0 ; CHECK-NEXT: it lt -; CHECK-NEXT: movlt r2, #1 -; CHECK-NEXT: cmp r2, #0 +; CHECK-NEXT: movlt r6, #1 +; CHECK-NEXT: cmp r6, #0 ; CHECK-NEXT: itt eq ; CHECK-NEXT: moveq r5, r0 -; CHECK-NEXT: moveq r4, r1 -; CHECK-NEXT: subs r2, r4, r3 -; CHECK-NEXT: sbcs r2, r5, #0 +; CHECK-NEXT: moveq r4, r3 +; CHECK-NEXT: subs r6, r4, r2 +; CHECK-NEXT: sbcs r5, r5, #0 ; CHECK-NEXT: it ge -; CHECK-NEXT: movge r4, r3 +; CHECK-NEXT: movge r4, r2 ; CHECK-NEXT: str r4, [r11], #4 ; CHECK-NEXT: le lr, .LBB1_7 ; CHECK-NEXT: .LBB1_8: @ %for.cond.cleanup @@ -770,38 +768,38 @@ ; CHECK-NEXT: mov r8, r2 ; CHECK-NEXT: b .LBB3_6 ; CHECK-NEXT: .LBB3_3: @ %vector.ph -; CHECK-NEXT: bic r5, r3, #1 -; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: subs r7, r5, #2 -; CHECK-NEXT: add.w r8, r2, r5, lsl #2 -; CHECK-NEXT: add.w r11, r1, r5, lsl #2 -; CHECK-NEXT: add.w lr, r6, r7, lsr #1 -; CHECK-NEXT: add.w r12, r0, r5, lsl #2 +; CHECK-NEXT: bic r7, r3, #1 +; CHECK-NEXT: movs r5, #1 +; CHECK-NEXT: subs r6, r7, #2 +; CHECK-NEXT: add.w r8, r2, r7, lsl #2 +; CHECK-NEXT: add.w r11, r1, r7, lsl #2 +; CHECK-NEXT: add.w lr, r5, r6, lsr #1 +; CHECK-NEXT: add.w r12, r0, r7, lsl #2 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: str r5, [sp] @ 4-byte Spill +; CHECK-NEXT: str r7, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB3_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: ldrd r4, r9, [r0] +; CHECK-NEXT: ldrd r6, r9, [r0] ; CHECK-NEXT: adds r0, #8 -; CHECK-NEXT: ldrd r5, r10, [r1] +; CHECK-NEXT: ldrd r7, r10, [r1] ; CHECK-NEXT: adds r1, #8 -; CHECK-NEXT: umull r4, r5, r5, r4 -; CHECK-NEXT: lsrl r4, r5, #31 -; CHECK-NEXT: subs.w r6, r4, #-1 -; CHECK-NEXT: vmov.32 q1[0], r4 -; CHECK-NEXT: sbcs r5, r5, #0 -; CHECK-NEXT: mov.w r5, #0 +; CHECK-NEXT: umull r6, r7, r7, r6 +; CHECK-NEXT: lsrl r6, r7, #31 +; CHECK-NEXT: subs.w r4, r6, #-1 +; CHECK-NEXT: vmov.32 q1[0], r6 +; CHECK-NEXT: sbcs r4, r7, #0 +; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: it lo -; CHECK-NEXT: movlo r5, #1 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csetm r5, ne -; CHECK-NEXT: vmov.32 q0[0], r5 -; CHECK-NEXT: vmov.32 q0[1], r5 -; CHECK-NEXT: umull r6, r5, r10, r9 -; CHECK-NEXT: lsrl r6, r5, #31 -; CHECK-NEXT: subs.w r7, r6, #-1 -; CHECK-NEXT: vmov.32 q1[2], r6 -; CHECK-NEXT: sbcs r5, r5, #0 +; CHECK-NEXT: movlo r4, #1 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: csetm r4, ne +; CHECK-NEXT: vmov.32 q0[0], r4 +; CHECK-NEXT: vmov.32 q0[1], r4 +; CHECK-NEXT: umull r4, r7, r10, r9 +; CHECK-NEXT: lsrl r4, r7, #31 +; CHECK-NEXT: subs.w r5, r4, #-1 +; CHECK-NEXT: vmov.32 q1[2], r4 +; CHECK-NEXT: sbcs r5, r7, #0 ; CHECK-NEXT: mov.w r5, #0 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r5, #1 @@ -929,11 +927,11 @@ ; CHECK-NEXT: b .LBB4_6 ; CHECK-NEXT: .LBB4_3: @ %vector.ph ; CHECK-NEXT: bic r8, r3, #3 -; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: sub.w r7, r8, #4 +; CHECK-NEXT: movs r5, #1 +; CHECK-NEXT: sub.w r6, r8, #4 ; CHECK-NEXT: add.w r10, r2, r8, lsl #2 ; CHECK-NEXT: add.w r9, r1, r8, lsl #2 -; CHECK-NEXT: add.w lr, r6, r7, lsr #2 +; CHECK-NEXT: add.w lr, r5, r6, lsr #2 ; CHECK-NEXT: add.w r12, r0, r8, lsl #2 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB4_4: @ %vector.body @@ -946,26 +944,26 @@ ; CHECK-NEXT: vmov.f32 s14, s11 ; CHECK-NEXT: vmullb.u32 q4, q3, q1 ; CHECK-NEXT: vmov.f32 s2, s1 -; CHECK-NEXT: vmov r5, s17 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: lsrl r4, r5, #31 +; CHECK-NEXT: vmov r7, s17 +; CHECK-NEXT: vmov r6, s16 +; CHECK-NEXT: lsrl r6, r7, #31 ; CHECK-NEXT: vmov.f32 s10, s9 -; CHECK-NEXT: subs.w r6, r4, #-1 -; CHECK-NEXT: vmov.32 q3[0], r4 -; CHECK-NEXT: sbcs r5, r5, #0 -; CHECK-NEXT: vmov r6, s18 -; CHECK-NEXT: mov.w r5, #0 +; CHECK-NEXT: subs.w r4, r6, #-1 +; CHECK-NEXT: vmov.32 q3[0], r6 +; CHECK-NEXT: sbcs r4, r7, #0 +; CHECK-NEXT: vmov r7, s19 +; CHECK-NEXT: mov.w r4, #0 ; CHECK-NEXT: it lo -; CHECK-NEXT: movlo r5, #1 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csetm r5, ne -; CHECK-NEXT: vmov.32 q1[0], r5 -; CHECK-NEXT: vmov.32 q1[1], r5 -; CHECK-NEXT: vmov r5, s19 -; CHECK-NEXT: lsrl r6, r5, #31 -; CHECK-NEXT: subs.w r7, r6, #-1 -; CHECK-NEXT: vmov.32 q3[2], r6 -; CHECK-NEXT: sbcs r5, r5, #0 +; CHECK-NEXT: movlo r4, #1 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: csetm r4, ne +; CHECK-NEXT: vmov.32 q1[0], r4 +; CHECK-NEXT: vmov.32 q1[1], r4 +; CHECK-NEXT: vmov r4, s18 +; CHECK-NEXT: lsrl r4, r7, #31 +; CHECK-NEXT: subs.w r5, r4, #-1 +; CHECK-NEXT: vmov.32 q3[2], r4 +; CHECK-NEXT: sbcs r5, r7, #0 ; CHECK-NEXT: mov.w r5, #0 ; CHECK-NEXT: it lo ; CHECK-NEXT: movlo r5, #1 @@ -976,23 +974,23 @@ ; CHECK-NEXT: vorn q1, q3, q1 ; CHECK-NEXT: vmullb.u32 q3, q2, q0 ; CHECK-NEXT: vmov r5, s13 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: lsrl r4, r5, #31 -; CHECK-NEXT: subs.w r6, r4, #-1 -; CHECK-NEXT: vmov.32 q2[0], r4 -; CHECK-NEXT: sbcs r5, r5, #0 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: mov.w r5, #0 -; CHECK-NEXT: it lo -; CHECK-NEXT: movlo r5, #1 -; CHECK-NEXT: cmp r5, #0 -; CHECK-NEXT: csetm r5, ne -; CHECK-NEXT: vmov.32 q0[0], r5 -; CHECK-NEXT: vmov.32 q0[1], r5 -; CHECK-NEXT: vmov r5, s15 +; CHECK-NEXT: vmov r6, s12 ; CHECK-NEXT: lsrl r6, r5, #31 -; CHECK-NEXT: subs.w r7, r6, #-1 -; CHECK-NEXT: vmov.32 q2[2], r6 +; CHECK-NEXT: subs.w r4, r6, #-1 +; CHECK-NEXT: vmov.32 q2[0], r6 +; CHECK-NEXT: sbcs r4, r5, #0 +; CHECK-NEXT: vmov r5, s15 +; CHECK-NEXT: mov.w r4, #0 +; CHECK-NEXT: it lo +; CHECK-NEXT: movlo r4, #1 +; CHECK-NEXT: cmp r4, #0 +; CHECK-NEXT: csetm r4, ne +; CHECK-NEXT: vmov.32 q0[0], r4 +; CHECK-NEXT: vmov.32 q0[1], r4 +; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: lsrl r4, r5, #31 +; CHECK-NEXT: subs.w r7, r4, #-1 +; CHECK-NEXT: vmov.32 q2[2], r4 ; CHECK-NEXT: sbcs r5, r5, #0 ; CHECK-NEXT: mov.w r5, #0 ; CHECK-NEXT: it lo @@ -1116,21 +1114,21 @@ ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhi .LBB5_3 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r6, r1 -; CHECK-NEXT: mov r4, r2 +; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: b .LBB5_6 ; CHECK-NEXT: .LBB5_3: @ %vector.ph -; CHECK-NEXT: bic r5, r3, #3 +; CHECK-NEXT: bic r6, r3, #3 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: subs r6, r5, #4 +; CHECK-NEXT: subs r5, r6, #4 ; CHECK-NEXT: vmvn.i32 q0, #0x7fff -; CHECK-NEXT: add.w r12, r0, r5, lsl #1 +; CHECK-NEXT: add.w r12, r0, r6, lsl #1 ; CHECK-NEXT: vmov.i32 q1, #0x7fff -; CHECK-NEXT: add.w lr, r4, r6, lsr #2 -; CHECK-NEXT: add.w r4, r2, r5, lsl #1 -; CHECK-NEXT: add.w r6, r1, r5, lsl #1 +; CHECK-NEXT: add.w lr, r4, r5, lsr #2 +; CHECK-NEXT: add.w r5, r2, r6, lsl #1 +; CHECK-NEXT: add.w r4, r1, r6, lsl #1 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB5_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -1143,19 +1141,19 @@ ; CHECK-NEXT: vstrh.32 q2, [r2], #8 ; CHECK-NEXT: le lr, .LBB5_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: cmp r5, r3 +; CHECK-NEXT: cmp r6, r3 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r6, pc} ; CHECK-NEXT: .LBB5_6: @ %for.body.preheader21 ; CHECK-NEXT: movw r0, #32768 -; CHECK-NEXT: sub.w lr, r3, r5 +; CHECK-NEXT: sub.w lr, r3, r6 ; CHECK-NEXT: movt r0, #65535 ; CHECK-NEXT: movw r1, #32767 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB5_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrsh r2, [r12], #2 -; CHECK-NEXT: ldrsh r3, [r6], #2 +; CHECK-NEXT: ldrsh r3, [r4], #2 ; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: cmp.w r0, r2, asr #15 @@ -1164,7 +1162,7 @@ ; CHECK-NEXT: cmp r3, r1 ; CHECK-NEXT: it ge ; CHECK-NEXT: movge r3, r1 -; CHECK-NEXT: strh r3, [r4], #2 +; CHECK-NEXT: strh r3, [r5], #2 ; CHECK-NEXT: le lr, .LBB5_7 ; CHECK-NEXT: .LBB5_8: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, r5, r6, pc} @@ -1257,21 +1255,21 @@ ; CHECK-NEXT: cmp r3, #7 ; CHECK-NEXT: bhi .LBB6_3 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r6, r1 -; CHECK-NEXT: mov r4, r2 +; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: b .LBB6_6 ; CHECK-NEXT: .LBB6_3: @ %vector.ph -; CHECK-NEXT: bic r5, r3, #7 +; CHECK-NEXT: bic r6, r3, #7 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: sub.w r6, r5, #8 +; CHECK-NEXT: sub.w r5, r6, #8 ; CHECK-NEXT: vmvn.i32 q0, #0x7fff -; CHECK-NEXT: add.w r12, r0, r5, lsl #1 +; CHECK-NEXT: add.w r12, r0, r6, lsl #1 ; CHECK-NEXT: vmov.i32 q1, #0x7fff -; CHECK-NEXT: add.w lr, r4, r6, lsr #3 -; CHECK-NEXT: add.w r4, r2, r5, lsl #1 -; CHECK-NEXT: add.w r6, r1, r5, lsl #1 +; CHECK-NEXT: add.w lr, r4, r5, lsr #3 +; CHECK-NEXT: add.w r5, r2, r6, lsl #1 +; CHECK-NEXT: add.w r4, r1, r6, lsl #1 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB6_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -1291,19 +1289,19 @@ ; CHECK-NEXT: vstrh.32 q2, [r2], #16 ; CHECK-NEXT: le lr, .LBB6_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: cmp r5, r3 +; CHECK-NEXT: cmp r6, r3 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r6, pc} ; CHECK-NEXT: .LBB6_6: @ %for.body.preheader21 ; CHECK-NEXT: movw r0, #32768 -; CHECK-NEXT: sub.w lr, r3, r5 +; CHECK-NEXT: sub.w lr, r3, r6 ; CHECK-NEXT: movt r0, #65535 ; CHECK-NEXT: movw r1, #32767 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB6_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrsh r2, [r12], #2 -; CHECK-NEXT: ldrsh r3, [r6], #2 +; CHECK-NEXT: ldrsh r3, [r4], #2 ; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: cmp.w r0, r2, asr #15 @@ -1312,7 +1310,7 @@ ; CHECK-NEXT: cmp r3, r1 ; CHECK-NEXT: it ge ; CHECK-NEXT: movge r3, r1 -; CHECK-NEXT: strh r3, [r4], #2 +; CHECK-NEXT: strh r3, [r5], #2 ; CHECK-NEXT: le lr, .LBB6_7 ; CHECK-NEXT: .LBB6_8: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, r5, r6, pc} @@ -1407,21 +1405,21 @@ ; CHECK-NEXT: cmp r3, #7 ; CHECK-NEXT: bhi .LBB7_3 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r6, r1 -; CHECK-NEXT: mov r4, r2 +; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: b .LBB7_6 ; CHECK-NEXT: .LBB7_3: @ %vector.ph -; CHECK-NEXT: bic r5, r3, #7 +; CHECK-NEXT: bic r6, r3, #7 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: sub.w r6, r5, #8 +; CHECK-NEXT: sub.w r5, r6, #8 ; CHECK-NEXT: vmvn.i32 q0, #0x7fff -; CHECK-NEXT: add.w r12, r0, r5, lsl #1 +; CHECK-NEXT: add.w r12, r0, r6, lsl #1 ; CHECK-NEXT: vmov.i32 q1, #0x7fff -; CHECK-NEXT: add.w lr, r4, r6, lsr #3 -; CHECK-NEXT: add.w r4, r2, r5, lsl #1 -; CHECK-NEXT: add.w r6, r1, r5, lsl #1 +; CHECK-NEXT: add.w lr, r4, r5, lsr #3 +; CHECK-NEXT: add.w r5, r2, r6, lsl #1 +; CHECK-NEXT: add.w r4, r1, r6, lsl #1 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB7_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -1439,18 +1437,18 @@ ; CHECK-NEXT: vstrb.8 q2, [r2], #16 ; CHECK-NEXT: le lr, .LBB7_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: cmp r5, r3 +; CHECK-NEXT: cmp r6, r3 ; CHECK-NEXT: beq .LBB7_8 ; CHECK-NEXT: .LBB7_6: @ %for.body.preheader21 ; CHECK-NEXT: movw r0, #32768 -; CHECK-NEXT: sub.w lr, r3, r5 +; CHECK-NEXT: sub.w lr, r3, r6 ; CHECK-NEXT: movt r0, #65535 ; CHECK-NEXT: movw r1, #32767 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB7_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrsh r2, [r12], #2 -; CHECK-NEXT: ldrsh r3, [r6], #2 +; CHECK-NEXT: ldrsh r3, [r4], #2 ; CHECK-NEXT: muls r2, r3, r2 ; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: cmp.w r0, r2, asr #15 @@ -1459,7 +1457,7 @@ ; CHECK-NEXT: cmp r3, r1 ; CHECK-NEXT: it ge ; CHECK-NEXT: movge r3, r1 -; CHECK-NEXT: strh r3, [r4], #2 +; CHECK-NEXT: strh r3, [r5], #2 ; CHECK-NEXT: le lr, .LBB7_7 ; CHECK-NEXT: .LBB7_8: @ %for.cond.cleanup ; CHECK-NEXT: vpop {d8, d9} @@ -1988,20 +1986,20 @@ ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhi .LBB11_3 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r6, r1 -; CHECK-NEXT: mov r4, r2 +; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: b .LBB11_6 ; CHECK-NEXT: .LBB11_3: @ %vector.ph -; CHECK-NEXT: bic r5, r3, #3 +; CHECK-NEXT: bic r6, r3, #3 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: subs r6, r5, #4 +; CHECK-NEXT: subs r5, r6, #4 ; CHECK-NEXT: vmov.i32 q0, #0xffff -; CHECK-NEXT: add.w r12, r0, r5, lsl #1 -; CHECK-NEXT: add.w lr, r4, r6, lsr #2 -; CHECK-NEXT: add.w r4, r2, r5, lsl #1 -; CHECK-NEXT: add.w r6, r1, r5, lsl #1 +; CHECK-NEXT: add.w r12, r0, r6, lsl #1 +; CHECK-NEXT: add.w lr, r4, r5, lsr #2 +; CHECK-NEXT: add.w r5, r2, r6, lsl #1 +; CHECK-NEXT: add.w r4, r1, r6, lsl #1 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB11_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -2013,24 +2011,24 @@ ; CHECK-NEXT: vstrh.32 q1, [r2], #8 ; CHECK-NEXT: le lr, .LBB11_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: cmp r5, r3 +; CHECK-NEXT: cmp r6, r3 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r6, pc} ; CHECK-NEXT: .LBB11_6: @ %for.body.preheader21 -; CHECK-NEXT: sub.w lr, r3, r5 +; CHECK-NEXT: sub.w lr, r3, r6 ; CHECK-NEXT: movw r0, #65535 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB11_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrh r1, [r12], #2 -; CHECK-NEXT: ldrh r2, [r6], #2 +; CHECK-NEXT: ldrh r2, [r4], #2 ; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: lsrs r2, r1, #15 ; CHECK-NEXT: cmp r2, r0 ; CHECK-NEXT: movw r2, #65535 ; CHECK-NEXT: it lo ; CHECK-NEXT: lsrlo r2, r1, #15 -; CHECK-NEXT: strh r2, [r4], #2 +; CHECK-NEXT: strh r2, [r5], #2 ; CHECK-NEXT: le lr, .LBB11_7 ; CHECK-NEXT: .LBB11_8: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, r5, r6, pc} @@ -2119,20 +2117,20 @@ ; CHECK-NEXT: cmp r3, #7 ; CHECK-NEXT: bhi .LBB12_3 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r6, r1 -; CHECK-NEXT: mov r4, r2 +; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: b .LBB12_6 ; CHECK-NEXT: .LBB12_3: @ %vector.ph -; CHECK-NEXT: bic r5, r3, #7 +; CHECK-NEXT: bic r6, r3, #7 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: sub.w r6, r5, #8 +; CHECK-NEXT: sub.w r5, r6, #8 ; CHECK-NEXT: vmov.i32 q0, #0xffff -; CHECK-NEXT: add.w r12, r0, r5, lsl #1 -; CHECK-NEXT: add.w lr, r4, r6, lsr #3 -; CHECK-NEXT: add.w r4, r2, r5, lsl #1 -; CHECK-NEXT: add.w r6, r1, r5, lsl #1 +; CHECK-NEXT: add.w r12, r0, r6, lsl #1 +; CHECK-NEXT: add.w lr, r4, r5, lsr #3 +; CHECK-NEXT: add.w r5, r2, r6, lsl #1 +; CHECK-NEXT: add.w r4, r1, r6, lsl #1 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB12_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -2150,24 +2148,24 @@ ; CHECK-NEXT: vstrh.32 q1, [r2], #16 ; CHECK-NEXT: le lr, .LBB12_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: cmp r5, r3 +; CHECK-NEXT: cmp r6, r3 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r6, pc} ; CHECK-NEXT: .LBB12_6: @ %for.body.preheader21 -; CHECK-NEXT: sub.w lr, r3, r5 +; CHECK-NEXT: sub.w lr, r3, r6 ; CHECK-NEXT: movw r0, #65535 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB12_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrh r1, [r12], #2 -; CHECK-NEXT: ldrh r2, [r6], #2 +; CHECK-NEXT: ldrh r2, [r4], #2 ; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: lsrs r2, r1, #15 ; CHECK-NEXT: cmp r2, r0 ; CHECK-NEXT: movw r2, #65535 ; CHECK-NEXT: it lo ; CHECK-NEXT: lsrlo r2, r1, #15 -; CHECK-NEXT: strh r2, [r4], #2 +; CHECK-NEXT: strh r2, [r5], #2 ; CHECK-NEXT: le lr, .LBB12_7 ; CHECK-NEXT: .LBB12_8: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, r5, r6, pc} @@ -2259,21 +2257,21 @@ ; CHECK-NEXT: cmp r3, #3 ; CHECK-NEXT: bhi .LBB13_3 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r6, r1 -; CHECK-NEXT: mov r4, r2 +; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: b .LBB13_6 ; CHECK-NEXT: .LBB13_3: @ %vector.ph -; CHECK-NEXT: bic r5, r3, #3 +; CHECK-NEXT: bic r6, r3, #3 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: subs r6, r5, #4 -; CHECK-NEXT: add.w r12, r0, r5 +; CHECK-NEXT: subs r5, r6, #4 +; CHECK-NEXT: add.w r12, r0, r6 ; CHECK-NEXT: vmvn.i32 q0, #0x7f ; CHECK-NEXT: vmov.i32 q1, #0x7f -; CHECK-NEXT: add.w lr, r4, r6, lsr #2 -; CHECK-NEXT: adds r4, r2, r5 -; CHECK-NEXT: adds r6, r1, r5 +; CHECK-NEXT: add.w lr, r4, r5, lsr #2 +; CHECK-NEXT: adds r5, r2, r6 +; CHECK-NEXT: adds r4, r1, r6 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB13_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -2286,16 +2284,16 @@ ; CHECK-NEXT: vstrb.32 q2, [r2], #4 ; CHECK-NEXT: le lr, .LBB13_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: cmp r5, r3 +; CHECK-NEXT: cmp r6, r3 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r6, pc} ; CHECK-NEXT: .LBB13_6: @ %for.body.preheader21 -; CHECK-NEXT: sub.w lr, r3, r5 +; CHECK-NEXT: sub.w lr, r3, r6 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB13_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrsb r0, [r12], #1 -; CHECK-NEXT: ldrsb r1, [r6], #1 +; CHECK-NEXT: ldrsb r1, [r4], #1 ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: asrs r1, r0, #7 ; CHECK-NEXT: cmn.w r1, #128 @@ -2305,7 +2303,7 @@ ; CHECK-NEXT: cmp r1, #127 ; CHECK-NEXT: it ge ; CHECK-NEXT: movge r1, #127 -; CHECK-NEXT: strb r1, [r4], #1 +; CHECK-NEXT: strb r1, [r5], #1 ; CHECK-NEXT: le lr, .LBB13_7 ; CHECK-NEXT: .LBB13_8: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, r5, r6, pc} @@ -2398,21 +2396,21 @@ ; CHECK-NEXT: cmp r3, #7 ; CHECK-NEXT: bhi .LBB14_3 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r6, r1 -; CHECK-NEXT: mov r4, r2 +; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: b .LBB14_6 ; CHECK-NEXT: .LBB14_3: @ %vector.ph -; CHECK-NEXT: bic r5, r3, #7 +; CHECK-NEXT: bic r6, r3, #7 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: sub.w r6, r5, #8 -; CHECK-NEXT: add.w r12, r0, r5 +; CHECK-NEXT: sub.w r5, r6, #8 +; CHECK-NEXT: add.w r12, r0, r6 ; CHECK-NEXT: vmvn.i16 q0, #0x7f ; CHECK-NEXT: vmov.i16 q1, #0x7f -; CHECK-NEXT: add.w lr, r4, r6, lsr #3 -; CHECK-NEXT: adds r4, r2, r5 -; CHECK-NEXT: adds r6, r1, r5 +; CHECK-NEXT: add.w lr, r4, r5, lsr #3 +; CHECK-NEXT: adds r5, r2, r6 +; CHECK-NEXT: adds r4, r1, r6 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB14_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -2425,17 +2423,17 @@ ; CHECK-NEXT: vstrb.16 q2, [r2], #8 ; CHECK-NEXT: le lr, .LBB14_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: cmp r5, r3 +; CHECK-NEXT: cmp r6, r3 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r6, pc} ; CHECK-NEXT: .LBB14_6: @ %for.body.preheader23 -; CHECK-NEXT: sub.w lr, r3, r5 +; CHECK-NEXT: sub.w lr, r3, r6 ; CHECK-NEXT: mvn r0, #127 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB14_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrsb r1, [r12], #1 -; CHECK-NEXT: ldrsb r2, [r6], #1 +; CHECK-NEXT: ldrsb r2, [r4], #1 ; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: mvn r2, #127 ; CHECK-NEXT: cmp.w r0, r1, asr #7 @@ -2444,7 +2442,7 @@ ; CHECK-NEXT: cmp r2, #127 ; CHECK-NEXT: it ge ; CHECK-NEXT: movge r2, #127 -; CHECK-NEXT: strb r2, [r4], #1 +; CHECK-NEXT: strb r2, [r5], #1 ; CHECK-NEXT: le lr, .LBB14_7 ; CHECK-NEXT: .LBB14_8: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, r5, r6, pc} @@ -2537,21 +2535,21 @@ ; CHECK-NEXT: cmp r3, #15 ; CHECK-NEXT: bhi .LBB15_3 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r6, r1 -; CHECK-NEXT: mov r4, r2 +; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: b .LBB15_6 ; CHECK-NEXT: .LBB15_3: @ %vector.ph -; CHECK-NEXT: bic r5, r3, #15 +; CHECK-NEXT: bic r6, r3, #15 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: sub.w r6, r5, #16 -; CHECK-NEXT: add.w r12, r0, r5 +; CHECK-NEXT: sub.w r5, r6, #16 +; CHECK-NEXT: add.w r12, r0, r6 ; CHECK-NEXT: vmvn.i16 q0, #0x7f ; CHECK-NEXT: vmov.i16 q1, #0x7f -; CHECK-NEXT: add.w lr, r4, r6, lsr #4 -; CHECK-NEXT: adds r4, r2, r5 -; CHECK-NEXT: adds r6, r1, r5 +; CHECK-NEXT: add.w lr, r4, r5, lsr #4 +; CHECK-NEXT: adds r5, r2, r6 +; CHECK-NEXT: adds r4, r1, r6 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB15_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -2571,17 +2569,17 @@ ; CHECK-NEXT: vstrb.16 q2, [r2], #16 ; CHECK-NEXT: le lr, .LBB15_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: cmp r5, r3 +; CHECK-NEXT: cmp r6, r3 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r6, pc} ; CHECK-NEXT: .LBB15_6: @ %for.body.preheader23 -; CHECK-NEXT: sub.w lr, r3, r5 +; CHECK-NEXT: sub.w lr, r3, r6 ; CHECK-NEXT: mvn r0, #127 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB15_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrsb r1, [r12], #1 -; CHECK-NEXT: ldrsb r2, [r6], #1 +; CHECK-NEXT: ldrsb r2, [r4], #1 ; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: mvn r2, #127 ; CHECK-NEXT: cmp.w r0, r1, asr #7 @@ -2590,7 +2588,7 @@ ; CHECK-NEXT: cmp r2, #127 ; CHECK-NEXT: it ge ; CHECK-NEXT: movge r2, #127 -; CHECK-NEXT: strb r2, [r4], #1 +; CHECK-NEXT: strb r2, [r5], #1 ; CHECK-NEXT: le lr, .LBB15_7 ; CHECK-NEXT: .LBB15_8: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, r5, r6, pc} @@ -2685,21 +2683,21 @@ ; CHECK-NEXT: cmp r3, #15 ; CHECK-NEXT: bhi .LBB16_3 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r6, r1 -; CHECK-NEXT: mov r4, r2 +; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: b .LBB16_6 ; CHECK-NEXT: .LBB16_3: @ %vector.ph -; CHECK-NEXT: bic r5, r3, #15 +; CHECK-NEXT: bic r6, r3, #15 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: sub.w r6, r5, #16 -; CHECK-NEXT: add.w r12, r0, r5 +; CHECK-NEXT: sub.w r5, r6, #16 +; CHECK-NEXT: add.w r12, r0, r6 ; CHECK-NEXT: vmvn.i16 q0, #0x7f ; CHECK-NEXT: vmov.i16 q1, #0x7f -; CHECK-NEXT: add.w lr, r4, r6, lsr #4 -; CHECK-NEXT: adds r4, r2, r5 -; CHECK-NEXT: adds r6, r1, r5 +; CHECK-NEXT: add.w lr, r4, r5, lsr #4 +; CHECK-NEXT: adds r5, r2, r6 +; CHECK-NEXT: adds r4, r1, r6 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB16_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -2717,16 +2715,16 @@ ; CHECK-NEXT: vstrb.8 q2, [r2], #16 ; CHECK-NEXT: le lr, .LBB16_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: cmp r5, r3 +; CHECK-NEXT: cmp r6, r3 ; CHECK-NEXT: beq .LBB16_8 ; CHECK-NEXT: .LBB16_6: @ %for.body.preheader23 -; CHECK-NEXT: sub.w lr, r3, r5 +; CHECK-NEXT: sub.w lr, r3, r6 ; CHECK-NEXT: mvn r0, #127 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB16_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrsb r1, [r12], #1 -; CHECK-NEXT: ldrsb r2, [r6], #1 +; CHECK-NEXT: ldrsb r2, [r4], #1 ; CHECK-NEXT: muls r1, r2, r1 ; CHECK-NEXT: mvn r2, #127 ; CHECK-NEXT: cmp.w r0, r1, asr #7 @@ -2735,7 +2733,7 @@ ; CHECK-NEXT: cmp r2, #127 ; CHECK-NEXT: it ge ; CHECK-NEXT: movge r2, #127 -; CHECK-NEXT: strb r2, [r4], #1 +; CHECK-NEXT: strb r2, [r5], #1 ; CHECK-NEXT: le lr, .LBB16_7 ; CHECK-NEXT: .LBB16_8: @ %for.cond.cleanup ; CHECK-NEXT: vpop {d8, d9} @@ -3508,20 +3506,20 @@ ; CHECK-NEXT: cmp r3, #7 ; CHECK-NEXT: bhi .LBB20_3 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r6, r1 -; CHECK-NEXT: mov r4, r2 +; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: b .LBB20_6 ; CHECK-NEXT: .LBB20_3: @ %vector.ph -; CHECK-NEXT: bic r5, r3, #7 +; CHECK-NEXT: bic r6, r3, #7 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: sub.w r6, r5, #8 -; CHECK-NEXT: add.w r12, r0, r5 +; CHECK-NEXT: sub.w r5, r6, #8 +; CHECK-NEXT: add.w r12, r0, r6 ; CHECK-NEXT: vmov.i16 q0, #0xff -; CHECK-NEXT: add.w lr, r4, r6, lsr #3 -; CHECK-NEXT: adds r4, r2, r5 -; CHECK-NEXT: adds r6, r1, r5 +; CHECK-NEXT: add.w lr, r4, r5, lsr #3 +; CHECK-NEXT: adds r5, r2, r6 +; CHECK-NEXT: adds r4, r1, r6 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB20_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -3533,23 +3531,23 @@ ; CHECK-NEXT: vstrb.16 q1, [r2], #8 ; CHECK-NEXT: le lr, .LBB20_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: cmp r5, r3 +; CHECK-NEXT: cmp r6, r3 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r6, pc} ; CHECK-NEXT: .LBB20_6: @ %for.body.preheader23 -; CHECK-NEXT: sub.w lr, r3, r5 +; CHECK-NEXT: sub.w lr, r3, r6 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB20_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r0, [r12], #1 -; CHECK-NEXT: ldrb r1, [r6], #1 +; CHECK-NEXT: ldrb r1, [r4], #1 ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: lsrs r1, r0, #7 ; CHECK-NEXT: cmp r1, #255 ; CHECK-NEXT: mov.w r1, #255 ; CHECK-NEXT: it lo ; CHECK-NEXT: lsrlo r1, r0, #7 -; CHECK-NEXT: strb r1, [r4], #1 +; CHECK-NEXT: strb r1, [r5], #1 ; CHECK-NEXT: le lr, .LBB20_7 ; CHECK-NEXT: .LBB20_8: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, r5, r6, pc} @@ -3638,20 +3636,20 @@ ; CHECK-NEXT: cmp r3, #15 ; CHECK-NEXT: bhi .LBB21_3 ; CHECK-NEXT: @ %bb.2: -; CHECK-NEXT: movs r5, #0 +; CHECK-NEXT: movs r6, #0 ; CHECK-NEXT: mov r12, r0 -; CHECK-NEXT: mov r6, r1 -; CHECK-NEXT: mov r4, r2 +; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: mov r5, r2 ; CHECK-NEXT: b .LBB21_6 ; CHECK-NEXT: .LBB21_3: @ %vector.ph -; CHECK-NEXT: bic r5, r3, #15 +; CHECK-NEXT: bic r6, r3, #15 ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: sub.w r6, r5, #16 -; CHECK-NEXT: add.w r12, r0, r5 +; CHECK-NEXT: sub.w r5, r6, #16 +; CHECK-NEXT: add.w r12, r0, r6 ; CHECK-NEXT: vmov.i16 q0, #0xff -; CHECK-NEXT: add.w lr, r4, r6, lsr #4 -; CHECK-NEXT: adds r4, r2, r5 -; CHECK-NEXT: adds r6, r1, r5 +; CHECK-NEXT: add.w lr, r4, r5, lsr #4 +; CHECK-NEXT: adds r5, r2, r6 +; CHECK-NEXT: adds r4, r1, r6 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB21_4: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 @@ -3669,23 +3667,23 @@ ; CHECK-NEXT: vstrb.16 q1, [r2], #16 ; CHECK-NEXT: le lr, .LBB21_4 ; CHECK-NEXT: @ %bb.5: @ %middle.block -; CHECK-NEXT: cmp r5, r3 +; CHECK-NEXT: cmp r6, r3 ; CHECK-NEXT: it eq ; CHECK-NEXT: popeq {r4, r5, r6, pc} ; CHECK-NEXT: .LBB21_6: @ %for.body.preheader23 -; CHECK-NEXT: sub.w lr, r3, r5 +; CHECK-NEXT: sub.w lr, r3, r6 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB21_7: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldrb r0, [r12], #1 -; CHECK-NEXT: ldrb r1, [r6], #1 +; CHECK-NEXT: ldrb r1, [r4], #1 ; CHECK-NEXT: muls r0, r1, r0 ; CHECK-NEXT: lsrs r1, r0, #7 ; CHECK-NEXT: cmp r1, #255 ; CHECK-NEXT: mov.w r1, #255 ; CHECK-NEXT: it lo ; CHECK-NEXT: lsrlo r1, r0, #7 -; CHECK-NEXT: strb r1, [r4], #1 +; CHECK-NEXT: strb r1, [r5], #1 ; CHECK-NEXT: le lr, .LBB21_7 ; CHECK-NEXT: .LBB21_8: @ %for.cond.cleanup ; CHECK-NEXT: pop {r4, r5, r6, pc}