Index: llvm/lib/Target/ARM/ARMBaseInstrInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -21,6 +21,8 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsARM.h" #include #include @@ -831,6 +833,27 @@ } } +// Return true if the given intrinsic is a gather or scatter +inline bool isGatherScatter(IntrinsicInst *IntInst) { + if (IntInst == nullptr) + return false; + unsigned IntrinsicID = IntInst->getIntrinsicID(); + return (IntrinsicID == Intrinsic::masked_gather || + IntrinsicID == Intrinsic::arm_mve_vldr_gather_base || + IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_predicated || + IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_wb || + IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_wb_predicated || + IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset || + IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset_predicated || + IntrinsicID == Intrinsic::masked_scatter || + IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base || + IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_predicated || + IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_wb || + IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_wb_predicated || + IntrinsicID == Intrinsic::arm_mve_vstr_scatter_offset || + IntrinsicID == Intrinsic::arm_mve_vstr_scatter_offset_predicated); +} + } // end namespace llvm #endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H Index: llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp =================================================================== --- llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -20,22 +20,22 @@ #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/InitializePasses.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsARM.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" -#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Transforms/Utils/Local.h" @@ -74,6 +74,8 @@ } private: + LoopInfo *LI = nullptr; + // Check this is a valid gather with correct alignment bool isLegalTypeAndAlignment(unsigned NumElements, unsigned ElemSize, unsigned Alignment); @@ -82,9 +84,17 @@ // Check for a getelementptr and deduce base and offsets from it, on success // returning the base directly and the offsets indirectly using the Offsets // argument - Value *checkGEP(Value *&Offsets, Type *Ty, Value *Ptr, IRBuilder<> &Builder); + Value *checkGEP(Value *&Offsets, Type *Ty, GetElementPtrInst *GEP, + IRBuilder<> &Builder); // Compute the scale of this gather/scatter instruction int computeScale(unsigned GEPElemSize, unsigned MemoryElemSize); + // If the value is a constant, or derived from constants via additions + // and multilications, return its numeric value + Optional getIfConst(const Value *V); + // If Inst is an add instruction, check whether one summand is a + // constant. If so, scale this constant and return it together with + // the other summand. + std::pair getVarAndConst(Value *Inst, int TypeScale); Value *lowerGather(IntrinsicInst *I); // Create a gather from a base + vector of offsets @@ -92,7 +102,22 @@ Instruction *&Root, IRBuilder<> &Builder); // Create a gather from a vector of pointers Value *tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr, - IRBuilder<> &Builder); + IRBuilder<> &Builder, + unsigned Increment = 0); + // Create a gather from a vector of pointers + Value *tryCreateMaskedGatherBaseWB(IntrinsicInst *I, Value *Ptr, + IRBuilder<> &Builder, + unsigned Increment = 0); + // QI gathers can increment their offsets on their own if the increment is + // a constant value (digit) + Value *tryCreateIncrementingGather(IntrinsicInst *I, Value *BasePtr, + Value *Ptr, GetElementPtrInst *GEP, + IRBuilder<> &Builder); + // QI gathers can increment their offsets on their own if the increment is + // a constant value (digit) - this creates a writeback QI gather + Value *tryCreateIncrementingWBGather(IntrinsicInst *I, Value *BasePtr, + Value *Ptr, unsigned TypeScale, + IRBuilder<> &Builder); Value *lowerScatter(IntrinsicInst *I); // Create a scatter to a base + vector of offsets @@ -137,9 +162,9 @@ return false; } -Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty, Value *Ptr, +Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty, + GetElementPtrInst *GEP, IRBuilder<> &Builder) { - GetElementPtrInst *GEP = dyn_cast(Ptr); if (!GEP) { LLVM_DEBUG( dbgs() << "masked gathers/scatters: no getelementpointer found\n"); @@ -217,6 +242,56 @@ return -1; } +Optional MVEGatherScatterLowering::getIfConst(const Value *V) { + const Constant *C = dyn_cast(V); + if (C != nullptr) + return Optional{C->getUniqueInteger().getSExtValue()}; + if (!isa(V)) + return Optional{}; + + const Instruction *I = cast(V); + if (I->getOpcode() == Instruction::Add || + I->getOpcode() == Instruction::Mul) { + Optional Op0 = getIfConst(I->getOperand(0)); + Optional Op1 = getIfConst(I->getOperand(1)); + if (!Op0 || !Op1) + return Optional{}; + if (I->getOpcode() == Instruction::Add) + return Optional{Op0.getValue() + Op1.getValue()}; + if (I->getOpcode() == Instruction::Mul) + return Optional{Op0.getValue() * Op1.getValue()}; + } + return Optional{}; +} + +std::pair +MVEGatherScatterLowering::getVarAndConst(Value *Inst, int TypeScale) { + std::pair ReturnFalse = + std::pair(nullptr, 0); + // At this point, the instruction we're looking at must be an add or we + // bail out + Instruction *Add = dyn_cast(Inst); + if (Add == nullptr || Add->getOpcode() != Instruction::Add) + return ReturnFalse; + + Value *Summand; + Optional Const; + // Find out which operand the value that is increased is + if ((Const = getIfConst(Add->getOperand(0)))) + Summand = Add->getOperand(1); + else if ((Const = getIfConst(Add->getOperand(1)))) + Summand = Add->getOperand(0); + else + return ReturnFalse; + + // Check that the constant is small enough for an incrementing gather + int64_t Immediate = Const.getValue() << TypeScale; + if (Immediate > 512 || Immediate < -512 || Immediate % 4 != 0) + return ReturnFalse; + + return std::pair(Summand, Immediate); +} + Value *MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) { using namespace PatternMatch; LLVM_DEBUG(dbgs() << "masked gathers: checking transform preconditions\n"); @@ -266,7 +341,8 @@ Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr, - IRBuilder<> &Builder) { + IRBuilder<> &Builder, + unsigned Increment) { using namespace PatternMatch; auto *Ty = cast(I->getType()); LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers\n"); @@ -277,12 +353,34 @@ if (match(Mask, m_One())) return Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base, {Ty, Ptr->getType()}, - {Ptr, Builder.getInt32(0)}); + {Ptr, Builder.getInt32(Increment)}); else return Builder.CreateIntrinsic( Intrinsic::arm_mve_vldr_gather_base_predicated, {Ty, Ptr->getType(), Mask->getType()}, - {Ptr, Builder.getInt32(0), Mask}); + {Ptr, Builder.getInt32(Increment), Mask}); +} + +Value *MVEGatherScatterLowering::tryCreateMaskedGatherBaseWB( + IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, unsigned Increment) { + using namespace PatternMatch; + auto *Ty = cast(I->getType()); + LLVM_DEBUG( + dbgs() + << "masked gathers: loading from vector of pointers with writeback\n"); + if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32) + // Can't build an intrinsic for this + return nullptr; + Value *Mask = I->getArgOperand(2); + if (match(Mask, m_One())) + return Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base_wb, + {Ty, Ptr->getType()}, + {Ptr, Builder.getInt32(Increment)}); + else + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vldr_gather_base_wb_predicated, + {Ty, Ptr->getType(), Mask->getType()}, + {Ptr, Builder.getInt32(Increment), Mask}); } Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset( @@ -321,10 +419,17 @@ } } + GetElementPtrInst *GEP = dyn_cast(Ptr); Value *Offsets; - Value *BasePtr = checkGEP(Offsets, ResultTy, Ptr, Builder); + Value *BasePtr = checkGEP(Offsets, ResultTy, GEP, Builder); if (!BasePtr) return nullptr; + // Check whether the offset is a constant increment that could be merged into + // a QI gather + Value *Load = + tryCreateIncrementingGather(I, BasePtr, Offsets, GEP, Builder); + if (Load) + return Load; int Scale = computeScale( BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(), @@ -348,6 +453,148 @@ Builder.getInt32(Scale), Builder.getInt32(Unsigned)}); } +Value *MVEGatherScatterLowering::tryCreateIncrementingGather( + IntrinsicInst *I, Value *BasePtr, Value *Offsets, GetElementPtrInst *GEP, + IRBuilder<> &Builder) { + auto *Ty = cast(I->getType()); + // Incrementing gathers only exist for v4i32 + if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32) + return nullptr; + Loop *L = LI->getLoopFor(I->getParent()); + if (L == nullptr) + // Incrementing gathers are not beneficial outside of a loop + return nullptr; + LLVM_DEBUG( + dbgs() << "masked gathers: trying to build incrementing wb gather\n"); + + // The gep was in charge of making sure the offsets are scaled correctly + // - calculate that factor so it can be applied by hand + DataLayout DT = I->getParent()->getParent()->getParent()->getDataLayout(); + int TypeScale = + computeScale(DT.getTypeSizeInBits(GEP->getOperand(0)->getType()), + DT.getTypeSizeInBits(GEP->getType()) / + cast(GEP->getType())->getNumElements()); + if (TypeScale == -1) + return nullptr; + + if (GEP->hasOneUse()) { + // Only in this case do we want to build a wb gather, because the wb will + // change the phi which does affect other users of the gep (which will still + // be using the phi in the old way) + Value *Load = + tryCreateIncrementingWBGather(I, BasePtr, Offsets, TypeScale, Builder); + if (Load != nullptr) + return Load; + } + LLVM_DEBUG( + dbgs() << "masked gathers: trying to build incrementing non-wb gather\n"); + + std::pair Add = getVarAndConst(Offsets, TypeScale); + if (Add.first == nullptr) + return nullptr; + Value *OffsetsIncoming = Add.first; + int64_t Immediate = Add.second; + + // Make sure the offsets are scaled correctly + Instruction *ScaledOffsets = BinaryOperator::Create( + Instruction::Shl, OffsetsIncoming, + Builder.CreateVectorSplat(Ty->getNumElements(), Builder.getInt32(TypeScale)), + "ScaledIndex", I); + // Add the base to the offsets + OffsetsIncoming = BinaryOperator::Create( + Instruction::Add, ScaledOffsets, + Builder.CreateVectorSplat( + Ty->getNumElements(), + Builder.CreatePtrToInt( + BasePtr, + cast(ScaledOffsets->getType())->getElementType())), + "StartIndex", I); + + // Merge the add into the gather, if it is the only user + if (cast(Offsets)->getNumUses() == 1) + return cast( + tryCreateMaskedGatherBase(I, OffsetsIncoming, Builder, Immediate)); + else + // TODO: In other cases, it might be possible to create a wb gather, but + // that depends on the order of instructions in the code + return nullptr; +} + +Value *MVEGatherScatterLowering::tryCreateIncrementingWBGather( + IntrinsicInst *I, Value *BasePtr, Value *Offsets, unsigned TypeScale, + IRBuilder<> &Builder) { + // Check whether this gather's offset is incremented by a constant - if so, + // and the load is of the right type, we can merge this into a QI gather + Loop *L = LI->getLoopFor(I->getParent()); + // Offsets that are worth merging into this instruction will be incremented + // by a constant, thus we're looking for an add of a phi and a constant + PHINode *Phi = dyn_cast(Offsets); + if (Phi == nullptr || Phi->getNumIncomingValues() != 2 || + Phi->getParent() != L->getHeader() || Phi->getNumUses() != 2) + // No phi means no IV to write back to; if there is a phi, we expect it + // to have exactly two incoming values; the only phis we are interested in + // will be loop IV's and have exactly two uses, one in their increment and + // one in the gather's gep + return nullptr; + + unsigned IncrementIndex = + Phi->getIncomingBlock(0) == L->getLoopLatch() ? 0 : 1; + // Look through the phi to the phi increment + Offsets = Phi->getIncomingValue(IncrementIndex); + + std::pair Add = getVarAndConst(Offsets, TypeScale); + if (Add.first == nullptr) + return nullptr; + Value *OffsetsIncoming = Add.first; + int64_t Immediate = Add.second; + if (OffsetsIncoming != Phi) + // Then the increment we are looking at is not an increment of the + // induction variable, and we don't want to do a writeback + return nullptr; + + Builder.SetInsertPoint(&Phi->getIncomingBlock(1 - IncrementIndex)->back()); + unsigned NumElems = + cast(OffsetsIncoming->getType())->getNumElements(); + + // Make sure the offsets are scaled correctly + Instruction *ScaledOffsets = BinaryOperator::Create( + Instruction::Shl, Phi->getIncomingValue(1 - IncrementIndex), + Builder.CreateVectorSplat(NumElems, Builder.getInt32(TypeScale)), + "ScaledIndex", &Phi->getIncomingBlock(1 - IncrementIndex)->back()); + // Add the base to the offsets + OffsetsIncoming = BinaryOperator::Create( + Instruction::Add, ScaledOffsets, + Builder.CreateVectorSplat( + NumElems, + Builder.CreatePtrToInt( + BasePtr, + cast(ScaledOffsets->getType())->getElementType())), + "StartIndex", &Phi->getIncomingBlock(1 - IncrementIndex)->back()); + // The gather is pre-incrementing + OffsetsIncoming = BinaryOperator::Create( + Instruction::Sub, OffsetsIncoming, + Builder.CreateVectorSplat(NumElems, Builder.getInt32(Immediate)), + "PreIncrementStartIndex", + &Phi->getIncomingBlock(1 - IncrementIndex)->back()); + Phi->setIncomingValue(1 - IncrementIndex, OffsetsIncoming); + + Builder.SetInsertPoint(I); + + // Build the incrementing gather + Value *Load = tryCreateMaskedGatherBaseWB(I, Phi, Builder, Immediate); + + // One value to be handed to whoever uses the gather, one is the loop + // increment + Value *ExtractedLoad = Builder.CreateExtractValue(Load, 0, "Gather"); + Value *Inc = Builder.CreateExtractValue(Load, 1, "GatherIncrement"); + Instruction *AddInst = cast(Offsets); + AddInst->replaceAllUsesWith(Inc); + AddInst->eraseFromParent(); + Phi->setIncomingValue(IncrementIndex, Inc); + + return ExtractedLoad; +} + Value *MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) { using namespace PatternMatch; LLVM_DEBUG(dbgs() << "masked scatters: checking transform preconditions\n"); @@ -433,8 +680,9 @@ return nullptr; } + GetElementPtrInst *GEP = dyn_cast(Ptr); Value *Offsets; - Value *BasePtr = checkGEP(Offsets, InputTy, Ptr, Builder); + Value *BasePtr = checkGEP(Offsets, InputTy, GEP, Builder); if (!BasePtr) return nullptr; int Scale = computeScale( @@ -514,27 +762,6 @@ return; } -// Return true if the given intrinsic is a gather or scatter -static bool isGatherScatter(IntrinsicInst *IntInst) { - if (IntInst == nullptr) - return false; - unsigned IntrinsicID = IntInst->getIntrinsicID(); - return (IntrinsicID == Intrinsic::masked_gather || - IntrinsicID == Intrinsic::arm_mve_vldr_gather_base || - IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_predicated || - IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_wb || - IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_wb_predicated || - IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset || - IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset_predicated || - IntrinsicID == Intrinsic::masked_scatter || - IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base || - IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_predicated || - IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_wb || - IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_wb_predicated || - IntrinsicID == Intrinsic::arm_mve_vstr_scatter_offset || - IntrinsicID == Intrinsic::arm_mve_vstr_scatter_offset_predicated); -} - // Check whether all usages of this instruction are as offsets of // gathers/scatters or simple arithmetics only used by gathers/scatters static bool hasAllGatScatUsers(Instruction *I) { @@ -717,29 +944,36 @@ auto *ST = &TM.getSubtarget(F); if (!ST->hasMVEIntegerOps()) return false; + LI = &getAnalysis().getLoopInfo(); SmallVector Gathers; SmallVector Scatters; - LoopInfo &LI = getAnalysis().getLoopInfo(); for (BasicBlock &BB : F) { for (Instruction &I : BB) { IntrinsicInst *II = dyn_cast(&I); - if (II && II->getIntrinsicID() == Intrinsic::masked_gather) + if (II && II->getIntrinsicID() == Intrinsic::masked_gather) { Gathers.push_back(II); - else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter) + if (isa(II->getArgOperand(0))) + optimiseOffsets( + cast(II->getArgOperand(0))->getOperand(1), + II->getParent(), LI); + } else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter) { Scatters.push_back(II); + if (isa(II->getArgOperand(1))) + optimiseOffsets( + cast(II->getArgOperand(1))->getOperand(1), + II->getParent(), LI); + } } } bool Changed = false; for (unsigned i = 0; i < Gathers.size(); i++) { IntrinsicInst *I = Gathers[i]; - if (isa(I->getArgOperand(0))) - optimiseOffsets(cast(I->getArgOperand(0))->getOperand(1), - I->getParent(), &LI); Value *L = lowerGather(I); if (L == nullptr) continue; + // Get rid of any now dead instructions SimplifyInstructionsInBlock(cast(L)->getParent()); Changed = true; @@ -747,12 +981,10 @@ for (unsigned i = 0; i < Scatters.size(); i++) { IntrinsicInst *I = Scatters[i]; - if (isa(I->getArgOperand(1))) - optimiseOffsets(cast(I->getArgOperand(1))->getOperand(1), - I->getParent(), &LI); Value *S = lowerScatter(I); if (S == nullptr) continue; + // Get rid of any now dead instructions SimplifyInstructionsInBlock(cast(S)->getParent()); Changed = true; Index: llvm/test/CodeGen/Thumb2/mve-gather-increment.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -225,33 +225,24 @@ define arm_aapcs_vfpcc void @gather_pre_inc(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) { ; CHECK-LABEL: gather_pre_inc: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: adr r3, .LCPI6_0 -; CHECK-NEXT: vmov.i32 q1, #0x8 ; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: vmov.i32 q2, #0x6 -; CHECK-NEXT: vmov.i32 q3, #0x3 -; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmul.i32 q4, q0, q3 +; CHECK-NEXT: vldrw.u32 q1, [q0, #96]! ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q4, q4, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vldrw.u32 q5, [r0, q4, uxtw #2] -; CHECK-NEXT: vstrb.8 q5, [r1], #16 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 ; CHECK-NEXT: bne .LBB6_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI6_0: +; CHECK-NEXT: .long 4294967224 @ 0xffffffb8 +; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 +; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 ; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 4 @ 0x4 -; CHECK-NEXT: .long 6 @ 0x6 vector.ph: ; preds = %for.body.preheader %ind.end = shl i32 %n.vec, 1 br label %vector.body @@ -278,31 +269,24 @@ define arm_aapcs_vfpcc void @gather_post_inc(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec43) { ; CHECK-LABEL: gather_post_inc: ; CHECK: @ %bb.0: @ %vector.ph41 -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr r3, .LCPI7_0 -; CHECK-NEXT: vmov.i32 q1, #0x8 ; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: vmov.i32 q2, #0x3 -; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: .LBB7_1: @ %vector.body39 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmul.i32 q3, q0, q2 +; CHECK-NEXT: vldrw.u32 q1, [q0, #96]! ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vldrw.u32 q4, [r0, q3, uxtw #2] -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vstrb.8 q4, [r1], #16 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 ; CHECK-NEXT: bne .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI7_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 4 @ 0x4 -; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 4294967200 @ 0xffffffa0 +; CHECK-NEXT: .long 4294967224 @ 0xffffffb8 +; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 +; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 vector.ph41: ; preds = %for.body6.preheader %ind.end47 = shl i32 %n.vec43, 1 br label %vector.body39 @@ -334,26 +318,23 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: bic r12, r2, #3 -; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: sub.w r3, r12, #4 -; CHECK-NEXT: vmov.i32 q1, #0x4 -; CHECK-NEXT: add.w r4, r4, r3, lsr #2 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: sub.w lr, r12, #4 +; CHECK-NEXT: add.w r4, r3, lr, lsr #2 ; CHECK-NEXT: adr r3, .LCPI8_0 ; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: .LBB8_1: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB8_2 Depth 2 -; CHECK-NEXT: mov r3, r1 -; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: dls lr, r4 -; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB8_2: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB8_1 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrw.u32 q3, [r0, q2, uxtw #2] -; CHECK-NEXT: vadd.i32 q2, q2, q1 -; CHECK-NEXT: vstrb.8 q3, [r3], #16 +; CHECK-NEXT: vldrw.u32 q2, [q1, #16]! +; CHECK-NEXT: vstrb.8 q2, [r0], #16 ; CHECK-NEXT: le lr, .LBB8_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB8_1 Depth=1 @@ -364,10 +345,10 @@ ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.5: ; CHECK-NEXT: .LCPI8_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .long 4294967280 @ 0xfffffff0 +; CHECK-NEXT: .long 4294967284 @ 0xfffffff4 +; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 +; CHECK-NEXT: .long 4294967292 @ 0xfffffffc entry: %cmp22 = icmp sgt i32 %n, 0 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup @@ -400,67 +381,68 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_complex(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n) { ; CHECK-LABEL: gather_inc_v4i32_complex: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #40 -; CHECK-NEXT: sub sp, #40 ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: blt .LBB9_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader ; CHECK-NEXT: bic r12, r2, #3 -; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: sub.w r3, r12, #4 -; CHECK-NEXT: vmov.i32 q2, #0x3 -; CHECK-NEXT: vmov.i32 q1, #0x4 -; CHECK-NEXT: vmov.i32 q3, #0x2 -; CHECK-NEXT: add.w r4, r4, r3, lsr #2 -; CHECK-NEXT: adr r3, .LCPI9_0 -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: vmov.i32 q4, #0x1 -; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: sub.w lr, r12, #4 +; CHECK-NEXT: adr r4, .LCPI9_1 +; CHECK-NEXT: adr r5, .LCPI9_2 +; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: add.w r3, r3, lr, lsr #2 +; CHECK-NEXT: adr.w lr, .LCPI9_0 +; CHECK-NEXT: vldrw.u32 q0, [r5] +; CHECK-NEXT: vldrw.u32 q2, [lr] +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: .LBB9_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB9_3 Depth 2 -; CHECK-NEXT: mov r3, r1 -; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload -; CHECK-NEXT: dls lr, r4 -; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov q5, q2 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB9_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB9_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vmul.i32 q6, q5, q2 -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vadd.i32 q0, q6, q4 -; CHECK-NEXT: vldrw.u32 q7, [r0, q6, uxtw #2] -; CHECK-NEXT: vldrw.u32 q1, [r0, q0, uxtw #2] -; CHECK-NEXT: vadd.i32 q0, q1, q7 -; CHECK-NEXT: vadd.i32 q1, q6, q3 -; CHECK-NEXT: vldrw.u32 q6, [r0, q1, uxtw #2] -; CHECK-NEXT: vmov q1, q2 -; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q5, q5, q1 -; CHECK-NEXT: vadd.i32 q0, q0, q6 -; CHECK-NEXT: vstrb.8 q0, [r3], #16 +; CHECK-NEXT: vldrw.u32 q6, [q5, #48]! +; CHECK-NEXT: vldrw.u32 q7, [q3, #48]! +; CHECK-NEXT: vadd.i32 q6, q7, q6 +; CHECK-NEXT: vldrw.u32 q7, [q4, #48]! +; CHECK-NEXT: vadd.i32 q6, q6, q7 +; CHECK-NEXT: vstrb.8 q6, [r0], #16 ; CHECK-NEXT: le lr, .LBB9_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=1 ; CHECK-NEXT: cmp r12, r2 ; CHECK-NEXT: bne .LBB9_2 ; CHECK-NEXT: .LBB9_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #40 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: pop {r4, r5, r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.6: ; CHECK-NEXT: .LCPI9_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 +; CHECK-NEXT: .long 4294967260 @ 0xffffffdc +; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 +; CHECK-NEXT: .long 4294967284 @ 0xfffffff4 +; CHECK-NEXT: .LCPI9_1: +; CHECK-NEXT: .long 4294967252 @ 0xffffffd4 +; CHECK-NEXT: .long 4294967264 @ 0xffffffe0 +; CHECK-NEXT: .long 4294967276 @ 0xffffffec +; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 +; CHECK-NEXT: .LCPI9_2: +; CHECK-NEXT: .long 4294967256 @ 0xffffffd8 +; CHECK-NEXT: .long 4294967268 @ 0xffffffe4 +; CHECK-NEXT: .long 4294967280 @ 0xfffffff0 +; CHECK-NEXT: .long 4294967292 @ 0xfffffffc entry: %cmp22 = icmp sgt i32 %n, 0 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup @@ -508,26 +490,23 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: bic r12, r2, #3 -; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: sub.w r3, r12, #4 -; CHECK-NEXT: vmov.i32 q1, #0x7f -; CHECK-NEXT: add.w r4, r4, r3, lsr #2 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: sub.w lr, r12, #4 +; CHECK-NEXT: add.w r4, r3, lr, lsr #2 ; CHECK-NEXT: adr r3, .LCPI10_0 ; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: .LBB10_1: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB10_2 Depth 2 -; CHECK-NEXT: mov r3, r1 -; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: dls lr, r4 -; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB10_2: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB10_1 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrw.u32 q3, [r0, q2, uxtw #2] -; CHECK-NEXT: vadd.i32 q2, q2, q1 -; CHECK-NEXT: vstrb.8 q3, [r3], #16 +; CHECK-NEXT: vldrw.u32 q2, [q1, #508]! +; CHECK-NEXT: vstrb.8 q2, [r0], #16 ; CHECK-NEXT: le lr, .LBB10_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB10_1 Depth=1 @@ -538,10 +517,10 @@ ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.5: ; CHECK-NEXT: .LCPI10_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .long 4294966788 @ 0xfffffe04 +; CHECK-NEXT: .long 4294966792 @ 0xfffffe08 +; CHECK-NEXT: .long 4294966796 @ 0xfffffe0c +; CHECK-NEXT: .long 4294966800 @ 0xfffffe10 entry: %cmp22 = icmp sgt i32 %n, 0 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup @@ -597,14 +576,12 @@ ; CHECK-NEXT: add.w r8, r6, r3, lsr #3 ; CHECK-NEXT: adr r3, .LCPI11_0 ; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB11_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB11_3 Depth 2 ; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: dls lr, r8 -; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB11_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB11_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 @@ -719,134 +696,158 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #88 -; CHECK-NEXT: sub sp, #88 +; CHECK-NEXT: .pad #104 +; CHECK-NEXT: sub sp, #104 ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #60] @ 4-byte Spill ; CHECK-NEXT: blt.w .LBB12_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader ; CHECK-NEXT: bic r1, r2, #7 ; CHECK-NEXT: movs r7, #1 ; CHECK-NEXT: sub.w r3, r1, #8 -; CHECK-NEXT: vmov.i16 q2, #0x1 -; CHECK-NEXT: vmov.i16 q1, #0x8 -; CHECK-NEXT: vmov.i16 q4, #0x2 +; CHECK-NEXT: adr r6, .LCPI12_2 +; CHECK-NEXT: vmov.i16 q3, #0x18 +; CHECK-NEXT: str r1, [sp, #56] @ 4-byte Spill ; CHECK-NEXT: add.w r8, r7, r3, lsr #3 +; CHECK-NEXT: adr r7, .LCPI12_1 +; CHECK-NEXT: vldrw.u32 q0, [r7] ; CHECK-NEXT: adr r3, .LCPI12_0 +; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.i16 q0, #0x3 -; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB12_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB12_3 Depth 2 ; CHECK-NEXT: dls lr, r8 -; CHECK-NEXT: ldr r3, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload -; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: ldr r3, [sp, #60] @ 4-byte Reload +; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload ; CHECK-NEXT: .LBB12_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB12_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vmul.i16 q7, q5, q0 -; CHECK-NEXT: vadd.i16 q0, q7, q4 -; CHECK-NEXT: vmov.u16 r12, q7[0] -; CHECK-NEXT: vmov.u16 r4, q0[4] -; CHECK-NEXT: vmov.u16 r5, q0[0] -; CHECK-NEXT: vmov.32 q1[0], r4 -; CHECK-NEXT: vmov.u16 r4, q0[5] -; CHECK-NEXT: vmov.32 q1[1], r4 -; CHECK-NEXT: vmov.u16 r4, q0[6] -; CHECK-NEXT: vmov.32 q1[2], r4 -; CHECK-NEXT: vmov.u16 r4, q0[7] -; CHECK-NEXT: vmov.32 q1[3], r4 -; CHECK-NEXT: vmov.32 q3[0], r12 +; CHECK-NEXT: vmov.u16 r4, q5[0] +; CHECK-NEXT: vmov.u16 r7, q7[4] +; CHECK-NEXT: vmov.32 q0[0], r4 +; CHECK-NEXT: vmov.u16 r4, q5[1] +; CHECK-NEXT: vmov.32 q0[1], r4 +; CHECK-NEXT: vmov.u16 r4, q5[2] +; CHECK-NEXT: vmov.32 q0[2], r4 +; CHECK-NEXT: vmov.u16 r4, q5[3] +; CHECK-NEXT: vmov.32 q0[3], r4 +; CHECK-NEXT: vmov.u16 r12, q6[0] +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vmov.32 q1[0], r12 +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vmov.u16 r1, q6[1] +; CHECK-NEXT: vadd.i32 q2, q0, r0 +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: vmov.u16 r1, q6[2] +; CHECK-NEXT: vmov.32 q1[2], r1 +; CHECK-NEXT: vmov.u16 r1, q6[3] +; CHECK-NEXT: vmov.32 q1[3], r1 +; CHECK-NEXT: vmov.u16 r1, q6[4] ; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vmov.u16 r1, q7[1] +; CHECK-NEXT: vmov r6, s11 ; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vadd.i32 q4, q1, r0 +; CHECK-NEXT: ldrh.w r9, [r4] +; CHECK-NEXT: vmov.u16 r4, q5[4] +; CHECK-NEXT: vmov.32 q0[0], r4 +; CHECK-NEXT: vmov.u16 r4, q5[5] +; CHECK-NEXT: vmov.32 q0[1], r4 +; CHECK-NEXT: vmov.u16 r4, q5[6] +; CHECK-NEXT: vmov.32 q0[2], r4 +; CHECK-NEXT: vmov.u16 r4, q5[7] +; CHECK-NEXT: vmov.32 q0[3], r4 +; CHECK-NEXT: ldrh r6, [r6] +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov r5, s3 +; CHECK-NEXT: ldrh.w r10, [r4] +; CHECK-NEXT: vmov r4, s1 +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: ldrh.w r11, [r4] +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: vmov.32 q0[0], r7 +; CHECK-NEXT: vmov.u16 r7, q7[5] +; CHECK-NEXT: vmov.32 q0[1], r7 +; CHECK-NEXT: vmov.u16 r7, q7[6] +; CHECK-NEXT: vmov.32 q0[2], r7 +; CHECK-NEXT: vmov.u16 r7, q7[7] +; CHECK-NEXT: vmov.32 q0[3], r7 +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r7, s2 +; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vmov.32 q0[0], r1 +; CHECK-NEXT: vmov.u16 r1, q6[5] +; CHECK-NEXT: vmov.32 q0[1], r1 +; CHECK-NEXT: vmov.u16 r1, q6[6] +; CHECK-NEXT: vmov.32 q0[2], r1 +; CHECK-NEXT: vmov.u16 r1, q6[7] +; CHECK-NEXT: vmov.32 q0[3], r1 +; CHECK-NEXT: vmov.u16 r1, q7[0] +; CHECK-NEXT: vmov.32 q3[0], r1 +; CHECK-NEXT: vmov.u16 r1, q7[1] ; CHECK-NEXT: vmov.32 q3[1], r1 -; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: vmov.u16 r1, q7[2] -; CHECK-NEXT: vmov r4, s6 ; CHECK-NEXT: vmov.32 q3[2], r1 ; CHECK-NEXT: vmov.u16 r1, q7[3] ; CHECK-NEXT: vmov.32 q3[3], r1 -; CHECK-NEXT: vmov.u16 r1, q7[4] +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmovlb.s16 q0, q0 ; CHECK-NEXT: vmovlb.s16 q3, q3 -; CHECK-NEXT: vshl.i32 q3, q3, #1 -; CHECK-NEXT: vadd.i32 q3, q3, r0 -; CHECK-NEXT: ldrh.w r9, [r4] -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrh.w r10, [r4] -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: ldrh.w r11, [r4] -; CHECK-NEXT: vmov r4, s5 -; CHECK-NEXT: vmov.32 q1[0], r5 -; CHECK-NEXT: vmov.u16 r5, q0[1] -; CHECK-NEXT: vmov.32 q1[1], r5 -; CHECK-NEXT: vmov.u16 r5, q0[2] -; CHECK-NEXT: vmov.32 q1[2], r5 -; CHECK-NEXT: vmov.u16 r5, q0[3] -; CHECK-NEXT: vmov.32 q1[3], r5 -; CHECK-NEXT: vmovlb.s16 q0, q1 -; CHECK-NEXT: vadd.i16 q1, q7, q2 -; CHECK-NEXT: vmov.u16 r7, q1[4] ; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vmov.32 q2[0], r7 -; CHECK-NEXT: vmov.u16 r7, q1[5] -; CHECK-NEXT: vmov.32 q2[1], r7 -; CHECK-NEXT: vmov.u16 r7, q1[6] -; CHECK-NEXT: vmov.32 q2[2], r7 -; CHECK-NEXT: vmov.u16 r7, q1[7] -; CHECK-NEXT: vmov.32 q2[3], r7 +; CHECK-NEXT: vshl.i32 q3, q3, #1 ; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmovlb.s16 q2, q2 -; CHECK-NEXT: vmov r5, s2 -; CHECK-NEXT: vshl.i32 q2, q2, #1 -; CHECK-NEXT: vmov r6, s3 -; CHECK-NEXT: vadd.i32 q6, q2, r0 -; CHECK-NEXT: vmov.32 q2[0], r1 -; CHECK-NEXT: vmov.u16 r1, q7[5] -; CHECK-NEXT: vmov r7, s26 -; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov.u16 r1, q7[6] -; CHECK-NEXT: vmov.32 q2[2], r1 -; CHECK-NEXT: vmov.u16 r1, q7[7] -; CHECK-NEXT: vmov.32 q2[3], r1 -; CHECK-NEXT: vmov.u16 r1, q1[0] -; CHECK-NEXT: vmov.32 q7[0], r1 -; CHECK-NEXT: vmov.u16 r1, q1[1] -; CHECK-NEXT: vmov.32 q7[1], r1 -; CHECK-NEXT: vmov.u16 r1, q1[2] -; CHECK-NEXT: vmov.32 q7[2], r1 -; CHECK-NEXT: vmov.u16 r1, q1[3] -; CHECK-NEXT: vmov.32 q7[3], r1 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmovlb.s16 q1, q7 -; CHECK-NEXT: vmovlb.s16 q2, q2 -; CHECK-NEXT: vshl.i32 q2, q2, #1 -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vadd.i32 q3, q3, r0 ; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: ldrh r6, [r6] ; CHECK-NEXT: ldrh r7, [r7] ; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q7[0], r1 +; CHECK-NEXT: vmov.16 q1[0], r1 +; CHECK-NEXT: vmov r1, s9 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q1[1], r1 +; CHECK-NEXT: vmov r1, s16 +; CHECK-NEXT: vmov.16 q1[2], r9 +; CHECK-NEXT: vmov.16 q1[3], r6 +; CHECK-NEXT: vmov.16 q1[4], r10 +; CHECK-NEXT: vmov.16 q1[5], r11 +; CHECK-NEXT: vmov.16 q1[6], r4 +; CHECK-NEXT: vmov.16 q1[7], r5 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q2[0], r1 +; CHECK-NEXT: vmov r1, s17 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q2[1], r1 +; CHECK-NEXT: vmov r1, s18 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q2[2], r1 +; CHECK-NEXT: vmov r1, s19 +; CHECK-NEXT: vldrw.u32 q4, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q2[3], r1 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q2[4], r1 ; CHECK-NEXT: vmov r1, s1 ; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q7[1], r1 +; CHECK-NEXT: vmov.16 q2[5], r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q2[6], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q2[7], r1 ; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: vmov.16 q7[2], r5 -; CHECK-NEXT: vmov.16 q7[3], r6 -; CHECK-NEXT: vmov.16 q7[4], r11 -; CHECK-NEXT: vmov.16 q7[5], r4 -; CHECK-NEXT: vmov.16 q7[6], r9 -; CHECK-NEXT: vmov.16 q7[7], r10 ; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: vmov.16 q0[0], r1 ; CHECK-NEXT: vmov r1, s13 @@ -856,57 +857,33 @@ ; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: vmov.16 q0[2], r1 ; CHECK-NEXT: vmov r1, s15 +; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vadd.i16 q6, q6, q3 +; CHECK-NEXT: vadd.i16 q5, q5, q3 +; CHECK-NEXT: vadd.i16 q7, q7, q3 ; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: vmov.16 q0[3], r1 -; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmov r1, s16 ; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: vmov.16 q0[4], r1 -; CHECK-NEXT: vmov r1, s9 +; CHECK-NEXT: vmov r1, s17 ; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q0[6], r1 -; CHECK-NEXT: vmov r1, s11 +; CHECK-NEXT: vmov r1, s19 +; CHECK-NEXT: vmov.16 q0[6], r7 ; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: vmov.16 q0[7], r1 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q2[0], r1 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q2[1], r1 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q2[2], r1 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vadd.i16 q5, q5, q1 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q2[3], r1 -; CHECK-NEXT: vmov r1, s24 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q2[4], r1 -; CHECK-NEXT: vmov r1, s25 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q2[5], r1 -; CHECK-NEXT: vmov r1, s27 -; CHECK-NEXT: vmov.16 q2[6], r7 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.16 q2[7], r1 -; CHECK-NEXT: vadd.i16 q0, q2, q0 -; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vadd.i16 q0, q0, q7 +; CHECK-NEXT: vadd.i16 q0, q0, q2 +; CHECK-NEXT: vadd.i16 q0, q0, q1 ; CHECK-NEXT: vstrb.8 q0, [r3], #16 -; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload ; CHECK-NEXT: le lr, .LBB12_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB12_2 Depth=1 -; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #56] @ 4-byte Reload ; CHECK-NEXT: cmp r1, r2 ; CHECK-NEXT: bne.w .LBB12_2 ; CHECK-NEXT: .LBB12_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #88 +; CHECK-NEXT: add sp, #104 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -914,13 +891,31 @@ ; CHECK-NEXT: @ %bb.6: ; CHECK-NEXT: .LCPI12_0: ; CHECK-NEXT: .short 0 @ 0x0 -; CHECK-NEXT: .short 1 @ 0x1 -; CHECK-NEXT: .short 2 @ 0x2 ; CHECK-NEXT: .short 3 @ 0x3 -; CHECK-NEXT: .short 4 @ 0x4 -; CHECK-NEXT: .short 5 @ 0x5 ; CHECK-NEXT: .short 6 @ 0x6 +; CHECK-NEXT: .short 9 @ 0x9 +; CHECK-NEXT: .short 12 @ 0xc +; CHECK-NEXT: .short 15 @ 0xf +; CHECK-NEXT: .short 18 @ 0x12 +; CHECK-NEXT: .short 21 @ 0x15 +; CHECK-NEXT: .LCPI12_1: +; CHECK-NEXT: .short 2 @ 0x2 +; CHECK-NEXT: .short 5 @ 0x5 +; CHECK-NEXT: .short 8 @ 0x8 +; CHECK-NEXT: .short 11 @ 0xb +; CHECK-NEXT: .short 14 @ 0xe +; CHECK-NEXT: .short 17 @ 0x11 +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 23 @ 0x17 +; CHECK-NEXT: .LCPI12_2: +; CHECK-NEXT: .short 1 @ 0x1 +; CHECK-NEXT: .short 4 @ 0x4 ; CHECK-NEXT: .short 7 @ 0x7 +; CHECK-NEXT: .short 10 @ 0xa +; CHECK-NEXT: .short 13 @ 0xd +; CHECK-NEXT: .short 16 @ 0x10 +; CHECK-NEXT: .short 19 @ 0x13 +; CHECK-NEXT: .short 22 @ 0x16 entry: @@ -971,266 +966,347 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #200 -; CHECK-NEXT: sub sp, #200 +; CHECK-NEXT: .pad #328 +; CHECK-NEXT: sub sp, #328 ; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: str r1, [sp, #120] @ 4-byte Spill +; CHECK-NEXT: mov r1, r2 +; CHECK-NEXT: str r2, [sp, #124] @ 4-byte Spill ; CHECK-NEXT: blt.w .LBB13_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader -; CHECK-NEXT: adr r5, .LCPI13_3 -; CHECK-NEXT: adr r7, .LCPI13_1 -; CHECK-NEXT: vldrw.u32 q0, [r5] -; CHECK-NEXT: adr r6, .LCPI13_2 -; CHECK-NEXT: adr r3, .LCPI13_0 -; CHECK-NEXT: bic r12, r2, #7 -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: ldr r1, [sp, #124] @ 4-byte Reload +; CHECK-NEXT: adr.w r6, .LCPI13_8 +; CHECK-NEXT: adr.w r7, .LCPI13_7 +; CHECK-NEXT: adr.w r3, .LCPI13_6 +; CHECK-NEXT: bic r11, r1, #7 +; CHECK-NEXT: adr r1, .LCPI13_0 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adr r1, .LCPI13_1 +; CHECK-NEXT: vmov.i32 q5, #0x30 +; CHECK-NEXT: str.w r11, [sp, #116] @ 4-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adr r1, .LCPI13_5 +; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: adr.w r6, .LCPI13_9 +; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r7] -; CHECK-NEXT: vmov.i32 q1, #0x2 -; CHECK-NEXT: vmov.i32 q2, #0x10 -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: vstrw.32 q1, [sp, #144] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q2, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r3] ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.i32 q0, #0x3 -; CHECK-NEXT: vstrw.32 q0, [sp, #128] @ 16-byte Spill -; CHECK-NEXT: vmov.i32 q0, #0x1 -; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB13_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB13_3 Depth 2 -; CHECK-NEXT: vldrw.u32 q3, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: mov lr, r1 -; CHECK-NEXT: vstrw.32 q3, [sp, #176] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: mov r8, r12 -; CHECK-NEXT: vstrw.32 q3, [sp, #160] @ 16-byte Spill -; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: adr r1, .LCPI13_3 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: adr r1, .LCPI13_4 +; CHECK-NEXT: vstrw.32 q2, [sp, #288] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q3, [r1] +; CHECK-NEXT: adr r1, .LCPI13_2 +; CHECK-NEXT: vstrw.32 q2, [sp, #224] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adr r1, .LCPI13_10 +; CHECK-NEXT: vstrw.32 q2, [sp, #272] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q0, [sp, #304] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adr r1, .LCPI13_11 +; CHECK-NEXT: ldr.w r9, [sp, #120] @ 4-byte Reload +; CHECK-NEXT: vstrw.32 q2, [sp, #208] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [r1] +; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q2, [sp, #192] @ 16-byte Spill ; CHECK-NEXT: .LBB13_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB13_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrw.u32 q2, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q6, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q4, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vmlas.u32 q6, q2, r0 -; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vadd.i32 q1, q6, q1 -; CHECK-NEXT: vmlas.u32 q0, q4, r0 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vldrw.u32 q4, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q5, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q7, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q4, q0, q4 -; CHECK-NEXT: subs.w r8, r8, #16 -; CHECK-NEXT: vmov r4, s19 -; CHECK-NEXT: vadd.i32 q2, q0, q7 -; CHECK-NEXT: ldrb.w r10, [r3] -; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: ldrb.w r9, [r4] -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: ldrb r5, [r3] -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r6, [r3] -; CHECK-NEXT: vmov r3, s17 -; CHECK-NEXT: ldrb r7, [r3] -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov.8 q4[0], r4 -; CHECK-NEXT: vmov r4, s5 -; CHECK-NEXT: vldrw.u32 q1, [sp, #176] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q1, [sp, #176] @ 16-byte Spill -; CHECK-NEXT: vmlas.u32 q1, q5, r0 -; CHECK-NEXT: vldrw.u32 q5, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q5, q1, q5 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: vmov.8 q4[1], r4 -; CHECK-NEXT: vmov r4, s22 -; CHECK-NEXT: vmov.8 q4[2], r10 -; CHECK-NEXT: vmov.8 q4[3], r5 -; CHECK-NEXT: vmov r5, s24 -; CHECK-NEXT: vmov.8 q4[4], r6 -; CHECK-NEXT: vmov r6, s23 -; CHECK-NEXT: vmov.8 q4[5], r7 -; CHECK-NEXT: vmov r7, s25 -; CHECK-NEXT: vmov.8 q4[6], r3 -; CHECK-NEXT: vmov r3, s21 -; CHECK-NEXT: vmov.8 q4[7], r9 -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r6, [r6] -; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: ldrb.w r10, [r3] -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: vmov.8 q5[0], r5 -; CHECK-NEXT: vmov.8 q5[1], r7 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q4[8], r3 -; CHECK-NEXT: vmov r3, s26 -; CHECK-NEXT: vmov.8 q4[9], r10 -; CHECK-NEXT: vmov.8 q4[10], r4 -; CHECK-NEXT: vmov.8 q4[11], r6 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q5[2], r3 -; CHECK-NEXT: vmov r3, s27 -; CHECK-NEXT: vadd.i32 q6, q6, q7 -; CHECK-NEXT: vmov r4, s25 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q5[3], r3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q5[4], r3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q5[5], r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q5[6], r3 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q5[7], r3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q5[8], r3 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q5[9], r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q5[10], r3 -; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q5[11], r3 -; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q3[0], r3 -; CHECK-NEXT: vmov r3, s26 -; CHECK-NEXT: vmov.8 q3[1], r4 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q3[2], r3 -; CHECK-NEXT: vmov r3, s27 -; CHECK-NEXT: vldrw.u32 q6, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q3[3], r3 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q3[4], r3 -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q3[5], r3 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q3[6], r3 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: vadd.i32 q2, q1, q7 -; CHECK-NEXT: vldrw.u32 q1, [sp, #160] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q1, [sp, #160] @ 16-byte Spill -; CHECK-NEXT: vmlas.u32 q1, q6, r0 -; CHECK-NEXT: vldrw.u32 q6, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q6, q1, q6 -; CHECK-NEXT: vmov r5, s26 -; CHECK-NEXT: vmov r4, s27 -; CHECK-NEXT: vmov r7, s25 -; CHECK-NEXT: vmov r6, s24 -; CHECK-NEXT: vadd.i32 q6, q1, q7 -; CHECK-NEXT: vldrw.u32 q7, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q0, q0, q7 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q3[7], r3 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: ldrb.w r10, [r5] -; CHECK-NEXT: vmov r5, s11 -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: ldrb r6, [r6] -; CHECK-NEXT: vmov.8 q4[12], r6 -; CHECK-NEXT: vmov.8 q4[13], r7 -; CHECK-NEXT: vmov.8 q4[14], r10 -; CHECK-NEXT: vmov.8 q4[15], r4 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q3[8], r3 -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q3[9], r3 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q3[10], r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov.8 q3[11], r5 -; CHECK-NEXT: vmov r5, s24 -; CHECK-NEXT: ldrb.w r9, [r3] -; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov.8 q3[12], r5 -; CHECK-NEXT: vmov r5, s25 -; CHECK-NEXT: ldrb.w r11, [r3] -; CHECK-NEXT: vmov r3, s27 +; CHECK-NEXT: vstrw.32 q3, [sp, #240] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q3, q6, r0 +; CHECK-NEXT: vmov r1, s15 +; CHECK-NEXT: vstrw.32 q1, [sp, #256] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vstrw.32 q0, [sp, #176] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q0, q7, r0 +; CHECK-NEXT: vstrw.32 q6, [sp, #160] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q6, [sp, #256] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q7, [sp, #144] @ 16-byte Spill +; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: vldrw.u32 q2, [sp, #240] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q6, q6, r0 +; CHECK-NEXT: vstrw.32 q4, [sp, #128] @ 16-byte Spill +; CHECK-NEXT: subs.w r11, r11, #16 +; CHECK-NEXT: ldrb.w r12, [r1] +; CHECK-NEXT: vmov r1, s5 ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov.8 q3[13], r5 -; CHECK-NEXT: vmov r5, s26 -; CHECK-NEXT: vldrw.u32 q6, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q6, q6, q7 +; CHECK-NEXT: ldrb.w lr, [r1] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: ldrb r6, [r1] +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: ldrb.w r10, [r1] +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: ldrb r4, [r1] +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: ldrb.w r8, [r1] +; CHECK-NEXT: vmov r1, s24 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[0], r1 +; CHECK-NEXT: vmov r1, s25 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[1], r1 +; CHECK-NEXT: vmov r1, s26 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[2], r1 +; CHECK-NEXT: vmov r1, s27 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[3], r1 +; CHECK-NEXT: vmov r1, s12 +; CHECK-NEXT: vmov.8 q7[4], r6 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q6[0], r1 +; CHECK-NEXT: vmov r1, s13 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q6[1], r1 +; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vadd.i32 q3, q4, r0 +; CHECK-NEXT: vldrw.u32 q4, [sp, #224] @ 16-byte Reload +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov r3, s15 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q6[2], r1 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmov.8 q6[3], r12 +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: vldrw.u32 q1, [sp, #304] @ 16-byte Reload ; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: vstrw.32 q1, [sp, #304] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q6[4], r1 +; CHECK-NEXT: vmov r1, s13 +; CHECK-NEXT: vmov.8 q6[5], lr +; CHECK-NEXT: vmov.8 q6[6], r8 +; CHECK-NEXT: vmov.8 q6[7], r5 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vldrw.u32 q0, [sp, #288] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q0, [sp, #288] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r6, s0 +; CHECK-NEXT: ldrb r7, [r1] +; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vldrw.u32 q3, [sp, #208] @ 16-byte Reload ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov.8 q3[14], r5 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: vmov.8 q3[15], r3 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov.8 q5[12], r5 -; CHECK-NEXT: vmov r5, s5 -; CHECK-NEXT: vldrw.u32 q1, [sp, #176] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q1, q1, q7 -; CHECK-NEXT: vstrw.32 q1, [sp, #176] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #160] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q1, q1, q7 -; CHECK-NEXT: vstrw.32 q1, [sp, #160] @ 16-byte Spill +; CHECK-NEXT: vmov.8 q7[5], r5 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov.8 q7[6], r10 +; CHECK-NEXT: vmov.8 q7[7], r4 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: vmov.8 q7[8], r2 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: vmov.8 q7[9], r7 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[10], r1 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmov.8 q7[11], r3 +; CHECK-NEXT: vmov.8 q7[12], r6 ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov.8 q5[13], r5 -; CHECK-NEXT: vmov.8 q5[14], r9 -; CHECK-NEXT: vmov.8 q5[15], r11 -; CHECK-NEXT: vadd.i8 q1, q3, q5 -; CHECK-NEXT: vadd.i8 q1, q1, q4 -; CHECK-NEXT: vstrb.8 q1, [lr], #16 +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov.8 q7[13], r5 +; CHECK-NEXT: vmov.8 q7[14], r4 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q6[8], r1 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q6[9], r1 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q6[10], r1 +; CHECK-NEXT: vmov r1, s7 +; CHECK-NEXT: vadd.i32 q1, q2, r0 +; CHECK-NEXT: vldrw.u32 q2, [sp, #192] @ 16-byte Reload +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q6[11], r1 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q6[12], r1 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q6[13], r1 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q6[14], r1 +; CHECK-NEXT: vmov r1, s7 +; CHECK-NEXT: vldrw.u32 q1, [sp, #256] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q1, q1, q5 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q6[15], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vadd.i32 q0, q4, r0 +; CHECK-NEXT: vadd.i32 q4, q4, q5 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vstrw.32 q4, [sp, #224] @ 16-byte Spill +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[15], r1 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vadd.i8 q6, q7, q6 +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[0], r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov.8 q7[1], r2 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[2], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vldrw.u32 q0, [sp, #272] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q0, [sp, #272] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vldrw.u32 q4, [sp, #272] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q4, q4, q5 +; CHECK-NEXT: vstrw.32 q4, [sp, #272] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q4, [sp, #304] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q4, q4, q5 +; CHECK-NEXT: vstrw.32 q4, [sp, #304] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q4, [sp, #128] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q4, q4, q5 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[3], r1 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[4], r1 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[5], r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[6], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vadd.i32 q0, q3, r0 +; CHECK-NEXT: vadd.i32 q3, q3, q5 +; CHECK-NEXT: vstrw.32 q3, [sp, #208] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q3, [sp, #240] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q3, q3, q5 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[7], r1 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[8], r1 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[9], r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[10], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vadd.i32 q0, q2, r0 +; CHECK-NEXT: vadd.i32 q2, q2, q5 +; CHECK-NEXT: vstrw.32 q2, [sp, #192] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #288] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q2, q2, q5 +; CHECK-NEXT: vstrw.32 q2, [sp, #288] @ 16-byte Spill +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[11], r1 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[12], r1 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[13], r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[14], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[15], r1 +; CHECK-NEXT: vadd.i8 q0, q6, q7 +; CHECK-NEXT: vldrw.u32 q7, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp, #160] @ 16-byte Reload +; CHECK-NEXT: vstrb.8 q0, [r9], #16 +; CHECK-NEXT: vldrw.u32 q0, [sp, #176] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q7, q7, q5 +; CHECK-NEXT: vadd.i32 q6, q6, q5 +; CHECK-NEXT: vadd.i32 q0, q0, q5 ; CHECK-NEXT: bne.w .LBB13_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB13_2 Depth=1 -; CHECK-NEXT: cmp r12, r2 +; CHECK-NEXT: ldr r1, [sp, #124] @ 4-byte Reload +; CHECK-NEXT: ldr.w r11, [sp, #116] @ 4-byte Reload +; CHECK-NEXT: cmp r11, r1 ; CHECK-NEXT: bne.w .LBB13_2 ; CHECK-NEXT: .LBB13_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #200 +; CHECK-NEXT: add sp, #328 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.6: ; CHECK-NEXT: .LCPI13_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .long 38 @ 0x26 +; CHECK-NEXT: .long 41 @ 0x29 +; CHECK-NEXT: .long 44 @ 0x2c +; CHECK-NEXT: .long 47 @ 0x2f ; CHECK-NEXT: .LCPI13_1: -; CHECK-NEXT: .long 8 @ 0x8 -; CHECK-NEXT: .long 9 @ 0x9 -; CHECK-NEXT: .long 10 @ 0xa -; CHECK-NEXT: .long 11 @ 0xb +; CHECK-NEXT: .long 14 @ 0xe +; CHECK-NEXT: .long 17 @ 0x11 +; CHECK-NEXT: .long 20 @ 0x14 +; CHECK-NEXT: .long 23 @ 0x17 ; CHECK-NEXT: .LCPI13_2: +; CHECK-NEXT: .long 24 @ 0x18 +; CHECK-NEXT: .long 27 @ 0x1b +; CHECK-NEXT: .long 30 @ 0x1e +; CHECK-NEXT: .long 33 @ 0x21 +; CHECK-NEXT: .LCPI13_3: +; CHECK-NEXT: .long 1 @ 0x1 ; CHECK-NEXT: .long 4 @ 0x4 -; CHECK-NEXT: .long 5 @ 0x5 -; CHECK-NEXT: .long 6 @ 0x6 ; CHECK-NEXT: .long 7 @ 0x7 -; CHECK-NEXT: .LCPI13_3: -; CHECK-NEXT: .long 12 @ 0xc +; CHECK-NEXT: .long 10 @ 0xa +; CHECK-NEXT: .LCPI13_4: +; CHECK-NEXT: .long 36 @ 0x24 +; CHECK-NEXT: .long 39 @ 0x27 +; CHECK-NEXT: .long 42 @ 0x2a +; CHECK-NEXT: .long 45 @ 0x2d +; CHECK-NEXT: .LCPI13_5: +; CHECK-NEXT: .long 25 @ 0x19 +; CHECK-NEXT: .long 28 @ 0x1c +; CHECK-NEXT: .long 31 @ 0x1f +; CHECK-NEXT: .long 34 @ 0x22 +; CHECK-NEXT: .LCPI13_6: ; CHECK-NEXT: .long 13 @ 0xd -; CHECK-NEXT: .long 14 @ 0xe +; CHECK-NEXT: .long 16 @ 0x10 +; CHECK-NEXT: .long 19 @ 0x13 +; CHECK-NEXT: .long 22 @ 0x16 +; CHECK-NEXT: .LCPI13_7: +; CHECK-NEXT: .long 2 @ 0x2 +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 11 @ 0xb +; CHECK-NEXT: .LCPI13_8: +; CHECK-NEXT: .long 26 @ 0x1a +; CHECK-NEXT: .long 29 @ 0x1d +; CHECK-NEXT: .long 32 @ 0x20 +; CHECK-NEXT: .long 35 @ 0x23 +; CHECK-NEXT: .LCPI13_9: +; CHECK-NEXT: .long 37 @ 0x25 +; CHECK-NEXT: .long 40 @ 0x28 +; CHECK-NEXT: .long 43 @ 0x2b +; CHECK-NEXT: .long 46 @ 0x2e +; CHECK-NEXT: .LCPI13_10: +; CHECK-NEXT: .long 12 @ 0xc ; CHECK-NEXT: .long 15 @ 0xf +; CHECK-NEXT: .long 18 @ 0x12 +; CHECK-NEXT: .long 21 @ 0x15 +; CHECK-NEXT: .LCPI13_11: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 9 @ 0x9 entry: @@ -1299,7 +1375,6 @@ ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r3] ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB14_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB14_3 Depth 2 @@ -1309,7 +1384,6 @@ ; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: mov lr, r1 ; CHECK-NEXT: mov r3, r12 -; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB14_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB14_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 Index: llvm/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-optimisation-deep.ll @@ -7,25 +7,24 @@ ; CHECK-LABEL: @push_out_add_sub_block( ; CHECK-NEXT: vector.ph: ; CHECK-NEXT: [[IND_END:%.*]] = shl i32 [[N_VEC:%.*]], 1 +; CHECK-NEXT: [[PUSHEDOUTADD:%.*]] = add <4 x i32> , ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY_END]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[PUSHEDOUTADD]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY_END]] ] ; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 50 ; CHECK-NEXT: br i1 [[TMP0]], label [[LOWER_BLOCK:%.*]], label [[END:%.*]] ; CHECK: lower.block: -; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA]], <4 x i32> [[TMP1]], i32 32, i32 2, i32 1) -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP5]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[VEC_IND]], i32 32, i32 2, i32 1) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; CHECK-NEXT: br label [[VECTOR_BODY_END]] ; CHECK: vector.body.end: -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP6]], label [[END]], label [[VECTOR_BODY]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label [[END]], label [[VECTOR_BODY]] ; CHECK: end: ; CHECK-NEXT: ret void ; @@ -63,26 +62,26 @@ ; CHECK-LABEL: @push_out_mul_sub_block( ; CHECK-NEXT: vector.ph: ; CHECK-NEXT: [[IND_END:%.*]] = shl i32 [[N_VEC:%.*]], 1 +; CHECK-NEXT: [[PUSHEDOUTMUL:%.*]] = mul <4 x i32> , +; CHECK-NEXT: [[PRODUCT:%.*]] = mul <4 x i32> , +; CHECK-NEXT: [[PUSHEDOUTADD:%.*]] = add <4 x i32> [[PUSHEDOUTMUL]], ; CHECK-NEXT: br label [[VECTOR_BODY:%.*]] ; CHECK: vector.body: ; CHECK-NEXT: [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY_END:%.*]] ] -; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ , [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY_END]] ] +; CHECK-NEXT: [[VEC_IND:%.*]] = phi <4 x i32> [ [[PUSHEDOUTADD]], [[VECTOR_PH]] ], [ [[INCREMENTPUSHEDOUTMUL:%.*]], [[VECTOR_BODY_END]] ] ; CHECK-NEXT: [[TMP0:%.*]] = icmp eq i32 [[INDEX]], 50 ; CHECK-NEXT: br i1 [[TMP0]], label [[LOWER_BLOCK:%.*]], label [[END:%.*]] ; CHECK: lower.block: -; CHECK-NEXT: [[TMP1:%.*]] = mul <4 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP2:%.*]] = add <4 x i32> [[TMP1]], -; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP2]] -; CHECK-NEXT: [[TMP4:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA]], <4 x i32> [[TMP2]], i32 32, i32 2, i32 1) -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP6:%.*]] = bitcast i32* [[TMP5]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP4]], <4 x i32>* [[TMP6]], align 4 +; CHECK-NEXT: [[TMP1:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[VEC_IND]], i32 32, i32 2, i32 1) +; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP3:%.*]] = bitcast i32* [[TMP2]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP1]], <4 x i32>* [[TMP3]], align 4 ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], ; CHECK-NEXT: br label [[VECTOR_BODY_END]] ; CHECK: vector.body.end: -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[END]], label [[VECTOR_BODY]] +; CHECK-NEXT: [[INCREMENTPUSHEDOUTMUL]] = add <4 x i32> [[VEC_IND]], [[PRODUCT]] +; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP4]], label [[END]], label [[VECTOR_BODY]] ; CHECK: end: ; CHECK-NEXT: ret void ; @@ -130,24 +129,22 @@ ; CHECK: vector.2.ph: ; CHECK-NEXT: br label [[VECTOR_2_BODY:%.*]] ; CHECK: vector.2.body: -; CHECK-NEXT: [[INDEX_2:%.*]] = phi i32 [ 0, [[VECTOR_2_PH]] ], [ [[INDEX_2_NEXT:%.*]], [[VECTOR_2_BODY_END:%.*]] ] ; CHECK-NEXT: [[TMP0:%.*]] = mul <4 x i32> [[VEC_IND]], ; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[TMP0]], -; CHECK-NEXT: [[TMP2:%.*]] = getelementptr inbounds i32, i32* [[DATA:%.*]], <4 x i32> [[TMP1]] -; CHECK-NEXT: [[TMP3:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA]], <4 x i32> [[TMP1]], i32 32, i32 2, i32 1) -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[INDEX]] -; CHECK-NEXT: [[TMP5:%.*]] = bitcast i32* [[TMP4]] to <4 x i32>* -; CHECK-NEXT: store <4 x i32> [[TMP3]], <4 x i32>* [[TMP5]], align 4 -; CHECK-NEXT: br label [[VECTOR_2_BODY_END]] +; CHECK-NEXT: [[TMP2:%.*]] = call <4 x i32> @llvm.arm.mve.vldr.gather.offset.v4i32.p0i32.v4i32(i32* [[DATA:%.*]], <4 x i32> [[TMP1]], i32 32, i32 2, i32 1) +; CHECK-NEXT: [[TMP3:%.*]] = getelementptr inbounds i32, i32* [[DST:%.*]], i32 [[INDEX]] +; CHECK-NEXT: [[TMP4:%.*]] = bitcast i32* [[TMP3]] to <4 x i32>* +; CHECK-NEXT: store <4 x i32> [[TMP2]], <4 x i32>* [[TMP4]], align 4 +; CHECK-NEXT: br label [[VECTOR_2_BODY_END:%.*]] ; CHECK: vector.2.body.end: -; CHECK-NEXT: [[INDEX_2_NEXT]] = add i32 [[INDEX]], 4 -; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_2_NEXT]], 15 -; CHECK-NEXT: br i1 [[TMP6]], label [[VECTOR_BODY_END]], label [[VECTOR_2_BODY]] +; CHECK-NEXT: [[INDEX_2_NEXT:%.*]] = add i32 [[INDEX]], 4 +; CHECK-NEXT: [[TMP5:%.*]] = icmp eq i32 [[INDEX_2_NEXT]], 15 +; CHECK-NEXT: br i1 [[TMP5]], label [[VECTOR_BODY_END]], label [[VECTOR_2_BODY]] ; CHECK: vector.body.end: ; CHECK-NEXT: [[INDEX_NEXT]] = add i32 [[INDEX]], 4 ; CHECK-NEXT: [[VEC_IND_NEXT]] = add <4 x i32> [[VEC_IND]], -; CHECK-NEXT: [[TMP7:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] -; CHECK-NEXT: br i1 [[TMP7]], label [[END:%.*]], label [[VECTOR_BODY]] +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]] +; CHECK-NEXT: br i1 [[TMP6]], label [[END:%.*]], label [[VECTOR_BODY]] ; CHECK: end: ; CHECK-NEXT: ret void ; Index: llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll @@ -741,7 +741,6 @@ ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r2, lsr #2 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB22_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 @@ -787,7 +786,6 @@ ; CHECK-NEXT: movs r3, #1 ; CHECK-NEXT: add.w lr, r3, r2, lsr #2 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB23_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 Index: llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -19,31 +19,24 @@ define arm_aapcs_vfpcc void @push_out_mul_gather(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) { ; CHECK-LABEL: push_out_mul_gather: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr r3, .LCPI0_0 -; CHECK-NEXT: vmov.i32 q1, #0x8 ; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: vmov.i32 q2, #0x3 -; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmul.i32 q3, q0, q2 +; CHECK-NEXT: vldrw.u32 q1, [q0, #96]! ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vldrw.u32 q4, [r0, q3, uxtw #2] -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vstrb.8 q4, [r1], #16 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 ; CHECK-NEXT: bne .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI0_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 4 @ 0x4 -; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 4294967200 @ 0xffffffa0 +; CHECK-NEXT: .long 4294967224 @ 0xffffffb8 +; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 +; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 vector.ph: ; preds = %for.body.preheader %ind.end = shl i32 %n.vec, 1 @@ -70,31 +63,24 @@ define arm_aapcs_vfpcc void @push_out_add_gather(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) { ; CHECK-LABEL: push_out_add_gather: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr r3, .LCPI1_0 -; CHECK-NEXT: vmov.i32 q1, #0x8 ; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: vmov.i32 q2, #0x6 -; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q3, q0, q2 +; CHECK-NEXT: vldrw.u32 q1, [q0, #32]! ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vldrw.u32 q4, [r0, q3, uxtw #2] -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vstrb.8 q4, [r1], #16 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 ; CHECK-NEXT: bne .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI1_0: +; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 ; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 4 @ 0x4 -; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 16 @ 0x10 vector.ph: ; preds = %for.body.preheader %ind.end = shl i32 %n.vec, 1 @@ -121,33 +107,24 @@ define arm_aapcs_vfpcc void @push_out_mul_add_gather(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) { ; CHECK-LABEL: push_out_mul_add_gather: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: adr r3, .LCPI2_0 -; CHECK-NEXT: vmov.i32 q1, #0x8 ; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: vmov.i32 q2, #0x6 -; CHECK-NEXT: vmov.i32 q3, #0x3 -; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmul.i32 q4, q0, q3 +; CHECK-NEXT: vldrw.u32 q1, [q0, #96]! ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q4, q4, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vldrw.u32 q5, [r0, q4, uxtw #2] -; CHECK-NEXT: vstrb.8 q5, [r1], #16 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 ; CHECK-NEXT: bne .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI2_0: +; CHECK-NEXT: .long 4294967224 @ 0xffffffb8 +; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 +; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 ; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 4 @ 0x4 -; CHECK-NEXT: .long 6 @ 0x6 vector.ph: ; preds = %for.body.preheader %ind.end = shl i32 %n.vec, 1 @@ -175,30 +152,24 @@ define arm_aapcs_vfpcc void @push_out_mul_scatter(i32* noalias nocapture readonly %data, ; CHECK-LABEL: push_out_mul_scatter: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr r1, .LCPI3_0 -; CHECK-NEXT: vmov.i32 q2, #0x8 -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmov.i32 q3, #0x3 -; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: vmov.i32 q1, #0x18 +; CHECK-NEXT: vldrw.u32 q2, [r1] ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vmul.i32 q4, q1, q3 -; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vstrw.32 q0, [r0, q4, uxtw #2] +; CHECK-NEXT: vstrw.32 q0, [r0, q2, uxtw #2] +; CHECK-NEXT: vadd.i32 q2, q2, q1 ; CHECK-NEXT: bne .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI3_0: ; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 4 @ 0x4 ; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 12 @ 0xc +; CHECK-NEXT: .long 18 @ 0x12 i32* noalias nocapture %dst, i32 %n.vec, <4 x i32> %to.store) { @@ -224,30 +195,24 @@ define arm_aapcs_vfpcc void @push_out_add_scatter(i32* noalias nocapture readonly %data, ; CHECK-LABEL: push_out_add_scatter: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr r1, .LCPI4_0 ; CHECK-NEXT: vmov.i32 q2, #0x8 ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmov.i32 q3, #0x6 -; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q4, q1, q3 +; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vstrw.32 q0, [r0, q4, uxtw #2] ; CHECK-NEXT: bne .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI4_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 4 @ 0x4 ; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 10 @ 0xa +; CHECK-NEXT: .long 12 @ 0xc i32* noalias nocapture %dst, i32 %n.vec, <4 x i32> %to.store) { @@ -273,31 +238,26 @@ define arm_aapcs_vfpcc void @push_out_mul_gather_scatter(i32* noalias nocapture readonly %data, ; CHECK-LABEL: push_out_mul_gather_scatter: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr r1, .LCPI5_0 -; CHECK-NEXT: vmov.i32 q1, #0x8 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vmov.i32 q2, #0x3 -; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: vmov.i32 q0, #0x18 +; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: .LBB5_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmul.i32 q3, q0, q2 +; CHECK-NEXT: vldrw.u32 q2, [r0, q1, uxtw #2] +; CHECK-NEXT: vadd.i32 q3, q1, q0 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vldrw.u32 q4, [r0, q3, uxtw #2] -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vstrw.32 q4, [r0, q3, uxtw #2] +; CHECK-NEXT: vstrw.32 q2, [r0, q1, uxtw #2] +; CHECK-NEXT: vmov q1, q3 ; CHECK-NEXT: bne .LBB5_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI5_0: ; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 4 @ 0x4 ; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 12 @ 0xc +; CHECK-NEXT: .long 18 @ 0x12 i32* noalias nocapture %dst, i32 %n.vec) { vector.ph: ; preds = %for.body.preheader @@ -323,31 +283,24 @@ define arm_aapcs_vfpcc void @push_out_add_sub_block(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) { ; CHECK-LABEL: push_out_add_sub_block: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr r3, .LCPI6_0 -; CHECK-NEXT: vmov.i32 q1, #0x8 ; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: vmov.i32 q2, #0x6 -; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q3, q0, q2 +; CHECK-NEXT: vldrw.u32 q1, [q0, #32]! ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vldrw.u32 q4, [r0, q3, uxtw #2] -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vstrb.8 q4, [r1], #16 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 ; CHECK-NEXT: bne .LBB6_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI6_0: +; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 ; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 4 @ 0x4 -; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 16 @ 0x10 vector.ph: ; preds = %for.body.preheader %ind.end = shl i32 %n.vec, 1 @@ -380,25 +333,23 @@ define arm_aapcs_vfpcc void @non_gatscat_use1(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) { ; CHECK-LABEL: non_gatscat_use1: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr r3, .LCPI7_0 -; CHECK-NEXT: vmov.i32 q1, #0x8 -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: vmov.i32 q2, #0x6 -; CHECK-NEXT: vmov.i32 q3, #0x3 -; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: vmov.i32 q0, #0x8 +; CHECK-NEXT: vldrw.u32 q2, [r3] +; CHECK-NEXT: vmov.i32 q1, #0xc ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmul.i32 q4, q0, q3 +; CHECK-NEXT: vadd.i32 q3, q2, q0 +; CHECK-NEXT: vmlas.u32 q2, q1, r0 +; CHECK-NEXT: vldrw.u32 q4, [q2, #24] ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q4, q4, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vldrw.u32 q5, [r0, q4, uxtw #2] -; CHECK-NEXT: vstrb.8 q5, [r1], #16 +; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: vstrb.8 q4, [r1], #16 ; CHECK-NEXT: bne .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: @@ -442,7 +393,6 @@ ; CHECK-NEXT: vldrw.u32 q0, [r3] ; CHECK-NEXT: vmov.i32 q2, #0x6 ; CHECK-NEXT: vmov.i32 q3, #0x3 -; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: vmul.i32 q4, q0, q3 @@ -490,59 +440,67 @@ define dso_local void @arm_mat_mult_q31(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %n, i32 %m, i32 %l) local_unnamed_addr #0 { ; CHECK-LABEL: arm_mat_mult_q31: ; CHECK: @ %bb.0: @ %for.cond8.preheader.us.us.preheader.preheader -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: ldrd r6, r12, [sp, #80] +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: ldrd r9, r12, [sp, #128] ; CHECK-NEXT: sub.w r7, r12, #1 -; CHECK-NEXT: movs r5, #1 -; CHECK-NEXT: mov.w r8, #0 -; CHECK-NEXT: add.w r7, r5, r7, lsr #1 -; CHECK-NEXT: vmov.i32 q1, #0x8 +; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: adr r5, .LCPI9_0 +; CHECK-NEXT: add.w r7, r6, r7, lsr #1 +; CHECK-NEXT: vdup.32 q1, r9 ; CHECK-NEXT: bic r7, r7, #3 +; CHECK-NEXT: vldrw.u32 q2, [r5] ; CHECK-NEXT: subs r7, #4 -; CHECK-NEXT: add.w r10, r5, r7, lsr #2 -; CHECK-NEXT: adr r7, .LCPI9_0 -; CHECK-NEXT: vldrw.u32 q0, [r7] -; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: vshl.i32 q3, q1, #3 +; CHECK-NEXT: add.w r7, r6, r7, lsr #2 +; CHECK-NEXT: adr r6, .LCPI9_1 +; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: .LBB9_1: @ %for.cond8.preheader.us.us.preheader ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB9_2 Depth 2 ; CHECK-NEXT: @ Child Loop BB9_3 Depth 3 -; CHECK-NEXT: mul r9, r8, r6 +; CHECK-NEXT: mul r10, r8, r9 ; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: mul r7, r8, r12 -; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: mul r11, r8, r12 ; CHECK-NEXT: .LBB9_2: @ %vector.ph ; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1 ; CHECK-NEXT: @ => This Loop Header: Depth=2 ; CHECK-NEXT: @ Child Loop BB9_3 Depth 3 -; CHECK-NEXT: vmov.i32 q2, #0x0 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: dls lr, r10 -; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: vdup.32 q5, r11 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vshl.i32 q5, q5, #2 +; CHECK-NEXT: vmov q6, q1 +; CHECK-NEXT: vadd.i32 q5, q5, r0 +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vadd.i32 q5, q5, q0 +; CHECK-NEXT: vmlas.u32 q6, q2, r5 +; CHECK-NEXT: dls lr, r7 ; CHECK-NEXT: .LBB9_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1 ; CHECK-NEXT: @ Parent Loop BB9_2 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vadd.i32 q5, q3, r7 -; CHECK-NEXT: vadd.i32 q4, q3, q1 -; CHECK-NEXT: vldrw.u32 q6, [r0, q5, uxtw #2] -; CHECK-NEXT: vdup.32 q5, r5 -; CHECK-NEXT: vmla.u32 q5, q3, r6 -; CHECK-NEXT: vldrw.u32 q3, [r1, q5, uxtw #2] -; CHECK-NEXT: vmul.i32 q3, q3, q6 -; CHECK-NEXT: vadd.i32 q2, q3, q2 -; CHECK-NEXT: vmov q3, q4 +; CHECK-NEXT: vadd.i32 q7, q6, q3 +; CHECK-NEXT: vldrw.u32 q0, [r1, q6, uxtw #2] +; CHECK-NEXT: vldrw.u32 q6, [q5, #32]! +; CHECK-NEXT: vmul.i32 q0, q0, q6 +; CHECK-NEXT: vmov q6, q7 +; CHECK-NEXT: vadd.i32 q4, q0, q4 ; CHECK-NEXT: le lr, .LBB9_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=2 -; CHECK-NEXT: add.w lr, r5, r9 +; CHECK-NEXT: add.w r6, r5, r10 ; CHECK-NEXT: adds r5, #1 -; CHECK-NEXT: vaddv.u32 r4, q2 -; CHECK-NEXT: cmp r5, r6 -; CHECK-NEXT: str.w r4, [r2, lr, lsl #2] +; CHECK-NEXT: vaddv.u32 r4, q4 +; CHECK-NEXT: cmp r5, r9 +; CHECK-NEXT: str.w r4, [r2, r6, lsl #2] ; CHECK-NEXT: bne .LBB9_2 ; CHECK-NEXT: @ %bb.5: @ %for.cond4.for.cond.cleanup6_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB9_1 Depth=1 @@ -550,8 +508,10 @@ ; CHECK-NEXT: cmp r8, r3 ; CHECK-NEXT: bne .LBB9_1 ; CHECK-NEXT: @ %bb.6: @ %for.end25 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.7: ; CHECK-NEXT: .LCPI9_0: @@ -559,6 +519,11 @@ ; CHECK-NEXT: .long 2 @ 0x2 ; CHECK-NEXT: .long 4 @ 0x4 ; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .LCPI9_1: +; CHECK-NEXT: .long 4294967264 @ 0xffffffe0 +; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 +; CHECK-NEXT: .long 4294967280 @ 0xfffffff0 +; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 for.cond8.preheader.us.us.preheader.preheader: ; preds = %entry %0 = add i32 %l, -1 @@ -636,11 +601,11 @@ ; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: strd r0, r2, [sp, #24] @ 8-byte Folded Spill ; CHECK-NEXT: cmp r3, #0 -; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: mov r0, r3 ; CHECK-NEXT: itt ne -; CHECK-NEXT: ldrne.w r11, [sp, #104] -; CHECK-NEXT: cmpne.w r11, #0 +; CHECK-NEXT: ldrne.w lr, [sp, #120] +; CHECK-NEXT: cmpne.w lr, #0 ; CHECK-NEXT: bne .LBB10_2 ; CHECK-NEXT: .LBB10_1: @ %for.cond.cleanup ; CHECK-NEXT: add sp, #32 @@ -648,42 +613,41 @@ ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} ; CHECK-NEXT: .LBB10_2: @ %for.cond1.preheader.us.preheader -; CHECK-NEXT: ldr.w r10, [sp, #108] -; CHECK-NEXT: mov r8, r1 +; CHECK-NEXT: ldr.w r11, [sp, #124] +; CHECK-NEXT: mov r6, r1 ; CHECK-NEXT: movs r1, #1 -; CHECK-NEXT: lsl.w r4, r11, #1 -; CHECK-NEXT: bic r0, r10, #3 +; CHECK-NEXT: vdup.32 q4, lr +; CHECK-NEXT: bic r0, r11, #3 ; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: subs r0, #4 ; CHECK-NEXT: lsl.w r4, lr, #1 ; CHECK-NEXT: mov.w r9, #0 -; CHECK-NEXT: vmov.i32 q5, #0x4 -; CHECK-NEXT: add.w r0, r1, r0, lsr #2 -; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: lsl.w r0, r10, #1 -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: str r0, [sp] @ 4-byte Spill +; CHECK-NEXT: vshl.i32 q6, q4, #2 +; CHECK-NEXT: add.w r8, r1, r0, lsr #2 +; CHECK-NEXT: lsl.w r0, r11, #1 +; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: adr r0, .LCPI10_0 -; CHECK-NEXT: vldrw.u32 q4, [r0] +; CHECK-NEXT: vldrw.u32 q5, [r0] ; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: movs r1, #0 ; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: b .LBB10_5 -; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB10_3: @ %for.cond5.preheader.us73.preheader ; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1 ; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload ; CHECK-NEXT: mov r1, r4 ; CHECK-NEXT: add.w r0, r0, r12, lsl #1 ; CHECK-NEXT: bl __aeabi_memclr +; CHECK-NEXT: ldr.w lr, [sp, #120] ; CHECK-NEXT: .LBB10_4: @ %for.cond1.for.cond.cleanup3_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1 -; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload -; CHECK-NEXT: add r9, r10 +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: add r9, r11 ; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: add r1, r0 ; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: adds r1, #1 ; CHECK-NEXT: cmp r1, r0 ; CHECK-NEXT: beq .LBB10_1 @@ -692,80 +656,75 @@ ; CHECK-NEXT: @ Child Loop BB10_8 Depth 2 ; CHECK-NEXT: @ Child Loop BB10_11 Depth 3 ; CHECK-NEXT: @ Child Loop BB10_14 Depth 3 -; CHECK-NEXT: mul r12, r1, r11 -; CHECK-NEXT: cmp.w r10, #0 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: mul r12, r1, lr +; CHECK-NEXT: cmp.w r11, #0 +; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: beq .LBB10_3 ; CHECK-NEXT: @ %bb.6: @ %for.cond5.preheader.us.us.preheader ; CHECK-NEXT: @ in Loop: Header=BB10_5 Depth=1 -; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: mov.w r10, #0 ; CHECK-NEXT: b .LBB10_8 -; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB10_7: @ %for.cond5.for.cond.cleanup7_crit_edge.us.us ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: ldr r3, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: add.w r0, r1, r12 -; CHECK-NEXT: adds r1, #1 -; CHECK-NEXT: cmp r1, r11 -; CHECK-NEXT: strh.w r2, [r3, r0, lsl #1] +; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: add.w r3, r10, r12 +; CHECK-NEXT: add.w r10, r10, #1 +; CHECK-NEXT: cmp r10, lr +; CHECK-NEXT: strh.w r2, [r0, r3, lsl #1] ; CHECK-NEXT: beq .LBB10_4 ; CHECK-NEXT: .LBB10_8: @ %for.cond5.preheader.us.us ; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 ; CHECK-NEXT: @ => This Loop Header: Depth=2 ; CHECK-NEXT: @ Child Loop BB10_11 Depth 3 ; CHECK-NEXT: @ Child Loop BB10_14 Depth 3 -; CHECK-NEXT: cmp.w r10, #3 +; CHECK-NEXT: cmp.w r11, #3 ; CHECK-NEXT: bhi .LBB10_10 ; CHECK-NEXT: @ %bb.9: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: movs r7, #0 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: b .LBB10_13 -; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB10_10: @ %vector.ph ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: ldr.w lr, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: vmov q1, q4 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: ldr r2, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmlas.u32 q1, q5, r10 +; CHECK-NEXT: dls lr, r8 ; CHECK-NEXT: .LBB10_11: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 ; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vdup.32 q3, r1 -; CHECK-NEXT: vadd.i32 q2, q1, q5 -; CHECK-NEXT: vmla.u32 q3, q1, r11 -; CHECK-NEXT: vldrh.s32 q1, [r8, q3, uxtw #1] -; CHECK-NEXT: vldrh.s32 q3, [r0], #8 -; CHECK-NEXT: vmul.i32 q1, q1, q3 +; CHECK-NEXT: vadd.i32 q2, q1, q6 +; CHECK-NEXT: vldrh.s32 q3, [r6, q1, uxtw #1] +; CHECK-NEXT: vldrh.s32 q1, [r2], #8 +; CHECK-NEXT: vmul.i32 q1, q3, q1 ; CHECK-NEXT: vadd.i32 q0, q1, q0 ; CHECK-NEXT: vmov q1, q2 ; CHECK-NEXT: le lr, .LBB10_11 ; CHECK-NEXT: @ %bb.12: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r7, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: vaddv.u32 r2, q0 -; CHECK-NEXT: cmp r0, r10 +; CHECK-NEXT: ldr.w lr, [sp, #120] +; CHECK-NEXT: cmp r7, r11 ; CHECK-NEXT: beq .LBB10_7 ; CHECK-NEXT: .LBB10_13: @ %for.body8.us.us.preheader ; CHECK-NEXT: @ in Loop: Header=BB10_8 Depth=2 -; CHECK-NEXT: mla r3, r11, r0, r1 -; CHECK-NEXT: sub.w r5, r10, r0 -; CHECK-NEXT: add r0, r9 -; CHECK-NEXT: ldr r7, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: add.w r0, r7, r0, lsl #1 -; CHECK-NEXT: add.w r3, r8, r3, lsl #1 -; CHECK-NEXT: .p2align 2 +; CHECK-NEXT: mla r3, lr, r7, r10 +; CHECK-NEXT: sub.w r5, r11, r7 +; CHECK-NEXT: add r7, r9 +; CHECK-NEXT: ldr r0, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: add.w r7, r0, r7, lsl #1 +; CHECK-NEXT: add.w r3, r6, r3, lsl #1 ; CHECK-NEXT: .LBB10_14: @ %for.body8.us.us ; CHECK-NEXT: @ Parent Loop BB10_5 Depth=1 ; CHECK-NEXT: @ Parent Loop BB10_8 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: ldrsh.w r6, [r3] +; CHECK-NEXT: ldrsh.w r1, [r3] ; CHECK-NEXT: add r3, r4 -; CHECK-NEXT: ldrsh r7, [r0], #2 +; CHECK-NEXT: ldrsh r0, [r7], #2 ; CHECK-NEXT: subs r5, #1 -; CHECK-NEXT: smlabb r2, r6, r7, r2 +; CHECK-NEXT: smlabb r2, r1, r0, r2 ; CHECK-NEXT: bne .LBB10_14 ; CHECK-NEXT: b .LBB10_7 ; CHECK-NEXT: .p2align 4