diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h --- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -21,6 +21,8 @@ #include "llvm/CodeGen/MachineInstrBuilder.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsARM.h" #include #include @@ -831,6 +833,27 @@ } } +// Return true if the given intrinsic is a gather or scatter +inline bool isGatherScatter(IntrinsicInst *IntInst) { + if (IntInst == nullptr) + return false; + unsigned IntrinsicID = IntInst->getIntrinsicID(); + return (IntrinsicID == Intrinsic::masked_gather || + IntrinsicID == Intrinsic::arm_mve_vldr_gather_base || + IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_predicated || + IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_wb || + IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_wb_predicated || + IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset || + IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset_predicated || + IntrinsicID == Intrinsic::masked_scatter || + IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base || + IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_predicated || + IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_wb || + IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_wb_predicated || + IntrinsicID == Intrinsic::arm_mve_vstr_scatter_offset || + IntrinsicID == Intrinsic::arm_mve_vstr_scatter_offset_predicated); +} + } // end namespace llvm #endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp --- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -20,22 +20,22 @@ #include "llvm/CodeGen/TargetLowering.h" #include "llvm/CodeGen/TargetPassConfig.h" #include "llvm/CodeGen/TargetSubtargetInfo.h" +#include "llvm/InitializePasses.h" #include "llvm/IR/BasicBlock.h" #include "llvm/IR/Constant.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DerivedTypes.h" #include "llvm/IR/Function.h" -#include "llvm/IR/IRBuilder.h" #include "llvm/IR/InstrTypes.h" #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" #include "llvm/IR/Intrinsics.h" #include "llvm/IR/IntrinsicsARM.h" +#include "llvm/IR/IRBuilder.h" #include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/IR/Value.h" -#include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Casting.h" #include "llvm/Transforms/Utils/Local.h" @@ -74,6 +74,8 @@ } private: + LoopInfo *LI = nullptr; + // Check this is a valid gather with correct alignment bool isLegalTypeAndAlignment(unsigned NumElements, unsigned ElemSize, unsigned Alignment); @@ -82,9 +84,17 @@ // Check for a getelementptr and deduce base and offsets from it, on success // returning the base directly and the offsets indirectly using the Offsets // argument - Value *checkGEP(Value *&Offsets, Type *Ty, Value *Ptr, IRBuilder<> &Builder); + Value *checkGEP(Value *&Offsets, Type *Ty, GetElementPtrInst *GEP, + IRBuilder<> &Builder); // Compute the scale of this gather/scatter instruction int computeScale(unsigned GEPElemSize, unsigned MemoryElemSize); + // If the value is a constant, or derived from constants via additions + // and multilications, return its numeric value + Optional getIfConst(const Value *V); + // If Inst is an add instruction, check whether one summand is a + // constant. If so, scale this constant and return it together with + // the other summand. + std::pair getVarAndConst(Value *Inst, int TypeScale); Value *lowerGather(IntrinsicInst *I); // Create a gather from a base + vector of offsets @@ -92,7 +102,22 @@ Instruction *&Root, IRBuilder<> &Builder); // Create a gather from a vector of pointers Value *tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr, - IRBuilder<> &Builder); + IRBuilder<> &Builder, + unsigned Increment = 0); + // Create a gather from a vector of pointers + Value *tryCreateMaskedGatherBaseWB(IntrinsicInst *I, Value *Ptr, + IRBuilder<> &Builder, + unsigned Increment = 0); + // QI gathers can increment their offsets on their own if the increment is + // a constant value (digit) + Value *tryCreateIncrementingGather(IntrinsicInst *I, Value *BasePtr, + Value *Ptr, GetElementPtrInst *GEP, + IRBuilder<> &Builder); + // QI gathers can increment their offsets on their own if the increment is + // a constant value (digit) - this creates a writeback QI gather + Value *tryCreateIncrementingWBGather(IntrinsicInst *I, Value *BasePtr, + Value *Ptr, unsigned TypeScale, + IRBuilder<> &Builder); Value *lowerScatter(IntrinsicInst *I); // Create a scatter to a base + vector of offsets @@ -137,9 +162,9 @@ return false; } -Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty, Value *Ptr, +Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty, + GetElementPtrInst *GEP, IRBuilder<> &Builder) { - GetElementPtrInst *GEP = dyn_cast(Ptr); if (!GEP) { LLVM_DEBUG( dbgs() << "masked gathers/scatters: no getelementpointer found\n"); @@ -217,6 +242,56 @@ return -1; } +Optional MVEGatherScatterLowering::getIfConst(const Value *V) { + const Constant *C = dyn_cast(V); + if (C != nullptr) + return Optional{C->getUniqueInteger().getSExtValue()}; + if (!isa(V)) + return Optional{}; + + const Instruction *I = cast(V); + if (I->getOpcode() == Instruction::Add || + I->getOpcode() == Instruction::Mul) { + Optional Op0 = getIfConst(I->getOperand(0)); + Optional Op1 = getIfConst(I->getOperand(1)); + if (!Op0 || !Op1) + return Optional{}; + if (I->getOpcode() == Instruction::Add) + return Optional{Op0.getValue() + Op1.getValue()}; + if (I->getOpcode() == Instruction::Mul) + return Optional{Op0.getValue() * Op1.getValue()}; + } + return Optional{}; +} + +std::pair +MVEGatherScatterLowering::getVarAndConst(Value *Inst, int TypeScale) { + std::pair ReturnFalse = + std::pair(nullptr, 0); + // At this point, the instruction we're looking at must be an add or we + // bail out + Instruction *Add = dyn_cast(Inst); + if (Add == nullptr || Add->getOpcode() != Instruction::Add) + return ReturnFalse; + + Value *Summand; + Optional Const; + // Find out which operand the value that is increased is + if ((Const = getIfConst(Add->getOperand(0)))) + Summand = Add->getOperand(1); + else if ((Const = getIfConst(Add->getOperand(1)))) + Summand = Add->getOperand(0); + else + return ReturnFalse; + + // Check that the constant is small enough for an incrementing gather + int64_t Immediate = Const.getValue() << TypeScale; + if (Immediate > 512 || Immediate < -512 || Immediate % 4 != 0) + return ReturnFalse; + + return std::pair(Summand, Immediate); +} + Value *MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) { using namespace PatternMatch; LLVM_DEBUG(dbgs() << "masked gathers: checking transform preconditions\n"); @@ -266,7 +341,8 @@ Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr, - IRBuilder<> &Builder) { + IRBuilder<> &Builder, + unsigned Increment) { using namespace PatternMatch; auto *Ty = cast(I->getType()); LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers\n"); @@ -277,12 +353,34 @@ if (match(Mask, m_One())) return Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base, {Ty, Ptr->getType()}, - {Ptr, Builder.getInt32(0)}); + {Ptr, Builder.getInt32(Increment)}); else return Builder.CreateIntrinsic( Intrinsic::arm_mve_vldr_gather_base_predicated, {Ty, Ptr->getType(), Mask->getType()}, - {Ptr, Builder.getInt32(0), Mask}); + {Ptr, Builder.getInt32(Increment), Mask}); +} + +Value *MVEGatherScatterLowering::tryCreateMaskedGatherBaseWB( + IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, unsigned Increment) { + using namespace PatternMatch; + auto *Ty = cast(I->getType()); + LLVM_DEBUG( + dbgs() + << "masked gathers: loading from vector of pointers with writeback\n"); + if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32) + // Can't build an intrinsic for this + return nullptr; + Value *Mask = I->getArgOperand(2); + if (match(Mask, m_One())) + return Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base_wb, + {Ty, Ptr->getType()}, + {Ptr, Builder.getInt32(Increment)}); + else + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vldr_gather_base_wb_predicated, + {Ty, Ptr->getType(), Mask->getType()}, + {Ptr, Builder.getInt32(Increment), Mask}); } Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset( @@ -321,10 +419,17 @@ } } + GetElementPtrInst *GEP = dyn_cast(Ptr); Value *Offsets; - Value *BasePtr = checkGEP(Offsets, ResultTy, Ptr, Builder); + Value *BasePtr = checkGEP(Offsets, ResultTy, GEP, Builder); if (!BasePtr) return nullptr; + // Check whether the offset is a constant increment that could be merged into + // a QI gather + Value *Load = + tryCreateIncrementingGather(I, BasePtr, Offsets, GEP, Builder); + if (Load) + return Load; int Scale = computeScale( BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(), @@ -348,6 +453,142 @@ Builder.getInt32(Scale), Builder.getInt32(Unsigned)}); } +Value *MVEGatherScatterLowering::tryCreateIncrementingGather( + IntrinsicInst *I, Value *BasePtr, Value *Offsets, GetElementPtrInst *GEP, + IRBuilder<> &Builder) { + auto *Ty = cast(I->getType()); + // Incrementing gathers only exist for v4i32 + if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32) + return nullptr; + Loop *L = LI->getLoopFor(I->getParent()); + if (L == nullptr) + // Incrementing gathers are not beneficial outside of a loop + return nullptr; + LLVM_DEBUG( + dbgs() << "masked gathers: trying to build incrementing wb gather\n"); + + // The gep was in charge of making sure the offsets are scaled correctly + // - calculate that factor so it can be applied by hand + DataLayout DT = I->getParent()->getParent()->getParent()->getDataLayout(); + int TypeScale = + computeScale(DT.getTypeSizeInBits(GEP->getOperand(0)->getType()), + DT.getTypeSizeInBits(GEP->getType()) / + cast(GEP->getType())->getNumElements()); + if (TypeScale == -1) + return nullptr; + + if (GEP->hasOneUse()) { + // Only in this case do we want to build a wb gather, because the wb will + // change the phi which does affect other users of the gep (which will still + // be using the phi in the old way) + Value *Load = + tryCreateIncrementingWBGather(I, BasePtr, Offsets, TypeScale, Builder); + if (Load != nullptr) + return Load; + } + LLVM_DEBUG( + dbgs() << "masked gathers: trying to build incrementing non-wb gather\n"); + + std::pair Add = getVarAndConst(Offsets, TypeScale); + if (Add.first == nullptr) + return nullptr; + Value *OffsetsIncoming = Add.first; + int64_t Immediate = Add.second; + + // Make sure the offsets are scaled correctly + Instruction *ScaledOffsets = BinaryOperator::Create( + Instruction::Shl, OffsetsIncoming, + Builder.CreateVectorSplat(Ty->getNumElements(), Builder.getInt32(TypeScale)), + "ScaledIndex", I); + // Add the base to the offsets + OffsetsIncoming = BinaryOperator::Create( + Instruction::Add, ScaledOffsets, + Builder.CreateVectorSplat( + Ty->getNumElements(), + Builder.CreatePtrToInt( + BasePtr, + cast(ScaledOffsets->getType())->getElementType())), + "StartIndex", I); + + return cast( + tryCreateMaskedGatherBase(I, OffsetsIncoming, Builder, Immediate)); +} + +Value *MVEGatherScatterLowering::tryCreateIncrementingWBGather( + IntrinsicInst *I, Value *BasePtr, Value *Offsets, unsigned TypeScale, + IRBuilder<> &Builder) { + // Check whether this gather's offset is incremented by a constant - if so, + // and the load is of the right type, we can merge this into a QI gather + Loop *L = LI->getLoopFor(I->getParent()); + // Offsets that are worth merging into this instruction will be incremented + // by a constant, thus we're looking for an add of a phi and a constant + PHINode *Phi = dyn_cast(Offsets); + if (Phi == nullptr || Phi->getNumIncomingValues() != 2 || + Phi->getParent() != L->getHeader() || Phi->getNumUses() != 2) + // No phi means no IV to write back to; if there is a phi, we expect it + // to have exactly two incoming values; the only phis we are interested in + // will be loop IV's and have exactly two uses, one in their increment and + // one in the gather's gep + return nullptr; + + unsigned IncrementIndex = + Phi->getIncomingBlock(0) == L->getLoopLatch() ? 0 : 1; + // Look through the phi to the phi increment + Offsets = Phi->getIncomingValue(IncrementIndex); + + std::pair Add = getVarAndConst(Offsets, TypeScale); + if (Add.first == nullptr) + return nullptr; + Value *OffsetsIncoming = Add.first; + int64_t Immediate = Add.second; + if (OffsetsIncoming != Phi) + // Then the increment we are looking at is not an increment of the + // induction variable, and we don't want to do a writeback + return nullptr; + + Builder.SetInsertPoint(&Phi->getIncomingBlock(1 - IncrementIndex)->back()); + unsigned NumElems = + cast(OffsetsIncoming->getType())->getNumElements(); + + // Make sure the offsets are scaled correctly + Instruction *ScaledOffsets = BinaryOperator::Create( + Instruction::Shl, Phi->getIncomingValue(1 - IncrementIndex), + Builder.CreateVectorSplat(NumElems, Builder.getInt32(TypeScale)), + "ScaledIndex", &Phi->getIncomingBlock(1 - IncrementIndex)->back()); + // Add the base to the offsets + OffsetsIncoming = BinaryOperator::Create( + Instruction::Add, ScaledOffsets, + Builder.CreateVectorSplat( + NumElems, + Builder.CreatePtrToInt( + BasePtr, + cast(ScaledOffsets->getType())->getElementType())), + "StartIndex", &Phi->getIncomingBlock(1 - IncrementIndex)->back()); + // The gather is pre-incrementing + OffsetsIncoming = BinaryOperator::Create( + Instruction::Sub, OffsetsIncoming, + Builder.CreateVectorSplat(NumElems, Builder.getInt32(Immediate)), + "PreIncrementStartIndex", + &Phi->getIncomingBlock(1 - IncrementIndex)->back()); + Phi->setIncomingValue(1 - IncrementIndex, OffsetsIncoming); + + Builder.SetInsertPoint(I); + + // Build the incrementing gather + Value *Load = tryCreateMaskedGatherBaseWB(I, Phi, Builder, Immediate); + + // One value to be handed to whoever uses the gather, one is the loop + // increment + Value *ExtractedLoad = Builder.CreateExtractValue(Load, 0, "Gather"); + Value *Inc = Builder.CreateExtractValue(Load, 1, "GatherIncrement"); + Instruction *AddInst = cast(Offsets); + AddInst->replaceAllUsesWith(Inc); + AddInst->eraseFromParent(); + Phi->setIncomingValue(IncrementIndex, Inc); + + return ExtractedLoad; +} + Value *MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) { using namespace PatternMatch; LLVM_DEBUG(dbgs() << "masked scatters: checking transform preconditions\n"); @@ -433,8 +674,9 @@ return nullptr; } + GetElementPtrInst *GEP = dyn_cast(Ptr); Value *Offsets; - Value *BasePtr = checkGEP(Offsets, InputTy, Ptr, Builder); + Value *BasePtr = checkGEP(Offsets, InputTy, GEP, Builder); if (!BasePtr) return nullptr; int Scale = computeScale( @@ -514,27 +756,6 @@ return; } -// Return true if the given intrinsic is a gather or scatter -static bool isGatherScatter(IntrinsicInst *IntInst) { - if (IntInst == nullptr) - return false; - unsigned IntrinsicID = IntInst->getIntrinsicID(); - return (IntrinsicID == Intrinsic::masked_gather || - IntrinsicID == Intrinsic::arm_mve_vldr_gather_base || - IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_predicated || - IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_wb || - IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_wb_predicated || - IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset || - IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset_predicated || - IntrinsicID == Intrinsic::masked_scatter || - IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base || - IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_predicated || - IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_wb || - IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_wb_predicated || - IntrinsicID == Intrinsic::arm_mve_vstr_scatter_offset || - IntrinsicID == Intrinsic::arm_mve_vstr_scatter_offset_predicated); -} - // Check whether all usages of this instruction are as offsets of // gathers/scatters or simple arithmetics only used by gathers/scatters static bool hasAllGatScatUsers(Instruction *I) { @@ -717,29 +938,36 @@ auto *ST = &TM.getSubtarget(F); if (!ST->hasMVEIntegerOps()) return false; + LI = &getAnalysis().getLoopInfo(); SmallVector Gathers; SmallVector Scatters; - LoopInfo &LI = getAnalysis().getLoopInfo(); for (BasicBlock &BB : F) { for (Instruction &I : BB) { IntrinsicInst *II = dyn_cast(&I); - if (II && II->getIntrinsicID() == Intrinsic::masked_gather) + if (II && II->getIntrinsicID() == Intrinsic::masked_gather) { Gathers.push_back(II); - else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter) + if (isa(II->getArgOperand(0))) + optimiseOffsets( + cast(II->getArgOperand(0))->getOperand(1), + II->getParent(), LI); + } else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter) { Scatters.push_back(II); + if (isa(II->getArgOperand(1))) + optimiseOffsets( + cast(II->getArgOperand(1))->getOperand(1), + II->getParent(), LI); + } } } bool Changed = false; for (unsigned i = 0; i < Gathers.size(); i++) { IntrinsicInst *I = Gathers[i]; - if (isa(I->getArgOperand(0))) - optimiseOffsets(cast(I->getArgOperand(0))->getOperand(1), - I->getParent(), &LI); Value *L = lowerGather(I); if (L == nullptr) continue; + // Get rid of any now dead instructions SimplifyInstructionsInBlock(cast(L)->getParent()); Changed = true; @@ -747,12 +975,10 @@ for (unsigned i = 0; i < Scatters.size(); i++) { IntrinsicInst *I = Scatters[i]; - if (isa(I->getArgOperand(1))) - optimiseOffsets(cast(I->getArgOperand(1))->getOperand(1), - I->getParent(), &LI); Value *S = lowerScatter(I); if (S == nullptr) continue; + // Get rid of any now dead instructions SimplifyInstructionsInBlock(cast(S)->getParent()); Changed = true; diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -0,0 +1,1530 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=thumbv8.1m.main-arm-none-eabi -mattr=+mve.fp -enable-arm-maskedldst -enable-arm-maskedgatscat %s -o - | FileCheck %s + +define arm_aapcs_vfpcc <4 x i32> @gather_inc_mini_4i32(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, <4 x i32> %offs) { +; CHECK-LABEL: gather_inc_mini_4i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov.i32 q1, #0x4 +; CHECK-NEXT: vadd.i32 q1, q0, q1 +; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] +; CHECK-NEXT: bx lr + %1 = add <4 x i32> %offs, + %2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1 + %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> , <4 x i32> undef) + ret <4 x i32> %wide.masked.gather +} + +define arm_aapcs_vfpcc <4 x i32> @gather_inc_minipred_4i32(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, <4 x i32> %offs) { +; CHECK-LABEL: gather_inc_minipred_4i32: +; CHECK: @ %bb.0: +; CHECK-NEXT: vmov.i32 q1, #0x4 +; CHECK-NEXT: movw r1, #3855 +; CHECK-NEXT: vadd.i32 q1, q0, q1 +; CHECK-NEXT: vmsr p0, r1 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrwt.u32 q0, [r0, q1, uxtw #2] +; CHECK-NEXT: bx lr + %1 = add <4 x i32> %offs, + %2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1 + %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> , <4 x i32> undef) + ret <4 x i32> %wide.masked.gather +} + +define arm_aapcs_vfpcc <8 x i16> @gather_inc_mini_8i16(i16* noalias nocapture readonly %data, i16* noalias nocapture %dst, <8 x i32> %offs) { +; CHECK-LABEL: gather_inc_mini_8i16: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vmov.i32 q2, #0x10 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrh.w r12, [r1] +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: ldrh.w lr, [r2] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov.16 q0[2], r12 +; CHECK-NEXT: vmov.16 q0[3], lr +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: pop {r4, r5, r7, pc} + %1 = add <8 x i32> %offs, + %2 = getelementptr inbounds i16, i16* %data, <8 x i32> %1 + %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %2, i32 4, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %wide.masked.gather +} + +define arm_aapcs_vfpcc <8 x i16> @gather_inc_minipred_8i16(i16* noalias nocapture readonly %data, i16* noalias nocapture %dst, <8 x i32> %offs) { +; CHECK-LABEL: gather_inc_minipred_8i16: +; CHECK: @ %bb.0: +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vmov.i32 q2, #0x10 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: vmov.16 q0[0], r1 +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vmov.16 q0[2], r3 +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: bx lr + %1 = add <8 x i32> %offs, + %2 = getelementptr inbounds i16, i16* %data, <8 x i32> %1 + %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %2, i32 4, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %wide.masked.gather +} + +define arm_aapcs_vfpcc <16 x i8> @gather_inc_mini_16i8(i8* noalias nocapture readonly %data, i8* noalias nocapture %dst, <16 x i32> %offs) { +; CHECK-LABEL: gather_inc_mini_16i8: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov.i32 q4, #0x10 +; CHECK-NEXT: vadd.i32 q3, q3, r0 +; CHECK-NEXT: vadd.i32 q3, q3, q4 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vadd.i32 q1, q1, q4 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vadd.i32 q2, q2, q4 +; CHECK-NEXT: vmov r2, s10 +; CHECK-NEXT: vmov r4, s11 +; CHECK-NEXT: ldrb.w r12, [r1] +; CHECK-NEXT: vmov r1, s15 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb.w lr, [r1] +; CHECK-NEXT: vmov r1, s12 +; CHECK-NEXT: ldrb r3, [r1] +; CHECK-NEXT: vmov r1, s13 +; CHECK-NEXT: vadd.i32 q3, q0, q4 +; CHECK-NEXT: vmov r5, s12 +; CHECK-NEXT: vmov r6, s15 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: vmov.8 q0[0], r5 +; CHECK-NEXT: vmov r5, s13 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[1], r5 +; CHECK-NEXT: vmov r5, s14 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[2], r5 +; CHECK-NEXT: vmov r5, s8 +; CHECK-NEXT: vmov.8 q0[3], r6 +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.8 q0[8], r5 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q0[10], r2 +; CHECK-NEXT: vmov.8 q0[11], r4 +; CHECK-NEXT: vmov.8 q0[12], r3 +; CHECK-NEXT: vmov.8 q0[13], r1 +; CHECK-NEXT: vmov.8 q0[14], r12 +; CHECK-NEXT: vmov.8 q0[15], lr +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r4, r5, r6, pc} + %1 = add <16 x i32> %offs, + %2 = getelementptr inbounds i8, i8* %data, <16 x i32> %1 + %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %2, i32 2, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %wide.masked.gather +} + +define arm_aapcs_vfpcc <16 x i8> @gather_inc_minipred_16i8(i8* noalias nocapture readonly %data, i8* noalias nocapture %dst, <16 x i32> %offs) { +; CHECK-LABEL: gather_inc_minipred_16i8: +; CHECK: @ %bb.0: +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov.i32 q4, #0x10 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q2, q2, q4 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r2, s8 +; CHECK-NEXT: vadd.i32 q1, q1, q4 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q0, q0, q4 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: vmov r5, s2 +; CHECK-NEXT: ldrb.w lr, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: ldrb.w r12, [r1] +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vadd.i32 q1, q3, r0 +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: vadd.i32 q1, q1, q4 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q0[0], r2 +; CHECK-NEXT: vmov.8 q0[2], r5 +; CHECK-NEXT: vmov.8 q0[4], r12 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r1 +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov.8 q0[8], lr +; CHECK-NEXT: vmov.8 q0[10], r3 +; CHECK-NEXT: vmov.8 q0[12], r0 +; CHECK-NEXT: vmov.8 q0[14], r4 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r4, r5, r7, pc} + %1 = add <16 x i32> %offs, + %2 = getelementptr inbounds i8, i8* %data, <16 x i32> %1 + %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %2, i32 2, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %wide.masked.gather +} + +define arm_aapcs_vfpcc void @gather_pre_inc(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) { +; CHECK-LABEL: gather_pre_inc: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: adr r3, .LCPI6_0 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: .LBB6_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [q0, #96]! +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: bne .LBB6_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI6_0: +; CHECK-NEXT: .long 4294967224 @ 0xffffffb8 +; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 +; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 +; CHECK-NEXT: .long 0 @ 0x0 +vector.ph: ; preds = %for.body.preheader + %ind.end = shl i32 %n.vec, 1 + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.ind = phi <4 x i32> [ , %vector.ph ], [ %vec.ind.next, %vector.body ] + %0 = mul <4 x i32> %vec.ind, + %1 = add <4 x i32> %0, + %2 = getelementptr inbounds i32, i32* %data, <4 x i32> %1 + %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %2, i32 4, <4 x i1> , <4 x i32> undef) + %3 = getelementptr inbounds i32, i32* %dst, i32 %index + %4 = bitcast i32* %3 to <4 x i32>* + store <4 x i32> %wide.masked.gather, <4 x i32>* %4, align 4 + %index.next = add i32 %index, 4 + %vec.ind.next = add <4 x i32> %vec.ind, + %5 = icmp eq i32 %index.next, %n.vec + br i1 %5, label %end, label %vector.body + +end: + ret void; +} + +define arm_aapcs_vfpcc void @gather_post_inc(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec43) { +; CHECK-LABEL: gather_post_inc: +; CHECK: @ %bb.0: @ %vector.ph41 +; CHECK-NEXT: adr r3, .LCPI7_0 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: .LBB7_1: @ %vector.body39 +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [q0, #96]! +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 +; CHECK-NEXT: bne .LBB7_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI7_0: +; CHECK-NEXT: .long 4294967200 @ 0xffffffa0 +; CHECK-NEXT: .long 4294967224 @ 0xffffffb8 +; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 +; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 +vector.ph41: ; preds = %for.body6.preheader + %ind.end47 = shl i32 %n.vec43, 1 + br label %vector.body39 + +vector.body39: ; preds = %vector.body39, %vector.ph41 + %index44 = phi i32 [ 0, %vector.ph41 ], [ %index.next45, %vector.body39 ] + %vec.ind50 = phi <4 x i32> [ , %vector.ph41 ], [ %vec.ind.next51, %vector.body39 ] + %0 = mul nuw nsw <4 x i32> %vec.ind50, + %1 = getelementptr inbounds i32, i32* %data, <4 x i32> %0 + %wide.masked.gather55 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %1, i32 4, <4 x i1> , <4 x i32> undef) + %2 = getelementptr inbounds i32, i32* %dst, i32 %index44 + %3 = bitcast i32* %2 to <4 x i32>* + store <4 x i32> %wide.masked.gather55, <4 x i32>* %3, align 4 + %index.next45 = add i32 %index44, 4 + %vec.ind.next51 = add <4 x i32> %vec.ind50, + %4 = icmp eq i32 %index.next45, %n.vec43 + br i1 %4, label %end, label %vector.body39 + +end: + ret void; +} + +define arm_aapcs_vfpcc void @gather_inc_v4i32_simple(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n) { +; CHECK-LABEL: gather_inc_v4i32_simple: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: bic r12, r2, #3 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: sub.w lr, r12, #4 +; CHECK-NEXT: add.w r4, r3, lr, lsr #2 +; CHECK-NEXT: adr r3, .LCPI8_0 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: .LBB8_1: @ %vector.ph +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB8_2 Depth 2 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: dls lr, r4 +; CHECK-NEXT: .LBB8_2: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB8_1 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: vldrw.u32 q2, [q1, #16]! +; CHECK-NEXT: vstrb.8 q2, [r0], #16 +; CHECK-NEXT: le lr, .LBB8_2 +; CHECK-NEXT: @ %bb.3: @ %middle.block +; CHECK-NEXT: @ in Loop: Header=BB8_1 Depth=1 +; CHECK-NEXT: cmp r12, r2 +; CHECK-NEXT: bne .LBB8_1 +; CHECK-NEXT: @ %bb.4: @ %for.cond.cleanup +; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.5: +; CHECK-NEXT: .LCPI8_0: +; CHECK-NEXT: .long 4294967280 @ 0xfffffff0 +; CHECK-NEXT: .long 4294967284 @ 0xfffffff4 +; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 +; CHECK-NEXT: .long 4294967292 @ 0xfffffffc +entry: + %cmp22 = icmp sgt i32 %n, 0 + br i1 %cmp22, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %for.body.preheader + %n.vec = and i32 %n, -4 + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.ind = phi <4 x i32> [ , %vector.ph ], [ %vec.ind.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %data, <4 x i32> %vec.ind + %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %0, i32 4, <4 x i1> , <4 x i32> undef) + %1 = getelementptr inbounds i32, i32* %dst, i32 %index + %2 = bitcast i32* %1 to <4 x i32>* + store <4 x i32> %wide.masked.gather, <4 x i32>* %2, align 4 + %index.next = add i32 %index, 4 + %vec.ind.next = add <4 x i32> %vec.ind, + %3 = icmp eq i32 %index.next, %n.vec + br i1 %3, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %cmp.n = icmp eq i32 %n.vec, %n + br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph + +for.cond.cleanup: ; preds = %for.body, %middle.block, %entry + ret void +} + +define arm_aapcs_vfpcc void @gather_inc_v4i32_complex(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n) { +; CHECK-LABEL: gather_inc_v4i32_complex: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: blt .LBB9_5 +; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader +; CHECK-NEXT: bic r12, r2, #3 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: sub.w lr, r12, #4 +; CHECK-NEXT: adr r4, .LCPI9_1 +; CHECK-NEXT: adr r5, .LCPI9_2 +; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: add.w r3, r3, lr, lsr #2 +; CHECK-NEXT: adr.w lr, .LCPI9_0 +; CHECK-NEXT: vldrw.u32 q0, [r5] +; CHECK-NEXT: vldrw.u32 q2, [lr] +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: .LBB9_2: @ %vector.ph +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB9_3 Depth 2 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov q5, q2 +; CHECK-NEXT: dls lr, r3 +; CHECK-NEXT: .LBB9_3: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB9_2 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: vldrw.u32 q6, [q5, #48]! +; CHECK-NEXT: vldrw.u32 q7, [q3, #48]! +; CHECK-NEXT: vadd.i32 q6, q7, q6 +; CHECK-NEXT: vldrw.u32 q7, [q4, #48]! +; CHECK-NEXT: vadd.i32 q6, q6, q7 +; CHECK-NEXT: vstrb.8 q6, [r0], #16 +; CHECK-NEXT: le lr, .LBB9_3 +; CHECK-NEXT: @ %bb.4: @ %middle.block +; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=1 +; CHECK-NEXT: cmp r12, r2 +; CHECK-NEXT: bne .LBB9_2 +; CHECK-NEXT: .LBB9_5: @ %for.cond.cleanup +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.6: +; CHECK-NEXT: .LCPI9_0: +; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 +; CHECK-NEXT: .long 4294967260 @ 0xffffffdc +; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 +; CHECK-NEXT: .long 4294967284 @ 0xfffffff4 +; CHECK-NEXT: .LCPI9_1: +; CHECK-NEXT: .long 4294967252 @ 0xffffffd4 +; CHECK-NEXT: .long 4294967264 @ 0xffffffe0 +; CHECK-NEXT: .long 4294967276 @ 0xffffffec +; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 +; CHECK-NEXT: .LCPI9_2: +; CHECK-NEXT: .long 4294967256 @ 0xffffffd8 +; CHECK-NEXT: .long 4294967268 @ 0xffffffe4 +; CHECK-NEXT: .long 4294967280 @ 0xfffffff0 +; CHECK-NEXT: .long 4294967292 @ 0xfffffffc +entry: + %cmp22 = icmp sgt i32 %n, 0 + br i1 %cmp22, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %for.body.preheader + %n.vec = and i32 %n, -4 + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.ind = phi <4 x i32> [ , %vector.ph ], [ %vec.ind.next, %vector.body ] + %0 = mul nuw nsw <4 x i32> %vec.ind, + %1 = getelementptr inbounds i32, i32* %data, <4 x i32> %0 + %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %1, i32 4, <4 x i1> , <4 x i32> undef) + %2 = add nuw nsw <4 x i32> %0, + %3 = getelementptr inbounds i32, i32* %data, <4 x i32> %2 + %wide.masked.gather24 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %3, i32 4, <4 x i1> , <4 x i32> undef) + %4 = add nuw nsw <4 x i32> %0, + %5 = getelementptr inbounds i32, i32* %data, <4 x i32> %4 + %wide.masked.gather25 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %5, i32 4, <4 x i1> , <4 x i32> undef) + %6 = add nsw <4 x i32> %wide.masked.gather24, %wide.masked.gather + %7 = add nsw <4 x i32> %6, %wide.masked.gather25 + %8 = getelementptr inbounds i32, i32* %dst, i32 %index + %9 = bitcast i32* %8 to <4 x i32>* + store <4 x i32> %7, <4 x i32>* %9, align 4 + %index.next = add i32 %index, 4 + %vec.ind.next = add <4 x i32> %vec.ind, + %10 = icmp eq i32 %index.next, %n.vec + br i1 %10, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %cmp.n = icmp eq i32 %n.vec, %n + br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph + +for.cond.cleanup: ; preds = %for.body, %middle.block, %entry + ret void +} + +define arm_aapcs_vfpcc void @gather_inc_v4i32_large(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n) { +; CHECK-LABEL: gather_inc_v4i32_large: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: it lt +; CHECK-NEXT: poplt {r4, pc} +; CHECK-NEXT: bic r12, r2, #3 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: sub.w lr, r12, #4 +; CHECK-NEXT: add.w r4, r3, lr, lsr #2 +; CHECK-NEXT: adr r3, .LCPI10_0 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: .LBB10_1: @ %vector.ph +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB10_2 Depth 2 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: dls lr, r4 +; CHECK-NEXT: .LBB10_2: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB10_1 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: vldrw.u32 q2, [q1, #508]! +; CHECK-NEXT: vstrb.8 q2, [r0], #16 +; CHECK-NEXT: le lr, .LBB10_2 +; CHECK-NEXT: @ %bb.3: @ %middle.block +; CHECK-NEXT: @ in Loop: Header=BB10_1 Depth=1 +; CHECK-NEXT: cmp r12, r2 +; CHECK-NEXT: bne .LBB10_1 +; CHECK-NEXT: @ %bb.4: @ %for.cond.cleanup +; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.5: +; CHECK-NEXT: .LCPI10_0: +; CHECK-NEXT: .long 4294966788 @ 0xfffffe04 +; CHECK-NEXT: .long 4294966792 @ 0xfffffe08 +; CHECK-NEXT: .long 4294966796 @ 0xfffffe0c +; CHECK-NEXT: .long 4294966800 @ 0xfffffe10 +entry: + %cmp22 = icmp sgt i32 %n, 0 + br i1 %cmp22, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %for.body.preheader + %n.vec = and i32 %n, -4 + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.ind = phi <4 x i32> [ , %vector.ph ], [ %vec.ind.next, %vector.body ] + %0 = getelementptr inbounds i32, i32* %data, <4 x i32> %vec.ind + %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %0, i32 4, <4 x i1> , <4 x i32> undef) + %1 = getelementptr inbounds i32, i32* %dst, i32 %index + %2 = bitcast i32* %1 to <4 x i32>* + store <4 x i32> %wide.masked.gather, <4 x i32>* %2, align 4 + %index.next = add i32 %index, 4 + %vec.ind.next = add <4 x i32> %vec.ind, + %3 = icmp eq i32 %index.next, %n.vec + br i1 %3, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %cmp.n = icmp eq i32 %n.vec, %n + br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph + +for.cond.cleanup: ; preds = %for.body, %middle.block, %entry + ret void +} + +; TODO: uneven - I think it's not possible to create such an example, because vec.ind will always be increased by a vector with 4 elements (=> x*4 = even) + +; TODO: What is sxth? +define arm_aapcs_vfpcc void @gather_inc_v8i16_simple(i16* noalias nocapture readonly %data, i16* noalias nocapture %dst, i32 %n) { +; CHECK-LABEL: gather_inc_v8i16_simple: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .pad #8 +; CHECK-NEXT: sub sp, #8 +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: blt .LBB11_5 +; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader +; CHECK-NEXT: bic r1, r2, #7 +; CHECK-NEXT: movs r6, #1 +; CHECK-NEXT: sub.w r3, r1, #8 +; CHECK-NEXT: vmov.i16 q1, #0x8 +; CHECK-NEXT: str r1, [sp] @ 4-byte Spill +; CHECK-NEXT: add.w r8, r6, r3, lsr #3 +; CHECK-NEXT: adr r3, .LCPI11_0 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: .LBB11_2: @ %vector.ph +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB11_3 Depth 2 +; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: ldr r3, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: dls lr, r8 +; CHECK-NEXT: .LBB11_3: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB11_2 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: vmov.u16 r7, q2[4] +; CHECK-NEXT: vmov.u16 r5, q2[0] +; CHECK-NEXT: vmov.32 q4[0], r7 +; CHECK-NEXT: vmov.u16 r7, q2[5] +; CHECK-NEXT: vmov.32 q4[1], r7 +; CHECK-NEXT: vmov.u16 r7, q2[6] +; CHECK-NEXT: vmov.32 q4[2], r7 +; CHECK-NEXT: vmov.u16 r7, q2[7] +; CHECK-NEXT: vmov.32 q4[3], r7 +; CHECK-NEXT: vmov.32 q3[0], r5 +; CHECK-NEXT: vmovlb.s16 q4, q4 +; CHECK-NEXT: vmov.u16 r5, q2[1] +; CHECK-NEXT: vshl.i32 q4, q4, #1 +; CHECK-NEXT: vmov.32 q3[1], r5 +; CHECK-NEXT: vadd.i32 q4, q4, r0 +; CHECK-NEXT: vmov.u16 r5, q2[2] +; CHECK-NEXT: vmov r7, s16 +; CHECK-NEXT: vmov.32 q3[2], r5 +; CHECK-NEXT: vmov.u16 r5, q2[3] +; CHECK-NEXT: vmov r4, s17 +; CHECK-NEXT: vmov.32 q3[3], r5 +; CHECK-NEXT: vadd.i16 q2, q2, q1 +; CHECK-NEXT: vmovlb.s16 q3, q3 +; CHECK-NEXT: vshl.i32 q3, q3, #1 +; CHECK-NEXT: vadd.i32 q3, q3, r0 +; CHECK-NEXT: vmov r5, s15 +; CHECK-NEXT: vmov r6, s14 +; CHECK-NEXT: vmov r12, s13 +; CHECK-NEXT: ldrh.w r11, [r7] +; CHECK-NEXT: vmov r7, s12 +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: ldrh.w r9, [r5] +; CHECK-NEXT: vmov r5, s18 +; CHECK-NEXT: ldrh.w r10, [r6] +; CHECK-NEXT: vmov r6, s19 +; CHECK-NEXT: ldrh.w r1, [r12] +; CHECK-NEXT: ldrh r7, [r7] +; CHECK-NEXT: vmov.16 q3[0], r7 +; CHECK-NEXT: vmov.16 q3[1], r1 +; CHECK-NEXT: vmov.16 q3[2], r10 +; CHECK-NEXT: vmov.16 q3[3], r9 +; CHECK-NEXT: vmov.16 q3[4], r11 +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q3[5], r4 +; CHECK-NEXT: ldrh r6, [r6] +; CHECK-NEXT: vmov.16 q3[6], r5 +; CHECK-NEXT: vmov.16 q3[7], r6 +; CHECK-NEXT: vstrb.8 q3, [r3], #16 +; CHECK-NEXT: le lr, .LBB11_3 +; CHECK-NEXT: @ %bb.4: @ %middle.block +; CHECK-NEXT: @ in Loop: Header=BB11_2 Depth=1 +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload +; CHECK-NEXT: cmp r1, r2 +; CHECK-NEXT: bne .LBB11_2 +; CHECK-NEXT: .LBB11_5: @ %for.cond.cleanup +; CHECK-NEXT: add sp, #8 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.6: +; CHECK-NEXT: .LCPI11_0: +; CHECK-NEXT: .short 0 @ 0x0 +; CHECK-NEXT: .short 1 @ 0x1 +; CHECK-NEXT: .short 2 @ 0x2 +; CHECK-NEXT: .short 3 @ 0x3 +; CHECK-NEXT: .short 4 @ 0x4 +; CHECK-NEXT: .short 5 @ 0x5 +; CHECK-NEXT: .short 6 @ 0x6 +; CHECK-NEXT: .short 7 @ 0x7 + + +entry: + %cmp22 = icmp sgt i32 %n, 0 + br i1 %cmp22, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %for.body.preheader + %n.vec = and i32 %n, -8 + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.ind = phi <8 x i16> [ , %vector.ph ], [ %vec.ind.next, %vector.body ] + %0 = getelementptr inbounds i16, i16* %data, <8 x i16> %vec.ind + %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %0, i32 2, <8 x i1> , <8 x i16> undef) + %1 = getelementptr inbounds i16, i16* %dst, i32 %index + %2 = bitcast i16* %1 to <8 x i16>* + store <8 x i16> %wide.masked.gather, <8 x i16>* %2, align 2 + %index.next = add i32 %index, 8 + %vec.ind.next = add <8 x i16> %vec.ind, + %3 = icmp eq i32 %index.next, %n.vec + br i1 %3, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %cmp.n = icmp eq i32 %n.vec, %n + br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph + +for.cond.cleanup: ; preds = %for.body, %middle.block, %entry + ret void +} + +; TODO: This looks absolutely terrifying :( +define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(i16* noalias nocapture readonly %data, i16* noalias nocapture %dst, i32 %n) { +; CHECK-LABEL: gather_inc_v8i16_complex: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #104 +; CHECK-NEXT: sub sp, #104 +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: str r1, [sp, #60] @ 4-byte Spill +; CHECK-NEXT: blt.w .LBB12_5 +; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader +; CHECK-NEXT: bic r1, r2, #7 +; CHECK-NEXT: movs r7, #1 +; CHECK-NEXT: sub.w r3, r1, #8 +; CHECK-NEXT: adr r6, .LCPI12_2 +; CHECK-NEXT: vmov.i16 q3, #0x18 +; CHECK-NEXT: str r1, [sp, #56] @ 4-byte Spill +; CHECK-NEXT: add.w r8, r7, r3, lsr #3 +; CHECK-NEXT: adr r7, .LCPI12_1 +; CHECK-NEXT: vldrw.u32 q0, [r7] +; CHECK-NEXT: adr r3, .LCPI12_0 +; CHECK-NEXT: vstrw.32 q3, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: .LBB12_2: @ %vector.ph +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB12_3 Depth 2 +; CHECK-NEXT: dls lr, r8 +; CHECK-NEXT: ldr r3, [sp, #60] @ 4-byte Reload +; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload +; CHECK-NEXT: .LBB12_3: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB12_2 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: vmov.u16 r4, q5[0] +; CHECK-NEXT: vmov.u16 r7, q7[4] +; CHECK-NEXT: vmov.32 q0[0], r4 +; CHECK-NEXT: vmov.u16 r4, q5[1] +; CHECK-NEXT: vmov.32 q0[1], r4 +; CHECK-NEXT: vmov.u16 r4, q5[2] +; CHECK-NEXT: vmov.32 q0[2], r4 +; CHECK-NEXT: vmov.u16 r4, q5[3] +; CHECK-NEXT: vmov.32 q0[3], r4 +; CHECK-NEXT: vmov.u16 r12, q6[0] +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vmov.32 q1[0], r12 +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vmov.u16 r1, q6[1] +; CHECK-NEXT: vadd.i32 q2, q0, r0 +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: vmov.u16 r1, q6[2] +; CHECK-NEXT: vmov.32 q1[2], r1 +; CHECK-NEXT: vmov.u16 r1, q6[3] +; CHECK-NEXT: vmov.32 q1[3], r1 +; CHECK-NEXT: vmov.u16 r1, q6[4] +; CHECK-NEXT: vmovlb.s16 q1, q1 +; CHECK-NEXT: vmov r6, s11 +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vadd.i32 q4, q1, r0 +; CHECK-NEXT: ldrh.w r9, [r4] +; CHECK-NEXT: vmov.u16 r4, q5[4] +; CHECK-NEXT: vmov.32 q0[0], r4 +; CHECK-NEXT: vmov.u16 r4, q5[5] +; CHECK-NEXT: vmov.32 q0[1], r4 +; CHECK-NEXT: vmov.u16 r4, q5[6] +; CHECK-NEXT: vmov.32 q0[2], r4 +; CHECK-NEXT: vmov.u16 r4, q5[7] +; CHECK-NEXT: vmov.32 q0[3], r4 +; CHECK-NEXT: ldrh r6, [r6] +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vmov r5, s3 +; CHECK-NEXT: ldrh.w r10, [r4] +; CHECK-NEXT: vmov r4, s1 +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: ldrh.w r11, [r4] +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: vmov.32 q0[0], r7 +; CHECK-NEXT: vmov.u16 r7, q7[5] +; CHECK-NEXT: vmov.32 q0[1], r7 +; CHECK-NEXT: vmov.u16 r7, q7[6] +; CHECK-NEXT: vmov.32 q0[2], r7 +; CHECK-NEXT: vmov.u16 r7, q7[7] +; CHECK-NEXT: vmov.32 q0[3], r7 +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r7, s2 +; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vmov.32 q0[0], r1 +; CHECK-NEXT: vmov.u16 r1, q6[5] +; CHECK-NEXT: vmov.32 q0[1], r1 +; CHECK-NEXT: vmov.u16 r1, q6[6] +; CHECK-NEXT: vmov.32 q0[2], r1 +; CHECK-NEXT: vmov.u16 r1, q6[7] +; CHECK-NEXT: vmov.32 q0[3], r1 +; CHECK-NEXT: vmov.u16 r1, q7[0] +; CHECK-NEXT: vmov.32 q3[0], r1 +; CHECK-NEXT: vmov.u16 r1, q7[1] +; CHECK-NEXT: vmov.32 q3[1], r1 +; CHECK-NEXT: vmov.u16 r1, q7[2] +; CHECK-NEXT: vmov.32 q3[2], r1 +; CHECK-NEXT: vmov.u16 r1, q7[3] +; CHECK-NEXT: vmov.32 q3[3], r1 +; CHECK-NEXT: vmov r1, s8 +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vmovlb.s16 q3, q3 +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vshl.i32 q3, q3, #1 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q3, q3, r0 +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: ldrh r7, [r7] +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q1[0], r1 +; CHECK-NEXT: vmov r1, s9 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q1[1], r1 +; CHECK-NEXT: vmov r1, s16 +; CHECK-NEXT: vmov.16 q1[2], r9 +; CHECK-NEXT: vmov.16 q1[3], r6 +; CHECK-NEXT: vmov.16 q1[4], r10 +; CHECK-NEXT: vmov.16 q1[5], r11 +; CHECK-NEXT: vmov.16 q1[6], r4 +; CHECK-NEXT: vmov.16 q1[7], r5 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q2[0], r1 +; CHECK-NEXT: vmov r1, s17 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q2[1], r1 +; CHECK-NEXT: vmov r1, s18 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q2[2], r1 +; CHECK-NEXT: vmov r1, s19 +; CHECK-NEXT: vldrw.u32 q4, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q2[3], r1 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q2[4], r1 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q2[5], r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q2[6], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q2[7], r1 +; CHECK-NEXT: vmov r1, s12 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q0[0], r1 +; CHECK-NEXT: vmov r1, s13 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q0[1], r1 +; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q0[2], r1 +; CHECK-NEXT: vmov r1, s15 +; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vadd.i16 q6, q6, q3 +; CHECK-NEXT: vadd.i16 q5, q5, q3 +; CHECK-NEXT: vadd.i16 q7, q7, q3 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q0[3], r1 +; CHECK-NEXT: vmov r1, s16 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q0[4], r1 +; CHECK-NEXT: vmov r1, s17 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov r1, s19 +; CHECK-NEXT: vmov.16 q0[6], r7 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q0[7], r1 +; CHECK-NEXT: vadd.i16 q0, q0, q2 +; CHECK-NEXT: vadd.i16 q0, q0, q1 +; CHECK-NEXT: vstrb.8 q0, [r3], #16 +; CHECK-NEXT: le lr, .LBB12_3 +; CHECK-NEXT: @ %bb.4: @ %middle.block +; CHECK-NEXT: @ in Loop: Header=BB12_2 Depth=1 +; CHECK-NEXT: ldr r1, [sp, #56] @ 4-byte Reload +; CHECK-NEXT: cmp r1, r2 +; CHECK-NEXT: bne.w .LBB12_2 +; CHECK-NEXT: .LBB12_5: @ %for.cond.cleanup +; CHECK-NEXT: add sp, #104 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.6: +; CHECK-NEXT: .LCPI12_0: +; CHECK-NEXT: .short 0 @ 0x0 +; CHECK-NEXT: .short 3 @ 0x3 +; CHECK-NEXT: .short 6 @ 0x6 +; CHECK-NEXT: .short 9 @ 0x9 +; CHECK-NEXT: .short 12 @ 0xc +; CHECK-NEXT: .short 15 @ 0xf +; CHECK-NEXT: .short 18 @ 0x12 +; CHECK-NEXT: .short 21 @ 0x15 +; CHECK-NEXT: .LCPI12_1: +; CHECK-NEXT: .short 2 @ 0x2 +; CHECK-NEXT: .short 5 @ 0x5 +; CHECK-NEXT: .short 8 @ 0x8 +; CHECK-NEXT: .short 11 @ 0xb +; CHECK-NEXT: .short 14 @ 0xe +; CHECK-NEXT: .short 17 @ 0x11 +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 23 @ 0x17 +; CHECK-NEXT: .LCPI12_2: +; CHECK-NEXT: .short 1 @ 0x1 +; CHECK-NEXT: .short 4 @ 0x4 +; CHECK-NEXT: .short 7 @ 0x7 +; CHECK-NEXT: .short 10 @ 0xa +; CHECK-NEXT: .short 13 @ 0xd +; CHECK-NEXT: .short 16 @ 0x10 +; CHECK-NEXT: .short 19 @ 0x13 +; CHECK-NEXT: .short 22 @ 0x16 + + +entry: + %cmp22 = icmp sgt i32 %n, 0 + br i1 %cmp22, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %for.body.preheader + %n.vec = and i32 %n, -8 + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.ind = phi <8 x i16> [ , %vector.ph ], [ %vec.ind.next, %vector.body ] + %0 = mul nuw nsw <8 x i16> %vec.ind, + %1 = getelementptr inbounds i16, i16* %data, <8 x i16> %0 + %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %1, i32 2, <8 x i1> , <8 x i16> undef) + %2 = add nuw nsw <8 x i16> %0, + %3 = getelementptr inbounds i16, i16* %data, <8 x i16> %2 + %wide.masked.gather24 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %3, i32 2, <8 x i1> , <8 x i16> undef) + %4 = add nuw nsw <8 x i16> %0, + %5 = getelementptr inbounds i16, i16* %data, <8 x i16> %4 + %wide.masked.gather25 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %5, i32 2, <8 x i1> , <8 x i16> undef) + %6 = add nsw <8 x i16> %wide.masked.gather24, %wide.masked.gather + %7 = add nsw <8 x i16> %6, %wide.masked.gather25 + %8 = getelementptr inbounds i16, i16* %dst, i32 %index + %9 = bitcast i16* %8 to <8 x i16>* + store <8 x i16> %7, <8 x i16>* %9, align 2 + %index.next = add i32 %index, 8 + %vec.ind.next = add <8 x i16> %vec.ind, + %10 = icmp eq i32 %index.next, %n.vec + br i1 %10, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %cmp.n = icmp eq i32 %n.vec, %n + br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph + +for.cond.cleanup: ; preds = %for.body, %middle.block, %entry + ret void +} + + +define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(i8* noalias nocapture readonly %data, i8* noalias nocapture %dst, i32 %n) { +; CHECK-LABEL: gather_inc_v16i8_complex: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #328 +; CHECK-NEXT: sub sp, #328 +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: str r1, [sp, #120] @ 4-byte Spill +; CHECK-NEXT: mov r1, r2 +; CHECK-NEXT: str r2, [sp, #124] @ 4-byte Spill +; CHECK-NEXT: blt.w .LBB13_5 +; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader +; CHECK-NEXT: ldr r1, [sp, #124] @ 4-byte Reload +; CHECK-NEXT: adr.w r6, .LCPI13_8 +; CHECK-NEXT: adr.w r7, .LCPI13_7 +; CHECK-NEXT: adr.w r3, .LCPI13_6 +; CHECK-NEXT: bic r11, r1, #7 +; CHECK-NEXT: adr r1, .LCPI13_0 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adr r1, .LCPI13_1 +; CHECK-NEXT: vmov.i32 q5, #0x30 +; CHECK-NEXT: str.w r11, [sp, #116] @ 4-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adr r1, .LCPI13_5 +; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: adr.w r6, .LCPI13_9 +; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r7] +; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: .LBB13_2: @ %vector.ph +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB13_3 Depth 2 +; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: adr r1, .LCPI13_3 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: adr r1, .LCPI13_4 +; CHECK-NEXT: vstrw.32 q2, [sp, #288] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q3, [r1] +; CHECK-NEXT: adr r1, .LCPI13_2 +; CHECK-NEXT: vstrw.32 q2, [sp, #224] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adr r1, .LCPI13_10 +; CHECK-NEXT: vstrw.32 q2, [sp, #272] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q0, [sp, #304] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adr r1, .LCPI13_11 +; CHECK-NEXT: ldr.w r9, [sp, #120] @ 4-byte Reload +; CHECK-NEXT: vstrw.32 q2, [sp, #208] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [r1] +; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q2, [sp, #192] @ 16-byte Spill +; CHECK-NEXT: .LBB13_3: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB13_2 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: vstrw.32 q3, [sp, #240] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q3, q6, r0 +; CHECK-NEXT: vmov r1, s15 +; CHECK-NEXT: vstrw.32 q1, [sp, #256] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vstrw.32 q0, [sp, #176] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q0, q7, r0 +; CHECK-NEXT: vstrw.32 q6, [sp, #160] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q6, [sp, #256] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q7, [sp, #144] @ 16-byte Spill +; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: vldrw.u32 q2, [sp, #240] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q6, q6, r0 +; CHECK-NEXT: vstrw.32 q4, [sp, #128] @ 16-byte Spill +; CHECK-NEXT: subs.w r11, r11, #16 +; CHECK-NEXT: ldrb.w r12, [r1] +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb.w lr, [r1] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: ldrb r6, [r1] +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: ldrb.w r10, [r1] +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: ldrb r4, [r1] +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: ldrb.w r8, [r1] +; CHECK-NEXT: vmov r1, s24 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[0], r1 +; CHECK-NEXT: vmov r1, s25 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[1], r1 +; CHECK-NEXT: vmov r1, s26 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[2], r1 +; CHECK-NEXT: vmov r1, s27 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[3], r1 +; CHECK-NEXT: vmov r1, s12 +; CHECK-NEXT: vmov.8 q7[4], r6 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q6[0], r1 +; CHECK-NEXT: vmov r1, s13 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q6[1], r1 +; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vadd.i32 q3, q4, r0 +; CHECK-NEXT: vldrw.u32 q4, [sp, #224] @ 16-byte Reload +; CHECK-NEXT: vmov r2, s12 +; CHECK-NEXT: vmov r3, s15 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q6[2], r1 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmov.8 q6[3], r12 +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: vldrw.u32 q1, [sp, #304] @ 16-byte Reload +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: vstrw.32 q1, [sp, #304] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q6[4], r1 +; CHECK-NEXT: vmov r1, s13 +; CHECK-NEXT: vmov.8 q6[5], lr +; CHECK-NEXT: vmov.8 q6[6], r8 +; CHECK-NEXT: vmov.8 q6[7], r5 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vldrw.u32 q0, [sp, #288] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q0, [sp, #288] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r6, s0 +; CHECK-NEXT: ldrb r7, [r1] +; CHECK-NEXT: vmov r1, s14 +; CHECK-NEXT: vldrw.u32 q3, [sp, #208] @ 16-byte Reload +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q7[5], r5 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov.8 q7[6], r10 +; CHECK-NEXT: vmov.8 q7[7], r4 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: vmov.8 q7[8], r2 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: vmov.8 q7[9], r7 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[10], r1 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: vmov.8 q7[11], r3 +; CHECK-NEXT: vmov.8 q7[12], r6 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov.8 q7[13], r5 +; CHECK-NEXT: vmov.8 q7[14], r4 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q6[8], r1 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q6[9], r1 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q6[10], r1 +; CHECK-NEXT: vmov r1, s7 +; CHECK-NEXT: vadd.i32 q1, q2, r0 +; CHECK-NEXT: vldrw.u32 q2, [sp, #192] @ 16-byte Reload +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q6[11], r1 +; CHECK-NEXT: vmov r1, s4 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q6[12], r1 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q6[13], r1 +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q6[14], r1 +; CHECK-NEXT: vmov r1, s7 +; CHECK-NEXT: vldrw.u32 q1, [sp, #256] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q1, q1, q5 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q6[15], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vadd.i32 q0, q4, r0 +; CHECK-NEXT: vadd.i32 q4, q4, q5 +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vstrw.32 q4, [sp, #224] @ 16-byte Spill +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[15], r1 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vadd.i8 q6, q7, q6 +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[0], r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov.8 q7[1], r2 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[2], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vldrw.u32 q0, [sp, #272] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q0, [sp, #272] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vldrw.u32 q4, [sp, #272] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q4, q4, q5 +; CHECK-NEXT: vstrw.32 q4, [sp, #272] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q4, [sp, #304] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q4, q4, q5 +; CHECK-NEXT: vstrw.32 q4, [sp, #304] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q4, [sp, #128] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q4, q4, q5 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[3], r1 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[4], r1 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[5], r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[6], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vadd.i32 q0, q3, r0 +; CHECK-NEXT: vadd.i32 q3, q3, q5 +; CHECK-NEXT: vstrw.32 q3, [sp, #208] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q3, [sp, #240] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q3, q3, q5 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[7], r1 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[8], r1 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[9], r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[10], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vadd.i32 q0, q2, r0 +; CHECK-NEXT: vadd.i32 q2, q2, q5 +; CHECK-NEXT: vstrw.32 q2, [sp, #192] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #288] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q2, q2, q5 +; CHECK-NEXT: vstrw.32 q2, [sp, #288] @ 16-byte Spill +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[11], r1 +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[12], r1 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[13], r1 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[14], r1 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: vmov.8 q7[15], r1 +; CHECK-NEXT: vadd.i8 q0, q6, q7 +; CHECK-NEXT: vldrw.u32 q7, [sp, #144] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp, #160] @ 16-byte Reload +; CHECK-NEXT: vstrb.8 q0, [r9], #16 +; CHECK-NEXT: vldrw.u32 q0, [sp, #176] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q7, q7, q5 +; CHECK-NEXT: vadd.i32 q6, q6, q5 +; CHECK-NEXT: vadd.i32 q0, q0, q5 +; CHECK-NEXT: bne.w .LBB13_3 +; CHECK-NEXT: @ %bb.4: @ %middle.block +; CHECK-NEXT: @ in Loop: Header=BB13_2 Depth=1 +; CHECK-NEXT: ldr r1, [sp, #124] @ 4-byte Reload +; CHECK-NEXT: ldr.w r11, [sp, #116] @ 4-byte Reload +; CHECK-NEXT: cmp r11, r1 +; CHECK-NEXT: bne.w .LBB13_2 +; CHECK-NEXT: .LBB13_5: @ %for.cond.cleanup +; CHECK-NEXT: add sp, #328 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.6: +; CHECK-NEXT: .LCPI13_0: +; CHECK-NEXT: .long 38 @ 0x26 +; CHECK-NEXT: .long 41 @ 0x29 +; CHECK-NEXT: .long 44 @ 0x2c +; CHECK-NEXT: .long 47 @ 0x2f +; CHECK-NEXT: .LCPI13_1: +; CHECK-NEXT: .long 14 @ 0xe +; CHECK-NEXT: .long 17 @ 0x11 +; CHECK-NEXT: .long 20 @ 0x14 +; CHECK-NEXT: .long 23 @ 0x17 +; CHECK-NEXT: .LCPI13_2: +; CHECK-NEXT: .long 24 @ 0x18 +; CHECK-NEXT: .long 27 @ 0x1b +; CHECK-NEXT: .long 30 @ 0x1e +; CHECK-NEXT: .long 33 @ 0x21 +; CHECK-NEXT: .LCPI13_3: +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 4 @ 0x4 +; CHECK-NEXT: .long 7 @ 0x7 +; CHECK-NEXT: .long 10 @ 0xa +; CHECK-NEXT: .LCPI13_4: +; CHECK-NEXT: .long 36 @ 0x24 +; CHECK-NEXT: .long 39 @ 0x27 +; CHECK-NEXT: .long 42 @ 0x2a +; CHECK-NEXT: .long 45 @ 0x2d +; CHECK-NEXT: .LCPI13_5: +; CHECK-NEXT: .long 25 @ 0x19 +; CHECK-NEXT: .long 28 @ 0x1c +; CHECK-NEXT: .long 31 @ 0x1f +; CHECK-NEXT: .long 34 @ 0x22 +; CHECK-NEXT: .LCPI13_6: +; CHECK-NEXT: .long 13 @ 0xd +; CHECK-NEXT: .long 16 @ 0x10 +; CHECK-NEXT: .long 19 @ 0x13 +; CHECK-NEXT: .long 22 @ 0x16 +; CHECK-NEXT: .LCPI13_7: +; CHECK-NEXT: .long 2 @ 0x2 +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 11 @ 0xb +; CHECK-NEXT: .LCPI13_8: +; CHECK-NEXT: .long 26 @ 0x1a +; CHECK-NEXT: .long 29 @ 0x1d +; CHECK-NEXT: .long 32 @ 0x20 +; CHECK-NEXT: .long 35 @ 0x23 +; CHECK-NEXT: .LCPI13_9: +; CHECK-NEXT: .long 37 @ 0x25 +; CHECK-NEXT: .long 40 @ 0x28 +; CHECK-NEXT: .long 43 @ 0x2b +; CHECK-NEXT: .long 46 @ 0x2e +; CHECK-NEXT: .LCPI13_10: +; CHECK-NEXT: .long 12 @ 0xc +; CHECK-NEXT: .long 15 @ 0xf +; CHECK-NEXT: .long 18 @ 0x12 +; CHECK-NEXT: .long 21 @ 0x15 +; CHECK-NEXT: .LCPI13_11: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 9 @ 0x9 + + +entry: + %cmp22 = icmp sgt i32 %n, 0 + br i1 %cmp22, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %for.body.preheader + %n.vec = and i32 %n, -8 + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.ind = phi <16 x i32> [ , %vector.ph ], [ %vec.ind.next, %vector.body ] + %0 = mul nuw nsw <16 x i32> %vec.ind, + %1 = getelementptr inbounds i8, i8* %data, <16 x i32> %0 + %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %1, i32 2, <16 x i1> , <16 x i8> undef) + %2 = add nuw nsw <16 x i32> %0, + %3 = getelementptr inbounds i8, i8* %data, <16 x i32> %2 + %wide.masked.gather24 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %3, i32 2, <16 x i1> , <16 x i8> undef) + %4 = add nuw nsw <16 x i32> %0, + %5 = getelementptr inbounds i8, i8* %data, <16 x i32> %4 + %wide.masked.gather25 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %5, i32 2, <16 x i1> , <16 x i8> undef) + %6 = add nsw <16 x i8> %wide.masked.gather24, %wide.masked.gather + %7 = add nsw <16 x i8> %6, %wide.masked.gather25 + %8 = getelementptr inbounds i8, i8* %dst, i32 %index + %9 = bitcast i8* %8 to <16 x i8>* + store <16 x i8> %7, <16 x i8>* %9, align 2 + %index.next = add i32 %index, 16 + %vec.ind.next = add <16 x i32> %vec.ind, + %10 = icmp eq i32 %index.next, %n.vec + br i1 %10, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %cmp.n = icmp eq i32 %n.vec, %n + br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph + +for.cond.cleanup: ; preds = %for.body, %middle.block, %entry + ret void +} + +define arm_aapcs_vfpcc void @gather_inc_v16i8_simple(i8* noalias nocapture readonly %data, i8* noalias nocapture %dst, i32 %n) { +; CHECK-LABEL: gather_inc_v16i8_simple: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} +; CHECK-NEXT: .pad #4 +; CHECK-NEXT: sub sp, #4 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .pad #72 +; CHECK-NEXT: sub sp, #72 +; CHECK-NEXT: cmp r2, #1 +; CHECK-NEXT: blt.w .LBB14_5 +; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader +; CHECK-NEXT: adr r5, .LCPI14_3 +; CHECK-NEXT: adr r7, .LCPI14_1 +; CHECK-NEXT: vldrw.u32 q0, [r5] +; CHECK-NEXT: adr r6, .LCPI14_2 +; CHECK-NEXT: adr r3, .LCPI14_0 +; CHECK-NEXT: bic r12, r2, #7 +; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r7] +; CHECK-NEXT: vmov.i32 q4, #0x10 +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: .LBB14_2: @ %vector.ph +; CHECK-NEXT: @ =>This Loop Header: Depth=1 +; CHECK-NEXT: @ Child Loop BB14_3 Depth 2 +; CHECK-NEXT: vldrw.u32 q5, [sp] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: mov lr, r1 +; CHECK-NEXT: mov r3, r12 +; CHECK-NEXT: .LBB14_3: @ %vector.body +; CHECK-NEXT: @ Parent Loop BB14_2 Depth=1 +; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 +; CHECK-NEXT: vadd.i32 q1, q7, r0 +; CHECK-NEXT: vadd.i32 q2, q0, r0 +; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: vadd.i32 q3, q5, r0 +; CHECK-NEXT: vmov r6, s12 +; CHECK-NEXT: subs r3, #16 +; CHECK-NEXT: vmov r5, s8 +; CHECK-NEXT: vadd.i32 q5, q5, q4 +; CHECK-NEXT: vadd.i32 q7, q7, q4 +; CHECK-NEXT: vadd.i32 q0, q0, q4 +; CHECK-NEXT: ldrb.w r8, [r4] +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb.w r10, [r4] +; CHECK-NEXT: vmov r4, s9 +; CHECK-NEXT: ldrb.w r9, [r4] +; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: ldrb.w r11, [r4] +; CHECK-NEXT: vmov r4, s11 +; CHECK-NEXT: vmov.8 q2[0], r6 +; CHECK-NEXT: vmov r6, s13 +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: vmov.8 q2[1], r6 +; CHECK-NEXT: vmov r6, s14 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: vmov.8 q2[2], r6 +; CHECK-NEXT: vmov r6, s15 +; CHECK-NEXT: vadd.i32 q3, q6, r0 +; CHECK-NEXT: vadd.i32 q6, q6, q4 +; CHECK-NEXT: vmov r7, s12 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: vmov.8 q2[3], r6 +; CHECK-NEXT: vmov r6, s5 +; CHECK-NEXT: vmov.8 q2[4], r7 +; CHECK-NEXT: vmov r7, s13 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: vmov.8 q2[5], r7 +; CHECK-NEXT: vmov r7, s14 +; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: vmov.8 q2[6], r7 +; CHECK-NEXT: vmov r7, s15 +; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: vmov.8 q2[7], r7 +; CHECK-NEXT: vmov r7, s4 +; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: vmov.8 q2[8], r7 +; CHECK-NEXT: vmov.8 q2[9], r6 +; CHECK-NEXT: vmov.8 q2[10], r8 +; CHECK-NEXT: vmov.8 q2[11], r10 +; CHECK-NEXT: vmov.8 q2[12], r5 +; CHECK-NEXT: vmov.8 q2[13], r9 +; CHECK-NEXT: vmov.8 q2[14], r11 +; CHECK-NEXT: vmov.8 q2[15], r4 +; CHECK-NEXT: vstrb.8 q2, [lr], #16 +; CHECK-NEXT: bne .LBB14_3 +; CHECK-NEXT: @ %bb.4: @ %middle.block +; CHECK-NEXT: @ in Loop: Header=BB14_2 Depth=1 +; CHECK-NEXT: cmp r12, r2 +; CHECK-NEXT: bne .LBB14_2 +; CHECK-NEXT: .LBB14_5: @ %for.cond.cleanup +; CHECK-NEXT: add sp, #72 +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.6: +; CHECK-NEXT: .LCPI14_0: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 2 @ 0x2 +; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .LCPI14_1: +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 9 @ 0x9 +; CHECK-NEXT: .long 10 @ 0xa +; CHECK-NEXT: .long 11 @ 0xb +; CHECK-NEXT: .LCPI14_2: +; CHECK-NEXT: .long 4 @ 0x4 +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 7 @ 0x7 +; CHECK-NEXT: .LCPI14_3: +; CHECK-NEXT: .long 12 @ 0xc +; CHECK-NEXT: .long 13 @ 0xd +; CHECK-NEXT: .long 14 @ 0xe +; CHECK-NEXT: .long 15 @ 0xf + + +entry: + %cmp22 = icmp sgt i32 %n, 0 + br i1 %cmp22, label %vector.ph, label %for.cond.cleanup + +vector.ph: ; preds = %for.body.preheader + %n.vec = and i32 %n, -8 + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vec.ind = phi <16 x i32> [ , %vector.ph ], [ %vec.ind.next, %vector.body ] + %0 = getelementptr inbounds i8, i8* %data, <16 x i32> %vec.ind + %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %0, i32 2, <16 x i1> , <16 x i8> undef) + %1 = getelementptr inbounds i8, i8* %dst, i32 %index + %2 = bitcast i8* %1 to <16 x i8>* + store <16 x i8> %wide.masked.gather, <16 x i8>* %2, align 2 + %index.next = add i32 %index, 16 + %vec.ind.next = add <16 x i32> %vec.ind, + %3 = icmp eq i32 %index.next, %n.vec + br i1 %3, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %cmp.n = icmp eq i32 %n.vec, %n + br i1 %cmp.n, label %for.cond.cleanup, label %vector.ph + +for.cond.cleanup: ; preds = %for.body, %middle.block, %entry + ret void +} + + +declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>) +declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) +declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>, i32, <8 x i1>, <8 x i32>) +declare <16 x i32> @llvm.masked.gather.v16i32.v16p0i32(<16 x i32*>, i32, <16 x i1>, <16 x i32>) +declare <2 x float> @llvm.masked.gather.v2f32.v2p0f32(<2 x float*>, i32, <2 x i1>, <2 x float>) +declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>) +declare <8 x float> @llvm.masked.gather.v8f32.v8p0f32(<8 x float*>, i32, <8 x i1>, <8 x float>) +declare <2 x i16> @llvm.masked.gather.v2i16.v2p0i16(<2 x i16*>, i32, <2 x i1>, <2 x i16>) +declare <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>) +declare <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*>, i32, <8 x i1>, <8 x i16>) +declare <16 x i16> @llvm.masked.gather.v16i16.v16p0i16(<16 x i16*>, i32, <16 x i1>, <16 x i16>) +declare <4 x half> @llvm.masked.gather.v4f16.v4p0f16(<4 x half*>, i32, <4 x i1>, <4 x half>) +declare <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*>, i32, <8 x i1>, <8 x half>) +declare <16 x half> @llvm.masked.gather.v16f16.v16p0f16(<16 x half*>, i32, <16 x i1>, <16 x half>) +declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>) +declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>) +declare <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*>, i32, <16 x i1>, <16 x i8>) +declare <32 x i8> @llvm.masked.gather.v32i8.v32p0i8(<32 x i8*>, i32, <32 x i1>, <32 x i8>) +declare void @llvm.masked.store.v4i32.p0v4i32(<4 x i32>, <4 x i32>*, i32, <4 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -20,24 +20,23 @@ ; CHECK-LABEL: push_out_mul_gather: ; CHECK: @ %bb.0: @ %vector.ph ; CHECK-NEXT: adr r3, .LCPI0_0 -; CHECK-NEXT: vmov.i32 q0, #0x18 -; CHECK-NEXT: vldrw.u32 q1, [r3] +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r0, q1, uxtw #2] +; CHECK-NEXT: vldrw.u32 q1, [q0, #96]! ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q1, q1, q0 -; CHECK-NEXT: vstrb.8 q2, [r1], #16 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 ; CHECK-NEXT: bne .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %end ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI0_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 6 @ 0x6 -; CHECK-NEXT: .long 12 @ 0xc -; CHECK-NEXT: .long 18 @ 0x12 +; CHECK-NEXT: .long 4294967200 @ 0xffffffa0 +; CHECK-NEXT: .long 4294967224 @ 0xffffffb8 +; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 +; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 vector.ph: ; preds = %for.body.preheader %ind.end = shl i32 %n.vec, 1 @@ -65,24 +64,23 @@ ; CHECK-LABEL: push_out_add_gather: ; CHECK: @ %bb.0: @ %vector.ph ; CHECK-NEXT: adr r3, .LCPI1_0 -; CHECK-NEXT: vmov.i32 q1, #0x8 ; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r0, q0, uxtw #2] +; CHECK-NEXT: vldrw.u32 q1, [q0, #32]! ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vstrb.8 q2, [r1], #16 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 ; CHECK-NEXT: bne .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %end ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI1_0: -; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 +; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 8 @ 0x8 -; CHECK-NEXT: .long 10 @ 0xa -; CHECK-NEXT: .long 12 @ 0xc +; CHECK-NEXT: .long 16 @ 0x10 vector.ph: ; preds = %for.body.preheader %ind.end = shl i32 %n.vec, 1 @@ -110,24 +108,23 @@ ; CHECK-LABEL: push_out_mul_add_gather: ; CHECK: @ %bb.0: @ %vector.ph ; CHECK-NEXT: adr r3, .LCPI2_0 -; CHECK-NEXT: vmov.i32 q0, #0x18 -; CHECK-NEXT: vldrw.u32 q1, [r3] +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r0, q1, uxtw #2] +; CHECK-NEXT: vldrw.u32 q1, [q0, #96]! ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q1, q1, q0 -; CHECK-NEXT: vstrb.8 q2, [r1], #16 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 ; CHECK-NEXT: bne .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %end ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI2_0: -; CHECK-NEXT: .long 6 @ 0x6 -; CHECK-NEXT: .long 12 @ 0xc -; CHECK-NEXT: .long 18 @ 0x12 -; CHECK-NEXT: .long 24 @ 0x18 +; CHECK-NEXT: .long 4294967224 @ 0xffffffb8 +; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 +; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 +; CHECK-NEXT: .long 0 @ 0x0 vector.ph: ; preds = %for.body.preheader %ind.end = shl i32 %n.vec, 1 @@ -287,24 +284,23 @@ ; CHECK-LABEL: push_out_add_sub_block: ; CHECK: @ %bb.0: @ %vector.ph ; CHECK-NEXT: adr r3, .LCPI6_0 -; CHECK-NEXT: vmov.i32 q1, #0x8 ; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r0, q0, uxtw #2] +; CHECK-NEXT: vldrw.u32 q1, [q0, #32]! ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vstrb.8 q2, [r1], #16 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 ; CHECK-NEXT: bne .LBB6_1 ; CHECK-NEXT: @ %bb.2: @ %end ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI6_0: -; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 +; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 8 @ 0x8 -; CHECK-NEXT: .long 10 @ 0xa -; CHECK-NEXT: .long 12 @ 0xc +; CHECK-NEXT: .long 16 @ 0x10 vector.ph: ; preds = %for.body.preheader %ind.end = shl i32 %n.vec, 1 @@ -337,24 +333,23 @@ define arm_aapcs_vfpcc void @non_gatscat_use1(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) { ; CHECK-LABEL: non_gatscat_use1: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr r3, .LCPI7_0 -; CHECK-NEXT: vmov.i32 q1, #0x8 -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: vmov.i32 q2, #0x6 -; CHECK-NEXT: vmov.i32 q3, #0x3 +; CHECK-NEXT: vmov.i32 q0, #0x8 +; CHECK-NEXT: vldrw.u32 q2, [r3] +; CHECK-NEXT: vmov.i32 q1, #0xc ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmul.i32 q4, q0, q3 +; CHECK-NEXT: vadd.i32 q3, q2, q0 +; CHECK-NEXT: vmlas.u32 q2, q1, r0 +; CHECK-NEXT: vldrw.u32 q4, [q2, #24] ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q4, q4, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vldrw.u32 q5, [r0, q4, uxtw #2] -; CHECK-NEXT: vstrb.8 q5, [r1], #16 +; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: vstrb.8 q4, [r1], #16 ; CHECK-NEXT: bne .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: @@ -391,24 +386,23 @@ define arm_aapcs_vfpcc void @non_gatscat_use2(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n.vec) { ; CHECK-LABEL: non_gatscat_use2: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr r3, .LCPI8_0 -; CHECK-NEXT: vmov.i32 q1, #0x8 -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: vmov.i32 q2, #0x6 -; CHECK-NEXT: vmov.i32 q3, #0x3 +; CHECK-NEXT: vmov.i32 q0, #0x8 +; CHECK-NEXT: vldrw.u32 q2, [r3] +; CHECK-NEXT: vmov.i32 q1, #0xc ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmul.i32 q4, q0, q3 +; CHECK-NEXT: vadd.i32 q3, q2, q0 +; CHECK-NEXT: vmlas.u32 q2, q1, r0 +; CHECK-NEXT: vldrw.u32 q4, [q2, #24] ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q4, q4, q2 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vldrw.u32 q5, [r0, q4, uxtw #2] -; CHECK-NEXT: vstrb.8 q5, [r1], #16 +; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: vstrb.8 q4, [r1], #16 ; CHECK-NEXT: bne .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: @@ -451,22 +445,22 @@ ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #40 -; CHECK-NEXT: sub sp, #40 -; CHECK-NEXT: ldrd r9, r12, [sp, #144] +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: ldrd r9, r12, [sp, #128] ; CHECK-NEXT: sub.w r7, r12, #1 ; CHECK-NEXT: movs r6, #1 -; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: adr r5, .LCPI9_0 ; CHECK-NEXT: add.w r7, r6, r7, lsr #1 -; CHECK-NEXT: vmov.i32 q3, #0x8 +; CHECK-NEXT: vdup.32 q1, r9 ; CHECK-NEXT: bic r7, r7, #3 +; CHECK-NEXT: vldrw.u32 q2, [r5] ; CHECK-NEXT: subs r7, #4 -; CHECK-NEXT: add.w r11, r6, r7, lsr #2 -; CHECK-NEXT: adr r7, .LCPI9_0 -; CHECK-NEXT: vldrw.u32 q0, [r7] -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vdup.32 q0, r9 -; CHECK-NEXT: vshl.i32 q2, q0, #3 +; CHECK-NEXT: mov.w r8, #0 +; CHECK-NEXT: vshl.i32 q3, q1, #3 +; CHECK-NEXT: add.w r7, r6, r7, lsr #2 +; CHECK-NEXT: adr r6, .LCPI9_1 +; CHECK-NEXT: vldrw.u32 q0, [r6] ; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: .LBB9_1: @ %for.cond8.preheader.us.us.preheader ; CHECK-NEXT: @ =>This Loop Header: Depth=1 @@ -474,28 +468,29 @@ ; CHECK-NEXT: @ Child Loop BB9_3 Depth 3 ; CHECK-NEXT: mul r10, r8, r9 ; CHECK-NEXT: movs r5, #0 -; CHECK-NEXT: mul r7, r8, r12 +; CHECK-NEXT: mul r11, r8, r12 ; CHECK-NEXT: .LBB9_2: @ %vector.ph ; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1 ; CHECK-NEXT: @ => This Loop Header: Depth=2 ; CHECK-NEXT: @ Child Loop BB9_3 Depth 3 -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload +; CHECK-NEXT: vdup.32 q5, r11 +; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vshl.i32 q5, q5, #2 +; CHECK-NEXT: vmov q6, q1 +; CHECK-NEXT: vadd.i32 q5, q5, r0 ; CHECK-NEXT: vmov.i32 q4, #0x0 -; CHECK-NEXT: dls lr, r11 -; CHECK-NEXT: vadd.i32 q5, q0, r7 -; CHECK-NEXT: vmlas.u32 q6, q0, r5 +; CHECK-NEXT: vadd.i32 q5, q5, q0 +; CHECK-NEXT: vmlas.u32 q6, q2, r5 +; CHECK-NEXT: dls lr, r7 ; CHECK-NEXT: .LBB9_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB9_1 Depth=1 ; CHECK-NEXT: @ Parent Loop BB9_2 Depth=2 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 -; CHECK-NEXT: vadd.i32 q1, q5, q3 -; CHECK-NEXT: vldrw.u32 q0, [r0, q5, uxtw #2] -; CHECK-NEXT: vldrw.u32 q5, [r1, q6, uxtw #2] -; CHECK-NEXT: vadd.i32 q7, q6, q2 +; CHECK-NEXT: vadd.i32 q7, q6, q3 +; CHECK-NEXT: vldrw.u32 q0, [r1, q6, uxtw #2] +; CHECK-NEXT: vldrw.u32 q6, [q5, #32]! +; CHECK-NEXT: vmul.i32 q0, q0, q6 ; CHECK-NEXT: vmov q6, q7 -; CHECK-NEXT: vmul.i32 q0, q5, q0 -; CHECK-NEXT: vmov q5, q1 ; CHECK-NEXT: vadd.i32 q4, q0, q4 ; CHECK-NEXT: le lr, .LBB9_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block @@ -512,7 +507,7 @@ ; CHECK-NEXT: cmp r8, r3 ; CHECK-NEXT: bne .LBB9_1 ; CHECK-NEXT: @ %bb.6: @ %for.end25 -; CHECK-NEXT: add sp, #40 +; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -523,6 +518,11 @@ ; CHECK-NEXT: .long 2 @ 0x2 ; CHECK-NEXT: .long 4 @ 0x4 ; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .LCPI9_1: +; CHECK-NEXT: .long 4294967264 @ 0xffffffe0 +; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 +; CHECK-NEXT: .long 4294967280 @ 0xfffffff0 +; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 for.cond8.preheader.us.us.preheader.preheader: ; preds = %entry %0 = add i32 %l, -1