Index: llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp =================================================================== --- llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -84,7 +84,7 @@ // Check for a getelementptr and deduce base and offsets from it, on success // returning the base directly and the offsets indirectly using the Offsets // argument - Value *checkGEP(Value *&Offsets, Type *Ty, GetElementPtrInst *GEP, + Value *checkGEP(Value *&Offsets, FixedVectorType *Ty, GetElementPtrInst *GEP, IRBuilder<> &Builder); // Compute the scale of this gather/scatter instruction int computeScale(unsigned GEPElemSize, unsigned MemoryElemSize); @@ -132,6 +132,11 @@ Value *tryCreateIncrementingWBGatScat(IntrinsicInst *I, Value *BasePtr, Value *Ptr, unsigned TypeScale, IRBuilder<> &Builder); + + // Optimise the base and offsets of the given address + bool optimiseAddress(Value *Address, BasicBlock *BB, LoopInfo *LI); + // Try to fold consecutive geps together into one + Value *foldGEP(GetElementPtrInst *GEP, Value *&Offsets, IRBuilder<> &Builder); // Check whether these offsets could be moved out of the loop they're in bool optimiseOffsets(Value *Offsets, BasicBlock *BB, LoopInfo *LI); // Pushes the given add out of the loop @@ -167,7 +172,49 @@ return false; } -Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty, +bool checkOffsetSize(Value *Offsets, unsigned TargetElemCount) { + // Offsets that are not of type are sign extended by the + // getelementptr instruction, and MVE gathers/scatters treat the offset as + // unsigned. Thus, if the element size is smaller than 32, we can only allow + // positive offsets - i.e., the offsets are not allowed to be variables we + // can't look into. + // Additionally, offsets have to either originate from a zext of a + // vector with element types smaller or equal the type of the gather we're + // looking at, or consist of constants that we can check are small enough + // to fit into the gather type. + // Thus we check that 0 < value < 2^TargetElemSize. + unsigned TargetElemSize = 128 / TargetElemCount; + unsigned OffsetElemSize = cast(Offsets->getType()) + ->getElementType() + ->getScalarSizeInBits(); + if (OffsetElemSize != TargetElemSize || OffsetElemSize != 32) { + Constant *ConstOff = dyn_cast(Offsets); + if (!ConstOff) + return false; + int64_t TargetElemMaxSize = (1ULL << TargetElemSize); + auto CheckValueSize = [TargetElemMaxSize](Value *OffsetElem) { + ConstantInt *OConst = dyn_cast(OffsetElem); + if (!OConst) + return false; + int SExtValue = OConst->getSExtValue(); + if (SExtValue >= TargetElemMaxSize || SExtValue < 0) + return false; + return true; + }; + if (isa(ConstOff->getType())) { + for (unsigned i = 0; i < TargetElemCount; i++) { + if (!CheckValueSize(ConstOff->getAggregateElement(i))) + return false; + } + } else { + if (!CheckValueSize(ConstOff)) + return false; + } + } + return true; +} + +Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, FixedVectorType *Ty, GetElementPtrInst *GEP, IRBuilder<> &Builder) { if (!GEP) { @@ -178,40 +225,43 @@ LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementpointer found." << " Looking at intrinsic for base + vector of offsets\n"); Value *GEPPtr = GEP->getPointerOperand(); - if (GEPPtr->getType()->isVectorTy()) { + Offsets = GEP->getOperand(1); + if (GEPPtr->getType()->isVectorTy() || + !isa(Offsets->getType())) return nullptr; - } + if (GEP->getNumOperands() != 2) { LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementptr with too many" << " operands. Expanding.\n"); return nullptr; } Offsets = GEP->getOperand(1); + unsigned OffsetsElemCount = + cast(Offsets->getType())->getNumElements(); // Paranoid check whether the number of parallel lanes is the same - assert(cast(Ty)->getNumElements() == - cast(Offsets->getType())->getNumElements()); - // Only offsets can be integrated into an arm gather, any smaller - // type would have to be sign extended by the gep - and arm gathers can only - // zero extend. Additionally, the offsets do have to originate from a zext of - // a vector with element types smaller or equal the type of the gather we're - // looking at - if (Offsets->getType()->getScalarSizeInBits() != 32) - return nullptr; - if (ZExtInst *ZextOffs = dyn_cast(Offsets)) + assert(Ty->getNumElements() == OffsetsElemCount); + + ZExtInst *ZextOffs = dyn_cast(Offsets); + if (ZextOffs) Offsets = ZextOffs->getOperand(0); - else if (!(cast(Offsets->getType())->getNumElements() == 4 && - Offsets->getType()->getScalarSizeInBits() == 32)) - return nullptr; + FixedVectorType *OffsetType = cast(Offsets->getType()); + + // If the offsets are already being zext-ed to , that relieves us of + // having to make sure that they won't overflow. + if (!ZextOffs || cast(ZextOffs->getDestTy()) + ->getElementType() + ->getScalarSizeInBits() != 32) + if (!checkOffsetSize(Offsets, OffsetsElemCount)) + return nullptr; + // The offset sizes have been checked; if any truncating or zext-ing is + // required to fix them, do that now if (Ty != Offsets->getType()) { - if ((Ty->getScalarSizeInBits() < - Offsets->getType()->getScalarSizeInBits())) { - LLVM_DEBUG(dbgs() << "masked gathers/scatters: no correct offset type." - << " Can't create intrinsic.\n"); - return nullptr; + if ((Ty->getElementType()->getScalarSizeInBits() < + OffsetType->getElementType()->getScalarSizeInBits())) { + Offsets = Builder.CreateTrunc(Offsets, Ty); } else { - Offsets = Builder.CreateZExt( - Offsets, VectorType::getInteger(cast(Ty))); + Offsets = Builder.CreateZExt(Offsets, VectorType::getInteger(Ty)); } } // If none of the checks failed, return the gep's base pointer @@ -426,7 +476,8 @@ GetElementPtrInst *GEP = dyn_cast(Ptr); Value *Offsets; - Value *BasePtr = checkGEP(Offsets, ResultTy, GEP, Builder); + Value *BasePtr = + checkGEP(Offsets, cast(ResultTy), GEP, Builder); if (!BasePtr) return nullptr; // Check whether the offset is a constant increment that could be merged into @@ -566,7 +617,8 @@ GetElementPtrInst *GEP = dyn_cast(Ptr); Value *Offsets; - Value *BasePtr = checkGEP(Offsets, InputTy, GEP, Builder); + Value *BasePtr = + checkGEP(Offsets, cast(InputTy), GEP, Builder); if (!BasePtr) return nullptr; // Check whether the offset is a constant increment that could be merged into @@ -978,6 +1030,128 @@ return true; } +Value *CheckAndCreateOffsetAdd(Value *X, Value *Y, Value *GEP, + IRBuilder<> &Builder) { + + // Splat the non-vector value to a vector of the given type - if the value is + // a constant (and its value isn't too big), we can even use this opportunity + // to scale it to the size of the vector elements + auto FixSummands = [&Builder](FixedVectorType *&VT, Value *&NonVectorVal) { + ConstantInt *Const; + if ((Const = dyn_cast(NonVectorVal)) && + VT->getElementType() != NonVectorVal->getType()) { + unsigned TargetElemSize = VT->getElementType()->getPrimitiveSizeInBits(); + uint64_t N = Const->getZExtValue(); + if (N < (unsigned)(1 << (TargetElemSize - 1))) { + NonVectorVal = Builder.CreateVectorSplat( + VT->getNumElements(), Builder.getIntN(TargetElemSize, N)); + return; + } + } + NonVectorVal = + Builder.CreateVectorSplat(VT->getNumElements(), NonVectorVal); + }; + + FixedVectorType *XElType = dyn_cast(X->getType()); + FixedVectorType *YElType = dyn_cast(Y->getType()); + // If one of X, Y is not a vector, we have to splat it in order + // to add the two of them. + if (XElType && !YElType) { + FixSummands(XElType, Y); + YElType = cast(Y->getType()); + } else if (YElType && !XElType) { + FixSummands(YElType, X); + XElType = cast(X->getType()); + } + // Check that the summands are of compatible types + if (XElType != YElType) { + LLVM_DEBUG(dbgs() << "masked gathers/scatters: incompatible gep offsets\n"); + return nullptr; + } + + if (XElType->getElementType()->getScalarSizeInBits() != 32) { + // Check that by adding the vectors we do not accidentally + // create an overflow + Constant *ConstX = dyn_cast(X); + Constant *ConstY = dyn_cast(Y); + if (!ConstX || !ConstY) + return nullptr; + unsigned TargetElemSize = 128 / XElType->getNumElements(); + for (unsigned i = 0; i < XElType->getNumElements(); i++) { + ConstantInt *ConstXEl = + dyn_cast(ConstX->getAggregateElement(i)); + ConstantInt *ConstYEl = + dyn_cast(ConstY->getAggregateElement(i)); + if (!ConstXEl || !ConstYEl || + ConstXEl->getZExtValue() + ConstYEl->getZExtValue() >= + (unsigned)(1 << (TargetElemSize - 1))) + return nullptr; + } + } + + Value *Add = Builder.CreateAdd(X, Y); + + FixedVectorType *GEPType = cast(GEP->getType()); + if (checkOffsetSize(Add, GEPType->getNumElements())) + return Add; + else + return nullptr; +} + +Value *MVEGatherScatterLowering::foldGEP(GetElementPtrInst *GEP, + Value *&Offsets, + IRBuilder<> &Builder) { + Value *GEPPtr = GEP->getPointerOperand(); + Offsets = GEP->getOperand(1); + // We only merge geps with constant offsets, because only for those + // we can make sure that we do not cause an overflow + if (!isa(Offsets)) + return nullptr; + GetElementPtrInst *BaseGEP; + if ((BaseGEP = dyn_cast(GEPPtr))) { + // Merge the two geps into one + Value *BaseBasePtr = foldGEP(BaseGEP, Offsets, Builder); + if (!BaseBasePtr) + return nullptr; + Offsets = + CheckAndCreateOffsetAdd(Offsets, GEP->getOperand(1), GEP, Builder); + if (Offsets == nullptr) + return nullptr; + return BaseBasePtr; + } + return GEPPtr; +} + +bool MVEGatherScatterLowering::optimiseAddress(Value *Address, BasicBlock *BB, + LoopInfo *LI) { + GetElementPtrInst *GEP = dyn_cast(Address); + if (!GEP) + return false; + bool Changed = false; + if (GEP->hasOneUse() && + dyn_cast(GEP->getPointerOperand())) { + IRBuilder<> Builder(GEP->getContext()); + Builder.SetInsertPoint(GEP); + Builder.SetCurrentDebugLocation(GEP->getDebugLoc()); + Value *Offsets; + Value *Base = foldGEP(GEP, Offsets, Builder); + // We only want to merge the geps if there is a real chance that they can be + // used by an MVE gather; thus the offset has to have the correct size + // (always i32 if it is not of vector type) and the base has to be a + // pointer. + if (Offsets && Base && Base != GEP) { + PointerType *BaseType = cast(Base->getType()); + GetElementPtrInst *NewAddress = GetElementPtrInst::Create( + BaseType->getPointerElementType(), Base, Offsets, "gep.merged", GEP); + GEP->replaceAllUsesWith(NewAddress); + GEP = NewAddress; + Changed = true; + } + } + Changed |= optimiseOffsets(GEP->getOperand(1), GEP->getParent(), LI); + return Changed; +} + bool MVEGatherScatterLowering::runOnFunction(Function &F) { if (!EnableMaskedGatherScatters) return false; @@ -995,22 +1169,17 @@ for (BasicBlock &BB : F) { for (Instruction &I : BB) { IntrinsicInst *II = dyn_cast(&I); - if (II && II->getIntrinsicID() == Intrinsic::masked_gather) { + if (II && II->getIntrinsicID() == Intrinsic::masked_gather && + isa(II->getType())) { Gathers.push_back(II); - if (isa(II->getArgOperand(0))) - Changed |= optimiseOffsets( - cast(II->getArgOperand(0))->getOperand(1), - II->getParent(), LI); - } else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter) { + Changed |= optimiseAddress(II->getArgOperand(0), II->getParent(), LI); + } else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter && + isa(II->getArgOperand(0)->getType())) { Scatters.push_back(II); - if (isa(II->getArgOperand(1))) - Changed |= optimiseOffsets( - cast(II->getArgOperand(1))->getOperand(1), - II->getParent(), LI); + Changed |= optimiseAddress(II->getArgOperand(1), II->getParent(), LI); } } } - for (unsigned i = 0; i < Gathers.size(); i++) { IntrinsicInst *I = Gathers[i]; Value *L = lowerGather(I); Index: llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll @@ -316,51 +316,21 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_2gep2(i16* %base, <8 x i16>* %offptr) { ; CHECK-LABEL: scaled_v8i16_i16_2gep2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: adr r1, .LCPI14_0 -; CHECK-NEXT: adr.w r12, .LCPI14_1 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q1, [r12] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrh.w lr, [r1] -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: ldrh r6, [r2] -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q0[0], r3 -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q0[2], lr -; CHECK-NEXT: vmov.16 q0[3], r6 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI14_0: -; CHECK-NEXT: .long 40 @ 0x28 -; CHECK-NEXT: .long 46 @ 0x2e -; CHECK-NEXT: .long 52 @ 0x34 -; CHECK-NEXT: .long 58 @ 0x3a -; CHECK-NEXT: .LCPI14_1: -; CHECK-NEXT: .long 64 @ 0x40 -; CHECK-NEXT: .long 70 @ 0x46 -; CHECK-NEXT: .long 76 @ 0x4c -; CHECK-NEXT: .long 82 @ 0x52 +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 23 @ 0x17 +; CHECK-NEXT: .short 26 @ 0x1a +; CHECK-NEXT: .short 29 @ 0x1d +; CHECK-NEXT: .short 32 @ 0x20 +; CHECK-NEXT: .short 35 @ 0x23 +; CHECK-NEXT: .short 38 @ 0x26 +; CHECK-NEXT: .short 41 @ 0x29 entry: %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i16 20 @@ -371,51 +341,21 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep(i16* %base) { ; CHECK-LABEL: scaled_v8i16_i16_biggep: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: adr r1, .LCPI15_0 -; CHECK-NEXT: adr.w r12, .LCPI15_1 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q1, [r12] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrh.w lr, [r1] -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: ldrh r6, [r2] -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q0[0], r3 -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q0[2], lr -; CHECK-NEXT: vmov.16 q0[3], r6 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI15_0: -; CHECK-NEXT: .long 40 @ 0x28 -; CHECK-NEXT: .long 46 @ 0x2e -; CHECK-NEXT: .long 52 @ 0x34 -; CHECK-NEXT: .long 58 @ 0x3a -; CHECK-NEXT: .LCPI15_1: -; CHECK-NEXT: .long 64 @ 0x40 -; CHECK-NEXT: .long 70 @ 0x46 -; CHECK-NEXT: .long 76 @ 0x4c -; CHECK-NEXT: .long 82 @ 0x52 +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 23 @ 0x17 +; CHECK-NEXT: .short 26 @ 0x1a +; CHECK-NEXT: .short 29 @ 0x1d +; CHECK-NEXT: .short 32 @ 0x20 +; CHECK-NEXT: .short 35 @ 0x23 +; CHECK-NEXT: .short 38 @ 0x26 +; CHECK-NEXT: .short 41 @ 0x29 entry: %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i32 20 @@ -426,51 +366,21 @@ define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep2(i16* %base) { ; CHECK-LABEL: scaled_v8i16_i16_biggep2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: adr r1, .LCPI16_0 -; CHECK-NEXT: adr.w r12, .LCPI16_1 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vldrw.u32 q1, [r12] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrh.w lr, [r1] -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: ldrh r6, [r2] -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: vmov.16 q0[0], r3 -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.16 q0[1], r5 -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q0[2], lr -; CHECK-NEXT: vmov.16 q0[3], r6 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r2 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI16_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 6 @ 0x6 -; CHECK-NEXT: .long 12 @ 0xc -; CHECK-NEXT: .long 18 @ 0x12 -; CHECK-NEXT: .LCPI16_1: -; CHECK-NEXT: .long 24 @ 0x18 -; CHECK-NEXT: .long 30 @ 0x1e -; CHECK-NEXT: .long 36 @ 0x24 -; CHECK-NEXT: .long 42 @ 0x2a +; CHECK-NEXT: .short 0 @ 0x0 +; CHECK-NEXT: .short 3 @ 0x3 +; CHECK-NEXT: .short 6 @ 0x6 +; CHECK-NEXT: .short 9 @ 0x9 +; CHECK-NEXT: .short 12 @ 0xc +; CHECK-NEXT: .short 15 @ 0xf +; CHECK-NEXT: .short 18 @ 0x12 +; CHECK-NEXT: .short 21 @ 0x15 entry: %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> undef) Index: llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll @@ -316,17 +316,16 @@ ; CHECK-LABEL: scaled_i32_i32_2gep2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: adr r1, .LCPI21_0 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI21_0: -; CHECK-NEXT: .long 20 @ 0x14 -; CHECK-NEXT: .long 32 @ 0x20 -; CHECK-NEXT: .long 44 @ 0x2c -; CHECK-NEXT: .long 56 @ 0x38 +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 11 @ 0xb +; CHECK-NEXT: .long 14 @ 0xe entry: %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %ptrs2 = getelementptr inbounds i32, <4 x i32*> %ptrs, i32 5 Index: llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll @@ -444,91 +444,29 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_2gep2(i8* %base, <16 x i8>* %offptr) { ; CHECK-LABEL: unscaled_v16i8_i8_2gep2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: adr r1, .LCPI8_0 -; CHECK-NEXT: adr r2, .LCPI8_1 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: adr r6, .LCPI8_2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrb.w r12, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb.w lr, [r1] -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: ldrb r3, [r1] -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: adr r6, .LCPI8_3 -; CHECK-NEXT: vldrw.u32 q2, [r6] -; CHECK-NEXT: vadd.i32 q3, q0, r0 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.8 q0[8], r5 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.8 q0[10], r12 -; CHECK-NEXT: vmov.8 q0[11], r4 -; CHECK-NEXT: vmov.8 q0[12], lr -; CHECK-NEXT: vmov.8 q0[13], r3 -; CHECK-NEXT: vmov.8 q0[14], r1 -; CHECK-NEXT: vmov.8 q0[15], r2 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrb.u8 q0, [r0, q1] +; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI8_0: -; CHECK-NEXT: .long 29 @ 0x1d -; CHECK-NEXT: .long 32 @ 0x20 -; CHECK-NEXT: .long 35 @ 0x23 -; CHECK-NEXT: .long 38 @ 0x26 -; CHECK-NEXT: .LCPI8_1: -; CHECK-NEXT: .long 41 @ 0x29 -; CHECK-NEXT: .long 44 @ 0x2c -; CHECK-NEXT: .long 47 @ 0x2f -; CHECK-NEXT: .long 50 @ 0x32 -; CHECK-NEXT: .LCPI8_2: -; CHECK-NEXT: .long 5 @ 0x5 -; CHECK-NEXT: .long 8 @ 0x8 -; CHECK-NEXT: .long 11 @ 0xb -; CHECK-NEXT: .long 14 @ 0xe -; CHECK-NEXT: .LCPI8_3: -; CHECK-NEXT: .long 17 @ 0x11 -; CHECK-NEXT: .long 20 @ 0x14 -; CHECK-NEXT: .long 23 @ 0x17 -; CHECK-NEXT: .long 26 @ 0x1a +; CHECK-NEXT: .byte 5 @ 0x5 +; CHECK-NEXT: .byte 8 @ 0x8 +; CHECK-NEXT: .byte 11 @ 0xb +; CHECK-NEXT: .byte 14 @ 0xe +; CHECK-NEXT: .byte 17 @ 0x11 +; CHECK-NEXT: .byte 20 @ 0x14 +; CHECK-NEXT: .byte 23 @ 0x17 +; CHECK-NEXT: .byte 26 @ 0x1a +; CHECK-NEXT: .byte 29 @ 0x1d +; CHECK-NEXT: .byte 32 @ 0x20 +; CHECK-NEXT: .byte 35 @ 0x23 +; CHECK-NEXT: .byte 38 @ 0x26 +; CHECK-NEXT: .byte 41 @ 0x29 +; CHECK-NEXT: .byte 44 @ 0x2c +; CHECK-NEXT: .byte 47 @ 0x2f +; CHECK-NEXT: .byte 50 @ 0x32 entry: %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i8 5 @@ -540,91 +478,29 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep(i8* %base) { ; CHECK-LABEL: unscaled_v16i8_i8_biggep: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: adr r1, .LCPI9_0 -; CHECK-NEXT: adr r2, .LCPI9_1 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: adr r6, .LCPI9_2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrb.w r12, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb.w lr, [r1] -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: ldrb r3, [r1] -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: adr r6, .LCPI9_3 -; CHECK-NEXT: vldrw.u32 q2, [r6] -; CHECK-NEXT: vadd.i32 q3, q0, r0 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.8 q0[8], r5 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.8 q0[10], r12 -; CHECK-NEXT: vmov.8 q0[11], r4 -; CHECK-NEXT: vmov.8 q0[12], lr -; CHECK-NEXT: vmov.8 q0[13], r3 -; CHECK-NEXT: vmov.8 q0[14], r1 -; CHECK-NEXT: vmov.8 q0[15], r2 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrb.u8 q0, [r0, q1] +; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI9_0: -; CHECK-NEXT: .long 29 @ 0x1d -; CHECK-NEXT: .long 32 @ 0x20 -; CHECK-NEXT: .long 35 @ 0x23 -; CHECK-NEXT: .long 38 @ 0x26 -; CHECK-NEXT: .LCPI9_1: -; CHECK-NEXT: .long 41 @ 0x29 -; CHECK-NEXT: .long 44 @ 0x2c -; CHECK-NEXT: .long 47 @ 0x2f -; CHECK-NEXT: .long 50 @ 0x32 -; CHECK-NEXT: .LCPI9_2: -; CHECK-NEXT: .long 5 @ 0x5 -; CHECK-NEXT: .long 8 @ 0x8 -; CHECK-NEXT: .long 11 @ 0xb -; CHECK-NEXT: .long 14 @ 0xe -; CHECK-NEXT: .LCPI9_3: -; CHECK-NEXT: .long 17 @ 0x11 -; CHECK-NEXT: .long 20 @ 0x14 -; CHECK-NEXT: .long 23 @ 0x17 -; CHECK-NEXT: .long 26 @ 0x1a +; CHECK-NEXT: .byte 5 @ 0x5 +; CHECK-NEXT: .byte 8 @ 0x8 +; CHECK-NEXT: .byte 11 @ 0xb +; CHECK-NEXT: .byte 14 @ 0xe +; CHECK-NEXT: .byte 17 @ 0x11 +; CHECK-NEXT: .byte 20 @ 0x14 +; CHECK-NEXT: .byte 23 @ 0x17 +; CHECK-NEXT: .byte 26 @ 0x1a +; CHECK-NEXT: .byte 29 @ 0x1d +; CHECK-NEXT: .byte 32 @ 0x20 +; CHECK-NEXT: .byte 35 @ 0x23 +; CHECK-NEXT: .byte 38 @ 0x26 +; CHECK-NEXT: .byte 41 @ 0x29 +; CHECK-NEXT: .byte 44 @ 0x2c +; CHECK-NEXT: .byte 47 @ 0x2f +; CHECK-NEXT: .byte 50 @ 0x32 entry: %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i32 5 @@ -636,91 +512,29 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep2(i8* %base) { ; CHECK-LABEL: unscaled_v16i8_i8_biggep2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: adr r1, .LCPI10_0 -; CHECK-NEXT: adr r2, .LCPI10_1 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: adr r6, .LCPI10_2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrb.w r12, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb.w lr, [r1] -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: ldrb r3, [r1] -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: adr r6, .LCPI10_3 -; CHECK-NEXT: vldrw.u32 q2, [r6] -; CHECK-NEXT: vadd.i32 q3, q0, r0 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.8 q0[8], r5 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.8 q0[10], r12 -; CHECK-NEXT: vmov.8 q0[11], r4 -; CHECK-NEXT: vmov.8 q0[12], lr -; CHECK-NEXT: vmov.8 q0[13], r3 -; CHECK-NEXT: vmov.8 q0[14], r1 -; CHECK-NEXT: vmov.8 q0[15], r2 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrb.u8 q0, [r0, q1] +; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI10_0: -; CHECK-NEXT: .long 24 @ 0x18 -; CHECK-NEXT: .long 27 @ 0x1b -; CHECK-NEXT: .long 30 @ 0x1e -; CHECK-NEXT: .long 33 @ 0x21 -; CHECK-NEXT: .LCPI10_1: -; CHECK-NEXT: .long 36 @ 0x24 -; CHECK-NEXT: .long 39 @ 0x27 -; CHECK-NEXT: .long 42 @ 0x2a -; CHECK-NEXT: .long 45 @ 0x2d -; CHECK-NEXT: .LCPI10_2: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 3 @ 0x3 -; CHECK-NEXT: .long 6 @ 0x6 -; CHECK-NEXT: .long 9 @ 0x9 -; CHECK-NEXT: .LCPI10_3: -; CHECK-NEXT: .long 12 @ 0xc -; CHECK-NEXT: .long 15 @ 0xf -; CHECK-NEXT: .long 18 @ 0x12 -; CHECK-NEXT: .long 21 @ 0x15 +; CHECK-NEXT: .byte 0 @ 0x0 +; CHECK-NEXT: .byte 3 @ 0x3 +; CHECK-NEXT: .byte 6 @ 0x6 +; CHECK-NEXT: .byte 9 @ 0x9 +; CHECK-NEXT: .byte 12 @ 0xc +; CHECK-NEXT: .byte 15 @ 0xf +; CHECK-NEXT: .byte 18 @ 0x12 +; CHECK-NEXT: .byte 21 @ 0x15 +; CHECK-NEXT: .byte 24 @ 0x18 +; CHECK-NEXT: .byte 27 @ 0x1b +; CHECK-NEXT: .byte 30 @ 0x1e +; CHECK-NEXT: .byte 33 @ 0x21 +; CHECK-NEXT: .byte 36 @ 0x24 +; CHECK-NEXT: .byte 39 @ 0x27 +; CHECK-NEXT: .byte 42 @ 0x2a +; CHECK-NEXT: .byte 45 @ 0x2d entry: %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) @@ -1253,91 +1067,29 @@ define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_3(i8* %base, <16 x i8>* %offptr) { ; CHECK-LABEL: unscaled_v16i8_i8_3: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, lr} -; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: adr r1, .LCPI17_0 -; CHECK-NEXT: adr r2, .LCPI17_1 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: adr r6, .LCPI17_2 -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrb.w r12, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb.w lr, [r1] -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: ldrb r3, [r1] -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vldrw.u32 q0, [r6] -; CHECK-NEXT: adr r6, .LCPI17_3 -; CHECK-NEXT: vldrw.u32 q2, [r6] -; CHECK-NEXT: vadd.i32 q3, q0, r0 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[0], r0 -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[1], r0 -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[2], r0 -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[3], r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[4], r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[5], r0 -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[6], r0 -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[7], r0 -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.8 q0[8], r5 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.8 q0[9], r0 -; CHECK-NEXT: vmov.8 q0[10], r12 -; CHECK-NEXT: vmov.8 q0[11], r4 -; CHECK-NEXT: vmov.8 q0[12], lr -; CHECK-NEXT: vmov.8 q0[13], r3 -; CHECK-NEXT: vmov.8 q0[14], r1 -; CHECK-NEXT: vmov.8 q0[15], r2 -; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrb.u8 q0, [r0, q1] +; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI17_0: -; CHECK-NEXT: .long 24 @ 0x18 -; CHECK-NEXT: .long 27 @ 0x1b -; CHECK-NEXT: .long 30 @ 0x1e -; CHECK-NEXT: .long 33 @ 0x21 -; CHECK-NEXT: .LCPI17_1: -; CHECK-NEXT: .long 36 @ 0x24 -; CHECK-NEXT: .long 39 @ 0x27 -; CHECK-NEXT: .long 42 @ 0x2a -; CHECK-NEXT: .long 45 @ 0x2d -; CHECK-NEXT: .LCPI17_2: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 3 @ 0x3 -; CHECK-NEXT: .long 6 @ 0x6 -; CHECK-NEXT: .long 9 @ 0x9 -; CHECK-NEXT: .LCPI17_3: -; CHECK-NEXT: .long 12 @ 0xc -; CHECK-NEXT: .long 15 @ 0xf -; CHECK-NEXT: .long 18 @ 0x12 -; CHECK-NEXT: .long 21 @ 0x15 +; CHECK-NEXT: .byte 0 @ 0x0 +; CHECK-NEXT: .byte 3 @ 0x3 +; CHECK-NEXT: .byte 6 @ 0x6 +; CHECK-NEXT: .byte 9 @ 0x9 +; CHECK-NEXT: .byte 12 @ 0xc +; CHECK-NEXT: .byte 15 @ 0xf +; CHECK-NEXT: .byte 18 @ 0x12 +; CHECK-NEXT: .byte 21 @ 0x15 +; CHECK-NEXT: .byte 24 @ 0x18 +; CHECK-NEXT: .byte 27 @ 0x1b +; CHECK-NEXT: .byte 30 @ 0x1e +; CHECK-NEXT: .byte 33 @ 0x21 +; CHECK-NEXT: .byte 36 @ 0x24 +; CHECK-NEXT: .byte 39 @ 0x27 +; CHECK-NEXT: .byte 42 @ 0x2a +; CHECK-NEXT: .byte 45 @ 0x2d entry: %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) Index: llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll @@ -57,8 +57,6 @@ ; CHECK: @ %bb.0: @ %vector.ph ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr r1, .LCPI1_0 ; CHECK-NEXT: adr r3, .LCPI1_1 ; CHECK-NEXT: vldrw.u32 q0, [r3] @@ -67,28 +65,25 @@ ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q3, q1, r0 -; CHECK-NEXT: vadd.i32 q2, q0, r0 -; CHECK-NEXT: vldrw.u32 q4, [q3] +; CHECK-NEXT: vldrw.u32 q2, [r0, q0, uxtw #2] +; CHECK-NEXT: vadd.i32 q2, q2, r2 +; CHECK-NEXT: vstrw.32 q2, [r0, q1, uxtw #2] ; CHECK-NEXT: adds r0, #64 -; CHECK-NEXT: vadd.i32 q3, q4, r2 -; CHECK-NEXT: vstrw.32 q3, [q2] ; CHECK-NEXT: le lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI1_0: -; CHECK-NEXT: .long 12 @ 0xc -; CHECK-NEXT: .long 28 @ 0x1c -; CHECK-NEXT: .long 44 @ 0x2c -; CHECK-NEXT: .long 60 @ 0x3c +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 9 @ 0x9 +; CHECK-NEXT: .long 13 @ 0xd +; CHECK-NEXT: .long 17 @ 0x11 ; CHECK-NEXT: .LCPI1_1: -; CHECK-NEXT: .long 20 @ 0x14 -; CHECK-NEXT: .long 36 @ 0x24 -; CHECK-NEXT: .long 52 @ 0x34 -; CHECK-NEXT: .long 68 @ 0x44 +; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .long 7 @ 0x7 +; CHECK-NEXT: .long 11 @ 0xb +; CHECK-NEXT: .long 15 @ 0xf vector.ph: %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %y, i32 0 %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer @@ -118,89 +113,33 @@ define void @ptr_iv_v8i16(i16* noalias nocapture readonly %A, i16* noalias nocapture %B, i16 %y) { ; CHECK-LABEL: ptr_iv_v8i16: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adr r3, .LCPI2_0 ; CHECK-NEXT: mov.w lr, #249 ; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: adr r3, .LCPI2_1 -; CHECK-NEXT: vldrw.u32 q1, [r3] ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q3, q1, r0 -; CHECK-NEXT: vadd.i32 q2, q0, r0 -; CHECK-NEXT: vmov r4, s12 -; CHECK-NEXT: vadd.i32 q4, q0, r1 -; CHECK-NEXT: vmov r3, s11 +; CHECK-NEXT: vldrh.u16 q1, [r0, q0, uxtw #1] ; CHECK-NEXT: adds r0, #64 -; CHECK-NEXT: vmov r5, s13 -; CHECK-NEXT: vmov r6, s14 -; CHECK-NEXT: vmov r7, s15 -; CHECK-NEXT: ldrh.w r8, [r4] -; CHECK-NEXT: vmov r4, s8 -; CHECK-NEXT: ldrh.w r12, [r3] -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: ldrh r6, [r6] -; CHECK-NEXT: ldrh r7, [r7] -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q3[0], r4 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: vmov.16 q3[1], r3 -; CHECK-NEXT: vadd.i32 q2, q1, r1 +; CHECK-NEXT: vadd.i16 q1, q1, r2 +; CHECK-NEXT: vstrh.16 q1, [r1, q0, uxtw #1] ; CHECK-NEXT: adds r1, #64 -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q3[2], r4 -; CHECK-NEXT: vmov.16 q3[3], r12 -; CHECK-NEXT: vmov.16 q3[4], r8 -; CHECK-NEXT: vmov.16 q3[5], r5 -; CHECK-NEXT: vmov.16 q3[6], r6 -; CHECK-NEXT: vmov.16 q3[7], r7 -; CHECK-NEXT: vmov r7, s16 -; CHECK-NEXT: vadd.i16 q3, q3, r2 -; CHECK-NEXT: vmov.u16 r3, q3[0] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s17 -; CHECK-NEXT: vmov.u16 r3, q3[1] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s18 -; CHECK-NEXT: vmov.u16 r3, q3[2] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s19 -; CHECK-NEXT: vmov.u16 r3, q3[3] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s8 -; CHECK-NEXT: vmov.u16 r3, q3[4] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s9 -; CHECK-NEXT: vmov.u16 r3, q3[5] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s10 -; CHECK-NEXT: vmov.u16 r3, q3[6] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s11 -; CHECK-NEXT: vmov.u16 r3, q3[7] -; CHECK-NEXT: strh r3, [r7] ; CHECK-NEXT: le lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8, d9} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI2_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 8 @ 0x8 -; CHECK-NEXT: .long 16 @ 0x10 -; CHECK-NEXT: .long 24 @ 0x18 -; CHECK-NEXT: .LCPI2_1: -; CHECK-NEXT: .long 32 @ 0x20 -; CHECK-NEXT: .long 40 @ 0x28 -; CHECK-NEXT: .long 48 @ 0x30 -; CHECK-NEXT: .long 56 @ 0x38 +; CHECK-NEXT: .short 0 @ 0x0 +; CHECK-NEXT: .short 4 @ 0x4 +; CHECK-NEXT: .short 8 @ 0x8 +; CHECK-NEXT: .short 12 @ 0xc +; CHECK-NEXT: .short 16 @ 0x10 +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 24 @ 0x18 +; CHECK-NEXT: .short 28 @ 0x1c vector.ph: %broadcast.splatinsert = insertelement <8 x i16> undef, i16 %y, i32 0 %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer @@ -229,103 +168,44 @@ define void @ptr_iv_v8i16_mult(i16* noalias nocapture readonly %A, i16* noalias nocapture %B, i16 %y) { ; CHECK-LABEL: ptr_iv_v8i16_mult: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: adr r3, .LCPI3_0 -; CHECK-NEXT: mov.w lr, #249 -; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adr.w r12, .LCPI3_0 ; CHECK-NEXT: adr r3, .LCPI3_1 -; CHECK-NEXT: vldrw.u32 q1, [r3] -; CHECK-NEXT: adr r3, .LCPI3_2 -; CHECK-NEXT: vldrw.u32 q2, [r3] -; CHECK-NEXT: adr r3, .LCPI3_3 -; CHECK-NEXT: vldrw.u32 q3, [r3] +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: mov.w lr, #249 ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q5, q1, r0 -; CHECK-NEXT: vadd.i32 q4, q0, r0 -; CHECK-NEXT: vmov r4, s20 -; CHECK-NEXT: vadd.i32 q6, q3, r1 -; CHECK-NEXT: vmov r3, s19 +; CHECK-NEXT: vldrh.u16 q2, [r0, q0, uxtw #1] ; CHECK-NEXT: adds r0, #64 -; CHECK-NEXT: vmov r5, s21 -; CHECK-NEXT: vmov r6, s22 -; CHECK-NEXT: vmov r7, s23 -; CHECK-NEXT: ldrh.w r8, [r4] -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: ldrh.w r12, [r3] -; CHECK-NEXT: vmov r3, s17 -; CHECK-NEXT: ldrh r5, [r5] -; CHECK-NEXT: ldrh r6, [r6] -; CHECK-NEXT: ldrh r7, [r7] -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q5[0], r4 -; CHECK-NEXT: vmov r4, s18 -; CHECK-NEXT: vmov.16 q5[1], r3 -; CHECK-NEXT: vadd.i32 q4, q2, r1 +; CHECK-NEXT: vadd.i16 q2, q2, r2 +; CHECK-NEXT: vstrh.16 q2, [r1, q1, uxtw #1] ; CHECK-NEXT: adds r1, #64 -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q5[2], r4 -; CHECK-NEXT: vmov.16 q5[3], r12 -; CHECK-NEXT: vmov.16 q5[4], r8 -; CHECK-NEXT: vmov.16 q5[5], r5 -; CHECK-NEXT: vmov.16 q5[6], r6 -; CHECK-NEXT: vmov.16 q5[7], r7 -; CHECK-NEXT: vmov r7, s24 -; CHECK-NEXT: vadd.i16 q5, q5, r2 -; CHECK-NEXT: vmov.u16 r3, q5[0] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s25 -; CHECK-NEXT: vmov.u16 r3, q5[1] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s26 -; CHECK-NEXT: vmov.u16 r3, q5[2] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s27 -; CHECK-NEXT: vmov.u16 r3, q5[3] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s16 -; CHECK-NEXT: vmov.u16 r3, q5[4] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s17 -; CHECK-NEXT: vmov.u16 r3, q5[5] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s18 -; CHECK-NEXT: vmov.u16 r3, q5[6] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s19 -; CHECK-NEXT: vmov.u16 r3, q5[7] -; CHECK-NEXT: strh r3, [r7] ; CHECK-NEXT: le lr, .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI3_0: -; CHECK-NEXT: .long 6 @ 0x6 -; CHECK-NEXT: .long 14 @ 0xe -; CHECK-NEXT: .long 22 @ 0x16 -; CHECK-NEXT: .long 30 @ 0x1e +; CHECK-NEXT: .short 5 @ 0x5 +; CHECK-NEXT: .short 9 @ 0x9 +; CHECK-NEXT: .short 13 @ 0xd +; CHECK-NEXT: .short 17 @ 0x11 +; CHECK-NEXT: .short 21 @ 0x15 +; CHECK-NEXT: .short 25 @ 0x19 +; CHECK-NEXT: .short 29 @ 0x1d +; CHECK-NEXT: .short 33 @ 0x21 ; CHECK-NEXT: .LCPI3_1: -; CHECK-NEXT: .long 38 @ 0x26 -; CHECK-NEXT: .long 46 @ 0x2e -; CHECK-NEXT: .long 54 @ 0x36 -; CHECK-NEXT: .long 62 @ 0x3e -; CHECK-NEXT: .LCPI3_2: -; CHECK-NEXT: .long 42 @ 0x2a -; CHECK-NEXT: .long 50 @ 0x32 -; CHECK-NEXT: .long 58 @ 0x3a -; CHECK-NEXT: .long 66 @ 0x42 -; CHECK-NEXT: .LCPI3_3: -; CHECK-NEXT: .long 10 @ 0xa -; CHECK-NEXT: .long 18 @ 0x12 -; CHECK-NEXT: .long 26 @ 0x1a -; CHECK-NEXT: .long 34 @ 0x22 +; CHECK-NEXT: .short 3 @ 0x3 +; CHECK-NEXT: .short 7 @ 0x7 +; CHECK-NEXT: .short 11 @ 0xb +; CHECK-NEXT: .short 15 @ 0xf +; CHECK-NEXT: .short 19 @ 0x13 +; CHECK-NEXT: .short 23 @ 0x17 +; CHECK-NEXT: .short 27 @ 0x1b +; CHECK-NEXT: .short 31 @ 0x1f vector.ph: %broadcast.splatinsert = insertelement <8 x i16> undef, i16 %y, i32 0 %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer @@ -355,160 +235,41 @@ define void @ptr_iv_v16i8(i8* noalias nocapture readonly %A, i8* noalias nocapture %B, i8 %y) { ; CHECK-LABEL: ptr_iv_v16i8: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: adr r3, .LCPI4_0 ; CHECK-NEXT: mov.w lr, #249 ; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: adr r3, .LCPI4_1 -; CHECK-NEXT: vldrw.u32 q1, [r3] -; CHECK-NEXT: adr r3, .LCPI4_2 -; CHECK-NEXT: vldrw.u32 q2, [r3] -; CHECK-NEXT: adr r3, .LCPI4_3 -; CHECK-NEXT: vldrw.u32 q3, [r3] ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q0, q1, r0 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: vadd.i32 q7, q3, r1 -; CHECK-NEXT: vadd.i32 q4, q6, r0 -; CHECK-NEXT: vmov r5, s3 -; CHECK-NEXT: vmov r3, s19 -; CHECK-NEXT: vadd.i32 q6, q6, r1 -; CHECK-NEXT: vmov r6, s0 -; CHECK-NEXT: vmov r7, s1 -; CHECK-NEXT: vadd.i32 q0, q2, r0 -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb.w r8, [r3] -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: ldrb r6, [r6] -; CHECK-NEXT: ldrb.w r12, [r7] -; CHECK-NEXT: vmov r7, s17 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q5[0], r3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q5[1], r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q5[2], r3 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vadd.i32 q0, q3, r0 +; CHECK-NEXT: vldrb.u8 q1, [r0, q0] ; CHECK-NEXT: adds r0, #64 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q5[3], r3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q5[4], r3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q5[5], r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q5[6], r3 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vadd.i32 q0, q2, r1 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q5[7], r3 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q5[8], r3 -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: vmov.8 q5[9], r7 -; CHECK-NEXT: vmov r7, s0 -; CHECK-NEXT: vadd.i32 q4, q1, r1 +; CHECK-NEXT: vadd.i8 q1, q1, r2 +; CHECK-NEXT: vstrb.8 q1, [r1, q0] ; CHECK-NEXT: adds r1, #64 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q5[10], r3 -; CHECK-NEXT: vmov.8 q5[11], r8 -; CHECK-NEXT: vmov.8 q5[12], r6 -; CHECK-NEXT: vmov.8 q5[13], r12 -; CHECK-NEXT: vmov.8 q5[14], r4 -; CHECK-NEXT: vmov.8 q5[15], r5 -; CHECK-NEXT: vadd.i8 q5, q5, r2 -; CHECK-NEXT: vmov.u8 r3, q5[0] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s1 -; CHECK-NEXT: vmov.u8 r3, q5[1] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s2 -; CHECK-NEXT: vmov.u8 r3, q5[2] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s3 -; CHECK-NEXT: vmov.u8 r3, q5[3] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s28 -; CHECK-NEXT: vmov.u8 r3, q5[4] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s29 -; CHECK-NEXT: vmov.u8 r3, q5[5] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s30 -; CHECK-NEXT: vmov.u8 r3, q5[6] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s31 -; CHECK-NEXT: vmov.u8 r3, q5[7] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s24 -; CHECK-NEXT: vmov.u8 r3, q5[8] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s25 -; CHECK-NEXT: vmov.u8 r3, q5[9] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s26 -; CHECK-NEXT: vmov.u8 r3, q5[10] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s27 -; CHECK-NEXT: vmov.u8 r3, q5[11] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s16 -; CHECK-NEXT: vmov.u8 r3, q5[12] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s17 -; CHECK-NEXT: vmov.u8 r3, q5[13] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s18 -; CHECK-NEXT: vmov.u8 r3, q5[14] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s19 -; CHECK-NEXT: vmov.u8 r3, q5[15] -; CHECK-NEXT: strb r3, [r7] ; CHECK-NEXT: le lr, .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: add sp, #24 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI4_0: -; CHECK-NEXT: .long 32 @ 0x20 -; CHECK-NEXT: .long 36 @ 0x24 -; CHECK-NEXT: .long 40 @ 0x28 -; CHECK-NEXT: .long 44 @ 0x2c -; CHECK-NEXT: .LCPI4_1: -; CHECK-NEXT: .long 48 @ 0x30 -; CHECK-NEXT: .long 52 @ 0x34 -; CHECK-NEXT: .long 56 @ 0x38 -; CHECK-NEXT: .long 60 @ 0x3c -; CHECK-NEXT: .LCPI4_2: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 4 @ 0x4 -; CHECK-NEXT: .long 8 @ 0x8 -; CHECK-NEXT: .long 12 @ 0xc -; CHECK-NEXT: .LCPI4_3: -; CHECK-NEXT: .long 16 @ 0x10 -; CHECK-NEXT: .long 20 @ 0x14 -; CHECK-NEXT: .long 24 @ 0x18 -; CHECK-NEXT: .long 28 @ 0x1c +; CHECK-NEXT: .byte 0 @ 0x0 +; CHECK-NEXT: .byte 4 @ 0x4 +; CHECK-NEXT: .byte 8 @ 0x8 +; CHECK-NEXT: .byte 12 @ 0xc +; CHECK-NEXT: .byte 16 @ 0x10 +; CHECK-NEXT: .byte 20 @ 0x14 +; CHECK-NEXT: .byte 24 @ 0x18 +; CHECK-NEXT: .byte 28 @ 0x1c +; CHECK-NEXT: .byte 32 @ 0x20 +; CHECK-NEXT: .byte 36 @ 0x24 +; CHECK-NEXT: .byte 40 @ 0x28 +; CHECK-NEXT: .byte 44 @ 0x2c +; CHECK-NEXT: .byte 48 @ 0x30 +; CHECK-NEXT: .byte 52 @ 0x34 +; CHECK-NEXT: .byte 56 @ 0x38 +; CHECK-NEXT: .byte 60 @ 0x3c vector.ph: ; preds = %entry %broadcast.splatinsert = insertelement <16 x i8> undef, i8 %y, i32 0 %broadcast.splat = shufflevector <16 x i8> %broadcast.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer @@ -537,196 +298,60 @@ define void @ptr_iv_v16i8_mult(i8* noalias nocapture readonly %A, i8* noalias nocapture %B, i8 %y) { ; CHECK-LABEL: ptr_iv_v16i8_mult: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #88 -; CHECK-NEXT: sub sp, #88 -; CHECK-NEXT: adr r3, .LCPI5_0 -; CHECK-NEXT: mov.w lr, #249 -; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adr.w r12, .LCPI5_0 ; CHECK-NEXT: adr r3, .LCPI5_1 -; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: adr r3, .LCPI5_2 -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: adr r3, .LCPI5_3 -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: adr r3, .LCPI5_4 -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: adr r3, .LCPI5_5 -; CHECK-NEXT: vldrw.u32 q5, [r3] -; CHECK-NEXT: adr r3, .LCPI5_6 -; CHECK-NEXT: vldrw.u32 q6, [r3] -; CHECK-NEXT: adr r3, .LCPI5_7 -; CHECK-NEXT: vldrw.u32 q7, [r3] -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB5_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q4, q5, r1 -; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vadd.i32 q3, q3, r1 -; CHECK-NEXT: vmov r4, s6 -; CHECK-NEXT: vmov r5, s7 -; CHECK-NEXT: vmov r6, s4 -; CHECK-NEXT: vmov r7, s5 -; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q2, q1, r0 -; CHECK-NEXT: ldrb.w r8, [r3] -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r6, [r6] -; CHECK-NEXT: ldrb.w r12, [r7] -; CHECK-NEXT: vmov r7, s1 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q1[0], r3 -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q1[1], r3 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q1[2], r3 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vldrb.u8 q2, [r0, q0] ; CHECK-NEXT: adds r0, #64 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q1[3], r3 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q1[4], r3 -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q1[5], r3 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q1[6], r3 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: vadd.i32 q2, q7, r1 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q1[7], r3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q1[8], r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov.8 q1[9], r7 -; CHECK-NEXT: vmov r7, s16 -; CHECK-NEXT: vadd.i32 q0, q6, r1 +; CHECK-NEXT: vadd.i8 q2, q2, r2 +; CHECK-NEXT: vstrb.8 q2, [r1, q1] ; CHECK-NEXT: adds r1, #64 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q1[10], r3 -; CHECK-NEXT: vmov.8 q1[11], r8 -; CHECK-NEXT: vmov.8 q1[12], r6 -; CHECK-NEXT: vmov.8 q1[13], r12 -; CHECK-NEXT: vmov.8 q1[14], r4 -; CHECK-NEXT: vmov.8 q1[15], r5 -; CHECK-NEXT: vadd.i8 q1, q1, r2 -; CHECK-NEXT: vmov.u8 r3, q1[0] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s17 -; CHECK-NEXT: vmov.u8 r3, q1[1] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s18 -; CHECK-NEXT: vmov.u8 r3, q1[2] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s19 -; CHECK-NEXT: vmov.u8 r3, q1[3] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s12 -; CHECK-NEXT: vmov.u8 r3, q1[4] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s13 -; CHECK-NEXT: vmov.u8 r3, q1[5] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s14 -; CHECK-NEXT: vmov.u8 r3, q1[6] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s15 -; CHECK-NEXT: vmov.u8 r3, q1[7] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s8 -; CHECK-NEXT: vmov.u8 r3, q1[8] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s9 -; CHECK-NEXT: vmov.u8 r3, q1[9] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s10 -; CHECK-NEXT: vmov.u8 r3, q1[10] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s11 -; CHECK-NEXT: vmov.u8 r3, q1[11] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s0 -; CHECK-NEXT: vmov.u8 r3, q1[12] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s1 -; CHECK-NEXT: vmov.u8 r3, q1[13] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s2 -; CHECK-NEXT: vmov.u8 r3, q1[14] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s3 -; CHECK-NEXT: vmov.u8 r3, q1[15] -; CHECK-NEXT: strb r3, [r7] ; CHECK-NEXT: le lr, .LBB5_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: add sp, #88 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI5_0: -; CHECK-NEXT: .long 35 @ 0x23 -; CHECK-NEXT: .long 39 @ 0x27 -; CHECK-NEXT: .long 43 @ 0x2b -; CHECK-NEXT: .long 47 @ 0x2f +; CHECK-NEXT: .byte 5 @ 0x5 +; CHECK-NEXT: .byte 9 @ 0x9 +; CHECK-NEXT: .byte 13 @ 0xd +; CHECK-NEXT: .byte 17 @ 0x11 +; CHECK-NEXT: .byte 21 @ 0x15 +; CHECK-NEXT: .byte 25 @ 0x19 +; CHECK-NEXT: .byte 29 @ 0x1d +; CHECK-NEXT: .byte 33 @ 0x21 +; CHECK-NEXT: .byte 37 @ 0x25 +; CHECK-NEXT: .byte 41 @ 0x29 +; CHECK-NEXT: .byte 45 @ 0x2d +; CHECK-NEXT: .byte 49 @ 0x31 +; CHECK-NEXT: .byte 53 @ 0x35 +; CHECK-NEXT: .byte 57 @ 0x39 +; CHECK-NEXT: .byte 61 @ 0x3d +; CHECK-NEXT: .byte 65 @ 0x41 ; CHECK-NEXT: .LCPI5_1: -; CHECK-NEXT: .long 51 @ 0x33 -; CHECK-NEXT: .long 55 @ 0x37 -; CHECK-NEXT: .long 59 @ 0x3b -; CHECK-NEXT: .long 63 @ 0x3f -; CHECK-NEXT: .LCPI5_2: -; CHECK-NEXT: .long 3 @ 0x3 -; CHECK-NEXT: .long 7 @ 0x7 -; CHECK-NEXT: .long 11 @ 0xb -; CHECK-NEXT: .long 15 @ 0xf -; CHECK-NEXT: .LCPI5_3: -; CHECK-NEXT: .long 19 @ 0x13 -; CHECK-NEXT: .long 23 @ 0x17 -; CHECK-NEXT: .long 27 @ 0x1b -; CHECK-NEXT: .long 31 @ 0x1f -; CHECK-NEXT: .LCPI5_4: -; CHECK-NEXT: .long 21 @ 0x15 -; CHECK-NEXT: .long 25 @ 0x19 -; CHECK-NEXT: .long 29 @ 0x1d -; CHECK-NEXT: .long 33 @ 0x21 -; CHECK-NEXT: .LCPI5_5: -; CHECK-NEXT: .long 5 @ 0x5 -; CHECK-NEXT: .long 9 @ 0x9 -; CHECK-NEXT: .long 13 @ 0xd -; CHECK-NEXT: .long 17 @ 0x11 -; CHECK-NEXT: .LCPI5_6: -; CHECK-NEXT: .long 53 @ 0x35 -; CHECK-NEXT: .long 57 @ 0x39 -; CHECK-NEXT: .long 61 @ 0x3d -; CHECK-NEXT: .long 65 @ 0x41 -; CHECK-NEXT: .LCPI5_7: -; CHECK-NEXT: .long 37 @ 0x25 -; CHECK-NEXT: .long 41 @ 0x29 -; CHECK-NEXT: .long 45 @ 0x2d -; CHECK-NEXT: .long 49 @ 0x31 +; CHECK-NEXT: .byte 3 @ 0x3 +; CHECK-NEXT: .byte 7 @ 0x7 +; CHECK-NEXT: .byte 11 @ 0xb +; CHECK-NEXT: .byte 15 @ 0xf +; CHECK-NEXT: .byte 19 @ 0x13 +; CHECK-NEXT: .byte 23 @ 0x17 +; CHECK-NEXT: .byte 27 @ 0x1b +; CHECK-NEXT: .byte 31 @ 0x1f +; CHECK-NEXT: .byte 35 @ 0x23 +; CHECK-NEXT: .byte 39 @ 0x27 +; CHECK-NEXT: .byte 43 @ 0x2b +; CHECK-NEXT: .byte 47 @ 0x2f +; CHECK-NEXT: .byte 51 @ 0x33 +; CHECK-NEXT: .byte 55 @ 0x37 +; CHECK-NEXT: .byte 59 @ 0x3b +; CHECK-NEXT: .byte 63 @ 0x3f vector.ph: ; preds = %entry %broadcast.splatinsert = insertelement <16 x i8> undef, i8 %y, i32 0 %broadcast.splat = shufflevector <16 x i8> %broadcast.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer @@ -808,8 +433,6 @@ ; CHECK: @ %bb.0: @ %vector.ph ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr r1, .LCPI7_0 ; CHECK-NEXT: adr r3, .LCPI7_1 ; CHECK-NEXT: vldrw.u32 q0, [r3] @@ -818,28 +441,25 @@ ; CHECK-NEXT: dls lr, lr ; CHECK-NEXT: .LBB7_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q3, q1, r0 -; CHECK-NEXT: vadd.i32 q2, q0, r0 -; CHECK-NEXT: vldrw.u32 q4, [q3] +; CHECK-NEXT: vldrw.u32 q2, [r0, q0, uxtw #2] +; CHECK-NEXT: vadd.f32 q2, q2, r2 +; CHECK-NEXT: vstrw.32 q2, [r0, q1, uxtw #2] ; CHECK-NEXT: adds r0, #64 -; CHECK-NEXT: vadd.f32 q3, q4, r2 -; CHECK-NEXT: vstrw.32 q3, [q2] ; CHECK-NEXT: le lr, .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI7_0: -; CHECK-NEXT: .long 12 @ 0xc -; CHECK-NEXT: .long 28 @ 0x1c -; CHECK-NEXT: .long 44 @ 0x2c -; CHECK-NEXT: .long 60 @ 0x3c +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 9 @ 0x9 +; CHECK-NEXT: .long 13 @ 0xd +; CHECK-NEXT: .long 17 @ 0x11 ; CHECK-NEXT: .LCPI7_1: -; CHECK-NEXT: .long 20 @ 0x14 -; CHECK-NEXT: .long 36 @ 0x24 -; CHECK-NEXT: .long 52 @ 0x34 -; CHECK-NEXT: .long 68 @ 0x44 +; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .long 7 @ 0x7 +; CHECK-NEXT: .long 11 @ 0xb +; CHECK-NEXT: .long 15 @ 0xf vector.ph: ; preds = %entry %broadcast.splatinsert = insertelement <4 x float> undef, float %y, i32 0 %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer @@ -871,94 +491,34 @@ ; CHECK: @ %bb.0: @ %vector.ph ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8} -; CHECK-NEXT: vpush {d8} ; CHECK-NEXT: vmov s0, r2 ; CHECK-NEXT: adr r3, .LCPI8_0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: mov.w lr, #249 -; CHECK-NEXT: vmov.f16 r12, s0 +; CHECK-NEXT: vmov.f16 r2, s0 ; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: adr r3, .LCPI8_1 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: vldrw.u32 q1, [r3] ; CHECK-NEXT: .LBB8_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q3, q0, r0 -; CHECK-NEXT: vmov r3, s13 -; CHECK-NEXT: vldr.16 s8, [r3] -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vldr.16 s8, [r2] -; CHECK-NEXT: vmov r2, s8 -; CHECK-NEXT: vmov.16 q2[0], r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vldr.16 s16, [r2] -; CHECK-NEXT: vmov.16 q2[1], r3 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.16 q2[2], r2 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: vldr.16 s12, [r2] -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vadd.i32 q3, q1, r0 -; CHECK-NEXT: vmov.16 q2[3], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vldr.16 s16, [r2] +; CHECK-NEXT: vldrh.u16 q1, [r0, q0, uxtw #1] ; CHECK-NEXT: adds r0, #64 -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.16 q2[4], r2 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vldr.16 s16, [r2] -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.16 q2[5], r2 -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vldr.16 s16, [r2] -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.16 q2[6], r2 -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: vldr.16 s12, [r2] -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vadd.i32 q3, q0, r1 -; CHECK-NEXT: vmov.16 q2[7], r2 -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vadd.f16 q2, q2, r12 -; CHECK-NEXT: vstr.16 s8, [r2] -; CHECK-NEXT: vmovx.f16 s16, s8 -; CHECK-NEXT: vmov r2, s13 -; CHECK-NEXT: vmovx.f16 s8, s11 -; CHECK-NEXT: vstr.16 s16, [r2] -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vstr.16 s9, [r2] -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: vmovx.f16 s16, s9 -; CHECK-NEXT: vadd.i32 q3, q1, r1 -; CHECK-NEXT: vstr.16 s16, [r2] -; CHECK-NEXT: vmov r2, s12 -; CHECK-NEXT: vstr.16 s10, [r2] -; CHECK-NEXT: vmovx.f16 s16, s10 -; CHECK-NEXT: vmov r2, s13 +; CHECK-NEXT: vadd.f16 q1, q1, r2 +; CHECK-NEXT: vstrh.16 q1, [r1, q0, uxtw #1] ; CHECK-NEXT: adds r1, #64 -; CHECK-NEXT: vstr.16 s16, [r2] -; CHECK-NEXT: vmov r2, s14 -; CHECK-NEXT: vstr.16 s11, [r2] -; CHECK-NEXT: vmov r2, s15 -; CHECK-NEXT: vstr.16 s8, [r2] ; CHECK-NEXT: le lr, .LBB8_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8} ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI8_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 8 @ 0x8 -; CHECK-NEXT: .long 16 @ 0x10 -; CHECK-NEXT: .long 24 @ 0x18 -; CHECK-NEXT: .LCPI8_1: -; CHECK-NEXT: .long 32 @ 0x20 -; CHECK-NEXT: .long 40 @ 0x28 -; CHECK-NEXT: .long 48 @ 0x30 -; CHECK-NEXT: .long 56 @ 0x38 +; CHECK-NEXT: .short 0 @ 0x0 +; CHECK-NEXT: .short 4 @ 0x4 +; CHECK-NEXT: .short 8 @ 0x8 +; CHECK-NEXT: .short 12 @ 0xc +; CHECK-NEXT: .short 16 @ 0x10 +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 24 @ 0x18 +; CHECK-NEXT: .short 28 @ 0x1c vector.ph: %y.trunc = fptrunc float %y to half %broadcast.splatinsert = insertelement <8 x half> undef, half %y.trunc, i32 0 @@ -989,108 +549,44 @@ ; CHECK: @ %bb.0: @ %vector.ph ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12} ; CHECK-NEXT: vmov s0, r2 -; CHECK-NEXT: adr r3, .LCPI9_0 +; CHECK-NEXT: adr r2, .LCPI9_0 ; CHECK-NEXT: vcvtb.f16.f32 s0, s0 ; CHECK-NEXT: mov.w lr, #249 -; CHECK-NEXT: vmov.f16 r12, s0 -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: adr r3, .LCPI9_1 +; CHECK-NEXT: vmov.f16 r1, s0 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: adr r2, .LCPI9_1 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: vldrw.u32 q1, [r3] -; CHECK-NEXT: adr r3, .LCPI9_2 -; CHECK-NEXT: vldrw.u32 q2, [r3] -; CHECK-NEXT: adr r3, .LCPI9_3 -; CHECK-NEXT: vldrw.u32 q3, [r3] +; CHECK-NEXT: vldrw.u32 q1, [r2] ; CHECK-NEXT: .LBB9_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q5, q0, r0 -; CHECK-NEXT: adds r1, #64 -; CHECK-NEXT: vmov r3, s21 -; CHECK-NEXT: vldr.16 s16, [r3] -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vldr.16 s16, [r2] -; CHECK-NEXT: vmov r2, s16 -; CHECK-NEXT: vmov.16 q4[0], r2 -; CHECK-NEXT: vmov r2, s22 -; CHECK-NEXT: vldr.16 s24, [r2] -; CHECK-NEXT: vmov.16 q4[1], r3 -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmov.16 q4[2], r2 -; CHECK-NEXT: vmov r2, s23 -; CHECK-NEXT: vldr.16 s20, [r2] -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vadd.i32 q5, q1, r0 -; CHECK-NEXT: vmov.16 q4[3], r2 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vldr.16 s24, [r2] -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmov.16 q4[4], r2 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: vldr.16 s24, [r2] -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmov.16 q4[5], r2 -; CHECK-NEXT: vmov r2, s22 -; CHECK-NEXT: vldr.16 s24, [r2] -; CHECK-NEXT: vmov r2, s24 -; CHECK-NEXT: vmov.16 q4[6], r2 -; CHECK-NEXT: vmov r2, s23 -; CHECK-NEXT: vldr.16 s20, [r2] -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vadd.i32 q5, q2, r0 -; CHECK-NEXT: vmov.16 q4[7], r2 -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vadd.f16 q4, q4, r12 -; CHECK-NEXT: vstr.16 s16, [r2] -; CHECK-NEXT: vmovx.f16 s24, s16 -; CHECK-NEXT: vmov r2, s21 -; CHECK-NEXT: vmovx.f16 s16, s19 -; CHECK-NEXT: vstr.16 s24, [r2] -; CHECK-NEXT: vmov r2, s22 -; CHECK-NEXT: vstr.16 s17, [r2] -; CHECK-NEXT: vmov r2, s23 -; CHECK-NEXT: vmovx.f16 s24, s17 -; CHECK-NEXT: vadd.i32 q5, q3, r0 -; CHECK-NEXT: vstr.16 s24, [r2] -; CHECK-NEXT: vmov r2, s20 -; CHECK-NEXT: vstr.16 s18, [r2] -; CHECK-NEXT: vmovx.f16 s24, s18 -; CHECK-NEXT: vmov r2, s21 +; CHECK-NEXT: vldrh.u16 q2, [r0, q0, uxtw #1] +; CHECK-NEXT: vadd.f16 q2, q2, r1 +; CHECK-NEXT: vstrh.16 q2, [r0, q1, uxtw #1] ; CHECK-NEXT: adds r0, #64 -; CHECK-NEXT: vstr.16 s24, [r2] -; CHECK-NEXT: vmov r2, s22 -; CHECK-NEXT: vstr.16 s19, [r2] -; CHECK-NEXT: vmov r2, s23 -; CHECK-NEXT: vstr.16 s16, [r2] ; CHECK-NEXT: le lr, .LBB9_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12} ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI9_0: -; CHECK-NEXT: .long 6 @ 0x6 -; CHECK-NEXT: .long 14 @ 0xe -; CHECK-NEXT: .long 22 @ 0x16 -; CHECK-NEXT: .long 30 @ 0x1e +; CHECK-NEXT: .short 3 @ 0x3 +; CHECK-NEXT: .short 7 @ 0x7 +; CHECK-NEXT: .short 11 @ 0xb +; CHECK-NEXT: .short 15 @ 0xf +; CHECK-NEXT: .short 19 @ 0x13 +; CHECK-NEXT: .short 23 @ 0x17 +; CHECK-NEXT: .short 27 @ 0x1b +; CHECK-NEXT: .short 31 @ 0x1f ; CHECK-NEXT: .LCPI9_1: -; CHECK-NEXT: .long 38 @ 0x26 -; CHECK-NEXT: .long 46 @ 0x2e -; CHECK-NEXT: .long 54 @ 0x36 -; CHECK-NEXT: .long 62 @ 0x3e -; CHECK-NEXT: .LCPI9_2: -; CHECK-NEXT: .long 10 @ 0xa -; CHECK-NEXT: .long 18 @ 0x12 -; CHECK-NEXT: .long 26 @ 0x1a -; CHECK-NEXT: .long 34 @ 0x22 -; CHECK-NEXT: .LCPI9_3: -; CHECK-NEXT: .long 42 @ 0x2a -; CHECK-NEXT: .long 50 @ 0x32 -; CHECK-NEXT: .long 58 @ 0x3a -; CHECK-NEXT: .long 66 @ 0x42 +; CHECK-NEXT: .short 5 @ 0x5 +; CHECK-NEXT: .short 9 @ 0x9 +; CHECK-NEXT: .short 13 @ 0xd +; CHECK-NEXT: .short 17 @ 0x11 +; CHECK-NEXT: .short 21 @ 0x15 +; CHECK-NEXT: .short 25 @ 0x19 +; CHECK-NEXT: .short 29 @ 0x1d +; CHECK-NEXT: .short 33 @ 0x21 vector.ph: %y.trunc = fptrunc float %y to half %broadcast.splatinsert = insertelement <8 x half> undef, half %y.trunc, i32 0 @@ -1124,61 +620,50 @@ ; CHECK: @ %bb.0: @ %vector.ph ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #40 -; CHECK-NEXT: sub sp, #40 +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: adr.w r12, .LCPI10_0 ; CHECK-NEXT: adr.w lr, .LCPI10_1 -; CHECK-NEXT: vmov.i32 q0, #0xa ; CHECK-NEXT: adr r3, .LCPI10_2 -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r3] ; CHECK-NEXT: vldrw.u32 q2, [lr] +; CHECK-NEXT: vldrw.u32 q1, [r3] ; CHECK-NEXT: vldrw.u32 q3, [r12] -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.i32 q0, #0xa ; CHECK-NEXT: .LBB10_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q7, q3, r1 -; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q5, q0, r0 -; CHECK-NEXT: vadd.i32 q4, q0, r1 -; CHECK-NEXT: vadd.i32 q0, q3, r0 -; CHECK-NEXT: vldrw.u32 q6, [q5] +; CHECK-NEXT: vldrw.u32 q4, [r0, q1, uxtw #2] ; CHECK-NEXT: vldrw.u32 q5, [r0, q2, uxtw #2] -; CHECK-NEXT: vldrw.u32 q1, [q0] +; CHECK-NEXT: vldrw.u32 q6, [r0, q3, uxtw #2] +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vmul.i32 q4, q5, q4 ; CHECK-NEXT: add.w r0, r0, #48 -; CHECK-NEXT: vmul.i32 q0, q5, q1 -; CHECK-NEXT: vldrw.u32 q1, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vmul.i32 q6, q5, q6 -; CHECK-NEXT: vmul.i32 q1, q5, q1 -; CHECK-NEXT: vstrw.32 q1, [r1, q2, uxtw #2] +; CHECK-NEXT: vmul.i32 q5, q5, q0 +; CHECK-NEXT: vstrw.32 q5, [r1, q2, uxtw #2] +; CHECK-NEXT: vstrw.32 q6, [r1, q3, uxtw #2] +; CHECK-NEXT: vstrw.32 q4, [r1, q1, uxtw #2] ; CHECK-NEXT: add.w r1, r1, #48 -; CHECK-NEXT: vstrw.32 q0, [q7] -; CHECK-NEXT: vstrw.32 q6, [q4] ; CHECK-NEXT: bne .LBB10_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: add sp, #40 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI10_0: +; CHECK-NEXT: .long 1 @ 0x1 ; CHECK-NEXT: .long 4 @ 0x4 -; CHECK-NEXT: .long 16 @ 0x10 -; CHECK-NEXT: .long 28 @ 0x1c -; CHECK-NEXT: .long 40 @ 0x28 +; CHECK-NEXT: .long 7 @ 0x7 +; CHECK-NEXT: .long 10 @ 0xa ; CHECK-NEXT: .LCPI10_1: ; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 3 @ 0x3 ; CHECK-NEXT: .long 6 @ 0x6 ; CHECK-NEXT: .long 9 @ 0x9 ; CHECK-NEXT: .LCPI10_2: +; CHECK-NEXT: .long 2 @ 0x2 +; CHECK-NEXT: .long 5 @ 0x5 ; CHECK-NEXT: .long 8 @ 0x8 -; CHECK-NEXT: .long 20 @ 0x14 -; CHECK-NEXT: .long 32 @ 0x20 -; CHECK-NEXT: .long 44 @ 0x2c +; CHECK-NEXT: .long 11 @ 0xb vector.ph: br label %vector.body @@ -1215,91 +700,35 @@ define arm_aapcs_vfpcc void @three_pointer_iv_v4i8(i8* nocapture readonly %x, i8* nocapture %z, i32 %n) { ; CHECK-LABEL: three_pointer_iv_v4i8: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .save {r4, r5, r6, r7, lr} -; CHECK-NEXT: push {r4, r5, r6, r7, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 -; CHECK-NEXT: adr r3, .LCPI11_0 -; CHECK-NEXT: vmov.i32 q3, #0xa -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: adr r3, .LCPI11_1 -; CHECK-NEXT: vldrw.u32 q1, [r3] +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: adr.w r12, .LCPI11_0 +; CHECK-NEXT: adr.w lr, .LCPI11_1 ; CHECK-NEXT: adr r3, .LCPI11_2 -; CHECK-NEXT: vldrw.u32 q2, [r3] -; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [lr] +; CHECK-NEXT: vldrw.u32 q1, [r3] +; CHECK-NEXT: vldrw.u32 q3, [r12] +; CHECK-NEXT: vmov.i32 q0, #0xa ; CHECK-NEXT: .LBB11_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vadd.i32 q4, q0, r0 -; CHECK-NEXT: vadd.i32 q6, q0, r1 -; CHECK-NEXT: vmov r3, s19 +; CHECK-NEXT: vldrb.u32 q4, [r0, q1] +; CHECK-NEXT: vldrb.u32 q5, [r0, q2] +; CHECK-NEXT: vldrb.u32 q6, [r0, q3] ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vmov r4, s16 -; CHECK-NEXT: ldrb.w r12, [r3] -; CHECK-NEXT: vmov r3, s18 -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb.w lr, [r3] -; CHECK-NEXT: vmov r3, s17 -; CHECK-NEXT: vadd.i32 q4, q1, r0 -; CHECK-NEXT: vmov r7, s16 -; CHECK-NEXT: vmov r5, s19 -; CHECK-NEXT: vmov r6, s18 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov.32 q5[0], r7 -; CHECK-NEXT: vmov r7, s17 -; CHECK-NEXT: ldrb r6, [r6] -; CHECK-NEXT: vadd.i32 q4, q1, r1 -; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: vldrb.u32 q7, [r0, q2] -; CHECK-NEXT: vldrw.u32 q3, [sp] @ 16-byte Reload +; CHECK-NEXT: vmul.i32 q4, q5, q4 ; CHECK-NEXT: add.w r0, r0, #12 -; CHECK-NEXT: vmov.32 q5[1], r7 -; CHECK-NEXT: vmov r7, s24 -; CHECK-NEXT: vmul.i32 q3, q7, q3 -; CHECK-NEXT: vmov.32 q5[2], r6 -; CHECK-NEXT: vstrb.32 q3, [r1, q2] -; CHECK-NEXT: vmov.32 q3[0], r4 -; CHECK-NEXT: vmov.32 q3[1], r3 -; CHECK-NEXT: vmov.32 q5[3], r5 -; CHECK-NEXT: vmov.32 q3[2], lr -; CHECK-NEXT: vmul.i32 q5, q7, q5 -; CHECK-NEXT: vmov.32 q3[3], r12 +; CHECK-NEXT: vmul.i32 q6, q5, q6 +; CHECK-NEXT: vmul.i32 q5, q5, q0 +; CHECK-NEXT: vstrb.32 q5, [r1, q2] +; CHECK-NEXT: vstrb.32 q6, [r1, q3] +; CHECK-NEXT: vstrb.32 q4, [r1, q1] ; CHECK-NEXT: add.w r1, r1, #12 -; CHECK-NEXT: vmul.i32 q3, q7, q3 -; CHECK-NEXT: vmov r3, s12 -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s25 -; CHECK-NEXT: vmov r3, s13 -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s26 -; CHECK-NEXT: vmov r3, s14 -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s27 -; CHECK-NEXT: vmov r3, s15 -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s16 -; CHECK-NEXT: vmov r3, s20 -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s17 -; CHECK-NEXT: vmov r3, s21 -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s18 -; CHECK-NEXT: vmov r3, s22 -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s19 -; CHECK-NEXT: vmov r3, s23 -; CHECK-NEXT: strb r3, [r7] ; CHECK-NEXT: bne .LBB11_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: add sp, #24 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI11_0: @@ -1308,15 +737,15 @@ ; CHECK-NEXT: .long 7 @ 0x7 ; CHECK-NEXT: .long 10 @ 0xa ; CHECK-NEXT: .LCPI11_1: -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 5 @ 0x5 -; CHECK-NEXT: .long 8 @ 0x8 -; CHECK-NEXT: .long 11 @ 0xb -; CHECK-NEXT: .LCPI11_2: ; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 3 @ 0x3 ; CHECK-NEXT: .long 6 @ 0x6 ; CHECK-NEXT: .long 9 @ 0x9 +; CHECK-NEXT: .LCPI11_2: +; CHECK-NEXT: .long 2 @ 0x2 +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 11 @ 0xb vector.ph: br label %vector.body @@ -1359,250 +788,64 @@ define arm_aapcs_vfpcc void @three_pointer_iv_v8i16(i16* nocapture readonly %x, i16* nocapture %z, i32 %n) { ; CHECK-LABEL: three_pointer_iv_v8i16: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #144 -; CHECK-NEXT: sub sp, #144 -; CHECK-NEXT: adr r3, .LCPI12_0 -; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: adr r3, .LCPI12_1 -; CHECK-NEXT: vstrw.32 q0, [sp, #128] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: adr.w r12, .LCPI12_0 +; CHECK-NEXT: adr.w lr, .LCPI12_1 ; CHECK-NEXT: adr r3, .LCPI12_2 -; CHECK-NEXT: vstrw.32 q0, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: adr r3, .LCPI12_3 -; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: adr r3, .LCPI12_4 -; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: adr r3, .LCPI12_5 -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [lr] +; CHECK-NEXT: vldrw.u32 q1, [r3] +; CHECK-NEXT: vldrw.u32 q3, [r12] ; CHECK-NEXT: vmov.i16 q0, #0xa -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: .LBB12_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q1, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q3, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q6, q1, r0 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vldrw.u32 q1, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q2, q3, r0 -; CHECK-NEXT: vadd.i32 q3, q3, r1 -; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vldrh.u16 q4, [r0, q1, uxtw #1] +; CHECK-NEXT: vldrh.u16 q5, [r0, q2, uxtw #1] +; CHECK-NEXT: vldrh.u16 q6, [r0, q3, uxtw #1] ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: add.w r12, r12, #4 -; CHECK-NEXT: ldrh.w lr, [r3] -; CHECK-NEXT: vmov r3, s24 -; CHECK-NEXT: ldrh.w r8, [r3] -; CHECK-NEXT: vmov r3, s25 -; CHECK-NEXT: ldrh.w r9, [r3] -; CHECK-NEXT: vmov r3, s26 -; CHECK-NEXT: ldrh r5, [r3] -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: ldrh r6, [r3] -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: ldrh r7, [r3] -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q7[0], r3 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q7[1], r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q7[2], r3 -; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: vldrw.u32 q1, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q7[3], r3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q7[4], r3 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q7[5], r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q7[6], r3 -; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q7[7], r3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vadd.i32 q0, q5, r0 +; CHECK-NEXT: vmul.i16 q4, q5, q4 ; CHECK-NEXT: add.w r0, r0, #48 -; CHECK-NEXT: vmov r4, s0 -; CHECK-NEXT: vmov r10, s3 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: vmov.16 q1[1], r6 -; CHECK-NEXT: vmov r6, s2 -; CHECK-NEXT: vmov.16 q1[2], r7 -; CHECK-NEXT: vmov r7, s1 -; CHECK-NEXT: vmov.16 q1[3], lr -; CHECK-NEXT: ldrh r4, [r4] -; CHECK-NEXT: vmov.16 q1[4], r8 -; CHECK-NEXT: vmov.16 q1[5], r9 -; CHECK-NEXT: vmov.16 q1[6], r5 -; CHECK-NEXT: vmov r5, s10 -; CHECK-NEXT: ldrh.w r8, [r3] -; CHECK-NEXT: vmov r3, s27 -; CHECK-NEXT: vadd.i32 q6, q5, r1 -; CHECK-NEXT: ldrh r6, [r6] -; CHECK-NEXT: ldrh r7, [r7] -; CHECK-NEXT: ldrh.w lr, [r5] -; CHECK-NEXT: ldrh.w r5, [r10] -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q1[7], r3 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmul.i16 q7, q1, q7 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q0[0], r3 -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload -; CHECK-NEXT: vmul.i16 q4, q1, q2 -; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q2, q2, r1 -; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q2, q2, r1 -; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q5, q2, r1 -; CHECK-NEXT: vldrw.u32 q2, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q2, q2, r1 +; CHECK-NEXT: vmul.i16 q6, q5, q6 +; CHECK-NEXT: vmul.i16 q5, q5, q0 +; CHECK-NEXT: vstrh.16 q5, [r1, q2, uxtw #1] +; CHECK-NEXT: vstrh.16 q6, [r1, q3, uxtw #1] +; CHECK-NEXT: vstrh.16 q4, [r1, q1, uxtw #1] ; CHECK-NEXT: add.w r1, r1, #48 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.16 q0[1], r3 -; CHECK-NEXT: vmov.u16 r3, q4[0] -; CHECK-NEXT: vmov.16 q0[2], lr -; CHECK-NEXT: vmov.16 q0[3], r8 -; CHECK-NEXT: vmov.16 q0[4], r4 -; CHECK-NEXT: vmov.16 q0[5], r7 -; CHECK-NEXT: vmov r7, s8 -; CHECK-NEXT: vmov.16 q0[6], r6 -; CHECK-NEXT: vmov.16 q0[7], r5 -; CHECK-NEXT: vmul.i16 q1, q1, q0 -; CHECK-NEXT: vldrw.u32 q0, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s9 -; CHECK-NEXT: vmov.u16 r3, q4[1] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s10 -; CHECK-NEXT: vmov.u16 r3, q4[2] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s11 -; CHECK-NEXT: vmov.u16 r3, q4[3] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s20 -; CHECK-NEXT: vmov.u16 r3, q4[4] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s21 -; CHECK-NEXT: vmov.u16 r3, q4[5] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s22 -; CHECK-NEXT: vmov.u16 r3, q4[6] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s23 -; CHECK-NEXT: vmov.u16 r3, q4[7] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s12 -; CHECK-NEXT: vmov.u16 r3, q1[0] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s13 -; CHECK-NEXT: vmov.u16 r3, q1[1] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s14 -; CHECK-NEXT: vmov.u16 r3, q1[2] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s15 -; CHECK-NEXT: vmov.u16 r3, q1[3] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s24 -; CHECK-NEXT: vmov.u16 r3, q1[4] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s25 -; CHECK-NEXT: vmov.u16 r3, q1[5] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s26 -; CHECK-NEXT: vmov.u16 r3, q1[6] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s27 -; CHECK-NEXT: vmov.u16 r3, q1[7] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s0 -; CHECK-NEXT: vmov.u16 r3, q7[0] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s1 -; CHECK-NEXT: vmov.u16 r3, q7[1] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s2 -; CHECK-NEXT: vmov.u16 r3, q7[2] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s3 -; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vmov.u16 r3, q7[3] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s0 -; CHECK-NEXT: vmov.u16 r3, q7[4] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s1 -; CHECK-NEXT: vmov.u16 r3, q7[5] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s2 -; CHECK-NEXT: vmov.u16 r3, q7[6] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: vmov r7, s3 -; CHECK-NEXT: vmov.u16 r3, q7[7] -; CHECK-NEXT: strh r3, [r7] -; CHECK-NEXT: bne.w .LBB12_1 +; CHECK-NEXT: bne .LBB12_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: add sp, #144 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI12_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 6 @ 0x6 -; CHECK-NEXT: .long 12 @ 0xc -; CHECK-NEXT: .long 18 @ 0x12 +; CHECK-NEXT: .short 1 @ 0x1 +; CHECK-NEXT: .short 4 @ 0x4 +; CHECK-NEXT: .short 7 @ 0x7 +; CHECK-NEXT: .short 10 @ 0xa +; CHECK-NEXT: .short 13 @ 0xd +; CHECK-NEXT: .short 16 @ 0x10 +; CHECK-NEXT: .short 19 @ 0x13 +; CHECK-NEXT: .short 22 @ 0x16 ; CHECK-NEXT: .LCPI12_1: -; CHECK-NEXT: .long 24 @ 0x18 -; CHECK-NEXT: .long 30 @ 0x1e -; CHECK-NEXT: .long 36 @ 0x24 -; CHECK-NEXT: .long 42 @ 0x2a +; CHECK-NEXT: .short 0 @ 0x0 +; CHECK-NEXT: .short 3 @ 0x3 +; CHECK-NEXT: .short 6 @ 0x6 +; CHECK-NEXT: .short 9 @ 0x9 +; CHECK-NEXT: .short 12 @ 0xc +; CHECK-NEXT: .short 15 @ 0xf +; CHECK-NEXT: .short 18 @ 0x12 +; CHECK-NEXT: .short 21 @ 0x15 ; CHECK-NEXT: .LCPI12_2: -; CHECK-NEXT: .long 4 @ 0x4 -; CHECK-NEXT: .long 10 @ 0xa -; CHECK-NEXT: .long 16 @ 0x10 -; CHECK-NEXT: .long 22 @ 0x16 -; CHECK-NEXT: .LCPI12_3: -; CHECK-NEXT: .long 28 @ 0x1c -; CHECK-NEXT: .long 34 @ 0x22 -; CHECK-NEXT: .long 40 @ 0x28 -; CHECK-NEXT: .long 46 @ 0x2e -; CHECK-NEXT: .LCPI12_4: -; CHECK-NEXT: .long 26 @ 0x1a -; CHECK-NEXT: .long 32 @ 0x20 -; CHECK-NEXT: .long 38 @ 0x26 -; CHECK-NEXT: .long 44 @ 0x2c -; CHECK-NEXT: .LCPI12_5: -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 8 @ 0x8 -; CHECK-NEXT: .long 14 @ 0xe -; CHECK-NEXT: .long 20 @ 0x14 +; CHECK-NEXT: .short 2 @ 0x2 +; CHECK-NEXT: .short 5 @ 0x5 +; CHECK-NEXT: .short 8 @ 0x8 +; CHECK-NEXT: .short 11 @ 0xb +; CHECK-NEXT: .short 14 @ 0xe +; CHECK-NEXT: .short 17 @ 0x11 +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 23 @ 0x17 vector.ph: br label %vector.body @@ -1639,469 +882,88 @@ define arm_aapcs_vfpcc void @three_pointer_iv_v16i8(i8* nocapture readonly %x, i8* nocapture %z, i32 %n) { ; CHECK-LABEL: three_pointer_iv_v16i8: ; CHECK: @ %bb.0: @ %vector.ph -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #280 -; CHECK-NEXT: sub sp, #280 -; CHECK-NEXT: adr.w r3, .LCPI13_0 -; CHECK-NEXT: mov.w r12, #0 -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: adr.w r3, .LCPI13_1 -; CHECK-NEXT: vstrw.32 q0, [sp, #256] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: adr.w r3, .LCPI13_2 -; CHECK-NEXT: vstrw.32 q0, [sp, #240] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: adr.w r3, .LCPI13_3 -; CHECK-NEXT: vstrw.32 q0, [sp, #224] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: adr.w r3, .LCPI13_4 -; CHECK-NEXT: vstrw.32 q0, [sp, #208] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: adr.w r3, .LCPI13_5 -; CHECK-NEXT: vstrw.32 q0, [sp, #192] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: adr.w r3, .LCPI13_6 -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: adr.w r3, .LCPI13_7 -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: adr.w r3, .LCPI13_8 -; CHECK-NEXT: vstrw.32 q0, [sp, #176] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: adr.w r3, .LCPI13_9 -; CHECK-NEXT: vstrw.32 q0, [sp, #160] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: adr.w r3, .LCPI13_10 -; CHECK-NEXT: vstrw.32 q0, [sp, #144] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: adr.w r3, .LCPI13_11 -; CHECK-NEXT: vstrw.32 q0, [sp, #128] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: adr.w r12, .LCPI13_0 +; CHECK-NEXT: adr.w lr, .LCPI13_1 +; CHECK-NEXT: adr r3, .LCPI13_2 +; CHECK-NEXT: vldrw.u32 q2, [lr] +; CHECK-NEXT: vldrw.u32 q1, [r3] +; CHECK-NEXT: vldrw.u32 q3, [r12] ; CHECK-NEXT: vmov.i8 q0, #0xa -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: .LBB13_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #256] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q2, [sp, #208] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q1, [sp, #224] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q7, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r7, s5 -; CHECK-NEXT: vldrw.u32 q3, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: vldrb.u8 q4, [r0, q1] +; CHECK-NEXT: vldrb.u8 q5, [r0, q2] +; CHECK-NEXT: vldrb.u8 q6, [r0, q3] ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: add.w r12, r12, #4 -; CHECK-NEXT: ldrb.w lr, [r3] -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: vldrw.u32 q0, [sp, #240] @ 16-byte Reload -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vmov r6, s3 -; CHECK-NEXT: vmov.8 q5[0], r4 -; CHECK-NEXT: vmov.8 q5[1], r5 -; CHECK-NEXT: vmov.8 q5[2], lr -; CHECK-NEXT: ldrb.w r8, [r3] -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: vmov.8 q5[3], r8 -; CHECK-NEXT: ldrb r6, [r6] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q4[0], r3 -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q4[1], r3 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q4[2], r3 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: vldrw.u32 q2, [sp, #192] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q4[3], r3 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q4[4], r3 -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q4[5], r3 -; CHECK-NEXT: vmov r3, s10 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q4[6], r3 -; CHECK-NEXT: vmov r3, s11 -; CHECK-NEXT: vadd.i32 q2, q7, r0 -; CHECK-NEXT: vmov r4, s10 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q4[7], r3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: ldrb.w lr, [r4] -; CHECK-NEXT: vmov r4, s11 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: vmov.8 q4[8], r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov.8 q4[9], r7 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q4[10], r3 -; CHECK-NEXT: vmov r3, s8 -; CHECK-NEXT: ldrb r7, [r3] -; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: vmov.8 q5[4], r7 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q4[11], r3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q4[12], r3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q4[13], r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vadd.i32 q0, q3, r0 -; CHECK-NEXT: vadd.i32 q3, q3, r1 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov r7, s2 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q4[14], r3 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vmov.8 q4[15], r6 -; CHECK-NEXT: vmov r6, s1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #176] @ 16-byte Reload -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: ldrb.w r8, [r3] -; CHECK-NEXT: vmov r3, s9 -; CHECK-NEXT: ldrb r6, [r6] -; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q5[5], r3 -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov.8 q5[6], lr -; CHECK-NEXT: vmov.8 q5[7], r4 -; CHECK-NEXT: vmov.8 q5[8], r5 -; CHECK-NEXT: vmov.8 q5[9], r6 -; CHECK-NEXT: vmov.8 q5[10], r7 -; CHECK-NEXT: vmov.8 q5[11], r8 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q5[12], r3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q5[13], r3 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q5[14], r3 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vldrw.u32 q0, [sp, #160] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vmov r7, s5 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q5[15], r3 -; CHECK-NEXT: vmov r3, s4 ; CHECK-NEXT: vmul.i8 q4, q5, q4 -; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q0[0], r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov.8 q0[1], r7 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q0[2], r3 -; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: vldrw.u32 q1, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q0[3], r3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q0[4], r3 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q0[5], r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q0[6], r3 -; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: vldrw.u32 q1, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q0[7], r3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q0[8], r3 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q0[9], r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q0[10], r3 -; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: vadd.i32 q1, q2, r0 ; CHECK-NEXT: add.w r0, r0, #48 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q0[11], r3 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q0[12], r3 -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q0[13], r3 -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q0[14], r3 -; CHECK-NEXT: vmov r3, s7 -; CHECK-NEXT: vadd.i32 q1, q7, r1 -; CHECK-NEXT: vadd.i32 q7, q2, r1 -; CHECK-NEXT: vmov lr, s7 -; CHECK-NEXT: vmov r8, s6 -; CHECK-NEXT: vmov r6, s5 -; CHECK-NEXT: vmov r5, s4 -; CHECK-NEXT: vldrw.u32 q1, [sp, #256] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q1, q1, r1 -; CHECK-NEXT: vmov r4, s4 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q0[15], r3 -; CHECK-NEXT: vmul.i8 q6, q5, q0 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vmul.i8 q0, q5, q0 -; CHECK-NEXT: vmov.u8 r3, q0[0] -; CHECK-NEXT: vmov.u8 r7, q0[2] -; CHECK-NEXT: strb r3, [r4] -; CHECK-NEXT: vmov r3, s5 -; CHECK-NEXT: vmov.u8 r4, q0[1] -; CHECK-NEXT: strb r4, [r3] -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: strb r7, [r3] -; CHECK-NEXT: vmov r7, s12 -; CHECK-NEXT: vmov.u8 r3, q0[3] -; CHECK-NEXT: strb r3, [r4] -; CHECK-NEXT: vmov.u8 r3, q0[4] -; CHECK-NEXT: strb r3, [r5] -; CHECK-NEXT: vmov.u8 r3, q0[5] -; CHECK-NEXT: strb r3, [r6] -; CHECK-NEXT: vmov.u8 r3, q0[6] -; CHECK-NEXT: strb.w r3, [r8] -; CHECK-NEXT: vmov.u8 r3, q0[7] -; CHECK-NEXT: strb.w r3, [lr] -; CHECK-NEXT: vmov.u8 r3, q0[8] -; CHECK-NEXT: vmov.u8 lr, q0[15] -; CHECK-NEXT: vmov.u8 r8, q0[14] -; CHECK-NEXT: vmov.u8 r6, q0[13] -; CHECK-NEXT: vmov.u8 r5, q0[12] -; CHECK-NEXT: vmov.u8 r4, q0[11] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s13 -; CHECK-NEXT: vmov.u8 r3, q0[9] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s14 -; CHECK-NEXT: vmov.u8 r3, q0[10] -; CHECK-NEXT: vldrw.u32 q0, [sp, #240] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q0, q0, r1 -; CHECK-NEXT: vstrw.32 q0, [sp, #112] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [sp, #224] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q0, q0, r1 -; CHECK-NEXT: vstrw.32 q0, [sp, #96] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [sp, #192] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q0, q0, r1 -; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [sp, #208] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q0, q0, r1 -; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [sp, #128] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q5, q0, r1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #144] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q2, q0, r1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #160] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q1, q0, r1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #176] @ 16-byte Reload -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r3, s15 -; CHECK-NEXT: vadd.i32 q0, q0, r1 -; CHECK-NEXT: vmov r7, s4 +; CHECK-NEXT: vmul.i8 q6, q5, q6 +; CHECK-NEXT: vmul.i8 q5, q5, q0 +; CHECK-NEXT: vstrb.8 q5, [r1, q2] +; CHECK-NEXT: vstrb.8 q6, [r1, q3] +; CHECK-NEXT: vstrb.8 q4, [r1, q1] ; CHECK-NEXT: add.w r1, r1, #48 -; CHECK-NEXT: strb r4, [r3] -; CHECK-NEXT: vmov r3, s0 -; CHECK-NEXT: vmov r4, s2 -; CHECK-NEXT: strb r5, [r3] -; CHECK-NEXT: vmov r5, s1 -; CHECK-NEXT: vmov r3, s3 -; CHECK-NEXT: vldrw.u32 q0, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: strb r6, [r5] -; CHECK-NEXT: strb.w r8, [r4] -; CHECK-NEXT: strb.w lr, [r3] -; CHECK-NEXT: vmov.u8 r3, q6[0] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s5 -; CHECK-NEXT: vmov.u8 r3, q6[1] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s6 -; CHECK-NEXT: vmov.u8 r3, q6[2] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s7 -; CHECK-NEXT: vmov.u8 r3, q6[3] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s8 -; CHECK-NEXT: vmov.u8 r3, q6[4] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s9 -; CHECK-NEXT: vmov.u8 r3, q6[5] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s10 -; CHECK-NEXT: vmov.u8 r3, q6[6] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s11 -; CHECK-NEXT: vmov.u8 r3, q6[7] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s20 -; CHECK-NEXT: vmov.u8 r3, q6[8] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s21 -; CHECK-NEXT: vmov.u8 r3, q6[9] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s22 -; CHECK-NEXT: vmov.u8 r3, q6[10] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s23 -; CHECK-NEXT: vmov.u8 r3, q6[11] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s28 -; CHECK-NEXT: vmov.u8 r3, q6[12] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s29 -; CHECK-NEXT: vmov.u8 r3, q6[13] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s30 -; CHECK-NEXT: vmov.u8 r3, q6[14] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s31 -; CHECK-NEXT: vmov.u8 r3, q6[15] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s0 -; CHECK-NEXT: vmov.u8 r3, q4[0] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s1 -; CHECK-NEXT: vmov.u8 r3, q4[1] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s2 -; CHECK-NEXT: vmov.u8 r3, q4[2] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s3 -; CHECK-NEXT: vldrw.u32 q0, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vmov.u8 r3, q4[3] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s0 -; CHECK-NEXT: vmov.u8 r3, q4[4] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s1 -; CHECK-NEXT: vmov.u8 r3, q4[5] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s2 -; CHECK-NEXT: vmov.u8 r3, q4[6] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s3 -; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload -; CHECK-NEXT: vmov.u8 r3, q4[7] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s0 -; CHECK-NEXT: vmov.u8 r3, q4[8] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s1 -; CHECK-NEXT: vmov.u8 r3, q4[9] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s2 -; CHECK-NEXT: vmov.u8 r3, q4[10] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s3 -; CHECK-NEXT: vldrw.u32 q0, [sp, #112] @ 16-byte Reload -; CHECK-NEXT: vmov.u8 r3, q4[11] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s0 -; CHECK-NEXT: vmov.u8 r3, q4[12] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s1 -; CHECK-NEXT: vmov.u8 r3, q4[13] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s2 -; CHECK-NEXT: vmov.u8 r3, q4[14] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: vmov r7, s3 -; CHECK-NEXT: vmov.u8 r3, q4[15] -; CHECK-NEXT: strb r3, [r7] -; CHECK-NEXT: bne.w .LBB13_1 +; CHECK-NEXT: bne .LBB13_1 ; CHECK-NEXT: @ %bb.2: @ %end -; CHECK-NEXT: add sp, #280 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, pc} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop {r7, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI13_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 3 @ 0x3 -; CHECK-NEXT: .long 6 @ 0x6 -; CHECK-NEXT: .long 9 @ 0x9 +; CHECK-NEXT: .byte 1 @ 0x1 +; CHECK-NEXT: .byte 4 @ 0x4 +; CHECK-NEXT: .byte 7 @ 0x7 +; CHECK-NEXT: .byte 10 @ 0xa +; CHECK-NEXT: .byte 13 @ 0xd +; CHECK-NEXT: .byte 16 @ 0x10 +; CHECK-NEXT: .byte 19 @ 0x13 +; CHECK-NEXT: .byte 22 @ 0x16 +; CHECK-NEXT: .byte 25 @ 0x19 +; CHECK-NEXT: .byte 28 @ 0x1c +; CHECK-NEXT: .byte 31 @ 0x1f +; CHECK-NEXT: .byte 34 @ 0x22 +; CHECK-NEXT: .byte 37 @ 0x25 +; CHECK-NEXT: .byte 40 @ 0x28 +; CHECK-NEXT: .byte 43 @ 0x2b +; CHECK-NEXT: .byte 46 @ 0x2e ; CHECK-NEXT: .LCPI13_1: -; CHECK-NEXT: .long 38 @ 0x26 -; CHECK-NEXT: .long 41 @ 0x29 -; CHECK-NEXT: .long 44 @ 0x2c -; CHECK-NEXT: .long 47 @ 0x2f +; CHECK-NEXT: .byte 0 @ 0x0 +; CHECK-NEXT: .byte 3 @ 0x3 +; CHECK-NEXT: .byte 6 @ 0x6 +; CHECK-NEXT: .byte 9 @ 0x9 +; CHECK-NEXT: .byte 12 @ 0xc +; CHECK-NEXT: .byte 15 @ 0xf +; CHECK-NEXT: .byte 18 @ 0x12 +; CHECK-NEXT: .byte 21 @ 0x15 +; CHECK-NEXT: .byte 24 @ 0x18 +; CHECK-NEXT: .byte 27 @ 0x1b +; CHECK-NEXT: .byte 30 @ 0x1e +; CHECK-NEXT: .byte 33 @ 0x21 +; CHECK-NEXT: .byte 36 @ 0x24 +; CHECK-NEXT: .byte 39 @ 0x27 +; CHECK-NEXT: .byte 42 @ 0x2a +; CHECK-NEXT: .byte 45 @ 0x2d ; CHECK-NEXT: .LCPI13_2: -; CHECK-NEXT: .long 26 @ 0x1a -; CHECK-NEXT: .long 29 @ 0x1d -; CHECK-NEXT: .long 32 @ 0x20 -; CHECK-NEXT: .long 35 @ 0x23 -; CHECK-NEXT: .LCPI13_3: -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 5 @ 0x5 -; CHECK-NEXT: .long 8 @ 0x8 -; CHECK-NEXT: .long 11 @ 0xb -; CHECK-NEXT: .LCPI13_4: -; CHECK-NEXT: .long 14 @ 0xe -; CHECK-NEXT: .long 17 @ 0x11 -; CHECK-NEXT: .long 20 @ 0x14 -; CHECK-NEXT: .long 23 @ 0x17 -; CHECK-NEXT: .LCPI13_5: -; CHECK-NEXT: .long 12 @ 0xc -; CHECK-NEXT: .long 15 @ 0xf -; CHECK-NEXT: .long 18 @ 0x12 -; CHECK-NEXT: .long 21 @ 0x15 -; CHECK-NEXT: .LCPI13_6: -; CHECK-NEXT: .long 24 @ 0x18 -; CHECK-NEXT: .long 27 @ 0x1b -; CHECK-NEXT: .long 30 @ 0x1e -; CHECK-NEXT: .long 33 @ 0x21 -; CHECK-NEXT: .LCPI13_7: -; CHECK-NEXT: .long 36 @ 0x24 -; CHECK-NEXT: .long 39 @ 0x27 -; CHECK-NEXT: .long 42 @ 0x2a -; CHECK-NEXT: .long 45 @ 0x2d -; CHECK-NEXT: .LCPI13_8: -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 4 @ 0x4 -; CHECK-NEXT: .long 7 @ 0x7 -; CHECK-NEXT: .long 10 @ 0xa -; CHECK-NEXT: .LCPI13_9: -; CHECK-NEXT: .long 13 @ 0xd -; CHECK-NEXT: .long 16 @ 0x10 -; CHECK-NEXT: .long 19 @ 0x13 -; CHECK-NEXT: .long 22 @ 0x16 -; CHECK-NEXT: .LCPI13_10: -; CHECK-NEXT: .long 25 @ 0x19 -; CHECK-NEXT: .long 28 @ 0x1c -; CHECK-NEXT: .long 31 @ 0x1f -; CHECK-NEXT: .long 34 @ 0x22 -; CHECK-NEXT: .LCPI13_11: -; CHECK-NEXT: .long 37 @ 0x25 -; CHECK-NEXT: .long 40 @ 0x28 -; CHECK-NEXT: .long 43 @ 0x2b -; CHECK-NEXT: .long 46 @ 0x2e +; CHECK-NEXT: .byte 2 @ 0x2 +; CHECK-NEXT: .byte 5 @ 0x5 +; CHECK-NEXT: .byte 8 @ 0x8 +; CHECK-NEXT: .byte 11 @ 0xb +; CHECK-NEXT: .byte 14 @ 0xe +; CHECK-NEXT: .byte 17 @ 0x11 +; CHECK-NEXT: .byte 20 @ 0x14 +; CHECK-NEXT: .byte 23 @ 0x17 +; CHECK-NEXT: .byte 26 @ 0x1a +; CHECK-NEXT: .byte 29 @ 0x1d +; CHECK-NEXT: .byte 32 @ 0x20 +; CHECK-NEXT: .byte 35 @ 0x23 +; CHECK-NEXT: .byte 38 @ 0x26 +; CHECK-NEXT: .byte 41 @ 0x29 +; CHECK-NEXT: .byte 44 @ 0x2c +; CHECK-NEXT: .byte 47 @ 0x2f vector.ph: br label %vector.body Index: llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll +++ llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll @@ -226,48 +226,20 @@ ; CHECK-LABEL: scaled_v8i16_i16_2gep2: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: adr r1, .LCPI9_0 -; CHECK-NEXT: vmov.u16 r2, q0[0] ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vadd.i32 q2, q1, r0 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: strh r2, [r1] -; CHECK-NEXT: adr r1, .LCPI9_1 -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vstrh.16 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI9_0: -; CHECK-NEXT: .long 40 @ 0x28 -; CHECK-NEXT: .long 46 @ 0x2e -; CHECK-NEXT: .long 52 @ 0x34 -; CHECK-NEXT: .long 58 @ 0x3a -; CHECK-NEXT: .LCPI9_1: -; CHECK-NEXT: .long 64 @ 0x40 -; CHECK-NEXT: .long 70 @ 0x46 -; CHECK-NEXT: .long 76 @ 0x4c -; CHECK-NEXT: .long 82 @ 0x52 +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 23 @ 0x17 +; CHECK-NEXT: .short 26 @ 0x1a +; CHECK-NEXT: .short 29 @ 0x1d +; CHECK-NEXT: .short 32 @ 0x20 +; CHECK-NEXT: .short 35 @ 0x23 +; CHECK-NEXT: .short 38 @ 0x26 +; CHECK-NEXT: .short 41 @ 0x29 entry: %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %ptrs2 = getelementptr inbounds i16, <8 x i16*> %ptrs, i16 20 Index: llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll +++ llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll @@ -268,27 +268,15 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: adr r1, .LCPI16_0 ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vstrh.32 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI16_0: -; CHECK-NEXT: .long 10 @ 0xa -; CHECK-NEXT: .long 16 @ 0x10 -; CHECK-NEXT: .long 22 @ 0x16 -; CHECK-NEXT: .long 28 @ 0x1c +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 11 @ 0xb +; CHECK-NEXT: .long 14 @ 0xe entry: %ptrs = getelementptr inbounds i16, i16* %base, <4 x i16> %ptrs2 = getelementptr inbounds i16, <4 x i16*> %ptrs, i16 5 Index: llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll +++ llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll @@ -613,92 +613,29 @@ define arm_aapcs_vfpcc void @unscaled_v16i8_i8_2gep2(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) { ; CHECK-LABEL: unscaled_v16i8_i8_2gep2: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr r1, .LCPI11_0 -; CHECK-NEXT: vmov.u8 r2, q0[0] ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: adr r3, .LCPI11_3 -; CHECK-NEXT: vadd.i32 q4, q1, r0 -; CHECK-NEXT: vldrw.u32 q1, [r3] -; CHECK-NEXT: vmov r1, s16 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: strb r2, [r1] -; CHECK-NEXT: adr r1, .LCPI11_1 -; CHECK-NEXT: adr r2, .LCPI11_2 -; CHECK-NEXT: vldrw.u32 q3, [r1] -; CHECK-NEXT: vldrw.u32 q2, [r2] -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: vadd.i32 q3, q3, r0 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.u8 r1, q0[8] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.u8 r1, q0[14] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.u8 r1, q0[15] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vstrb.8 q0, [r0, q1] ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.1: ; CHECK-NEXT: .LCPI11_0: -; CHECK-NEXT: .long 5 @ 0x5 -; CHECK-NEXT: .long 8 @ 0x8 -; CHECK-NEXT: .long 11 @ 0xb -; CHECK-NEXT: .long 14 @ 0xe -; CHECK-NEXT: .LCPI11_1: -; CHECK-NEXT: .long 17 @ 0x11 -; CHECK-NEXT: .long 20 @ 0x14 -; CHECK-NEXT: .long 23 @ 0x17 -; CHECK-NEXT: .long 26 @ 0x1a -; CHECK-NEXT: .LCPI11_2: -; CHECK-NEXT: .long 29 @ 0x1d -; CHECK-NEXT: .long 32 @ 0x20 -; CHECK-NEXT: .long 35 @ 0x23 -; CHECK-NEXT: .long 38 @ 0x26 -; CHECK-NEXT: .LCPI11_3: -; CHECK-NEXT: .long 41 @ 0x29 -; CHECK-NEXT: .long 44 @ 0x2c -; CHECK-NEXT: .long 47 @ 0x2f -; CHECK-NEXT: .long 50 @ 0x32 +; CHECK-NEXT: .byte 5 @ 0x5 +; CHECK-NEXT: .byte 8 @ 0x8 +; CHECK-NEXT: .byte 11 @ 0xb +; CHECK-NEXT: .byte 14 @ 0xe +; CHECK-NEXT: .byte 17 @ 0x11 +; CHECK-NEXT: .byte 20 @ 0x14 +; CHECK-NEXT: .byte 23 @ 0x17 +; CHECK-NEXT: .byte 26 @ 0x1a +; CHECK-NEXT: .byte 29 @ 0x1d +; CHECK-NEXT: .byte 32 @ 0x20 +; CHECK-NEXT: .byte 35 @ 0x23 +; CHECK-NEXT: .byte 38 @ 0x26 +; CHECK-NEXT: .byte 41 @ 0x29 +; CHECK-NEXT: .byte 44 @ 0x2c +; CHECK-NEXT: .byte 47 @ 0x2f +; CHECK-NEXT: .byte 50 @ 0x32 entry: %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i8 5