diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp --- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -84,7 +84,7 @@ // Check for a getelementptr and deduce base and offsets from it, on success // returning the base directly and the offsets indirectly using the Offsets // argument - Value *checkGEP(Value *&Offsets, Type *Ty, GetElementPtrInst *GEP, + Value *checkGEP(Value *&Offsets, FixedVectorType *Ty, GetElementPtrInst *GEP, IRBuilder<> &Builder); // Compute the scale of this gather/scatter instruction int computeScale(unsigned GEPElemSize, unsigned MemoryElemSize); @@ -132,6 +132,11 @@ Value *tryCreateIncrementingWBGatScat(IntrinsicInst *I, Value *BasePtr, Value *Ptr, unsigned TypeScale, IRBuilder<> &Builder); + + // Optimise the base and offsets of the given address + bool optimiseAddress(Value *Address, BasicBlock *BB, LoopInfo *LI); + // Try to fold consecutive geps together into one + Value *foldGEP(GetElementPtrInst *GEP, Value *&Offsets, IRBuilder<> &Builder); // Check whether these offsets could be moved out of the loop they're in bool optimiseOffsets(Value *Offsets, BasicBlock *BB, LoopInfo *LI); // Pushes the given add out of the loop @@ -167,7 +172,49 @@ return false; } -Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty, +bool checkOffsetSize(Value *Offsets, unsigned TargetElemCount) { + // Offsets that are not of type are sign extended by the + // getelementptr instruction, and MVE gathers/scatters treat the offset as + // unsigned. Thus, if the element size is smaller than 32, we can only allow + // positive offsets - i.e., the offsets are not allowed to be variables we + // can't look into. + // Additionally, offsets have to either originate from a zext of a + // vector with element types smaller or equal the type of the gather we're + // looking at, or consist of constants that we can check are small enough + // to fit into the gather type. + // Thus we check that 0 < value < 2^TargetElemSize. + unsigned TargetElemSize = 128 / TargetElemCount; + unsigned OffsetElemSize = cast(Offsets->getType()) + ->getElementType() + ->getScalarSizeInBits(); + if (OffsetElemSize != TargetElemSize || OffsetElemSize != 32) { + Constant *ConstOff = dyn_cast(Offsets); + if (!ConstOff) + return false; + int64_t TargetElemMaxSize = (1ULL << TargetElemSize); + auto CheckValueSize = [TargetElemMaxSize](Value *OffsetElem) { + ConstantInt *OConst = dyn_cast(OffsetElem); + if (!OConst) + return false; + int SExtValue = OConst->getSExtValue(); + if (SExtValue >= TargetElemMaxSize || SExtValue < 0) + return false; + return true; + }; + if (isa(ConstOff->getType())) { + for (unsigned i = 0; i < TargetElemCount; i++) { + if (!CheckValueSize(ConstOff->getAggregateElement(i))) + return false; + } + } else { + if (!CheckValueSize(ConstOff)) + return false; + } + } + return true; +} + +Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, FixedVectorType *Ty, GetElementPtrInst *GEP, IRBuilder<> &Builder) { if (!GEP) { @@ -178,40 +225,43 @@ LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementpointer found." << " Looking at intrinsic for base + vector of offsets\n"); Value *GEPPtr = GEP->getPointerOperand(); - if (GEPPtr->getType()->isVectorTy()) { + Offsets = GEP->getOperand(1); + if (GEPPtr->getType()->isVectorTy() || + !isa(Offsets->getType())) return nullptr; - } + if (GEP->getNumOperands() != 2) { LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementptr with too many" << " operands. Expanding.\n"); return nullptr; } Offsets = GEP->getOperand(1); + unsigned OffsetsElemCount = + cast(Offsets->getType())->getNumElements(); // Paranoid check whether the number of parallel lanes is the same - assert(cast(Ty)->getNumElements() == - cast(Offsets->getType())->getNumElements()); - // Only offsets can be integrated into an arm gather, any smaller - // type would have to be sign extended by the gep - and arm gathers can only - // zero extend. Additionally, the offsets do have to originate from a zext of - // a vector with element types smaller or equal the type of the gather we're - // looking at - if (Offsets->getType()->getScalarSizeInBits() != 32) - return nullptr; - if (ZExtInst *ZextOffs = dyn_cast(Offsets)) + assert(Ty->getNumElements() == OffsetsElemCount); + + ZExtInst *ZextOffs = dyn_cast(Offsets); + if (ZextOffs) Offsets = ZextOffs->getOperand(0); - else if (!(cast(Offsets->getType())->getNumElements() == 4 && - Offsets->getType()->getScalarSizeInBits() == 32)) - return nullptr; + FixedVectorType *OffsetType = cast(Offsets->getType()); + + // If the offsets are already being zext-ed to , that relieves us of + // having to make sure that they won't overflow. + if (!ZextOffs || cast(ZextOffs->getDestTy()) + ->getElementType() + ->getScalarSizeInBits() != 32) + if (!checkOffsetSize(Offsets, OffsetsElemCount)) + return nullptr; + // The offset sizes have been checked; if any truncating or zext-ing is + // required to fix them, do that now if (Ty != Offsets->getType()) { - if ((Ty->getScalarSizeInBits() < - Offsets->getType()->getScalarSizeInBits())) { - LLVM_DEBUG(dbgs() << "masked gathers/scatters: no correct offset type." - << " Can't create intrinsic.\n"); - return nullptr; + if ((Ty->getElementType()->getScalarSizeInBits() < + OffsetType->getElementType()->getScalarSizeInBits())) { + Offsets = Builder.CreateTrunc(Offsets, Ty); } else { - Offsets = Builder.CreateZExt( - Offsets, VectorType::getInteger(cast(Ty))); + Offsets = Builder.CreateZExt(Offsets, VectorType::getInteger(Ty)); } } // If none of the checks failed, return the gep's base pointer @@ -426,7 +476,8 @@ GetElementPtrInst *GEP = dyn_cast(Ptr); Value *Offsets; - Value *BasePtr = checkGEP(Offsets, ResultTy, GEP, Builder); + Value *BasePtr = + checkGEP(Offsets, cast(ResultTy), GEP, Builder); if (!BasePtr) return nullptr; // Check whether the offset is a constant increment that could be merged into @@ -566,7 +617,8 @@ GetElementPtrInst *GEP = dyn_cast(Ptr); Value *Offsets; - Value *BasePtr = checkGEP(Offsets, InputTy, GEP, Builder); + Value *BasePtr = + checkGEP(Offsets, cast(InputTy), GEP, Builder); if (!BasePtr) return nullptr; // Check whether the offset is a constant increment that could be merged into @@ -978,6 +1030,128 @@ return true; } +Value *CheckAndCreateOffsetAdd(Value *X, Value *Y, Value *GEP, + IRBuilder<> &Builder) { + + // Splat the non-vector value to a vector of the given type - if the value is + // a constant (and its value isn't too big), we can even use this opportunity + // to scale it to the size of the vector elements + auto FixSummands = [&Builder](FixedVectorType *&VT, Value *&NonVectorVal) { + ConstantInt *Const; + if ((Const = dyn_cast(NonVectorVal)) && + VT->getElementType() != NonVectorVal->getType()) { + unsigned TargetElemSize = VT->getElementType()->getPrimitiveSizeInBits(); + uint64_t N = Const->getZExtValue(); + if (N < (unsigned)(1 << (TargetElemSize - 1))) { + NonVectorVal = Builder.CreateVectorSplat( + VT->getNumElements(), Builder.getIntN(TargetElemSize, N)); + return; + } + } + NonVectorVal = + Builder.CreateVectorSplat(VT->getNumElements(), NonVectorVal); + }; + + FixedVectorType *XElType = dyn_cast(X->getType()); + FixedVectorType *YElType = dyn_cast(Y->getType()); + // If one of X, Y is not a vector, we have to splat it in order + // to add the two of them. + if (XElType && !YElType) { + FixSummands(XElType, Y); + YElType = cast(Y->getType()); + } else if (YElType && !XElType) { + FixSummands(YElType, X); + XElType = cast(X->getType()); + } + // Check that the summands are of compatible types + if (XElType != YElType) { + LLVM_DEBUG(dbgs() << "masked gathers/scatters: incompatible gep offsets\n"); + return nullptr; + } + + if (XElType->getElementType()->getScalarSizeInBits() != 32) { + // Check that by adding the vectors we do not accidentally + // create an overflow + Constant *ConstX = dyn_cast(X); + Constant *ConstY = dyn_cast(Y); + if (!ConstX || !ConstY) + return nullptr; + unsigned TargetElemSize = 128 / XElType->getNumElements(); + for (unsigned i = 0; i < XElType->getNumElements(); i++) { + ConstantInt *ConstXEl = + dyn_cast(ConstX->getAggregateElement(i)); + ConstantInt *ConstYEl = + dyn_cast(ConstY->getAggregateElement(i)); + if (!ConstXEl || !ConstYEl || + ConstXEl->getZExtValue() + ConstYEl->getZExtValue() >= + (unsigned)(1 << (TargetElemSize - 1))) + return nullptr; + } + } + + Value *Add = Builder.CreateAdd(X, Y); + + FixedVectorType *GEPType = cast(GEP->getType()); + if (checkOffsetSize(Add, GEPType->getNumElements())) + return Add; + else + return nullptr; +} + +Value *MVEGatherScatterLowering::foldGEP(GetElementPtrInst *GEP, + Value *&Offsets, + IRBuilder<> &Builder) { + Value *GEPPtr = GEP->getPointerOperand(); + Offsets = GEP->getOperand(1); + // We only merge geps with constant offsets, because only for those + // we can make sure that we do not cause an overflow + if (!isa(Offsets)) + return nullptr; + GetElementPtrInst *BaseGEP; + if ((BaseGEP = dyn_cast(GEPPtr))) { + // Merge the two geps into one + Value *BaseBasePtr = foldGEP(BaseGEP, Offsets, Builder); + if (!BaseBasePtr) + return nullptr; + Offsets = + CheckAndCreateOffsetAdd(Offsets, GEP->getOperand(1), GEP, Builder); + if (Offsets == nullptr) + return nullptr; + return BaseBasePtr; + } + return GEPPtr; +} + +bool MVEGatherScatterLowering::optimiseAddress(Value *Address, BasicBlock *BB, + LoopInfo *LI) { + GetElementPtrInst *GEP = dyn_cast(Address); + if (!GEP) + return false; + bool Changed = false; + if (GEP->hasOneUse() && + dyn_cast(GEP->getPointerOperand())) { + IRBuilder<> Builder(GEP->getContext()); + Builder.SetInsertPoint(GEP); + Builder.SetCurrentDebugLocation(GEP->getDebugLoc()); + Value *Offsets; + Value *Base = foldGEP(GEP, Offsets, Builder); + // We only want to merge the geps if there is a real chance that they can be + // used by an MVE gather; thus the offset has to have the correct size + // (always i32 if it is not of vector type) and the base has to be a + // pointer. + if (Offsets && Base && Base != GEP) { + PointerType *BaseType = cast(Base->getType()); + GetElementPtrInst *NewAddress = GetElementPtrInst::Create( + BaseType->getPointerElementType(), Base, Offsets, "gep.merged", GEP); + GEP->replaceAllUsesWith(NewAddress); + GEP = NewAddress; + Changed = true; + } + } + Changed |= optimiseOffsets(GEP->getOperand(1), GEP->getParent(), LI); + return Changed; +} + bool MVEGatherScatterLowering::runOnFunction(Function &F) { if (!EnableMaskedGatherScatters) return false; @@ -995,22 +1169,17 @@ for (BasicBlock &BB : F) { for (Instruction &I : BB) { IntrinsicInst *II = dyn_cast(&I); - if (II && II->getIntrinsicID() == Intrinsic::masked_gather) { + if (II && II->getIntrinsicID() == Intrinsic::masked_gather && + isa(II->getType())) { Gathers.push_back(II); - if (isa(II->getArgOperand(0))) - Changed |= optimiseOffsets( - cast(II->getArgOperand(0))->getOperand(1), - II->getParent(), LI); - } else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter) { + Changed |= optimiseAddress(II->getArgOperand(0), II->getParent(), LI); + } else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter && + isa(II->getArgOperand(0)->getType())) { Scatters.push_back(II); - if (isa(II->getArgOperand(1))) - Changed |= optimiseOffsets( - cast(II->getArgOperand(1))->getOperand(1), - II->getParent(), LI); + Changed |= optimiseAddress(II->getArgOperand(1), II->getParent(), LI); } } } - for (unsigned i = 0; i < Gathers.size(); i++) { IntrinsicInst *I = Gathers[i]; Value *L = lowerGather(I); diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-scaled.ll @@ -266,6 +266,430 @@ ret <8 x i16> %gather } +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_2gep(i16* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: scaled_v8i16_i16_2gep: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vldrh.s32 q0, [r1] +; CHECK-NEXT: vmov.i32 q1, #0x28 +; CHECK-NEXT: vldrh.s32 q2, [r1, #8] +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vshl.i32 q2, q2, #1 +; CHECK-NEXT: vadd.i32 q0, q0, q1 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vadd.i32 q1, q2, q1 +; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrh.w r12, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: ldrh.w lr, [r3] +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: vmov.16 q0[2], r12 +; CHECK-NEXT: vmov.16 q0[3], lr +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: pop {r4, r5, r7, pc} +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %offs + %ptrs2 = getelementptr inbounds i16, <8 x i16*> %ptrs, i16 20 + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_2gep2(i16* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: scaled_v8i16_i16_2gep2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI14_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI14_0: +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 23 @ 0x17 +; CHECK-NEXT: .short 26 @ 0x1a +; CHECK-NEXT: .short 29 @ 0x1d +; CHECK-NEXT: .short 32 @ 0x20 +; CHECK-NEXT: .short 35 @ 0x23 +; CHECK-NEXT: .short 38 @ 0x26 +; CHECK-NEXT: .short 41 @ 0x29 +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> + %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i16 20 + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep(i16* %base) { +; CHECK-LABEL: scaled_v8i16_i16_biggep: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI15_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI15_0: +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 23 @ 0x17 +; CHECK-NEXT: .short 26 @ 0x1a +; CHECK-NEXT: .short 29 @ 0x1d +; CHECK-NEXT: .short 32 @ 0x20 +; CHECK-NEXT: .short 35 @ 0x23 +; CHECK-NEXT: .short 38 @ 0x26 +; CHECK-NEXT: .short 41 @ 0x29 +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> + %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i32 20 + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep2(i16* %base) { +; CHECK-LABEL: scaled_v8i16_i16_biggep2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI16_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrh.u16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI16_0: +; CHECK-NEXT: .short 0 @ 0x0 +; CHECK-NEXT: .short 3 @ 0x3 +; CHECK-NEXT: .short 6 @ 0x6 +; CHECK-NEXT: .short 9 @ 0x9 +; CHECK-NEXT: .short 12 @ 0xc +; CHECK-NEXT: .short 15 @ 0xf +; CHECK-NEXT: .short 18 @ 0x12 +; CHECK-NEXT: .short 21 @ 0x15 +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep3(i16* %base) { +; CHECK-LABEL: scaled_v8i16_i16_biggep3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: adr r1, .LCPI17_0 +; CHECK-NEXT: adr.w r12, .LCPI17_1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrh.w lr, [r1] +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: ldrh r6, [r2] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov.16 q0[2], lr +; CHECK-NEXT: vmov.16 q0[3], r6 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI17_0: +; CHECK-NEXT: .long 131072 @ 0x20000 +; CHECK-NEXT: .long 131078 @ 0x20006 +; CHECK-NEXT: .long 131084 @ 0x2000c +; CHECK-NEXT: .long 131090 @ 0x20012 +; CHECK-NEXT: .LCPI17_1: +; CHECK-NEXT: .long 131096 @ 0x20018 +; CHECK-NEXT: .long 131102 @ 0x2001e +; CHECK-NEXT: .long 131108 @ 0x20024 +; CHECK-NEXT: .long 131114 @ 0x2002a +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> + %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i32 65536 + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep4(i16* %base) { +; CHECK-LABEL: scaled_v8i16_i16_biggep4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: adr r1, .LCPI18_0 +; CHECK-NEXT: adr.w r12, .LCPI18_1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrh.w lr, [r1] +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: ldrh r6, [r2] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov.16 q0[2], lr +; CHECK-NEXT: vmov.16 q0[3], r6 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI18_0: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 12 @ 0xc +; CHECK-NEXT: .long 18 @ 0x12 +; CHECK-NEXT: .LCPI18_1: +; CHECK-NEXT: .long 24 @ 0x18 +; CHECK-NEXT: .long 131072 @ 0x20000 +; CHECK-NEXT: .long 36 @ 0x24 +; CHECK-NEXT: .long 42 @ 0x2a +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep5(<8 x i16*> %base) { +; CHECK-LABEL: scaled_v8i16_i16_biggep5: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vmov.i32 q2, #0x20000 +; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vmov r0, s2 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrh.w r12, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: ldrh.w lr, [r1] +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov.16 q0[2], r12 +; CHECK-NEXT: vmov.16 q0[3], lr +; CHECK-NEXT: vmov.16 q0[4], r3 +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: vmov.16 q0[5], r0 +; CHECK-NEXT: vmov.16 q0[6], r1 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: pop {r4, r5, r7, pc} +entry: + %ptrs2 = getelementptr inbounds i16,<8 x i16*> %base, i32 65536 + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep6(i16* %base) { +; CHECK-LABEL: scaled_v8i16_i16_biggep6: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: adr r1, .LCPI20_0 +; CHECK-NEXT: adr.w r12, .LCPI20_1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrh.w lr, [r1] +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: ldrh r6, [r2] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov.16 q0[2], lr +; CHECK-NEXT: vmov.16 q0[3], r6 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI20_0: +; CHECK-NEXT: .long 2 @ 0x2 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 14 @ 0xe +; CHECK-NEXT: .long 20 @ 0x14 +; CHECK-NEXT: .LCPI20_1: +; CHECK-NEXT: .long 131074 @ 0x20002 +; CHECK-NEXT: .long 32 @ 0x20 +; CHECK-NEXT: .long 38 @ 0x26 +; CHECK-NEXT: .long 44 @ 0x2c +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i32> + %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i32 1 + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_biggep7(i16* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: scaled_v8i16_i16_biggep7: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: adr r1, .LCPI21_0 +; CHECK-NEXT: adr.w r12, .LCPI21_1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrh.w lr, [r1] +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: ldrh r6, [r2] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: vmov.16 q0[0], r3 +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: vmov.16 q0[2], lr +; CHECK-NEXT: vmov.16 q0[3], r6 +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.16 q0[6], r2 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI21_0: +; CHECK-NEXT: .long 128 @ 0x80 +; CHECK-NEXT: .long 1206 @ 0x4b6 +; CHECK-NEXT: .long 1212 @ 0x4bc +; CHECK-NEXT: .long 1218 @ 0x4c2 +; CHECK-NEXT: .LCPI21_1: +; CHECK-NEXT: .long 1224 @ 0x4c8 +; CHECK-NEXT: .long 1230 @ 0x4ce +; CHECK-NEXT: .long 1236 @ 0x4d4 +; CHECK-NEXT: .long 1242 @ 0x4da +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> + %ptrs2 = getelementptr inbounds i16,<8 x i16*> %ptrs, i16 600 + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs2, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + +define arm_aapcs_vfpcc <8 x i16> @scaled_v8i16_i16_basei32(i32* %base, <8 x i16>* %offptr) { +; CHECK-LABEL: scaled_v8i16_i16_basei32: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vldrh.u32 q1, [r1, #8] +; CHECK-NEXT: vshl.i32 q0, q0, #2 +; CHECK-NEXT: vshl.i32 q1, q1, #2 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrh.w r12, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: ldrh.w lr, [r3] +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: ldrh r5, [r5] +; CHECK-NEXT: ldrh r0, [r0] +; CHECK-NEXT: ldrh r1, [r1] +; CHECK-NEXT: ldrh r4, [r4] +; CHECK-NEXT: ldrh r2, [r2] +; CHECK-NEXT: ldrh r3, [r3] +; CHECK-NEXT: vmov.16 q0[0], r2 +; CHECK-NEXT: vmov.16 q0[1], r5 +; CHECK-NEXT: vmov.16 q0[2], r12 +; CHECK-NEXT: vmov.16 q0[3], lr +; CHECK-NEXT: vmov.16 q0[4], r0 +; CHECK-NEXT: vmov.16 q0[5], r1 +; CHECK-NEXT: vmov.16 q0[6], r3 +; CHECK-NEXT: vmov.16 q0[7], r4 +; CHECK-NEXT: pop {r4, r5, r7, pc} +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %offs.zext = zext <8 x i16> %offs to <8 x i32> + %ptrs = getelementptr inbounds i32, i32* %base, <8 x i32> %offs.zext + %ptrs.cast = bitcast <8 x i32*> %ptrs to <8 x i16*> + %gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %ptrs.cast, i32 2, <8 x i1> , <8 x i16> undef) + ret <8 x i16> %gather +} + declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>) #1 declare <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*>, i32, <8 x i1>, <8 x i16>) #1 declare <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*>, i32, <8 x i1>, <8 x half>) #1 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll @@ -294,6 +294,45 @@ ret <4 x i32> %gather.sext } +define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32_2gep(i32* %base, <4 x i32>* %offptr) { +; CHECK-LABEL: scaled_i32_i32_2gep: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vmov.i32 q0, #0x14 +; CHECK-NEXT: vshl.i32 q1, q1, #2 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vadd.i32 q1, q1, q0 +; CHECK-NEXT: vldrw.u32 q0, [q1] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 + %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> %offs + %ptrs2 = getelementptr inbounds i32, <4 x i32*> %ptrs, i32 5 + %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs2, i32 4, <4 x i1> , <4 x i32> undef) + ret <4 x i32> %gather +} + +define arm_aapcs_vfpcc <4 x i32> @scaled_i32_i32_2gep2(i32* %base, <4 x i32>* %offptr) { +; CHECK-LABEL: scaled_i32_i32_2gep2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI21_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r0, q1, uxtw #2] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI21_0: +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 11 @ 0xb +; CHECK-NEXT: .long 14 @ 0xe +entry: + %ptrs = getelementptr inbounds i32, i32* %base, <4 x i32> + %ptrs2 = getelementptr inbounds i32, <4 x i32*> %ptrs, i32 5 + %gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %ptrs2, i32 4, <4 x i1> , <4 x i32> undef) + ret <4 x i32> %gather +} + declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>) declare <4 x i16> @llvm.masked.gather.v4i16.v4p0i16(<4 x i16*>, i32, <4 x i1>, <4 x i16>) declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind8-unscaled.ll @@ -365,6 +365,812 @@ ret <16 x i8> %gather } +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_2gep(i8* %base, <16 x i8>* %offptr) { +; CHECK-LABEL: unscaled_v16i8_i8_2gep: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, r7, lr} +; CHECK-NEXT: push {r4, r5, r6, r7, lr} +; CHECK-NEXT: vldrb.s32 q0, [r1, #12] +; CHECK-NEXT: vmov.i32 q2, #0x5 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: ldrb r3, [r2] +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: ldrb.w lr, [r2] +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vldrb.s32 q0, [r1, #8] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q0, q2 +; CHECK-NEXT: vldrb.s32 q0, [r1] +; CHECK-NEXT: vmov r6, s4 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: vadd.i32 q3, q0, q2 +; CHECK-NEXT: vmov r5, s12 +; CHECK-NEXT: vmov r7, s15 +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r7, [r7] +; CHECK-NEXT: vmov.8 q0[0], r5 +; CHECK-NEXT: vmov r5, s13 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[1], r5 +; CHECK-NEXT: vmov r5, s14 +; CHECK-NEXT: vldrb.s32 q3, [r1, #4] +; CHECK-NEXT: vadd.i32 q3, q3, r0 +; CHECK-NEXT: vadd.i32 q2, q3, q2 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[2], r5 +; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: vmov.8 q0[3], r7 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.8 q0[8], r6 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q0[10], r4 +; CHECK-NEXT: vmov.8 q0[11], r5 +; CHECK-NEXT: vmov.8 q0[12], r3 +; CHECK-NEXT: vmov.8 q0[13], r2 +; CHECK-NEXT: vmov.8 q0[14], r12 +; CHECK-NEXT: vmov.8 q0[15], lr +; CHECK-NEXT: pop {r4, r5, r6, r7, pc} +entry: + %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %offs + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i8 5 + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_2gep2(i8* %base, <16 x i8>* %offptr) { +; CHECK-LABEL: unscaled_v16i8_i8_2gep2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI8_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrb.u8 q0, [r0, q1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI8_0: +; CHECK-NEXT: .byte 5 @ 0x5 +; CHECK-NEXT: .byte 8 @ 0x8 +; CHECK-NEXT: .byte 11 @ 0xb +; CHECK-NEXT: .byte 14 @ 0xe +; CHECK-NEXT: .byte 17 @ 0x11 +; CHECK-NEXT: .byte 20 @ 0x14 +; CHECK-NEXT: .byte 23 @ 0x17 +; CHECK-NEXT: .byte 26 @ 0x1a +; CHECK-NEXT: .byte 29 @ 0x1d +; CHECK-NEXT: .byte 32 @ 0x20 +; CHECK-NEXT: .byte 35 @ 0x23 +; CHECK-NEXT: .byte 38 @ 0x26 +; CHECK-NEXT: .byte 41 @ 0x29 +; CHECK-NEXT: .byte 44 @ 0x2c +; CHECK-NEXT: .byte 47 @ 0x2f +; CHECK-NEXT: .byte 50 @ 0x32 +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i8 5 + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep(i8* %base) { +; CHECK-LABEL: unscaled_v16i8_i8_biggep: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI9_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrb.u8 q0, [r0, q1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI9_0: +; CHECK-NEXT: .byte 5 @ 0x5 +; CHECK-NEXT: .byte 8 @ 0x8 +; CHECK-NEXT: .byte 11 @ 0xb +; CHECK-NEXT: .byte 14 @ 0xe +; CHECK-NEXT: .byte 17 @ 0x11 +; CHECK-NEXT: .byte 20 @ 0x14 +; CHECK-NEXT: .byte 23 @ 0x17 +; CHECK-NEXT: .byte 26 @ 0x1a +; CHECK-NEXT: .byte 29 @ 0x1d +; CHECK-NEXT: .byte 32 @ 0x20 +; CHECK-NEXT: .byte 35 @ 0x23 +; CHECK-NEXT: .byte 38 @ 0x26 +; CHECK-NEXT: .byte 41 @ 0x29 +; CHECK-NEXT: .byte 44 @ 0x2c +; CHECK-NEXT: .byte 47 @ 0x2f +; CHECK-NEXT: .byte 50 @ 0x32 +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i32 5 + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep2(i8* %base) { +; CHECK-LABEL: unscaled_v16i8_i8_biggep2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI10_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrb.u8 q0, [r0, q1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI10_0: +; CHECK-NEXT: .byte 0 @ 0x0 +; CHECK-NEXT: .byte 3 @ 0x3 +; CHECK-NEXT: .byte 6 @ 0x6 +; CHECK-NEXT: .byte 9 @ 0x9 +; CHECK-NEXT: .byte 12 @ 0xc +; CHECK-NEXT: .byte 15 @ 0xf +; CHECK-NEXT: .byte 18 @ 0x12 +; CHECK-NEXT: .byte 21 @ 0x15 +; CHECK-NEXT: .byte 24 @ 0x18 +; CHECK-NEXT: .byte 27 @ 0x1b +; CHECK-NEXT: .byte 30 @ 0x1e +; CHECK-NEXT: .byte 33 @ 0x21 +; CHECK-NEXT: .byte 36 @ 0x24 +; CHECK-NEXT: .byte 39 @ 0x27 +; CHECK-NEXT: .byte 42 @ 0x2a +; CHECK-NEXT: .byte 45 @ 0x2d +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep3(i8* %base) { +; CHECK-LABEL: unscaled_v16i8_i8_biggep3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: adr r1, .LCPI11_0 +; CHECK-NEXT: adr r2, .LCPI11_1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adr r6, .LCPI11_2 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrb.w r12, [r1] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb.w lr, [r1] +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: ldrb r3, [r1] +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: adr r6, .LCPI11_3 +; CHECK-NEXT: vldrw.u32 q2, [r6] +; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.8 q0[8], r5 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q0[10], r12 +; CHECK-NEXT: vmov.8 q0[11], r4 +; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: vmov.8 q0[13], r3 +; CHECK-NEXT: vmov.8 q0[14], r1 +; CHECK-NEXT: vmov.8 q0[15], r2 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI11_0: +; CHECK-NEXT: .long 280 @ 0x118 +; CHECK-NEXT: .long 283 @ 0x11b +; CHECK-NEXT: .long 286 @ 0x11e +; CHECK-NEXT: .long 289 @ 0x121 +; CHECK-NEXT: .LCPI11_1: +; CHECK-NEXT: .long 292 @ 0x124 +; CHECK-NEXT: .long 295 @ 0x127 +; CHECK-NEXT: .long 298 @ 0x12a +; CHECK-NEXT: .long 301 @ 0x12d +; CHECK-NEXT: .LCPI11_2: +; CHECK-NEXT: .long 256 @ 0x100 +; CHECK-NEXT: .long 259 @ 0x103 +; CHECK-NEXT: .long 262 @ 0x106 +; CHECK-NEXT: .long 265 @ 0x109 +; CHECK-NEXT: .LCPI11_3: +; CHECK-NEXT: .long 268 @ 0x10c +; CHECK-NEXT: .long 271 @ 0x10f +; CHECK-NEXT: .long 274 @ 0x112 +; CHECK-NEXT: .long 277 @ 0x115 +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i32 256 + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep4(i8* %base) { +; CHECK-LABEL: unscaled_v16i8_i8_biggep4: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: adr r1, .LCPI12_0 +; CHECK-NEXT: adr r2, .LCPI12_1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adr r6, .LCPI12_2 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrb.w r12, [r1] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb.w lr, [r1] +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: ldrb r3, [r1] +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: adr r6, .LCPI12_3 +; CHECK-NEXT: vldrw.u32 q2, [r6] +; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.8 q0[8], r5 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q0[10], r12 +; CHECK-NEXT: vmov.8 q0[11], r4 +; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: vmov.8 q0[13], r3 +; CHECK-NEXT: vmov.8 q0[14], r1 +; CHECK-NEXT: vmov.8 q0[15], r2 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI12_0: +; CHECK-NEXT: .long 256 @ 0x100 +; CHECK-NEXT: .long 27 @ 0x1b +; CHECK-NEXT: .long 30 @ 0x1e +; CHECK-NEXT: .long 33 @ 0x21 +; CHECK-NEXT: .LCPI12_1: +; CHECK-NEXT: .long 36 @ 0x24 +; CHECK-NEXT: .long 39 @ 0x27 +; CHECK-NEXT: .long 42 @ 0x2a +; CHECK-NEXT: .long 45 @ 0x2d +; CHECK-NEXT: .LCPI12_2: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 9 @ 0x9 +; CHECK-NEXT: .LCPI12_3: +; CHECK-NEXT: .long 12 @ 0xc +; CHECK-NEXT: .long 15 @ 0xf +; CHECK-NEXT: .long 18 @ 0x12 +; CHECK-NEXT: .long 21 @ 0x15 +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep5(<16 x i8*> %base) { +; CHECK-LABEL: unscaled_v16i8_i8_biggep5: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov.i32 q4, #0x100 +; CHECK-NEXT: vadd.i32 q2, q2, q4 +; CHECK-NEXT: vadd.i32 q3, q3, q4 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vadd.i32 q1, q1, q4 +; CHECK-NEXT: vmov r1, s15 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: ldrb.w r12, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb.w lr, [r0] +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: ldrb r2, [r0] +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: ldrb r3, [r0] +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vadd.i32 q3, q0, q4 +; CHECK-NEXT: vmov r5, s12 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[0], r5 +; CHECK-NEXT: vmov r5, s13 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[1], r5 +; CHECK-NEXT: vmov r5, s14 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[2], r5 +; CHECK-NEXT: vmov r5, s15 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[3], r5 +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[4], r5 +; CHECK-NEXT: vmov r5, s5 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[5], r5 +; CHECK-NEXT: vmov r5, s6 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[6], r5 +; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[7], r5 +; CHECK-NEXT: vmov r5, s9 +; CHECK-NEXT: vmov.8 q0[8], r4 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.8 q0[9], r5 +; CHECK-NEXT: vmov.8 q0[10], r12 +; CHECK-NEXT: vmov.8 q0[11], lr +; CHECK-NEXT: vmov.8 q0[12], r2 +; CHECK-NEXT: vmov.8 q0[13], r3 +; CHECK-NEXT: vmov.8 q0[14], r0 +; CHECK-NEXT: vmov.8 q0[15], r1 +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r4, r5, r7, pc} +entry: + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %base, i32 256 + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep6(i8* %base) { +; CHECK-LABEL: unscaled_v16i8_i8_biggep6: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: adr r1, .LCPI14_0 +; CHECK-NEXT: adr r2, .LCPI14_1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adr r6, .LCPI14_2 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrb.w r12, [r1] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb.w lr, [r1] +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: ldrb r3, [r1] +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: adr r6, .LCPI14_3 +; CHECK-NEXT: vldrw.u32 q2, [r6] +; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.8 q0[8], r5 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q0[10], r12 +; CHECK-NEXT: vmov.8 q0[11], r4 +; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: vmov.8 q0[13], r3 +; CHECK-NEXT: vmov.8 q0[14], r1 +; CHECK-NEXT: vmov.8 q0[15], r2 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI14_0: +; CHECK-NEXT: .long 257 @ 0x101 +; CHECK-NEXT: .long 28 @ 0x1c +; CHECK-NEXT: .long 31 @ 0x1f +; CHECK-NEXT: .long 34 @ 0x22 +; CHECK-NEXT: .LCPI14_1: +; CHECK-NEXT: .long 37 @ 0x25 +; CHECK-NEXT: .long 40 @ 0x28 +; CHECK-NEXT: .long 43 @ 0x2b +; CHECK-NEXT: .long 46 @ 0x2e +; CHECK-NEXT: .LCPI14_2: +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 4 @ 0x4 +; CHECK-NEXT: .long 7 @ 0x7 +; CHECK-NEXT: .long 10 @ 0xa +; CHECK-NEXT: .LCPI14_3: +; CHECK-NEXT: .long 13 @ 0xd +; CHECK-NEXT: .long 16 @ 0x10 +; CHECK-NEXT: .long 19 @ 0x13 +; CHECK-NEXT: .long 22 @ 0x16 +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i32 1 + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_biggep7(i8* %base) { +; CHECK-LABEL: unscaled_v16i8_i8_biggep7: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: adr r1, .LCPI15_0 +; CHECK-NEXT: adr r2, .LCPI15_1 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: adr r6, .LCPI15_2 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: vmov r1, s6 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r5, s4 +; CHECK-NEXT: vmov r2, s3 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrb.w r12, [r1] +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb.w lr, [r1] +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: ldrb r3, [r1] +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: adr r6, .LCPI15_3 +; CHECK-NEXT: vldrw.u32 q2, [r6] +; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.8 q0[8], r5 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q0[10], r12 +; CHECK-NEXT: vmov.8 q0[11], r4 +; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: vmov.8 q0[13], r3 +; CHECK-NEXT: vmov.8 q0[14], r1 +; CHECK-NEXT: vmov.8 q0[15], r2 +; CHECK-NEXT: pop {r4, r5, r6, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI15_0: +; CHECK-NEXT: .long 224 @ 0xe0 +; CHECK-NEXT: .long 227 @ 0xe3 +; CHECK-NEXT: .long 230 @ 0xe6 +; CHECK-NEXT: .long 233 @ 0xe9 +; CHECK-NEXT: .LCPI15_1: +; CHECK-NEXT: .long 236 @ 0xec +; CHECK-NEXT: .long 239 @ 0xef +; CHECK-NEXT: .long 242 @ 0xf2 +; CHECK-NEXT: .long 245 @ 0xf5 +; CHECK-NEXT: .LCPI15_2: +; CHECK-NEXT: .long 300 @ 0x12c +; CHECK-NEXT: .long 203 @ 0xcb +; CHECK-NEXT: .long 206 @ 0xce +; CHECK-NEXT: .long 209 @ 0xd1 +; CHECK-NEXT: .LCPI15_3: +; CHECK-NEXT: .long 212 @ 0xd4 +; CHECK-NEXT: .long 215 @ 0xd7 +; CHECK-NEXT: .long 218 @ 0xda +; CHECK-NEXT: .long 221 @ 0xdd +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i32> + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i32 200 + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs2, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_2(i8* %base, <16 x i8>* %offptr) { +; CHECK-LABEL: unscaled_v16i8_i8_2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vldrb.s32 q0, [r1, #8] +; CHECK-NEXT: vldrb.s32 q2, [r1, #4] +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrb.s32 q0, [r1, #12] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r6, s4 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r4, s3 +; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb.w lr, [r2] +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vldrb.s32 q0, [r1] +; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.8 q0[8], r6 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q0[10], r12 +; CHECK-NEXT: vmov.8 q0[11], r5 +; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: vmov.8 q0[13], r2 +; CHECK-NEXT: vmov.8 q0[14], r3 +; CHECK-NEXT: vmov.8 q0[15], r4 +; CHECK-NEXT: pop {r4, r5, r6, pc} +entry: + %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %offs + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_i8_3(i8* %base, <16 x i8>* %offptr) { +; CHECK-LABEL: unscaled_v16i8_i8_3: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI17_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrb.u8 q0, [r0, q1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI17_0: +; CHECK-NEXT: .byte 0 @ 0x0 +; CHECK-NEXT: .byte 3 @ 0x3 +; CHECK-NEXT: .byte 6 @ 0x6 +; CHECK-NEXT: .byte 9 @ 0x9 +; CHECK-NEXT: .byte 12 @ 0xc +; CHECK-NEXT: .byte 15 @ 0xf +; CHECK-NEXT: .byte 18 @ 0x12 +; CHECK-NEXT: .byte 21 @ 0x15 +; CHECK-NEXT: .byte 24 @ 0x18 +; CHECK-NEXT: .byte 27 @ 0x1b +; CHECK-NEXT: .byte 30 @ 0x1e +; CHECK-NEXT: .byte 33 @ 0x21 +; CHECK-NEXT: .byte 36 @ 0x24 +; CHECK-NEXT: .byte 39 @ 0x27 +; CHECK-NEXT: .byte 42 @ 0x2a +; CHECK-NEXT: .byte 45 @ 0x2d +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + +define arm_aapcs_vfpcc <16 x i8> @unscaled_v16i8_basei16(i16* %base, <16 x i8>* %offptr) { +; CHECK-LABEL: unscaled_v16i8_basei16: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: vldrb.u32 q0, [r1, #8] +; CHECK-NEXT: vldrb.u32 q2, [r1, #4] +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vshl.i32 q2, q2, #1 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrb.u32 q0, [r1, #12] +; CHECK-NEXT: vmov r2, s6 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vmov r6, s4 +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov r5, s7 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov r4, s3 +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: ldrb r6, [r6] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: ldrb.w lr, [r2] +; CHECK-NEXT: vmov r2, s1 +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vshl.i32 q0, q0, #1 +; CHECK-NEXT: vadd.i32 q3, q0, r0 +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[0], r0 +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[1], r0 +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[2], r0 +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[3], r0 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[4], r0 +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[5], r0 +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[6], r0 +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[7], r0 +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.8 q0[8], r6 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: vmov.8 q0[9], r0 +; CHECK-NEXT: vmov.8 q0[10], r12 +; CHECK-NEXT: vmov.8 q0[11], r5 +; CHECK-NEXT: vmov.8 q0[12], lr +; CHECK-NEXT: vmov.8 q0[13], r2 +; CHECK-NEXT: vmov.8 q0[14], r3 +; CHECK-NEXT: vmov.8 q0[15], r4 +; CHECK-NEXT: pop {r4, r5, r6, pc} +entry: + %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 + %offs.zext = zext <16 x i8> %offs to <16 x i32> + %ptrs = getelementptr inbounds i16, i16* %base, <16 x i32> %offs.zext + %ptrs.cast = bitcast <16 x i16*> %ptrs to <16 x i8*> + %gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %ptrs.cast, i32 1, <16 x i1> , <16 x i8> undef) + ret <16 x i8> %gather +} + declare <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*>, i32, <16 x i1>, <16 x i8>) declare <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*>, i32, <8 x i1>, <8 x i8>) declare <2 x i8> @llvm.masked.gather.v2i8.v2p0i8(<2 x i8*>, i32, <2 x i1>, <2 x i8>) diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/Thumb2/mve-gather-scatter-ptr-address.ll @@ -0,0 +1,1012 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py + +; RUN: llc -mtriple=thumbv8.1m.main-none-none-eabi -mattr=+mve.fp -enable-arm-maskedgatscat %s -o 2>/dev/null - | FileCheck %s + +define void @ptr_iv_v4i32(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i32 %y) { +; CHECK-LABEL: ptr_iv_v4i32: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adr r3, .LCPI0_0 +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB0_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vadd.i32 q1, q1, r2 +; CHECK-NEXT: vstrw.32 q1, [r1, q0, uxtw #2] +; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: le lr, .LBB0_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI0_0: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 4 @ 0x4 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 12 @ 0xc +vector.ph: + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %y, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi i32* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi i32* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr i32, i32* %pointer.phi, i32 16 + %1 = getelementptr i32, i32* %pointer.phi, <4 x i32> + %2 = getelementptr i32, i32* %pointer.phi13, i32 16 + %3 = getelementptr i32, i32* %pointer.phi13, <4 x i32> + %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %1, i32 4, <4 x i1> , <4 x i32> undef) + %4 = add nsw <4 x i32> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %4, <4 x i32*> %3, i32 4, <4 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + +define void @ptr_iv_v4i32_mult(i32* noalias nocapture readonly %A, i32* noalias nocapture %B, i32 %y) { +; CHECK-LABEL: ptr_iv_v4i32_mult: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adr r1, .LCPI1_0 +; CHECK-NEXT: adr r3, .LCPI1_1 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB1_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q2, [r0, q0, uxtw #2] +; CHECK-NEXT: vadd.i32 q2, q2, r2 +; CHECK-NEXT: vstrw.32 q2, [r0, q1, uxtw #2] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: le lr, .LBB1_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI1_0: +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 9 @ 0x9 +; CHECK-NEXT: .long 13 @ 0xd +; CHECK-NEXT: .long 17 @ 0x11 +; CHECK-NEXT: .LCPI1_1: +; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .long 7 @ 0x7 +; CHECK-NEXT: .long 11 @ 0xb +; CHECK-NEXT: .long 15 @ 0xf +vector.ph: + %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %y, i32 0 + %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi i32* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi i32* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr i32, i32* %pointer.phi, i32 16 + %1 = getelementptr i32, i32* %pointer.phi, <4 x i32> + %gather.address = getelementptr i32, <4 x i32*> %1, i32 3 + %2 = getelementptr i32, i32* %pointer.phi13, i32 16 + %3 = getelementptr i32, i32* %pointer.phi13, <4 x i32> + %scatter.address = getelementptr i32, <4 x i32*> %1, i32 5 + %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %gather.address, i32 4, <4 x i1> , <4 x i32> undef) + %4 = add nsw <4 x i32> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %4, <4 x i32*> %scatter.address, i32 4, <4 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + +define void @ptr_iv_v8i16(i16* noalias nocapture readonly %A, i16* noalias nocapture %B, i16 %y) { +; CHECK-LABEL: ptr_iv_v8i16: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adr r3, .LCPI2_0 +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB2_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.u16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vadd.i16 q1, q1, r2 +; CHECK-NEXT: vstrh.16 q1, [r1, q0, uxtw #1] +; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: le lr, .LBB2_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI2_0: +; CHECK-NEXT: .short 0 @ 0x0 +; CHECK-NEXT: .short 4 @ 0x4 +; CHECK-NEXT: .short 8 @ 0x8 +; CHECK-NEXT: .short 12 @ 0xc +; CHECK-NEXT: .short 16 @ 0x10 +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 24 @ 0x18 +; CHECK-NEXT: .short 28 @ 0x1c +vector.ph: + %broadcast.splatinsert = insertelement <8 x i16> undef, i16 %y, i32 0 + %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi i16* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi i16* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr i16, i16* %pointer.phi, i32 32 + %1 = getelementptr i16, i16* %pointer.phi, <8 x i16> + %2 = getelementptr i16, i16* %pointer.phi13, i32 32 + %3 = getelementptr i16, i16* %pointer.phi13, <8 x i16> + %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %1, i32 4, <8 x i1> , <8 x i16> undef) + %4 = add nsw <8 x i16> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %4, <8 x i16*> %3, i32 4, <8 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + + +define void @ptr_iv_v8i16_mult(i16* noalias nocapture readonly %A, i16* noalias nocapture %B, i16 %y) { +; CHECK-LABEL: ptr_iv_v8i16_mult: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adr.w r12, .LCPI3_0 +; CHECK-NEXT: adr r3, .LCPI3_1 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB3_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.u16 q2, [r0, q0, uxtw #1] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vadd.i16 q2, q2, r2 +; CHECK-NEXT: vstrh.16 q2, [r1, q1, uxtw #1] +; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: le lr, .LBB3_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI3_0: +; CHECK-NEXT: .short 5 @ 0x5 +; CHECK-NEXT: .short 9 @ 0x9 +; CHECK-NEXT: .short 13 @ 0xd +; CHECK-NEXT: .short 17 @ 0x11 +; CHECK-NEXT: .short 21 @ 0x15 +; CHECK-NEXT: .short 25 @ 0x19 +; CHECK-NEXT: .short 29 @ 0x1d +; CHECK-NEXT: .short 33 @ 0x21 +; CHECK-NEXT: .LCPI3_1: +; CHECK-NEXT: .short 3 @ 0x3 +; CHECK-NEXT: .short 7 @ 0x7 +; CHECK-NEXT: .short 11 @ 0xb +; CHECK-NEXT: .short 15 @ 0xf +; CHECK-NEXT: .short 19 @ 0x13 +; CHECK-NEXT: .short 23 @ 0x17 +; CHECK-NEXT: .short 27 @ 0x1b +; CHECK-NEXT: .short 31 @ 0x1f +vector.ph: + %broadcast.splatinsert = insertelement <8 x i16> undef, i16 %y, i32 0 + %broadcast.splat = shufflevector <8 x i16> %broadcast.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi i16* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi i16* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr i16, i16* %pointer.phi, i32 32 + %1 = getelementptr i16, i16* %pointer.phi, <8 x i16> + %gather.address = getelementptr i16, <8 x i16*> %1, i16 3 + %2 = getelementptr i16, i16* %pointer.phi13, i32 32 + %3 = getelementptr i16, i16* %pointer.phi13, <8 x i16> + %scatter.address = getelementptr i16, <8 x i16*> %3, i16 5 + %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %gather.address, i32 4, <8 x i1> , <8 x i16> undef) + %4 = add nsw <8 x i16> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %4, <8 x i16*> %scatter.address, i32 4, <8 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + +define void @ptr_iv_v16i8(i8* noalias nocapture readonly %A, i8* noalias nocapture %B, i8 %y) { +; CHECK-LABEL: ptr_iv_v16i8: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adr r3, .LCPI4_0 +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB4_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u8 q1, [r0, q0] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vadd.i8 q1, q1, r2 +; CHECK-NEXT: vstrb.8 q1, [r1, q0] +; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: le lr, .LBB4_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI4_0: +; CHECK-NEXT: .byte 0 @ 0x0 +; CHECK-NEXT: .byte 4 @ 0x4 +; CHECK-NEXT: .byte 8 @ 0x8 +; CHECK-NEXT: .byte 12 @ 0xc +; CHECK-NEXT: .byte 16 @ 0x10 +; CHECK-NEXT: .byte 20 @ 0x14 +; CHECK-NEXT: .byte 24 @ 0x18 +; CHECK-NEXT: .byte 28 @ 0x1c +; CHECK-NEXT: .byte 32 @ 0x20 +; CHECK-NEXT: .byte 36 @ 0x24 +; CHECK-NEXT: .byte 40 @ 0x28 +; CHECK-NEXT: .byte 44 @ 0x2c +; CHECK-NEXT: .byte 48 @ 0x30 +; CHECK-NEXT: .byte 52 @ 0x34 +; CHECK-NEXT: .byte 56 @ 0x38 +; CHECK-NEXT: .byte 60 @ 0x3c +vector.ph: ; preds = %entry + %broadcast.splatinsert = insertelement <16 x i8> undef, i8 %y, i32 0 + %broadcast.splat = shufflevector <16 x i8> %broadcast.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi i8* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi i8* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr i8, i8* %pointer.phi, i32 64 + %1 = getelementptr i8, i8* %pointer.phi, <16 x i8> + %2 = getelementptr i8, i8* %pointer.phi13, i32 64 + %3 = getelementptr i8, i8* %pointer.phi13, <16 x i8> + %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %1, i32 4, <16 x i1> , <16 x i8> undef) + %4 = add nsw <16 x i8> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %4, <16 x i8*> %3, i32 4, <16 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + + +define void @ptr_iv_v16i8_mult(i8* noalias nocapture readonly %A, i8* noalias nocapture %B, i8 %y) { +; CHECK-LABEL: ptr_iv_v16i8_mult: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adr.w r12, .LCPI5_0 +; CHECK-NEXT: adr r3, .LCPI5_1 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vldrw.u32 q1, [r12] +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB5_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u8 q2, [r0, q0] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vadd.i8 q2, q2, r2 +; CHECK-NEXT: vstrb.8 q2, [r1, q1] +; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: le lr, .LBB5_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI5_0: +; CHECK-NEXT: .byte 5 @ 0x5 +; CHECK-NEXT: .byte 9 @ 0x9 +; CHECK-NEXT: .byte 13 @ 0xd +; CHECK-NEXT: .byte 17 @ 0x11 +; CHECK-NEXT: .byte 21 @ 0x15 +; CHECK-NEXT: .byte 25 @ 0x19 +; CHECK-NEXT: .byte 29 @ 0x1d +; CHECK-NEXT: .byte 33 @ 0x21 +; CHECK-NEXT: .byte 37 @ 0x25 +; CHECK-NEXT: .byte 41 @ 0x29 +; CHECK-NEXT: .byte 45 @ 0x2d +; CHECK-NEXT: .byte 49 @ 0x31 +; CHECK-NEXT: .byte 53 @ 0x35 +; CHECK-NEXT: .byte 57 @ 0x39 +; CHECK-NEXT: .byte 61 @ 0x3d +; CHECK-NEXT: .byte 65 @ 0x41 +; CHECK-NEXT: .LCPI5_1: +; CHECK-NEXT: .byte 3 @ 0x3 +; CHECK-NEXT: .byte 7 @ 0x7 +; CHECK-NEXT: .byte 11 @ 0xb +; CHECK-NEXT: .byte 15 @ 0xf +; CHECK-NEXT: .byte 19 @ 0x13 +; CHECK-NEXT: .byte 23 @ 0x17 +; CHECK-NEXT: .byte 27 @ 0x1b +; CHECK-NEXT: .byte 31 @ 0x1f +; CHECK-NEXT: .byte 35 @ 0x23 +; CHECK-NEXT: .byte 39 @ 0x27 +; CHECK-NEXT: .byte 43 @ 0x2b +; CHECK-NEXT: .byte 47 @ 0x2f +; CHECK-NEXT: .byte 51 @ 0x33 +; CHECK-NEXT: .byte 55 @ 0x37 +; CHECK-NEXT: .byte 59 @ 0x3b +; CHECK-NEXT: .byte 63 @ 0x3f +vector.ph: ; preds = %entry + %broadcast.splatinsert = insertelement <16 x i8> undef, i8 %y, i32 0 + %broadcast.splat = shufflevector <16 x i8> %broadcast.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi i8* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi i8* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr i8, i8* %pointer.phi, i32 64 + %1 = getelementptr i8, i8* %pointer.phi, <16 x i8> + %gather.address = getelementptr i8, <16 x i8*> %1, i8 3 + %2 = getelementptr i8, i8* %pointer.phi13, i32 64 + %3 = getelementptr i8, i8* %pointer.phi13, <16 x i8> + %scatter.address = getelementptr i8, <16 x i8*> %3, i8 5 + %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %gather.address, i32 4, <16 x i1> , <16 x i8> undef) + %4 = add nsw <16 x i8> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %4, <16 x i8*> %scatter.address, i32 4, <16 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + +define void @ptr_iv_v4f32(float* noalias nocapture readonly %A, float* noalias nocapture %B, float %y) { +; CHECK-LABEL: ptr_iv_v4f32: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adr r3, .LCPI6_0 +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB6_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q1, [r0, q0, uxtw #2] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vadd.f32 q1, q1, r2 +; CHECK-NEXT: vstrw.32 q1, [r1, q0, uxtw #2] +; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: le lr, .LBB6_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI6_0: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 4 @ 0x4 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 12 @ 0xc +vector.ph: ; preds = %entry + %broadcast.splatinsert = insertelement <4 x float> undef, float %y, i32 0 + %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi float* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi float* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr float, float* %pointer.phi, i32 16 + %1 = getelementptr float, float* %pointer.phi, <4 x i32> + %2 = getelementptr float, float* %pointer.phi13, i32 16 + %3 = getelementptr float, float* %pointer.phi13, <4 x i32> + %wide.masked.gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %1, i32 4, <4 x i1> , <4 x float> undef) + %4 = fadd <4 x float> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %4, <4 x float*> %3, i32 4, <4 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + +define void @ptr_iv_v4f32_mult(float* noalias nocapture readonly %A, float* noalias nocapture %B, float %y) { +; CHECK-LABEL: ptr_iv_v4f32_mult: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: adr r1, .LCPI7_0 +; CHECK-NEXT: adr r3, .LCPI7_1 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB7_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q2, [r0, q0, uxtw #2] +; CHECK-NEXT: vadd.f32 q2, q2, r2 +; CHECK-NEXT: vstrw.32 q2, [r0, q1, uxtw #2] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: le lr, .LBB7_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI7_0: +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 9 @ 0x9 +; CHECK-NEXT: .long 13 @ 0xd +; CHECK-NEXT: .long 17 @ 0x11 +; CHECK-NEXT: .LCPI7_1: +; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .long 7 @ 0x7 +; CHECK-NEXT: .long 11 @ 0xb +; CHECK-NEXT: .long 15 @ 0xf +vector.ph: ; preds = %entry + %broadcast.splatinsert = insertelement <4 x float> undef, float %y, i32 0 + %broadcast.splat = shufflevector <4 x float> %broadcast.splatinsert, <4 x float> undef, <4 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi float* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi float* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr float, float* %pointer.phi, i32 16 + %1 = getelementptr float, float* %pointer.phi, <4 x i32> + %gather.address = getelementptr float, <4 x float*> %1, i32 3 + %2 = getelementptr float, float* %pointer.phi13, i32 16 + %3 = getelementptr float, float* %pointer.phi13, <4 x i32> + %scatter.address = getelementptr float, <4 x float*> %1, i32 5 + %wide.masked.gather = call <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*> %gather.address, i32 4, <4 x i1> , <4 x float> undef) + %4 = fadd <4 x float> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float> %4, <4 x float*> %scatter.address, i32 4, <4 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + +define void @ptr_iv_v8f16(half* noalias nocapture readonly %A, half* noalias nocapture %B, float %y) { +; CHECK-LABEL: ptr_iv_v8f16: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov s0, r2 +; CHECK-NEXT: adr r3, .LCPI8_0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: vmov.f16 r2, s0 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: .LBB8_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.u16 q1, [r0, q0, uxtw #1] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: vadd.f16 q1, q1, r2 +; CHECK-NEXT: vstrh.16 q1, [r1, q0, uxtw #1] +; CHECK-NEXT: adds r1, #64 +; CHECK-NEXT: le lr, .LBB8_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI8_0: +; CHECK-NEXT: .short 0 @ 0x0 +; CHECK-NEXT: .short 4 @ 0x4 +; CHECK-NEXT: .short 8 @ 0x8 +; CHECK-NEXT: .short 12 @ 0xc +; CHECK-NEXT: .short 16 @ 0x10 +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 24 @ 0x18 +; CHECK-NEXT: .short 28 @ 0x1c +vector.ph: + %y.trunc = fptrunc float %y to half + %broadcast.splatinsert = insertelement <8 x half> undef, half %y.trunc, i32 0 + %broadcast.splat = shufflevector <8 x half> %broadcast.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi half* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi half* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr half, half* %pointer.phi, i32 32 + %1 = getelementptr half, half* %pointer.phi, <8 x i16> + %2 = getelementptr half, half* %pointer.phi13, i32 32 + %3 = getelementptr half, half* %pointer.phi13, <8 x i16> + %wide.masked.gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %1, i32 4, <8 x i1> , <8 x half> undef) + %4 = fadd <8 x half> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %4, <8 x half*> %3, i32 4, <8 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + +define void @ptr_iv_v8f16_mult(half* noalias nocapture readonly %A, half* noalias nocapture %B, float %y) { +; CHECK-LABEL: ptr_iv_v8f16_mult: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: vmov s0, r2 +; CHECK-NEXT: adr r2, .LCPI9_0 +; CHECK-NEXT: vcvtb.f16.f32 s0, s0 +; CHECK-NEXT: mov.w lr, #249 +; CHECK-NEXT: vmov.f16 r1, s0 +; CHECK-NEXT: vldrw.u32 q0, [r2] +; CHECK-NEXT: adr r2, .LCPI9_1 +; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: vldrw.u32 q1, [r2] +; CHECK-NEXT: .LBB9_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.u16 q2, [r0, q0, uxtw #1] +; CHECK-NEXT: vadd.f16 q2, q2, r1 +; CHECK-NEXT: vstrh.16 q2, [r0, q1, uxtw #1] +; CHECK-NEXT: adds r0, #64 +; CHECK-NEXT: le lr, .LBB9_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI9_0: +; CHECK-NEXT: .short 3 @ 0x3 +; CHECK-NEXT: .short 7 @ 0x7 +; CHECK-NEXT: .short 11 @ 0xb +; CHECK-NEXT: .short 15 @ 0xf +; CHECK-NEXT: .short 19 @ 0x13 +; CHECK-NEXT: .short 23 @ 0x17 +; CHECK-NEXT: .short 27 @ 0x1b +; CHECK-NEXT: .short 31 @ 0x1f +; CHECK-NEXT: .LCPI9_1: +; CHECK-NEXT: .short 5 @ 0x5 +; CHECK-NEXT: .short 9 @ 0x9 +; CHECK-NEXT: .short 13 @ 0xd +; CHECK-NEXT: .short 17 @ 0x11 +; CHECK-NEXT: .short 21 @ 0x15 +; CHECK-NEXT: .short 25 @ 0x19 +; CHECK-NEXT: .short 29 @ 0x1d +; CHECK-NEXT: .short 33 @ 0x21 +vector.ph: + %y.trunc = fptrunc float %y to half + %broadcast.splatinsert = insertelement <8 x half> undef, half %y.trunc, i32 0 + %broadcast.splat = shufflevector <8 x half> %broadcast.splatinsert, <8 x half> undef, <8 x i32> zeroinitializer + br label %vector.body + +vector.body: + %pointer.phi = phi half* [ %A, %vector.ph ], [ %0, %vector.body ] + %pointer.phi13 = phi half* [ %B, %vector.ph ], [ %2, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %0 = getelementptr half, half* %pointer.phi, i32 32 + %1 = getelementptr half, half* %pointer.phi, <8 x i16> + %gather.address = getelementptr half, <8 x half*> %1, i32 3 + %2 = getelementptr half, half* %pointer.phi13, i32 32 + %3 = getelementptr half, half* %pointer.phi13, <8 x i16> + %scatter.address = getelementptr half, <8 x half*> %1, i32 5 + %wide.masked.gather = call <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*> %gather.address, i32 4, <8 x i1> , <8 x half> undef) + %4 = fadd <8 x half> %wide.masked.gather, %broadcast.splat + call void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half> %4, <8 x half*> %scatter.address, i32 4, <8 x i1> ) + %index.next = add i32 %index, 4 + %5 = icmp eq i32 %index.next, 996 + br i1 %5, label %end, label %vector.body + +end: + ret void +} + + +define arm_aapcs_vfpcc void @three_pointer_iv_v4i32(i32* nocapture readonly %x, i32* nocapture %z, i32 %n) { +; CHECK-LABEL: three_pointer_iv_v4i32: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: adr.w r12, .LCPI10_0 +; CHECK-NEXT: adr.w lr, .LCPI10_1 +; CHECK-NEXT: adr r3, .LCPI10_2 +; CHECK-NEXT: vldrw.u32 q2, [lr] +; CHECK-NEXT: vldrw.u32 q1, [r3] +; CHECK-NEXT: vldrw.u32 q3, [r12] +; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: .LBB10_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrw.u32 q4, [r0, q1, uxtw #2] +; CHECK-NEXT: vldrw.u32 q5, [r0, q2, uxtw #2] +; CHECK-NEXT: vldrw.u32 q6, [r0, q3, uxtw #2] +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vmul.i32 q4, q5, q4 +; CHECK-NEXT: add.w r0, r0, #48 +; CHECK-NEXT: vmul.i32 q6, q5, q6 +; CHECK-NEXT: vmul.i32 q5, q5, q0 +; CHECK-NEXT: vstrw.32 q5, [r1, q2, uxtw #2] +; CHECK-NEXT: vstrw.32 q6, [r1, q3, uxtw #2] +; CHECK-NEXT: vstrw.32 q4, [r1, q1, uxtw #2] +; CHECK-NEXT: add.w r1, r1, #48 +; CHECK-NEXT: bne .LBB10_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI10_0: +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 4 @ 0x4 +; CHECK-NEXT: .long 7 @ 0x7 +; CHECK-NEXT: .long 10 @ 0xa +; CHECK-NEXT: .LCPI10_1: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 9 @ 0x9 +; CHECK-NEXT: .LCPI10_2: +; CHECK-NEXT: .long 2 @ 0x2 +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 11 @ 0xb +vector.ph: + br label %vector.body + +vector.body: + %pointer.phi = phi i32* [ %x, %vector.ph ], [ %v3, %vector.body ] + %pointer.phi55 = phi i32* [ %z, %vector.ph ], [ %v4, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vector.gep = getelementptr i32, i32* %pointer.phi, <4 x i32> + %v3 = getelementptr i32, i32* %pointer.phi, i32 12 + %vector.gep56 = getelementptr i32, i32* %pointer.phi55, <4 x i32> + %v4 = getelementptr i32, i32* %pointer.phi55, i32 12 + %v5 = add i32 %index, 0 + %v6 = getelementptr inbounds i32, <4 x i32*> %vector.gep, i32 1 + %wide.masked.gather = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %vector.gep, i32 4, <4 x i1> , <4 x i32> undef) + %v7 = getelementptr inbounds i32, <4 x i32*> %vector.gep, i32 2 + %wide.masked.gather57 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %v6, i32 4, <4 x i1> , <4 x i32> undef) + %wide.masked.gather58 = call <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*> %v7, i32 4, <4 x i1> , <4 x i32> undef) + %v11 = mul nuw nsw <4 x i32> %wide.masked.gather, + %v13 = mul nuw nsw <4 x i32> %wide.masked.gather, %wide.masked.gather57 + %v15 = mul nuw nsw <4 x i32> %wide.masked.gather, %wide.masked.gather58 + %v17 = getelementptr inbounds i32, <4 x i32*> %vector.gep56, i32 1 + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %v11, <4 x i32*> %vector.gep56, i32 4, <4 x i1> ) + %v18 = getelementptr inbounds i32, <4 x i32*> %vector.gep56, i32 2 + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %v13, <4 x i32*> %v17, i32 4, <4 x i1> ) + call void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32> %v15, <4 x i32*> %v18, i32 4, <4 x i1> ) + %index.next = add i32 %index, 4 + %v37 = icmp eq i32 %index.next, %n + br i1 %v37, label %end, label %vector.body + +end: + ret void; +} + +define arm_aapcs_vfpcc void @three_pointer_iv_v4i8(i8* nocapture readonly %x, i8* nocapture %z, i32 %n) { +; CHECK-LABEL: three_pointer_iv_v4i8: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: adr.w r12, .LCPI11_0 +; CHECK-NEXT: adr.w lr, .LCPI11_1 +; CHECK-NEXT: adr r3, .LCPI11_2 +; CHECK-NEXT: vldrw.u32 q2, [lr] +; CHECK-NEXT: vldrw.u32 q1, [r3] +; CHECK-NEXT: vldrw.u32 q3, [r12] +; CHECK-NEXT: vmov.i32 q0, #0xa +; CHECK-NEXT: .LBB11_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u32 q4, [r0, q1] +; CHECK-NEXT: vldrb.u32 q5, [r0, q2] +; CHECK-NEXT: vldrb.u32 q6, [r0, q3] +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vmul.i32 q4, q5, q4 +; CHECK-NEXT: add.w r0, r0, #12 +; CHECK-NEXT: vmul.i32 q6, q5, q6 +; CHECK-NEXT: vmul.i32 q5, q5, q0 +; CHECK-NEXT: vstrb.32 q5, [r1, q2] +; CHECK-NEXT: vstrb.32 q6, [r1, q3] +; CHECK-NEXT: vstrb.32 q4, [r1, q1] +; CHECK-NEXT: add.w r1, r1, #12 +; CHECK-NEXT: bne .LBB11_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI11_0: +; CHECK-NEXT: .long 1 @ 0x1 +; CHECK-NEXT: .long 4 @ 0x4 +; CHECK-NEXT: .long 7 @ 0x7 +; CHECK-NEXT: .long 10 @ 0xa +; CHECK-NEXT: .LCPI11_1: +; CHECK-NEXT: .long 0 @ 0x0 +; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 9 @ 0x9 +; CHECK-NEXT: .LCPI11_2: +; CHECK-NEXT: .long 2 @ 0x2 +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 11 @ 0xb +vector.ph: + br label %vector.body + +vector.body: ; preds = %vector.body, %vector.ph + %pointer.phi = phi i8* [ %x, %vector.ph ], [ %v3, %vector.body ] + %pointer.phi55 = phi i8* [ %z, %vector.ph ], [ %v4, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vector.gep = getelementptr i8, i8* %pointer.phi, <4 x i32> + %v3 = getelementptr i8, i8* %pointer.phi, i32 12 + %vector.gep56 = getelementptr i8, i8* %pointer.phi55, <4 x i32> + %v4 = getelementptr i8, i8* %pointer.phi55, i32 12 + %v5 = add i32 %index, 0 + %v6 = getelementptr inbounds i8, <4 x i8*> %vector.gep, i32 1 + %wide.masked.gather = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %vector.gep, i32 1, <4 x i1> , <4 x i8> undef) + %v7 = getelementptr inbounds i8, <4 x i8*> %vector.gep, i32 2 + %wide.masked.gather57 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %v6, i32 1, <4 x i1> , <4 x i8> undef) + %wide.masked.gather58 = call <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*> %v7, i32 1, <4 x i1> , <4 x i8> undef) + %v8 = zext <4 x i8> %wide.masked.gather to <4 x i32> + %v9 = zext <4 x i8> %wide.masked.gather57 to <4 x i32> + %v10 = zext <4 x i8> %wide.masked.gather58 to <4 x i32> + %v11 = mul nuw nsw <4 x i32> %v8, + %v12 = trunc <4 x i32> %v11 to <4 x i8> + %v13 = mul nuw nsw <4 x i32> %v8, %v9 + %v14 = trunc <4 x i32> %v13 to <4 x i8> + %v15 = mul nuw nsw <4 x i32> %v8, %v10 + %v16 = trunc <4 x i32> %v15 to <4 x i8> + %v17 = getelementptr inbounds i8, <4 x i8*> %vector.gep56, i32 1 + call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %v12, <4 x i8*> %vector.gep56, i32 1, <4 x i1> ) + %v18 = getelementptr inbounds i8, <4 x i8*> %vector.gep56, i32 2 + call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %v14, <4 x i8*> %v17, i32 1, <4 x i1> ) + call void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8> %v16, <4 x i8*> %v18, i32 1, <4 x i1> ) + %index.next = add i32 %index, 4 + %v37 = icmp eq i32 %index.next, %n + br i1 %v37, label %end, label %vector.body + +end: + ret void; +} + +define arm_aapcs_vfpcc void @three_pointer_iv_v8i16(i16* nocapture readonly %x, i16* nocapture %z, i32 %n) { +; CHECK-LABEL: three_pointer_iv_v8i16: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: adr.w r12, .LCPI12_0 +; CHECK-NEXT: adr.w lr, .LCPI12_1 +; CHECK-NEXT: adr r3, .LCPI12_2 +; CHECK-NEXT: vldrw.u32 q2, [lr] +; CHECK-NEXT: vldrw.u32 q1, [r3] +; CHECK-NEXT: vldrw.u32 q3, [r12] +; CHECK-NEXT: vmov.i16 q0, #0xa +; CHECK-NEXT: .LBB12_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrh.u16 q4, [r0, q1, uxtw #1] +; CHECK-NEXT: vldrh.u16 q5, [r0, q2, uxtw #1] +; CHECK-NEXT: vldrh.u16 q6, [r0, q3, uxtw #1] +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vmul.i16 q4, q5, q4 +; CHECK-NEXT: add.w r0, r0, #48 +; CHECK-NEXT: vmul.i16 q6, q5, q6 +; CHECK-NEXT: vmul.i16 q5, q5, q0 +; CHECK-NEXT: vstrh.16 q5, [r1, q2, uxtw #1] +; CHECK-NEXT: vstrh.16 q6, [r1, q3, uxtw #1] +; CHECK-NEXT: vstrh.16 q4, [r1, q1, uxtw #1] +; CHECK-NEXT: add.w r1, r1, #48 +; CHECK-NEXT: bne .LBB12_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI12_0: +; CHECK-NEXT: .short 1 @ 0x1 +; CHECK-NEXT: .short 4 @ 0x4 +; CHECK-NEXT: .short 7 @ 0x7 +; CHECK-NEXT: .short 10 @ 0xa +; CHECK-NEXT: .short 13 @ 0xd +; CHECK-NEXT: .short 16 @ 0x10 +; CHECK-NEXT: .short 19 @ 0x13 +; CHECK-NEXT: .short 22 @ 0x16 +; CHECK-NEXT: .LCPI12_1: +; CHECK-NEXT: .short 0 @ 0x0 +; CHECK-NEXT: .short 3 @ 0x3 +; CHECK-NEXT: .short 6 @ 0x6 +; CHECK-NEXT: .short 9 @ 0x9 +; CHECK-NEXT: .short 12 @ 0xc +; CHECK-NEXT: .short 15 @ 0xf +; CHECK-NEXT: .short 18 @ 0x12 +; CHECK-NEXT: .short 21 @ 0x15 +; CHECK-NEXT: .LCPI12_2: +; CHECK-NEXT: .short 2 @ 0x2 +; CHECK-NEXT: .short 5 @ 0x5 +; CHECK-NEXT: .short 8 @ 0x8 +; CHECK-NEXT: .short 11 @ 0xb +; CHECK-NEXT: .short 14 @ 0xe +; CHECK-NEXT: .short 17 @ 0x11 +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 23 @ 0x17 +vector.ph: + br label %vector.body + +vector.body: + %pointer.phi = phi i16* [ %x, %vector.ph ], [ %v3, %vector.body ] + %pointer.phi55 = phi i16* [ %z, %vector.ph ], [ %v4, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vector.gep = getelementptr i16, i16* %pointer.phi, <8 x i16> + %v3 = getelementptr i16, i16* %pointer.phi, i32 24 + %vector.gep56 = getelementptr i16, i16* %pointer.phi55, <8 x i16> + %v4 = getelementptr i16, i16* %pointer.phi55, i32 24 + %v5 = add i32 %index, 0 + %v6 = getelementptr inbounds i16, <8 x i16*> %vector.gep, i16 1 + %wide.masked.gather = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %vector.gep, i32 2, <8 x i1> , <8 x i16> undef) + %v7 = getelementptr inbounds i16, <8 x i16*> %vector.gep, i16 2 + %wide.masked.gather57 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %v6, i32 2, <8 x i1> , <8 x i16> undef) + %wide.masked.gather58 = call <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*> %v7, i32 2, <8 x i1> , <8 x i16> undef) + %v11 = mul nuw nsw <8 x i16> %wide.masked.gather, + %v13 = mul nuw nsw <8 x i16> %wide.masked.gather, %wide.masked.gather57 + %v15 = mul nuw nsw <8 x i16> %wide.masked.gather, %wide.masked.gather58 + %v17 = getelementptr inbounds i16, <8 x i16*> %vector.gep56, i32 1 + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %v11, <8 x i16*> %vector.gep56, i32 2, <8 x i1> ) + %v18 = getelementptr inbounds i16, <8 x i16*> %vector.gep56, i32 2 + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %v13, <8 x i16*> %v17, i32 2, <8 x i1> ) + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %v15, <8 x i16*> %v18, i32 2, <8 x i1> ) + %index.next = add i32 %index, 4 + %v37 = icmp eq i32 %index.next, %n + br i1 %v37, label %end, label %vector.body + +end: + ret void; +} + +define arm_aapcs_vfpcc void @three_pointer_iv_v16i8(i8* nocapture readonly %x, i8* nocapture %z, i32 %n) { +; CHECK-LABEL: three_pointer_iv_v16i8: +; CHECK: @ %bb.0: @ %vector.ph +; CHECK-NEXT: .save {r7, lr} +; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: adr.w r12, .LCPI13_0 +; CHECK-NEXT: adr.w lr, .LCPI13_1 +; CHECK-NEXT: adr r3, .LCPI13_2 +; CHECK-NEXT: vldrw.u32 q2, [lr] +; CHECK-NEXT: vldrw.u32 q1, [r3] +; CHECK-NEXT: vldrw.u32 q3, [r12] +; CHECK-NEXT: vmov.i8 q0, #0xa +; CHECK-NEXT: .LBB13_1: @ %vector.body +; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: vldrb.u8 q4, [r0, q1] +; CHECK-NEXT: vldrb.u8 q5, [r0, q2] +; CHECK-NEXT: vldrb.u8 q6, [r0, q3] +; CHECK-NEXT: subs r2, #4 +; CHECK-NEXT: vmul.i8 q4, q5, q4 +; CHECK-NEXT: add.w r0, r0, #48 +; CHECK-NEXT: vmul.i8 q6, q5, q6 +; CHECK-NEXT: vmul.i8 q5, q5, q0 +; CHECK-NEXT: vstrb.8 q5, [r1, q2] +; CHECK-NEXT: vstrb.8 q6, [r1, q3] +; CHECK-NEXT: vstrb.8 q4, [r1, q1] +; CHECK-NEXT: add.w r1, r1, #48 +; CHECK-NEXT: bne .LBB13_1 +; CHECK-NEXT: @ %bb.2: @ %end +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.3: +; CHECK-NEXT: .LCPI13_0: +; CHECK-NEXT: .byte 1 @ 0x1 +; CHECK-NEXT: .byte 4 @ 0x4 +; CHECK-NEXT: .byte 7 @ 0x7 +; CHECK-NEXT: .byte 10 @ 0xa +; CHECK-NEXT: .byte 13 @ 0xd +; CHECK-NEXT: .byte 16 @ 0x10 +; CHECK-NEXT: .byte 19 @ 0x13 +; CHECK-NEXT: .byte 22 @ 0x16 +; CHECK-NEXT: .byte 25 @ 0x19 +; CHECK-NEXT: .byte 28 @ 0x1c +; CHECK-NEXT: .byte 31 @ 0x1f +; CHECK-NEXT: .byte 34 @ 0x22 +; CHECK-NEXT: .byte 37 @ 0x25 +; CHECK-NEXT: .byte 40 @ 0x28 +; CHECK-NEXT: .byte 43 @ 0x2b +; CHECK-NEXT: .byte 46 @ 0x2e +; CHECK-NEXT: .LCPI13_1: +; CHECK-NEXT: .byte 0 @ 0x0 +; CHECK-NEXT: .byte 3 @ 0x3 +; CHECK-NEXT: .byte 6 @ 0x6 +; CHECK-NEXT: .byte 9 @ 0x9 +; CHECK-NEXT: .byte 12 @ 0xc +; CHECK-NEXT: .byte 15 @ 0xf +; CHECK-NEXT: .byte 18 @ 0x12 +; CHECK-NEXT: .byte 21 @ 0x15 +; CHECK-NEXT: .byte 24 @ 0x18 +; CHECK-NEXT: .byte 27 @ 0x1b +; CHECK-NEXT: .byte 30 @ 0x1e +; CHECK-NEXT: .byte 33 @ 0x21 +; CHECK-NEXT: .byte 36 @ 0x24 +; CHECK-NEXT: .byte 39 @ 0x27 +; CHECK-NEXT: .byte 42 @ 0x2a +; CHECK-NEXT: .byte 45 @ 0x2d +; CHECK-NEXT: .LCPI13_2: +; CHECK-NEXT: .byte 2 @ 0x2 +; CHECK-NEXT: .byte 5 @ 0x5 +; CHECK-NEXT: .byte 8 @ 0x8 +; CHECK-NEXT: .byte 11 @ 0xb +; CHECK-NEXT: .byte 14 @ 0xe +; CHECK-NEXT: .byte 17 @ 0x11 +; CHECK-NEXT: .byte 20 @ 0x14 +; CHECK-NEXT: .byte 23 @ 0x17 +; CHECK-NEXT: .byte 26 @ 0x1a +; CHECK-NEXT: .byte 29 @ 0x1d +; CHECK-NEXT: .byte 32 @ 0x20 +; CHECK-NEXT: .byte 35 @ 0x23 +; CHECK-NEXT: .byte 38 @ 0x26 +; CHECK-NEXT: .byte 41 @ 0x29 +; CHECK-NEXT: .byte 44 @ 0x2c +; CHECK-NEXT: .byte 47 @ 0x2f +vector.ph: + br label %vector.body + +vector.body: + %pointer.phi = phi i8* [ %x, %vector.ph ], [ %v3, %vector.body ] + %pointer.phi55 = phi i8* [ %z, %vector.ph ], [ %v4, %vector.body ] + %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ] + %vector.gep = getelementptr i8, i8* %pointer.phi, <16 x i8> + %v3 = getelementptr i8, i8* %pointer.phi, i32 48 + %vector.gep56 = getelementptr i8, i8* %pointer.phi55, <16 x i8> + %v4 = getelementptr i8, i8* %pointer.phi55, i32 48 + %v5 = add i32 %index, 0 + %v6 = getelementptr inbounds i8, <16 x i8*> %vector.gep, i8 1 + %wide.masked.gather = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %vector.gep, i32 1, <16 x i1> , <16 x i8> undef) + %v7 = getelementptr inbounds i8, <16 x i8*> %vector.gep, i8 2 + %wide.masked.gather57 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %v6, i32 1, <16 x i1> , <16 x i8> undef) + %wide.masked.gather58 = call <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*> %v7, i32 1, <16 x i1> , <16 x i8> undef) + %v11 = mul nuw nsw <16 x i8> %wide.masked.gather, + %v13 = mul nuw nsw <16 x i8> %wide.masked.gather, %wide.masked.gather57 + %v15 = mul nuw nsw <16 x i8> %wide.masked.gather, %wide.masked.gather58 + %v17 = getelementptr inbounds i8, <16 x i8*> %vector.gep56, i32 1 + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %v11, <16 x i8*> %vector.gep56, i32 1, <16 x i1> ) + %v18 = getelementptr inbounds i8, <16 x i8*> %vector.gep56, i32 2 + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %v13, <16 x i8*> %v17, i32 1, <16 x i1> ) + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %v15, <16 x i8*> %v18, i32 1, <16 x i1> ) + %index.next = add i32 %index, 4 + %v37 = icmp eq i32 %index.next, %n + br i1 %v37, label %end, label %vector.body + +end: + ret void; +} + +declare <4 x i8> @llvm.masked.gather.v4i8.v4p0i8(<4 x i8*>, i32, <4 x i1>, <4 x i8>) +declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) +declare <4 x float> @llvm.masked.gather.v4f32.v4p0f32(<4 x float*>, i32, <4 x i1>, <4 x float>) +declare <8 x i16> @llvm.masked.gather.v8i16.v8p0i16(<8 x i16*>, i32, <8 x i1>, <8 x i16>) +declare <8 x half> @llvm.masked.gather.v8f16.v8p0f16(<8 x half*>, i32, <8 x i1>, <8 x half>) +declare <16 x i8> @llvm.masked.gather.v16i8.v16p0i8(<16 x i8*>, i32, <16 x i1>, <16 x i8>) + +declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v4i32.v4p0i32(<4 x i32>, <4 x i32*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v4f32.v4p0f32(<4 x float>, <4 x float*>, i32, <4 x i1>) +declare void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16>, <8 x i16*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half>, <8 x half*>, i32, <8 x i1>) +declare void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8>, <16 x i8*>, i32, <16 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll @@ -177,5 +177,75 @@ ret void } +define arm_aapcs_vfpcc void @scaled_v8i16_i16_2gep(i16* %base, <8 x i16>* %offptr, <8 x i16> %input) { +; CHECK-LABEL: scaled_v8i16_i16_2gep: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrh.s32 q1, [r1, #8] +; CHECK-NEXT: vldrh.s32 q3, [r1] +; CHECK-NEXT: vmov.i32 q2, #0x28 +; CHECK-NEXT: vmov.u16 r1, q0[0] +; CHECK-NEXT: vshl.i32 q1, q1, #1 +; CHECK-NEXT: vshl.i32 q3, q3, #1 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vadd.i32 q3, q3, r0 +; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vadd.i32 q2, q3, q2 +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.u16 r1, q0[1] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.u16 r1, q0[2] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.u16 r1, q0[3] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.u16 r1, q0[4] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.u16 r1, q0[5] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.u16 r1, q0[6] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.u16 r1, q0[7] +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> %offs + %ptrs2 = getelementptr inbounds i16, <8 x i16*> %ptrs, i16 20 + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %input, <8 x i16*> %ptrs2, i32 2, <8 x i1> ) + ret void +} + +define arm_aapcs_vfpcc void @scaled_v8i16_i16_2gep2(i16* %base, <8 x i16>* %offptr, <8 x i16> %input) { +; CHECK-LABEL: scaled_v8i16_i16_2gep2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI9_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vstrh.16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI9_0: +; CHECK-NEXT: .short 20 @ 0x14 +; CHECK-NEXT: .short 23 @ 0x17 +; CHECK-NEXT: .short 26 @ 0x1a +; CHECK-NEXT: .short 29 @ 0x1d +; CHECK-NEXT: .short 32 @ 0x20 +; CHECK-NEXT: .short 35 @ 0x23 +; CHECK-NEXT: .short 38 @ 0x26 +; CHECK-NEXT: .short 41 @ 0x29 +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <8 x i16> + %ptrs2 = getelementptr inbounds i16, <8 x i16*> %ptrs, i16 20 + call void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16> %input, <8 x i16*> %ptrs2, i32 2, <8 x i1> ) + ret void +} + declare void @llvm.masked.scatter.v8i16.v8p0i16(<8 x i16>, <8 x i16*>, i32, <8 x i1>) declare void @llvm.masked.scatter.v8f16.v8p0f16(<8 x half>, <8 x half*>, i32, <8 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll @@ -233,6 +233,58 @@ ret void } +define arm_aapcs_vfpcc void @ext_scaled_i16_i32_2gep(i16* %base, <4 x i32>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: ext_scaled_i16_i32_2gep: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vmov.i32 q1, #0xa +; CHECK-NEXT: vmov r1, s0 +; CHECK-NEXT: vshl.i32 q2, q2, #1 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q1, q2, q1 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov r1, s1 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov r1, s2 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov r1, s3 +; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: bx lr +entry: + %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 + %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs + %ptrs2 = getelementptr inbounds i16, <4 x i16*> %ptrs, i16 5 + %t = trunc <4 x i32> %input to <4 x i16> + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs2, i32 2, <4 x i1> ) + ret void +} + +define arm_aapcs_vfpcc void @ext_scaled_i16_i32_2gep2(i16* %base, <4 x i32>* %offptr, <4 x i32> %input) { +; CHECK-LABEL: ext_scaled_i16_i32_2gep2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI16_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vstrh.32 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI16_0: +; CHECK-NEXT: .long 5 @ 0x5 +; CHECK-NEXT: .long 8 @ 0x8 +; CHECK-NEXT: .long 11 @ 0xb +; CHECK-NEXT: .long 14 @ 0xe +entry: + %ptrs = getelementptr inbounds i16, i16* %base, <4 x i16> + %ptrs2 = getelementptr inbounds i16, <4 x i16*> %ptrs, i16 5 + %t = trunc <4 x i32> %input to <4 x i16> + call void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16> %t, <4 x i16*> %ptrs2, i32 2, <4 x i1> ) + ret void +} + declare void @llvm.masked.scatter.v4i8.v4p0i8(<4 x i8>, <4 x i8*>, i32, <4 x i1>) declare void @llvm.masked.scatter.v4i16.v4p0i16(<4 x i16>, <4 x i16*>, i32, <4 x i1>) declare void @llvm.masked.scatter.v4f16.v4p0f16(<4 x half>, <4 x half*>, i32, <4 x i1>) diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll @@ -534,6 +534,115 @@ ret void } +define arm_aapcs_vfpcc void @unscaled_v16i8_i8_2gep(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) { +; CHECK-LABEL: unscaled_v16i8_i8_2gep: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vldrb.s32 q1, [r1, #12] +; CHECK-NEXT: vldrb.s32 q2, [r1, #8] +; CHECK-NEXT: vldrb.s32 q3, [r1, #4] +; CHECK-NEXT: vldrb.s32 q5, [r1] +; CHECK-NEXT: vmov.i32 q4, #0x5 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vadd.i32 q2, q2, r0 +; CHECK-NEXT: vadd.i32 q3, q3, r0 +; CHECK-NEXT: vadd.i32 q5, q5, r0 +; CHECK-NEXT: vadd.i32 q1, q1, q4 +; CHECK-NEXT: vadd.i32 q2, q2, q4 +; CHECK-NEXT: vadd.i32 q3, q3, q4 +; CHECK-NEXT: vadd.i32 q4, q5, q4 +; CHECK-NEXT: vmov.u8 r1, q0[0] +; CHECK-NEXT: vmov r0, s16 +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s17 +; CHECK-NEXT: vmov.u8 r1, q0[1] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s18 +; CHECK-NEXT: vmov.u8 r1, q0[2] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s19 +; CHECK-NEXT: vmov.u8 r1, q0[3] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s12 +; CHECK-NEXT: vmov.u8 r1, q0[4] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s13 +; CHECK-NEXT: vmov.u8 r1, q0[5] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s14 +; CHECK-NEXT: vmov.u8 r1, q0[6] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s15 +; CHECK-NEXT: vmov.u8 r1, q0[7] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s8 +; CHECK-NEXT: vmov.u8 r1, q0[8] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s9 +; CHECK-NEXT: vmov.u8 r1, q0[9] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s10 +; CHECK-NEXT: vmov.u8 r1, q0[10] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s11 +; CHECK-NEXT: vmov.u8 r1, q0[11] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov.u8 r1, q0[12] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s5 +; CHECK-NEXT: vmov.u8 r1, q0[13] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s6 +; CHECK-NEXT: vmov.u8 r1, q0[14] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vmov r0, s7 +; CHECK-NEXT: vmov.u8 r1, q0[15] +; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: bx lr +entry: + %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> %offs + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i8 5 + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs2, i32 1, <16 x i1> ) + ret void +} + +define arm_aapcs_vfpcc void @unscaled_v16i8_i8_2gep2(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) { +; CHECK-LABEL: unscaled_v16i8_i8_2gep2: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: adr r1, .LCPI11_0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vstrb.8 q0, [r0, q1] +; CHECK-NEXT: bx lr +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: @ %bb.1: +; CHECK-NEXT: .LCPI11_0: +; CHECK-NEXT: .byte 5 @ 0x5 +; CHECK-NEXT: .byte 8 @ 0x8 +; CHECK-NEXT: .byte 11 @ 0xb +; CHECK-NEXT: .byte 14 @ 0xe +; CHECK-NEXT: .byte 17 @ 0x11 +; CHECK-NEXT: .byte 20 @ 0x14 +; CHECK-NEXT: .byte 23 @ 0x17 +; CHECK-NEXT: .byte 26 @ 0x1a +; CHECK-NEXT: .byte 29 @ 0x1d +; CHECK-NEXT: .byte 32 @ 0x20 +; CHECK-NEXT: .byte 35 @ 0x23 +; CHECK-NEXT: .byte 38 @ 0x26 +; CHECK-NEXT: .byte 41 @ 0x29 +; CHECK-NEXT: .byte 44 @ 0x2c +; CHECK-NEXT: .byte 47 @ 0x2f +; CHECK-NEXT: .byte 50 @ 0x32 +entry: + %ptrs = getelementptr inbounds i8, i8* %base, <16 x i8> + %ptrs2 = getelementptr inbounds i8, <16 x i8*> %ptrs, i8 5 + call void @llvm.masked.scatter.v16i8.v16p0i8(<16 x i8> %input, <16 x i8*> %ptrs2, i32 1, <16 x i1> ) + ret void +} + declare void @llvm.masked.scatter.v2i8.v2p0i8(<2 x i8>, <2 x i8*>, i32, <2 x i1>) declare void @llvm.masked.scatter.v8i8.v8p0i8(<8 x i8>, <8 x i8*>, i32, <8 x i1>)