Index: llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp =================================================================== --- llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -102,22 +102,11 @@ Instruction *&Root, IRBuilder<> &Builder); // Create a gather from a vector of pointers Value *tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr, - IRBuilder<> &Builder, - unsigned Increment = 0); - // Create a gather from a vector of pointers + IRBuilder<> &Builder, int64_t Increment = 0); + // Create an incrementing gather from a vector of pointers Value *tryCreateMaskedGatherBaseWB(IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, - unsigned Increment = 0); - // QI gathers can increment their offsets on their own if the increment is - // a constant value (digit) - Value *tryCreateIncrementingGather(IntrinsicInst *I, Value *BasePtr, - Value *Ptr, GetElementPtrInst *GEP, - IRBuilder<> &Builder); - // QI gathers can increment their offsets on their own if the increment is - // a constant value (digit) - this creates a writeback QI gather - Value *tryCreateIncrementingWBGather(IntrinsicInst *I, Value *BasePtr, - Value *Ptr, unsigned TypeScale, - IRBuilder<> &Builder); + int64_t Increment = 0); Value *lowerScatter(IntrinsicInst *I); // Create a scatter to a base + vector of offsets @@ -125,8 +114,24 @@ IRBuilder<> &Builder); // Create a scatter to a vector of pointers Value *tryCreateMaskedScatterBase(IntrinsicInst *I, Value *Ptr, - IRBuilder<> &Builder); - + IRBuilder<> &Builder, + int64_t Increment = 0); + // Create an incrementing scatter from a vector of pointers + Value *tryCreateMaskedScatterBaseWB(IntrinsicInst *I, Value *Ptr, + IRBuilder<> &Builder, + int64_t Increment = 0); + + // QI gathers and scatters can increment their offsets on their own if + // the increment is a constant value (digit) + Value *tryCreateIncrementingGatScat(IntrinsicInst *I, Value *BasePtr, + Value *Ptr, GetElementPtrInst *GEP, + IRBuilder<> &Builder); + // QI gathers/scatters can increment their offsets on their own if the + // increment is a constant value (digit) - this creates a writeback QI + // gather/scatter + Value *tryCreateIncrementingWBGatScat(IntrinsicInst *I, Value *BasePtr, + Value *Ptr, unsigned TypeScale, + IRBuilder<> &Builder); // Check whether these offsets could be moved out of the loop they're in bool optimiseOffsets(Value *Offsets, BasicBlock *BB, LoopInfo *LI); // Pushes the given add out of the loop @@ -342,7 +347,7 @@ Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, - unsigned Increment) { + int64_t Increment) { using namespace PatternMatch; auto *Ty = cast(I->getType()); LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers\n"); @@ -362,7 +367,7 @@ } Value *MVEGatherScatterLowering::tryCreateMaskedGatherBaseWB( - IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, unsigned Increment) { + IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, int64_t Increment) { using namespace PatternMatch; auto *Ty = cast(I->getType()); LLVM_DEBUG( @@ -426,8 +431,7 @@ return nullptr; // Check whether the offset is a constant increment that could be merged into // a QI gather - Value *Load = - tryCreateIncrementingGather(I, BasePtr, Offsets, GEP, Builder); + Value *Load = tryCreateIncrementingGatScat(I, BasePtr, Offsets, GEP, Builder); if (Load) return Load; @@ -453,19 +457,165 @@ Builder.getInt32(Scale), Builder.getInt32(Unsigned)}); } -Value *MVEGatherScatterLowering::tryCreateIncrementingGather( +Value *MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) { + using namespace PatternMatch; + LLVM_DEBUG(dbgs() << "masked scatters: checking transform preconditions\n"); + + // @llvm.masked.scatter.*(data, ptrs, alignment, mask) + // Attempt to turn the masked scatter in I into a MVE intrinsic + // Potentially optimising the addressing modes as we do so. + Value *Input = I->getArgOperand(0); + Value *Ptr = I->getArgOperand(1); + unsigned Alignment = cast(I->getArgOperand(2))->getZExtValue(); + auto *Ty = cast(Input->getType()); + + if (!isLegalTypeAndAlignment(Ty->getNumElements(), Ty->getScalarSizeInBits(), + Alignment)) + return nullptr; + + lookThroughBitcast(Ptr); + assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type"); + + IRBuilder<> Builder(I->getContext()); + Builder.SetInsertPoint(I); + Builder.SetCurrentDebugLocation(I->getDebugLoc()); + + Value *Store = tryCreateMaskedScatterOffset(I, Ptr, Builder); + if (!Store) + Store = tryCreateMaskedScatterBase(I, Ptr, Builder); + if (!Store) + return nullptr; + + LLVM_DEBUG(dbgs() << "masked scatters: successfully built masked scatter\n"); + I->eraseFromParent(); + return Store; +} + +Value *MVEGatherScatterLowering::tryCreateMaskedScatterBase( + IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, int64_t Increment) { + using namespace PatternMatch; + Value *Input = I->getArgOperand(0); + auto *Ty = cast(Input->getType()); + // Only QR variants allow truncating + if (!(Ty->getNumElements() == 4 && Ty->getScalarSizeInBits() == 32)) { + // Can't build an intrinsic for this + return nullptr; + } + Value *Mask = I->getArgOperand(3); + // int_arm_mve_vstr_scatter_base(_predicated) addr, offset, data(, mask) + LLVM_DEBUG(dbgs() << "masked scatters: storing to a vector of pointers\n"); + if (match(Mask, m_One())) + return Builder.CreateIntrinsic(Intrinsic::arm_mve_vstr_scatter_base, + {Ptr->getType(), Input->getType()}, + {Ptr, Builder.getInt32(Increment), Input}); + else + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vstr_scatter_base_predicated, + {Ptr->getType(), Input->getType(), Mask->getType()}, + {Ptr, Builder.getInt32(Increment), Input, Mask}); +} + +Value *MVEGatherScatterLowering::tryCreateMaskedScatterBaseWB( + IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, int64_t Increment) { + using namespace PatternMatch; + Value *Input = I->getArgOperand(0); + auto *Ty = cast(Input->getType()); + LLVM_DEBUG( + dbgs() + << "masked scatters: storing to a vector of pointers with writeback\n"); + if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32) + // Can't build an intrinsic for this + return nullptr; + Value *Mask = I->getArgOperand(3); + if (match(Mask, m_One())) + return Builder.CreateIntrinsic(Intrinsic::arm_mve_vstr_scatter_base_wb, + {Ptr->getType(), Input->getType()}, + {Ptr, Builder.getInt32(Increment), Input}); + else + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vstr_scatter_base_wb_predicated, + {Ptr->getType(), Input->getType(), Mask->getType()}, + {Ptr, Builder.getInt32(Increment), Input, Mask}); +} + +Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset( + IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder) { + using namespace PatternMatch; + Value *Input = I->getArgOperand(0); + Value *Mask = I->getArgOperand(3); + Type *InputTy = Input->getType(); + Type *MemoryTy = InputTy; + LLVM_DEBUG(dbgs() << "masked scatters: getelementpointer found. Storing" + << " to base + vector of offsets\n"); + // If the input has been truncated, try to integrate that trunc into the + // scatter instruction (we don't care about alignment here) + if (TruncInst *Trunc = dyn_cast(Input)) { + Value *PreTrunc = Trunc->getOperand(0); + Type *PreTruncTy = PreTrunc->getType(); + if (PreTruncTy->getPrimitiveSizeInBits() == 128) { + Input = PreTrunc; + InputTy = PreTruncTy; + } + } + if (InputTy->getPrimitiveSizeInBits() != 128) { + LLVM_DEBUG( + dbgs() << "masked scatters: cannot create scatters for non-standard" + << " input types. Expanding.\n"); + return nullptr; + } + + GetElementPtrInst *GEP = dyn_cast(Ptr); + Value *Offsets; + Value *BasePtr = checkGEP(Offsets, InputTy, GEP, Builder); + if (!BasePtr) + return nullptr; + // Check whether the offset is a constant increment that could be merged into + // a QI gather + Value *Store = + tryCreateIncrementingGatScat(I, BasePtr, Offsets, GEP, Builder); + if (Store) + return Store; + int Scale = computeScale( + BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(), + MemoryTy->getScalarSizeInBits()); + if (Scale == -1) + return nullptr; + + if (!match(Mask, m_One())) + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vstr_scatter_offset_predicated, + {BasePtr->getType(), Offsets->getType(), Input->getType(), + Mask->getType()}, + {BasePtr, Offsets, Input, + Builder.getInt32(MemoryTy->getScalarSizeInBits()), + Builder.getInt32(Scale), Mask}); + else + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vstr_scatter_offset, + {BasePtr->getType(), Offsets->getType(), Input->getType()}, + {BasePtr, Offsets, Input, + Builder.getInt32(MemoryTy->getScalarSizeInBits()), + Builder.getInt32(Scale)}); +} + +Value *MVEGatherScatterLowering::tryCreateIncrementingGatScat( IntrinsicInst *I, Value *BasePtr, Value *Offsets, GetElementPtrInst *GEP, IRBuilder<> &Builder) { - auto *Ty = cast(I->getType()); + VectorType *Ty; + if (I->getIntrinsicID() == Intrinsic::masked_gather) + Ty = cast(I->getType()); + else + Ty = cast(I->getArgOperand(0)->getType()); // Incrementing gathers only exist for v4i32 - if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32) + if (Ty->getNumElements() != 4 || + Ty->getScalarSizeInBits() != 32) return nullptr; Loop *L = LI->getLoopFor(I->getParent()); if (L == nullptr) // Incrementing gathers are not beneficial outside of a loop return nullptr; - LLVM_DEBUG( - dbgs() << "masked gathers: trying to build incrementing wb gather\n"); + LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to build incrementing " + "wb gather/scatter\n"); // The gep was in charge of making sure the offsets are scaled correctly // - calculate that factor so it can be applied by hand @@ -482,12 +632,12 @@ // change the phi which does affect other users of the gep (which will still // be using the phi in the old way) Value *Load = - tryCreateIncrementingWBGather(I, BasePtr, Offsets, TypeScale, Builder); + tryCreateIncrementingWBGatScat(I, BasePtr, Offsets, TypeScale, Builder); if (Load != nullptr) return Load; } - LLVM_DEBUG( - dbgs() << "masked gathers: trying to build incrementing non-wb gather\n"); + LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to build incrementing " + "non-wb gather/scatter\n"); std::pair Add = getVarAndConst(Offsets, TypeScale); if (Add.first == nullptr) @@ -510,11 +660,15 @@ cast(ScaledOffsets->getType())->getElementType())), "StartIndex", I); - return cast( - tryCreateMaskedGatherBase(I, OffsetsIncoming, Builder, Immediate)); + if (I->getIntrinsicID() == Intrinsic::masked_gather) + return cast( + tryCreateMaskedGatherBase(I, OffsetsIncoming, Builder, Immediate)); + else + return cast( + tryCreateMaskedScatterBase(I, OffsetsIncoming, Builder, Immediate)); } -Value *MVEGatherScatterLowering::tryCreateIncrementingWBGather( +Value *MVEGatherScatterLowering::tryCreateIncrementingWBGatScat( IntrinsicInst *I, Value *BasePtr, Value *Offsets, unsigned TypeScale, IRBuilder<> &Builder) { // Check whether this gather's offset is incremented by a constant - if so, @@ -574,132 +728,26 @@ Builder.SetInsertPoint(I); - // Build the incrementing gather - Value *Load = tryCreateMaskedGatherBaseWB(I, Phi, Builder, Immediate); - - // One value to be handed to whoever uses the gather, one is the loop - // increment - Value *ExtractedLoad = Builder.CreateExtractValue(Load, 0, "Gather"); - Value *Inc = Builder.CreateExtractValue(Load, 1, "GatherIncrement"); + Value *EndResult; + Value *NewInduction; + if (I->getIntrinsicID() == Intrinsic::masked_gather) { + // Build the incrementing gather + Value *Load = tryCreateMaskedGatherBaseWB(I, Phi, Builder, Immediate); + // One value to be handed to whoever uses the gather, one is the loop + // increment + EndResult = Builder.CreateExtractValue(Load, 0, "Gather"); + NewInduction = Builder.CreateExtractValue(Load, 1, "GatherIncrement"); + } else { + // Build the incrementing scatter + NewInduction = tryCreateMaskedScatterBaseWB(I, Phi, Builder, Immediate); + EndResult = NewInduction; + } Instruction *AddInst = cast(Offsets); - AddInst->replaceAllUsesWith(Inc); + AddInst->replaceAllUsesWith(NewInduction); AddInst->eraseFromParent(); - Phi->setIncomingValue(IncrementIndex, Inc); - - return ExtractedLoad; -} - -Value *MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) { - using namespace PatternMatch; - LLVM_DEBUG(dbgs() << "masked scatters: checking transform preconditions\n"); - - // @llvm.masked.scatter.*(data, ptrs, alignment, mask) - // Attempt to turn the masked scatter in I into a MVE intrinsic - // Potentially optimising the addressing modes as we do so. - Value *Input = I->getArgOperand(0); - Value *Ptr = I->getArgOperand(1); - unsigned Alignment = cast(I->getArgOperand(2))->getZExtValue(); - auto *Ty = cast(Input->getType()); - - if (!isLegalTypeAndAlignment(Ty->getNumElements(), Ty->getScalarSizeInBits(), - Alignment)) - return nullptr; - - lookThroughBitcast(Ptr); - assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type"); - - IRBuilder<> Builder(I->getContext()); - Builder.SetInsertPoint(I); - Builder.SetCurrentDebugLocation(I->getDebugLoc()); - - Value *Store = tryCreateMaskedScatterOffset(I, Ptr, Builder); - if (!Store) - Store = tryCreateMaskedScatterBase(I, Ptr, Builder); - if (!Store) - return nullptr; - - LLVM_DEBUG(dbgs() << "masked scatters: successfully built masked scatter\n"); - I->replaceAllUsesWith(Store); - I->eraseFromParent(); - return Store; -} - -Value *MVEGatherScatterLowering::tryCreateMaskedScatterBase( - IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder) { - using namespace PatternMatch; - Value *Input = I->getArgOperand(0); - Value *Mask = I->getArgOperand(3); - auto *Ty = cast(Input->getType()); - // Only QR variants allow truncating - if (!(Ty->getNumElements() == 4 && Ty->getScalarSizeInBits() == 32)) { - // Can't build an intrinsic for this - return nullptr; - } - // int_arm_mve_vstr_scatter_base(_predicated) addr, offset, data(, mask) - LLVM_DEBUG(dbgs() << "masked scatters: storing to a vector of pointers\n"); - if (match(Mask, m_One())) - return Builder.CreateIntrinsic(Intrinsic::arm_mve_vstr_scatter_base, - {Ptr->getType(), Input->getType()}, - {Ptr, Builder.getInt32(0), Input}); - else - return Builder.CreateIntrinsic( - Intrinsic::arm_mve_vstr_scatter_base_predicated, - {Ptr->getType(), Input->getType(), Mask->getType()}, - {Ptr, Builder.getInt32(0), Input, Mask}); -} - -Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset( - IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder) { - using namespace PatternMatch; - Value *Input = I->getArgOperand(0); - Value *Mask = I->getArgOperand(3); - Type *InputTy = Input->getType(); - Type *MemoryTy = InputTy; - LLVM_DEBUG(dbgs() << "masked scatters: getelementpointer found. Storing" - << " to base + vector of offsets\n"); - // If the input has been truncated, try to integrate that trunc into the - // scatter instruction (we don't care about alignment here) - if (TruncInst *Trunc = dyn_cast(Input)) { - Value *PreTrunc = Trunc->getOperand(0); - Type *PreTruncTy = PreTrunc->getType(); - if (PreTruncTy->getPrimitiveSizeInBits() == 128) { - Input = PreTrunc; - InputTy = PreTruncTy; - } - } - if (InputTy->getPrimitiveSizeInBits() != 128) { - LLVM_DEBUG( - dbgs() << "masked scatters: cannot create scatters for non-standard" - << " input types. Expanding.\n"); - return nullptr; - } - - GetElementPtrInst *GEP = dyn_cast(Ptr); - Value *Offsets; - Value *BasePtr = checkGEP(Offsets, InputTy, GEP, Builder); - if (!BasePtr) - return nullptr; - int Scale = computeScale( - BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(), - MemoryTy->getScalarSizeInBits()); - if (Scale == -1) - return nullptr; + Phi->setIncomingValue(IncrementIndex, NewInduction); - if (!match(Mask, m_One())) - return Builder.CreateIntrinsic( - Intrinsic::arm_mve_vstr_scatter_offset_predicated, - {BasePtr->getType(), Offsets->getType(), Input->getType(), - Mask->getType()}, - {BasePtr, Offsets, Input, - Builder.getInt32(MemoryTy->getScalarSizeInBits()), - Builder.getInt32(Scale), Mask}); - else - return Builder.CreateIntrinsic( - Intrinsic::arm_mve_vstr_scatter_offset, - {BasePtr->getType(), Offsets->getType(), Input->getType()}, - {BasePtr, Offsets, Input, - Builder.getInt32(MemoryTy->getScalarSizeInBits()), - Builder.getInt32(Scale)}); + return EndResult; } void MVEGatherScatterLowering::pushOutAdd(PHINode *&Phi, Index: llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-scatter-optimisation.ll @@ -153,23 +153,22 @@ ; CHECK-LABEL: push_out_mul_scatter: ; CHECK: @ %bb.0: @ %vector.ph ; CHECK-NEXT: adr r1, .LCPI3_0 -; CHECK-NEXT: vmov.i32 q1, #0x18 -; CHECK-NEXT: vldrw.u32 q2, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: .LBB3_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vstrw.32 q0, [r0, q2, uxtw #2] -; CHECK-NEXT: vadd.i32 q2, q2, q1 +; CHECK-NEXT: vstrw.32 q0, [q1, #96]! ; CHECK-NEXT: bne .LBB3_1 ; CHECK-NEXT: @ %bb.2: @ %end ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI3_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 6 @ 0x6 -; CHECK-NEXT: .long 12 @ 0xc -; CHECK-NEXT: .long 18 @ 0x12 +; CHECK-NEXT: .long 4294967200 @ 0xffffffa0 +; CHECK-NEXT: .long 4294967224 @ 0xffffffb8 +; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 +; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 i32* noalias nocapture %dst, i32 %n.vec, <4 x i32> %to.store) { @@ -196,23 +195,22 @@ ; CHECK-LABEL: push_out_add_scatter: ; CHECK: @ %bb.0: @ %vector.ph ; CHECK-NEXT: adr r1, .LCPI4_0 -; CHECK-NEXT: vmov.i32 q2, #0x8 ; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vadd.i32 q1, q1, r0 ; CHECK-NEXT: .LBB4_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] -; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vstrw.32 q0, [q1, #32]! ; CHECK-NEXT: bne .LBB4_1 ; CHECK-NEXT: @ %bb.2: @ %end ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI4_0: -; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 +; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 8 @ 0x8 -; CHECK-NEXT: .long 10 @ 0xa -; CHECK-NEXT: .long 12 @ 0xc +; CHECK-NEXT: .long 16 @ 0x10 i32* noalias nocapture %dst, i32 %n.vec, <4 x i32> %to.store) { Index: llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll +++ llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll @@ -141,73 +141,64 @@ ; CHECK-NEXT: push {r4, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #88 -; CHECK-NEXT: sub sp, #88 +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 ; CHECK-NEXT: cmp r1, #1 -; CHECK-NEXT: vstrw.32 q2, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: blt .LBB3_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader -; CHECK-NEXT: adr.w lr, .LCPI3_1 +; CHECK-NEXT: adr r4, .LCPI3_2 ; CHECK-NEXT: bic r2, r1, #3 -; CHECK-NEXT: vldrw.u32 q1, [lr] +; CHECK-NEXT: vldrw.u32 q3, [r4] ; CHECK-NEXT: sub.w r12, r2, #4 -; CHECK-NEXT: adr r4, .LCPI3_2 +; CHECK-NEXT: adr.w lr, .LCPI3_1 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [r4] +; CHECK-NEXT: vadd.i32 q3, q3, r0 ; CHECK-NEXT: add.w r3, r3, r12, lsr #2 +; CHECK-NEXT: vstrw.32 q3, [sp] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q3, [lr] ; CHECK-NEXT: adr.w r12, .LCPI3_0 -; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [r12] -; CHECK-NEXT: vmov.i32 q2, #0xc -; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q4, q3, r0 +; CHECK-NEXT: vldrw.u32 q3, [r12] +; CHECK-NEXT: vadd.i32 q3, q3, r0 ; CHECK-NEXT: .LBB3_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB3_3 Depth 2 -; CHECK-NEXT: dls lr, r3 -; CHECK-NEXT: vldrw.u32 q5, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov q6, q4 ; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov q5, q3 +; CHECK-NEXT: dls lr, r3 ; CHECK-NEXT: .LBB3_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB3_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vstrw.32 q0, [r0, q7, uxtw #2] -; CHECK-NEXT: vstrw.32 q1, [r0, q5, uxtw #2] -; CHECK-NEXT: vadd.i32 q3, q5, q2 -; CHECK-NEXT: vldrw.u32 q5, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q6, q4, q2 -; CHECK-NEXT: vadd.i32 q7, q7, q2 -; CHECK-NEXT: vstrw.32 q5, [r0, q4, uxtw #2] -; CHECK-NEXT: vmov q5, q3 -; CHECK-NEXT: vmov q4, q6 +; CHECK-NEXT: vstrw.32 q0, [q5, #48]! +; CHECK-NEXT: vstrw.32 q1, [q6, #48]! +; CHECK-NEXT: vstrw.32 q2, [q7, #48]! ; CHECK-NEXT: le lr, .LBB3_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1 ; CHECK-NEXT: cmp r2, r1 ; CHECK-NEXT: bne .LBB3_2 ; CHECK-NEXT: .LBB3_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #88 +; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.6: ; CHECK-NEXT: .LCPI3_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 3 @ 0x3 -; CHECK-NEXT: .long 6 @ 0x6 -; CHECK-NEXT: .long 9 @ 0x9 +; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 +; CHECK-NEXT: .long 4294967260 @ 0xffffffdc +; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 +; CHECK-NEXT: .long 4294967284 @ 0xfffffff4 ; CHECK-NEXT: .LCPI3_1: -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 5 @ 0x5 -; CHECK-NEXT: .long 8 @ 0x8 -; CHECK-NEXT: .long 11 @ 0xb +; CHECK-NEXT: .long 4294967252 @ 0xffffffd4 +; CHECK-NEXT: .long 4294967264 @ 0xffffffe0 +; CHECK-NEXT: .long 4294967276 @ 0xffffffec +; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 ; CHECK-NEXT: .LCPI3_2: -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 4 @ 0x4 -; CHECK-NEXT: .long 7 @ 0x7 -; CHECK-NEXT: .long 10 @ 0xa +; CHECK-NEXT: .long 4294967256 @ 0xffffffd8 +; CHECK-NEXT: .long 4294967268 @ 0xffffffe4 +; CHECK-NEXT: .long 4294967280 @ 0xfffffff0 +; CHECK-NEXT: .long 4294967292 @ 0xfffffffc entry: %cmp22 = icmp sgt i32 %n, 0 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup