Index: llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp =================================================================== --- llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -91,7 +91,12 @@ Instruction *&Root, IRBuilder<> &Builder); // Create a gather from a vector of pointers Value *tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr, - IRBuilder<> &Builder); + IRBuilder<> &Builder, + unsigned Increment = 0); + // Create a gather from a vector of pointers + Value *tryCreateMaskedGatherBaseWB(IntrinsicInst *I, Value *Ptr, + IRBuilder<> &Builder, + unsigned Increment = 0); Value *lowerScatter(IntrinsicInst *I); // Create a scatter to a base + vector of offsets @@ -100,6 +105,10 @@ // Create a scatter to a vector of pointers Value *tryCreateMaskedScatterBase(IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder); + // QI gathers can increment their offsets on their own if the increment is + // a constant value (digit) + Value *tryCreateMergedGatherIncrement(IntrinsicInst *I, Value *BasePtr, + Value *Ptr, IRBuilder<> &Builder); // Check whether these offsets could be moved out of the loop they're in bool optimiseOffsets(Value *Offsets, BasicBlock *BB, LoopInfo *LI); @@ -109,6 +118,8 @@ void pushOutMul(PHINode *&Phi, Value *IncrementPerRound, Value *SecondOperand, unsigned LoopIncrement, IRBuilder<> &Builder); + // Returns true if V is a constant or a loop invariant instruction + bool isConstOrLoopInvInst(const Value *V, const Loop *L, const Value *Ignore); }; } // end anonymous namespace @@ -265,7 +276,8 @@ Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr, - IRBuilder<> &Builder) { + IRBuilder<> &Builder, + unsigned Increment) { using namespace PatternMatch; Type *Ty = I->getType(); LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers\n"); @@ -276,12 +288,34 @@ if (match(Mask, m_One())) return Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base, {Ty, Ptr->getType()}, - {Ptr, Builder.getInt32(0)}); + {Ptr, Builder.getInt32(Increment)}); else return Builder.CreateIntrinsic( Intrinsic::arm_mve_vldr_gather_base_predicated, {Ty, Ptr->getType(), Mask->getType()}, - {Ptr, Builder.getInt32(0), Mask}); + {Ptr, Builder.getInt32(Increment), Mask}); +} + +Value *MVEGatherScatterLowering::tryCreateMaskedGatherBaseWB( + IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, unsigned Increment) { + using namespace PatternMatch; + Type *Ty = I->getType(); + LLVM_DEBUG( + dbgs() + << "masked gathers: loading from vector of pointers with writeback\n"); + if (Ty->getVectorNumElements() != 4 || Ty->getScalarSizeInBits() != 32) + // Can't build an intrinsic for this + return nullptr; + Value *Mask = I->getArgOperand(2); + if (match(Mask, m_One())) + return Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base_wb, + {Ty, Ptr->getType()}, + {Ptr, Builder.getInt32(Increment)}); + else + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vldr_gather_base_wb_predicated, + {Ty, Ptr->getType(), Mask->getType()}, + {Ptr, Builder.getInt32(Increment), Mask}); } Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset( @@ -324,6 +358,11 @@ Value *BasePtr = checkGEP(Offsets, ResultTy, Ptr, Builder); if (!BasePtr) return nullptr; + // Check whether the offset is a constant increment that could be merged into + // a QI gather + Value *Load = tryCreateMergedGatherIncrement(I, BasePtr, Offsets, Builder); + if (Load) + return Load; int Scale = computeScale( BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(), @@ -720,6 +759,174 @@ return true; } +Value *MVEGatherScatterLowering::tryCreateMergedGatherIncrement( + IntrinsicInst *I, Value *BasePtr, Value *Offsets, IRBuilder<> &Builder) { + // Check whether this gather's offset is incremented by a constant - if so, + // and the load is of the right type, we can merge this into a QI gather + Type *Ty = I->getType(); + // Incrementing gathers only exist for v4i32 + if (Ty->getVectorNumElements() != 4 || Ty->getScalarSizeInBits() != 32) + return nullptr; + LoopInfo &LI = getAnalysis().getLoopInfo(); + if (LI.getLoopFor(I->getParent()) == nullptr) + // Incrementing gathers are not beneficial outside of a loop + return nullptr; + LLVM_DEBUG(dbgs() << "masked gathers: try to merge increment and gather\n"); + DataLayout DT = I->getParent()->getParent()->getParent()->getDataLayout(); + + // Offsets that are worth merging into this instruction will be incremented + // by a constant, thus we're looking for an add of something (most probably a + // phi) and a constant + PHINode *Phi = nullptr; + if (isa(Offsets)) + Phi = cast(Offsets); + unsigned IncrementIndex; + + if (Phi != nullptr) { + // Look through the phi to the phi increment + IncrementIndex = Phi->getIncomingBlock(0) == Phi->getParent() ? 0 : 1; + Offsets = Phi->getIncomingValue(IncrementIndex); + } + if (!isa(Offsets)) + // Offsets as vector of constants - there is nothing to write back to + return nullptr; + + // At this point, the instruction we're looking at must be an add or we + // bail out + Instruction *Add = cast(Offsets); + if ((Add == nullptr) || !(Add->getOpcode() == Instruction::Add)) + return nullptr; + + // The gep was in charge of making sure the offsets are aligned correctly + // - calculate that factor so it can be applied by hand + int TypeScale = 0; + for (User *U : (Phi == nullptr ? Add->users() : Phi->users())) { + if (isa(U)) { + TypeScale = computeScale( + DT.getTypeSizeInBits( + cast(U)->getOperand(0)->getType()), + DT.getTypeSizeInBits(cast(U)->getType()) / + cast(U)->getType()->getVectorNumElements()); + } + } + + if (TypeScale == -1) + return nullptr; + + Value *C = nullptr; + Value *OffsetsIncoming; + // Find out which operand the value that is increased is + if (isConstOrLoopInvInst(Add->getOperand(0), LI.getLoopFor(I->getParent()), + nullptr)) { + C = Add->getOperand(0); + OffsetsIncoming = Add->getOperand(1); + } else if (isConstOrLoopInvInst(Add->getOperand(1), LI.getLoopFor(I->getParent()), + nullptr)) { + OffsetsIncoming = Add->getOperand(0); + C = Add->getOperand(1); + } + + int Const; + // The increment can only be merged with the gather if C is a constant + // or a combination of two constants + if (!isa(C)) { + if (!isa(C)) + return nullptr; + Instruction *Inst = cast(C); + if (isa(Inst->getOperand(0)) && + isa(Inst->getOperand(1))) { + int Left = cast(Inst->getOperand(0)) + ->getUniqueInteger() + .getZExtValue(); + int Right = cast(Inst->getOperand(1)) + ->getUniqueInteger() + .getZExtValue(); + // Only take the number, we don't need to construct an instruction + if (Inst->getOpcode() == Instruction::Add) + Const = Left + Right; + else if (Inst->getOpcode() == Instruction::Mul) + Const = Left * Right; + else + return nullptr; + } else + return nullptr; + } else { + Const = cast(C)->getUniqueInteger().getZExtValue(); + } + // Check that the constant is small enough for an incrementing gather + if (Const > 126) + return nullptr; + + // If right or left operand of offsets is a phi node, then if Offsets + // itself is not used by that phi node (i.e., is no immediate increment) + // the gather doesn't need to be write-back + PHINode *P; + if ((P = dyn_cast(Add->getOperand(0))) != nullptr || + ((P = dyn_cast(Add->getOperand(1))) != nullptr)) { + if (!(P->getIncomingValue(0) == Add || P->getIncomingValue(1) == Add)) { + // No loop increment, so no writeback + return cast( + tryCreateMaskedGatherBase(I, P, Builder, Const << TypeScale)); + } + } + if (Phi == nullptr) { + // If no phi node has been found, there's nothing to write back too - + // so either the former condition was true and we built a normal + // incrementing gather, or we bail out + return nullptr; + } + + unsigned Incoming = IncrementIndex == 0 ? 1 : 0; + Builder.SetInsertPoint(&Phi->getIncomingBlock(Incoming)->back()); + + // Make sure the offsets are aligned correctly + Instruction *AlignOffsets = BinaryOperator::Create( + Instruction::Shl, Phi->getIncomingValue(Incoming), + Builder.CreateVectorSplat( + OffsetsIncoming->getType()->getVectorNumElements(), + Builder.getInt32(TypeScale)), + "AlignedIndex", &Phi->getIncomingBlock(Incoming)->back()); + Phi->setIncomingValue(Incoming, AlignOffsets); + // Add the base to the offsets + OffsetsIncoming = BinaryOperator::Create( + Instruction::Add, Phi->getIncomingValue(Incoming), + Builder.CreateVectorSplat( + AlignOffsets->getType()->getVectorNumElements(), + Builder.CreatePtrToInt( + BasePtr, AlignOffsets->getType()->getVectorElementType())), + "StartIndex", &Phi->getIncomingBlock(Incoming)->back()); + // The gather is pre-incrementing + OffsetsIncoming = BinaryOperator::Create( + Instruction::Sub, OffsetsIncoming, + Builder.CreateVectorSplat( + OffsetsIncoming->getType()->getVectorNumElements(), + Builder.getInt32(Const << TypeScale)), + "PreIncrementStartIndex", &Phi->getIncomingBlock(Incoming)->back()); + Phi->setIncomingValue(Incoming, OffsetsIncoming); + + Builder.SetInsertPoint(I); + + // Build the incrementing gather + IntrinsicInst *Load; + if (Phi == nullptr) + Load = cast(tryCreateMaskedGatherBaseWB( + I, OffsetsIncoming, Builder, Const << TypeScale)); + else + Load = cast( + tryCreateMaskedGatherBaseWB(I, Phi, Builder, Const << TypeScale)); + + // One value to be handed to whoever uses the gather, one is the loop + // increment + Value *ExtractedLoad = Builder.CreateExtractValue(Load, 0, "Gather"); + Value *Inc = Builder.CreateExtractValue(Load, 1, "GatherIncrement"); + Add->replaceAllUsesWith(Inc); + Add->eraseFromParent(); + if (Phi != nullptr) + Phi->setIncomingValue(IncrementIndex, Inc); + + return ExtractedLoad; +} + bool MVEGatherScatterLowering::runOnFunction(Function &F) { if (!EnableMaskedGatherScatters) return false; @@ -730,6 +937,7 @@ return false; SmallVector Gathers; SmallVector Scatters; + LoopInfo &LI = getAnalysis().getLoopInfo(); for (BasicBlock &BB : F) { @@ -751,6 +959,7 @@ Value *L = lowerGather(I); if (L == nullptr) continue; + // Get rid of any now dead instructions SimplifyInstructionsInBlock(cast(L)->getParent()); Changed = true; @@ -764,9 +973,31 @@ Value *S = lowerScatter(I); if (S == nullptr) continue; + // Get rid of any now dead instructions SimplifyInstructionsInBlock(cast(S)->getParent()); Changed = true; } return Changed; } + +bool MVEGatherScatterLowering::isConstOrLoopInvInst(const Value *V, const Loop *L, + const Value *Ignore) { + if (isa(V) || V == Ignore) { + return true; + } else if (!isa(V)) { + return true; + } + const Instruction *I = cast(V); + if (isa(I) || isa(I) || isa(I)) { + return isConstOrLoopInvInst(I->getOperand(0), L, Ignore); + } else if (I->getOpcode() == Instruction::Add || + I->getOpcode() == Instruction::Mul) { + for (Value *O : I->operands()) { + if (!isConstOrLoopInvInst(O, L, Ignore)) + return false; + } + return true; + } + return !L->contains(I); +} Index: llvm/test/CodeGen/Thumb2/mve-gather-increment.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -226,24 +226,23 @@ ; CHECK-LABEL: gather_pre_inc: ; CHECK: @ %bb.0: @ %vector.ph ; CHECK-NEXT: adr r3, .LCPI6_0 -; CHECK-NEXT: vmov.i32 q1, #0x18 ; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: .LBB6_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r0, q0, uxtw #2] +; CHECK-NEXT: vldrw.u32 q1, [q0, #96]! ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vstrb.8 q2, [r1], #16 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 ; CHECK-NEXT: bne .LBB6_1 ; CHECK-NEXT: @ %bb.2: @ %end ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI6_0: -; CHECK-NEXT: .long 6 @ 0x6 -; CHECK-NEXT: .long 12 @ 0xc -; CHECK-NEXT: .long 18 @ 0x12 -; CHECK-NEXT: .long 24 @ 0x18 +; CHECK-NEXT: .long 4294967224 @ 0xffffffb8 +; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 +; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 +; CHECK-NEXT: .long 0 @ 0x0 vector.ph: ; preds = %for.body.preheader %ind.end = shl i32 %n.vec, 1 br label %vector.body @@ -271,24 +270,23 @@ ; CHECK-LABEL: gather_post_inc: ; CHECK: @ %bb.0: @ %vector.ph41 ; CHECK-NEXT: adr r3, .LCPI7_0 -; CHECK-NEXT: vmov.i32 q1, #0x18 ; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: .LBB7_1: @ %vector.body39 ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r0, q0, uxtw #2] +; CHECK-NEXT: vldrw.u32 q1, [q0, #96]! ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vstrb.8 q2, [r1], #16 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 ; CHECK-NEXT: bne .LBB7_1 ; CHECK-NEXT: @ %bb.2: @ %end ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI7_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 6 @ 0x6 -; CHECK-NEXT: .long 12 @ 0xc -; CHECK-NEXT: .long 18 @ 0x12 +; CHECK-NEXT: .long 4294967200 @ 0xffffffa0 +; CHECK-NEXT: .long 4294967224 @ 0xffffffb8 +; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 +; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 vector.ph41: ; preds = %for.body6.preheader %ind.end47 = shl i32 %n.vec43, 1 br label %vector.body39 @@ -320,24 +318,23 @@ ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r4, pc} ; CHECK-NEXT: bic r12, r2, #3 -; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: sub.w r3, r12, #4 -; CHECK-NEXT: vmov.i32 q1, #0x4 -; CHECK-NEXT: add.w r4, r4, r3, lsr #2 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: sub.w lr, r12, #4 +; CHECK-NEXT: add.w r4, r3, lr, lsr #2 ; CHECK-NEXT: adr r3, .LCPI8_0 ; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: .LBB8_1: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB8_2 Depth 2 -; CHECK-NEXT: mov r3, r1 -; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: dls lr, r4 ; CHECK-NEXT: .LBB8_2: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB8_1 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrw.u32 q3, [r0, q2, uxtw #2] -; CHECK-NEXT: vadd.i32 q2, q2, q1 -; CHECK-NEXT: vstrb.8 q3, [r3], #16 +; CHECK-NEXT: vldrw.u32 q2, [q1, #16]! +; CHECK-NEXT: vstrb.8 q2, [r0], #16 ; CHECK-NEXT: le lr, .LBB8_2 ; CHECK-NEXT: @ %bb.3: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB8_1 Depth=1 @@ -348,10 +345,10 @@ ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.5: ; CHECK-NEXT: .LCPI8_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .long 4294967280 @ 0xfffffff0 +; CHECK-NEXT: .long 4294967284 @ 0xfffffff4 +; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 +; CHECK-NEXT: .long 4294967292 @ 0xfffffffc entry: %cmp22 = icmp sgt i32 %n, 0 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup @@ -384,78 +381,52 @@ define arm_aapcs_vfpcc void @gather_inc_v4i32_complex(i32* noalias nocapture readonly %data, i32* noalias nocapture %dst, i32 %n) { ; CHECK-LABEL: gather_inc_v4i32_complex: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #48 -; CHECK-NEXT: sub sp, #48 +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: blt .LBB9_5 ; CHECK-NEXT: @ %bb.1: @ %vector.ph.preheader ; CHECK-NEXT: bic r12, r2, #3 -; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: sub.w r3, r12, #4 -; CHECK-NEXT: adr r5, .LCPI9_2 -; CHECK-NEXT: adr.w lr, .LCPI9_0 -; CHECK-NEXT: vmov.i32 q3, #0xc -; CHECK-NEXT: add.w r4, r4, r3, lsr #2 -; CHECK-NEXT: adr r3, .LCPI9_1 +; CHECK-NEXT: movs r3, #1 +; CHECK-NEXT: sub.w lr, r12, #4 +; CHECK-NEXT: add.w r4, r3, lr, lsr #2 +; CHECK-NEXT: adr r3, .LCPI9_0 ; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [r5] -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [lr] -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: .LBB9_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB9_3 Depth 2 -; CHECK-NEXT: mov r3, r1 -; CHECK-NEXT: vldrw.u32 q4, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q5, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload +; CHECK-NEXT: mov r0, r1 +; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: dls lr, r4 +; CHECK-NEXT: vmov q1, q0 ; CHECK-NEXT: .LBB9_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB9_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vadd.i32 q7, q6, q3 -; CHECK-NEXT: vldrw.u32 q2, [r0, q6, uxtw #2] -; CHECK-NEXT: vldrw.u32 q6, [r0, q4, uxtw #2] -; CHECK-NEXT: vadd.i32 q1, q4, q3 -; CHECK-NEXT: vldrw.u32 q4, [r0, q5, uxtw #2] -; CHECK-NEXT: vadd.i32 q0, q5, q3 -; CHECK-NEXT: vadd.i32 q2, q6, q2 -; CHECK-NEXT: vadd.i32 q2, q2, q4 -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vstrb.8 q2, [r3], #16 -; CHECK-NEXT: vmov q5, q0 -; CHECK-NEXT: vmov q6, q7 +; CHECK-NEXT: vldrw.u32 q3, [q1, #48]! +; CHECK-NEXT: vldrw.u32 q4, [q2, #4] +; CHECK-NEXT: vadd.i32 q3, q4, q3 +; CHECK-NEXT: vldrw.u32 q4, [q2, #8] +; CHECK-NEXT: vadd.i32 q2, q3, q4 +; CHECK-NEXT: vstrb.8 q2, [r0], #16 +; CHECK-NEXT: vmov q2, q1 ; CHECK-NEXT: le lr, .LBB9_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB9_2 Depth=1 ; CHECK-NEXT: cmp r12, r2 ; CHECK-NEXT: bne .LBB9_2 ; CHECK-NEXT: .LBB9_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #48 -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.6: ; CHECK-NEXT: .LCPI9_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 3 @ 0x3 -; CHECK-NEXT: .long 6 @ 0x6 -; CHECK-NEXT: .long 9 @ 0x9 -; CHECK-NEXT: .LCPI9_1: -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 5 @ 0x5 -; CHECK-NEXT: .long 8 @ 0x8 -; CHECK-NEXT: .long 11 @ 0xb -; CHECK-NEXT: .LCPI9_2: -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 4 @ 0x4 -; CHECK-NEXT: .long 7 @ 0x7 -; CHECK-NEXT: .long 10 @ 0xa +; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 +; CHECK-NEXT: .long 4294967260 @ 0xffffffdc +; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 +; CHECK-NEXT: .long 4294967284 @ 0xfffffff4 entry: %cmp22 = icmp sgt i32 %n, 0 br i1 %cmp22, label %vector.ph, label %for.cond.cleanup Index: llvm/test/CodeGen/Thumb2/mve-gather-optimisation.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-optimisation.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-optimisation.ll @@ -20,24 +20,23 @@ ; CHECK-LABEL: push_out_mul: ; CHECK: @ %bb.0: @ %vector.ph ; CHECK-NEXT: adr r3, .LCPI0_0 -; CHECK-NEXT: vmov.i32 q1, #0x18 ; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r0, q0, uxtw #2] +; CHECK-NEXT: vldrw.u32 q1, [q0, #96]! ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vstrb.8 q2, [r1], #16 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 ; CHECK-NEXT: bne .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %end ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI0_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 6 @ 0x6 -; CHECK-NEXT: .long 12 @ 0xc -; CHECK-NEXT: .long 18 @ 0x12 +; CHECK-NEXT: .long 4294967200 @ 0xffffffa0 +; CHECK-NEXT: .long 4294967224 @ 0xffffffb8 +; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 +; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 vector.ph: ; preds = %for.body.preheader %ind.end = shl i32 %n.vec, 1 @@ -65,24 +64,23 @@ ; CHECK-LABEL: push_out_add: ; CHECK: @ %bb.0: @ %vector.ph ; CHECK-NEXT: adr r3, .LCPI1_0 -; CHECK-NEXT: vmov.i32 q1, #0x8 ; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r0, q0, uxtw #2] +; CHECK-NEXT: vldrw.u32 q1, [q0, #32]! ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vstrb.8 q2, [r1], #16 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 ; CHECK-NEXT: bne .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %end ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI1_0: -; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 +; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 8 @ 0x8 -; CHECK-NEXT: .long 10 @ 0xa -; CHECK-NEXT: .long 12 @ 0xc +; CHECK-NEXT: .long 16 @ 0x10 vector.ph: ; preds = %for.body.preheader %ind.end = shl i32 %n.vec, 1 @@ -110,24 +108,23 @@ ; CHECK-LABEL: push_out_add_sub_block: ; CHECK: @ %bb.0: @ %vector.ph ; CHECK-NEXT: adr r3, .LCPI2_0 -; CHECK-NEXT: vmov.i32 q1, #0x8 ; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vadd.i32 q0, q0, r0 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r0, q0, uxtw #2] +; CHECK-NEXT: vldrw.u32 q1, [q0, #32]! ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vstrb.8 q2, [r1], #16 +; CHECK-NEXT: vstrb.8 q1, [r1], #16 ; CHECK-NEXT: bne .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %end ; CHECK-NEXT: bx lr ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI2_0: -; CHECK-NEXT: .long 6 @ 0x6 +; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 +; CHECK-NEXT: .long 0 @ 0x0 ; CHECK-NEXT: .long 8 @ 0x8 -; CHECK-NEXT: .long 10 @ 0xa -; CHECK-NEXT: .long 12 @ 0xc +; CHECK-NEXT: .long 16 @ 0x10 vector.ph: ; preds = %for.body.preheader %ind.end = shl i32 %n.vec, 1 @@ -272,10 +269,10 @@ ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #48 -; CHECK-NEXT: sub sp, #48 +; CHECK-NEXT: .pad #32 +; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: adr r6, .LCPI5_0 -; CHECK-NEXT: ldrd r9, r12, [sp, #144] +; CHECK-NEXT: ldrd r9, r12, [sp, #128] ; CHECK-NEXT: vldrw.u32 q0, [r6] ; CHECK-NEXT: sub.w r6, r12, #1 ; CHECK-NEXT: movs r7, #1 @@ -287,7 +284,7 @@ ; CHECK-NEXT: subs r6, #4 ; CHECK-NEXT: vshl.i32 q2, q2, #3 ; CHECK-NEXT: mov.w r8, #0 -; CHECK-NEXT: vmov.i32 q3, #0x8 +; CHECK-NEXT: vmov.i32 q3, #0x20 ; CHECK-NEXT: add.w r4, r7, r6, lsr #2 ; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: .LBB5_1: @ %for.cond8.preheader.us.us.preheader @@ -297,16 +294,17 @@ ; CHECK-NEXT: mul r10, r8, r9 ; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload ; CHECK-NEXT: mul r7, r8, r12 -; CHECK-NEXT: vadd.i32 q0, q0, r7 +; CHECK-NEXT: vadd.i32 q4, q0, r7 ; CHECK-NEXT: movs r7, #0 -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: .LBB5_2: @ %vector.ph ; CHECK-NEXT: @ Parent Loop BB5_1 Depth=1 ; CHECK-NEXT: @ => This Loop Header: Depth=2 ; CHECK-NEXT: @ Child Loop BB5_3 Depth 3 -; CHECK-NEXT: vmov.i32 q5, #0x0 -; CHECK-NEXT: vldrw.u32 q6, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vshl.i32 q6, q4, #2 +; CHECK-NEXT: vadd.i32 q6, q6, r0 ; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.i32 q5, #0x0 +; CHECK-NEXT: vsub.i32 q6, q6, q3 ; CHECK-NEXT: dls lr, r4 ; CHECK-NEXT: .LBB5_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB5_1 Depth=1 @@ -314,13 +312,11 @@ ; CHECK-NEXT: @ => This Inner Loop Header: Depth=3 ; CHECK-NEXT: vadd.i32 q0, q7, q2 ; CHECK-NEXT: vadd.i32 q7, q7, r7 -; CHECK-NEXT: vldrw.u32 q4, [r1, q7, uxtw #2] -; CHECK-NEXT: vldrw.u32 q7, [r0, q6, uxtw #2] -; CHECK-NEXT: vadd.i32 q1, q6, q3 -; CHECK-NEXT: vmul.i32 q4, q4, q7 -; CHECK-NEXT: vmov q6, q1 -; CHECK-NEXT: vadd.i32 q5, q4, q5 +; CHECK-NEXT: vldrw.u32 q1, [r1, q7, uxtw #2] +; CHECK-NEXT: vldrw.u32 q7, [q6, #32]! +; CHECK-NEXT: vmul.i32 q1, q1, q7 ; CHECK-NEXT: vmov q7, q0 +; CHECK-NEXT: vadd.i32 q5, q1, q5 ; CHECK-NEXT: le lr, .LBB5_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB5_2 Depth=2 @@ -336,7 +332,7 @@ ; CHECK-NEXT: cmp r8, r3 ; CHECK-NEXT: bne .LBB5_1 ; CHECK-NEXT: @ %bb.6: @ %for.end25 -; CHECK-NEXT: add sp, #48 +; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} ; CHECK-NEXT: .p2align 4