diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp --- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -76,6 +76,7 @@ private: LoopInfo *LI = nullptr; + const DataLayout *DL; // Check this is a valid gather with correct alignment bool isLegalTypeAndAlignment(unsigned NumElements, unsigned ElemSize, @@ -335,13 +336,13 @@ Optional MVEGatherScatterLowering::getIfConst(const Value *V) { const Constant *C = dyn_cast(V); - if (C != nullptr) + if (C && C->getSplatValue()) return Optional{C->getUniqueInteger().getSExtValue()}; if (!isa(V)) return Optional{}; const Instruction *I = cast(V); - if (I->getOpcode() == Instruction::Add || + if (I->getOpcode() == Instruction::Add || I->getOpcode() == Instruction::Or || I->getOpcode() == Instruction::Mul || I->getOpcode() == Instruction::Shl) { Optional Op0 = getIfConst(I->getOperand(0)); @@ -354,18 +355,28 @@ return Optional{Op0.getValue() * Op1.getValue()}; if (I->getOpcode() == Instruction::Shl) return Optional{Op0.getValue() << Op1.getValue()}; + if (I->getOpcode() == Instruction::Or) + return Optional{Op0.getValue() | Op1.getValue()}; } return Optional{}; } +// Return true if I is an Or instruction that is equivalent to an add, due to +// the operands having no common bits set. +static bool isAddLikeOr(Instruction *I, const DataLayout &DL) { + return I->getOpcode() == Instruction::Or && + haveNoCommonBitsSet(I->getOperand(0), I->getOperand(1), DL); +} + std::pair MVEGatherScatterLowering::getVarAndConst(Value *Inst, int TypeScale) { std::pair ReturnFalse = std::pair(nullptr, 0); - // At this point, the instruction we're looking at must be an add or we - // bail out + // At this point, the instruction we're looking at must be an add or an + // add-like-or. Instruction *Add = dyn_cast(Inst); - if (Add == nullptr || Add->getOpcode() != Instruction::Add) + if (Add == nullptr || + (Add->getOpcode() != Instruction::Add && isAddLikeOr(Add, *DL))) return ReturnFalse; Value *Summand; @@ -740,10 +751,9 @@ // The gep was in charge of making sure the offsets are scaled correctly // - calculate that factor so it can be applied by hand - DataLayout DT = I->getParent()->getParent()->getParent()->getDataLayout(); int TypeScale = - computeScale(DT.getTypeSizeInBits(GEP->getOperand(0)->getType()), - DT.getTypeSizeInBits(GEP->getType()) / + computeScale(DL->getTypeSizeInBits(GEP->getOperand(0)->getType()), + DL->getTypeSizeInBits(GEP->getType()) / cast(GEP->getType())->getNumElements()); if (TypeScale == -1) return nullptr; @@ -927,7 +937,7 @@ // Check whether all usages of this instruction are as offsets of // gathers/scatters or simple arithmetics only used by gathers/scatters -static bool hasAllGatScatUsers(Instruction *I) { +static bool hasAllGatScatUsers(Instruction *I, const DataLayout &DL) { if (I->hasNUses(0)) { return false; } @@ -941,8 +951,9 @@ } else { unsigned OpCode = cast(U)->getOpcode(); if ((OpCode == Instruction::Add || OpCode == Instruction::Mul || - OpCode == Instruction::Shl) && - hasAllGatScatUsers(cast(U))) { + OpCode == Instruction::Shl || + isAddLikeOr(cast(U), DL)) && + hasAllGatScatUsers(cast(U), DL)) { continue; } return false; @@ -960,7 +971,7 @@ if (!isa(Offsets)) return false; Instruction *Offs = cast(Offsets); - if (Offs->getOpcode() != Instruction::Add && + if (Offs->getOpcode() != Instruction::Add && !isAddLikeOr(Offs, *DL) && Offs->getOpcode() != Instruction::Mul && Offs->getOpcode() != Instruction::Shl) return false; @@ -968,7 +979,7 @@ if (L == nullptr) return false; if (!Offs->hasOneUse()) { - if (!hasAllGatScatUsers(Offs)) + if (!hasAllGatScatUsers(Offs, *DL)) return false; } @@ -1066,6 +1077,7 @@ switch (Offs->getOpcode()) { case Instruction::Add: + case Instruction::Or: pushOutAdd(NewPhi, OffsSecondOperand, IncrementingBlock == 1 ? 0 : 1); break; case Instruction::Mul: @@ -1221,6 +1233,7 @@ if (!ST->hasMVEIntegerOps()) return false; LI = &getAnalysis().getLoopInfo(); + DL = &F.getParent()->getDataLayout(); SmallVector Gathers; SmallVector Scatters; diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -1457,45 +1457,62 @@ define void @shlor(i32* nocapture %x, i32* noalias nocapture readonly %y, i32 %n) { ; CHECK-LABEL: shlor: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: blt .LBB16_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: adr r3, .LCPI16_0 -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: vmov.i32 q1, #0x4 +; CHECK-NEXT: adr.w lr, .LCPI16_0 +; CHECK-NEXT: adr r4, .LCPI16_1 +; CHECK-NEXT: adr r5, .LCPI16_2 +; CHECK-NEXT: adr r6, .LCPI16_3 +; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: vldrw.u32 q1, [r5] +; CHECK-NEXT: vldrw.u32 q2, [r4] +; CHECK-NEXT: vldrw.u32 q3, [lr] +; CHECK-NEXT: vadd.i32 q0, q0, r1 +; CHECK-NEXT: vadd.i32 q1, q1, r1 +; CHECK-NEXT: vadd.i32 q2, q2, r1 +; CHECK-NEXT: vadd.i32 q3, q3, r1 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB16_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vshl.i32 q2, q0, #3 -; CHECK-NEXT: vmov q4, q2 -; CHECK-NEXT: vldrw.u32 q3, [r1, q2, uxtw #2] -; CHECK-NEXT: vorr.i32 q4, #0x2 -; CHECK-NEXT: vldrw.u32 q5, [r1, q4, uxtw #2] -; CHECK-NEXT: vmov q4, q2 -; CHECK-NEXT: vadd.i32 q3, q5, q3 -; CHECK-NEXT: vorr.i32 q4, #0x4 -; CHECK-NEXT: vldrw.u32 q5, [r1, q4, uxtw #2] -; CHECK-NEXT: vorr.i32 q2, #0x6 -; CHECK-NEXT: vadd.i32 q3, q3, q5 -; CHECK-NEXT: vadd.i32 q0, q0, q1 -; CHECK-NEXT: vldrw.u32 q4, [r1, q2, uxtw #2] -; CHECK-NEXT: vadd.i32 q2, q3, q4 -; CHECK-NEXT: vstrw.32 q2, [r0], #16 +; CHECK-NEXT: vldrw.u32 q4, [q3, #128]! +; CHECK-NEXT: vldrw.u32 q5, [q2, #128]! +; CHECK-NEXT: vadd.i32 q4, q5, q4 +; CHECK-NEXT: vldrw.u32 q5, [q1, #128]! +; CHECK-NEXT: vldrw.u32 q6, [q0, #128]! +; CHECK-NEXT: vadd.i32 q4, q4, q5 +; CHECK-NEXT: vadd.i32 q4, q4, q6 +; CHECK-NEXT: vstrw.32 q4, [r0], #16 ; CHECK-NEXT: letp lr, .LBB16_2 ; CHECK-NEXT: .LBB16_3: @ %for.cond.cleanup -; CHECK-NEXT: vpop {d8, d9, d10, d11} -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: pop {r4, r5, r6, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.4: ; CHECK-NEXT: .LCPI16_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .long 4294967168 @ 0xffffff80 +; CHECK-NEXT: .long 4294967200 @ 0xffffffa0 +; CHECK-NEXT: .long 4294967232 @ 0xffffffc0 +; CHECK-NEXT: .long 4294967264 @ 0xffffffe0 +; CHECK-NEXT: .LCPI16_1: +; CHECK-NEXT: .long 4294967176 @ 0xffffff88 +; CHECK-NEXT: .long 4294967208 @ 0xffffffa8 +; CHECK-NEXT: .long 4294967240 @ 0xffffffc8 +; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 +; CHECK-NEXT: .LCPI16_2: +; CHECK-NEXT: .long 4294967184 @ 0xffffff90 +; CHECK-NEXT: .long 4294967216 @ 0xffffffb0 +; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 +; CHECK-NEXT: .long 4294967280 @ 0xfffffff0 +; CHECK-NEXT: .LCPI16_3: +; CHECK-NEXT: .long 4294967192 @ 0xffffff98 +; CHECK-NEXT: .long 4294967224 @ 0xffffffb8 +; CHECK-NEXT: .long 4294967256 @ 0xffffffd8 +; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 entry: %cmp23 = icmp sgt i32 %n, 0 br i1 %cmp23, label %vector.ph, label %for.cond.cleanup diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll @@ -283,62 +283,76 @@ define void @shlor(i32* nocapture readonly %x, i32* noalias nocapture %y, i32 %n) { ; CHECK-LABEL: shlor: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} +; CHECK-NEXT: .save {r4, r5, r6, lr} +; CHECK-NEXT: push {r4, r5, r6, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #80 -; CHECK-NEXT: sub sp, #80 +; CHECK-NEXT: .pad #48 +; CHECK-NEXT: sub sp, #48 ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: blt .LBB5_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: vmov.i32 q1, #0x1 -; CHECK-NEXT: adr r3, .LCPI5_0 -; CHECK-NEXT: vstrw.32 q1, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov.i32 q1, #0x3 -; CHECK-NEXT: vldrw.u32 q0, [r3] -; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vmov.i32 q1, #0x2 -; CHECK-NEXT: vstrw.32 q1, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vmov.i32 q1, #0x4 -; CHECK-NEXT: vstrw.32 q1, [sp] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload +; CHECK-NEXT: adr.w lr, .LCPI5_0 +; CHECK-NEXT: adr r4, .LCPI5_1 +; CHECK-NEXT: adr r5, .LCPI5_2 +; CHECK-NEXT: adr r6, .LCPI5_3 +; CHECK-NEXT: vldrw.u32 q0, [r6] +; CHECK-NEXT: vldrw.u32 q1, [r5] +; CHECK-NEXT: vldrw.u32 q2, [r4] +; CHECK-NEXT: vldrw.u32 q3, [lr] +; CHECK-NEXT: vadd.i32 q0, q0, r1 +; CHECK-NEXT: vadd.i32 q1, q1, r1 +; CHECK-NEXT: vadd.i32 q2, q2, r1 +; CHECK-NEXT: vadd.i32 q3, q3, r1 +; CHECK-NEXT: vmov.i32 q4, #0x3 +; CHECK-NEXT: vstrw.32 q4, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.i32 q4, #0x2 +; CHECK-NEXT: vstrw.32 q4, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.i32 q4, #0x1 +; CHECK-NEXT: vmov.i32 q7, #0x4 +; CHECK-NEXT: vstrw.32 q4, [sp] @ 16-byte Spill ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB5_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q5, [r0], #16 -; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q3, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vshl.i32 q7, q0, #3 -; CHECK-NEXT: vadd.i32 q1, q5, q6 -; CHECK-NEXT: vadd.i32 q2, q5, q2 -; CHECK-NEXT: vadd.i32 q3, q5, q3 -; CHECK-NEXT: vadd.i32 q5, q5, q4 -; CHECK-NEXT: vmov q4, q7 -; CHECK-NEXT: vstrw.32 q1, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vmov q1, q7 -; CHECK-NEXT: vstrw.32 q5, [r1, q7, uxtw #2] -; CHECK-NEXT: vadd.i32 q0, q0, q6 -; CHECK-NEXT: vorr.i32 q4, #0x4 -; CHECK-NEXT: vorr.i32 q7, #0x2 -; CHECK-NEXT: vstrw.32 q3, [r1, q7, uxtw #2] -; CHECK-NEXT: vstrw.32 q2, [r1, q4, uxtw #2] -; CHECK-NEXT: vorr.i32 q1, #0x6 -; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q2, [r1, q1, uxtw #2] +; CHECK-NEXT: vldrw.u32 q4, [r0], #16 +; CHECK-NEXT: vldrw.u32 q6, [sp] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q5, q4, q7 +; CHECK-NEXT: vadd.i32 q6, q4, q6 +; CHECK-NEXT: vstrw.32 q6, [q3, #128]! +; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q6, q4, q6 +; CHECK-NEXT: vstrw.32 q6, [q2, #128]! +; CHECK-NEXT: vldrw.u32 q6, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q4, q4, q6 +; CHECK-NEXT: vstrw.32 q4, [q1, #128]! +; CHECK-NEXT: vstrw.32 q5, [q0, #128]! ; CHECK-NEXT: letp lr, .LBB5_2 ; CHECK-NEXT: .LBB5_3: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #80 +; CHECK-NEXT: add sp, #48 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: pop {r4, r5, r6, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.4: ; CHECK-NEXT: .LCPI5_0: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 +; CHECK-NEXT: .long 4294967168 @ 0xffffff80 +; CHECK-NEXT: .long 4294967200 @ 0xffffffa0 +; CHECK-NEXT: .long 4294967232 @ 0xffffffc0 +; CHECK-NEXT: .long 4294967264 @ 0xffffffe0 +; CHECK-NEXT: .LCPI5_1: +; CHECK-NEXT: .long 4294967176 @ 0xffffff88 +; CHECK-NEXT: .long 4294967208 @ 0xffffffa8 +; CHECK-NEXT: .long 4294967240 @ 0xffffffc8 +; CHECK-NEXT: .long 4294967272 @ 0xffffffe8 +; CHECK-NEXT: .LCPI5_2: +; CHECK-NEXT: .long 4294967184 @ 0xffffff90 +; CHECK-NEXT: .long 4294967216 @ 0xffffffb0 +; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 +; CHECK-NEXT: .long 4294967280 @ 0xfffffff0 +; CHECK-NEXT: .LCPI5_3: +; CHECK-NEXT: .long 4294967192 @ 0xffffff98 +; CHECK-NEXT: .long 4294967224 @ 0xffffffb8 +; CHECK-NEXT: .long 4294967256 @ 0xffffffd8 +; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 entry: %cmp33 = icmp sgt i32 %n, 0 br i1 %cmp33, label %vector.ph, label %for.cond.cleanup