diff --git a/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp b/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp --- a/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp +++ b/llvm/lib/Target/ARM/MVELaneInterleavingPass.cpp @@ -154,7 +154,6 @@ static bool tryInterleave(Instruction *Start, SmallPtrSetImpl &Visited) { LLVM_DEBUG(dbgs() << "tryInterleave from " << *Start << "\n"); - auto *VT = cast(Start->getType()); if (!isa(Start->getOperand(0))) return false; @@ -165,6 +164,7 @@ Worklist.push_back(cast(Start->getOperand(0))); SmallSetVector Truncs; + SmallSetVector Reducts; SmallSetVector Exts; SmallSetVector OtherLeafs; SmallSetVector Ops; @@ -198,6 +198,13 @@ if (!II) return false; + if (II->getIntrinsicID() == Intrinsic::vector_reduce_add) { + if (!Reducts.insert(I)) + continue; + Visited.insert(I); + break; + } + switch (II->getIntrinsicID()) { case Intrinsic::abs: case Intrinsic::smin: @@ -267,21 +274,32 @@ return false; LLVM_DEBUG({ - dbgs() << "Found group:\n Exts:"; + dbgs() << "Found group:\n Exts:\n"; for (auto *I : Exts) dbgs() << " " << *I << "\n"; - dbgs() << " Ops:"; + dbgs() << " Ops:\n"; for (auto *I : Ops) dbgs() << " " << *I << "\n"; - dbgs() << " OtherLeafs:"; + dbgs() << " OtherLeafs:\n"; for (auto *I : OtherLeafs) dbgs() << " " << *I->get() << " of " << *I->getUser() << "\n"; - dbgs() << "Truncs:"; + dbgs() << " Truncs:\n"; for (auto *I : Truncs) dbgs() << " " << *I << "\n"; + dbgs() << " Reducts:\n"; + for (auto *I : Reducts) + dbgs() << " " << *I << "\n"; }); - assert(!Truncs.empty() && "Expected some truncs"); + assert((!Truncs.empty() || !Reducts.empty()) && + "Expected some truncs or reductions"); + if (Truncs.empty() && Exts.empty()) + return false; + + auto *VT = !Truncs.empty() + ? cast(Truncs[0]->getType()) + : cast(Exts[0]->getOperand(0)->getType()); + LLVM_DEBUG(dbgs() << "Using VT:" << *VT << "\n"); // Check types unsigned NumElts = VT->getNumElements(); @@ -311,6 +329,14 @@ // Check that it looks beneficial if (!isProfitableToInterleave(Exts, Truncs)) return false; + if (!Reducts.empty() && (Ops.empty() || all_of(Ops, [](Instruction *I) { + return I->getOpcode() == Instruction::Mul || + I->getOpcode() == Instruction::Select || + I->getOpcode() == Instruction::ICmp; + }))) { + LLVM_DEBUG(dbgs() << "Reduction does not look profitable\n"); + return false; + } // Create new shuffles around the extends / truncs / other leaves. IRBuilder<> Builder(Start); @@ -367,6 +393,14 @@ return true; } +// Add reductions are fairly common and associative, meaning we can start the +// interleaving from them and don't need to emit a shuffle. +static bool isAddReduction(Instruction &I) { + if (auto *II = dyn_cast(&I)) + return II->getIntrinsicID() == Intrinsic::vector_reduce_add; + return false; +} + bool MVELaneInterleaving::runOnFunction(Function &F) { if (!EnableInterleave) return false; @@ -380,8 +414,10 @@ SmallPtrSet Visited; for (Instruction &I : reverse(instructions(F))) { - if (I.getType()->isVectorTy() && - (isa(I) || isa(I)) && !Visited.count(&I)) + if (((I.getType()->isVectorTy() && + (isa(I) || isa(I))) || + isAddReduction(I)) && + !Visited.count(&I)) Changed |= tryInterleave(&I, Visited); } diff --git a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll --- a/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll +++ b/llvm/test/CodeGen/Thumb2/mve-laneinterleaving-reduct.ll @@ -4,23 +4,12 @@ define arm_aapcs_vfpcc i16 @reduce_v16i16_shift_mul(<16 x i8> %s0, <16 x i8> %s1) { ; CHECK-LABEL: reduce_v16i16_shift_mul: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: add r0, sp, #16 -; CHECK-NEXT: mov r1, sp -; CHECK-NEXT: vstrw.32 q1, [r0] -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vldrb.u16 q0, [r0, #8] -; CHECK-NEXT: vldrb.u16 q1, [r1, #8] -; CHECK-NEXT: vldrb.u16 q2, [r1] -; CHECK-NEXT: vmul.i16 q0, q1, q0 -; CHECK-NEXT: vldrb.u16 q1, [r0] +; CHECK-NEXT: vmullt.u8 q2, q0, q1 +; CHECK-NEXT: vmullb.u8 q0, q0, q1 +; CHECK-NEXT: vshr.s16 q2, q2, #14 ; CHECK-NEXT: vshr.s16 q0, q0, #14 -; CHECK-NEXT: vmul.i16 q1, q2, q1 -; CHECK-NEXT: vaddv.u16 r0, q0 -; CHECK-NEXT: vshr.s16 q1, q1, #14 -; CHECK-NEXT: vaddva.u16 r0, q1 -; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: vaddv.u16 r0, q2 +; CHECK-NEXT: vaddva.u16 r0, q0 ; CHECK-NEXT: bx lr entry: %s0s = zext <16 x i8> %s0 to <16 x i16> @@ -50,23 +39,16 @@ define arm_aapcs_vfpcc i16 @reduce_v16i16_shift_sub(<16 x i8> %s0, <16 x i8> %s1) { ; CHECK-LABEL: reduce_v16i16_shift_sub: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: add r0, sp, #16 -; CHECK-NEXT: mov r1, sp -; CHECK-NEXT: vstrw.32 q1, [r0] -; CHECK-NEXT: vstrw.32 q0, [r1] -; CHECK-NEXT: vldrb.u16 q0, [r0, #8] -; CHECK-NEXT: vldrb.u16 q1, [r1, #8] -; CHECK-NEXT: vldrb.u16 q2, [r1] -; CHECK-NEXT: vsub.i16 q0, q1, q0 -; CHECK-NEXT: vldrb.u16 q1, [r0] +; CHECK-NEXT: vmovlt.u8 q2, q1 +; CHECK-NEXT: vmovlt.u8 q3, q0 +; CHECK-NEXT: vsub.i16 q2, q3, q2 +; CHECK-NEXT: vmovlb.u8 q1, q1 +; CHECK-NEXT: vmovlb.u8 q0, q0 +; CHECK-NEXT: vshr.s16 q2, q2, #14 +; CHECK-NEXT: vsub.i16 q0, q0, q1 +; CHECK-NEXT: vaddv.u16 r0, q2 ; CHECK-NEXT: vshr.s16 q0, q0, #14 -; CHECK-NEXT: vsub.i16 q1, q2, q1 -; CHECK-NEXT: vaddv.u16 r0, q0 -; CHECK-NEXT: vshr.s16 q1, q1, #14 -; CHECK-NEXT: vaddva.u16 r0, q1 -; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: vaddva.u16 r0, q0 ; CHECK-NEXT: bx lr entry: %s0s = zext <16 x i8> %s0 to <16 x i16> @@ -190,17 +172,15 @@ ; CHECK-NEXT: .LBB4_8: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB4_4 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrh.s32 q2, [r5], #16 -; CHECK-NEXT: vldrh.s32 q1, [r4], #16 +; CHECK-NEXT: vldrh.u16 q1, [r4], #16 +; CHECK-NEXT: vldrh.u16 q2, [r5], #16 ; CHECK-NEXT: rsb.w r1, r12, #0 -; CHECK-NEXT: vmul.i32 q1, q2, q1 -; CHECK-NEXT: vldrh.s32 q2, [r4, #-8] -; CHECK-NEXT: vldrh.s32 q3, [r5, #-8] +; CHECK-NEXT: vmullb.s16 q3, q2, q1 +; CHECK-NEXT: vmullt.s16 q1, q2, q1 +; CHECK-NEXT: vshl.s32 q3, r1 ; CHECK-NEXT: vshl.s32 q1, r1 +; CHECK-NEXT: vaddva.u32 r6, q3 ; CHECK-NEXT: vaddva.u32 r6, q1 -; CHECK-NEXT: vmul.i32 q2, q3, q2 -; CHECK-NEXT: vshl.s32 q2, r1 -; CHECK-NEXT: vaddva.u32 r6, q2 ; CHECK-NEXT: le lr, .LBB4_8 ; CHECK-NEXT: @ %bb.9: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB4_4 Depth=1