Index: llvm/lib/Target/ARM/ARMBaseInstrInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMBaseInstrInfo.h +++ llvm/lib/Target/ARM/ARMBaseInstrInfo.h @@ -838,8 +838,8 @@ } } -// Return true if the given intrinsic is a gather or scatter -inline bool isGatherScatter(IntrinsicInst *IntInst) { +// Return true if the given intrinsic is a gather +inline bool isGather(IntrinsicInst *IntInst) { if (IntInst == nullptr) return false; unsigned IntrinsicID = IntInst->getIntrinsicID(); @@ -849,8 +849,15 @@ IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_wb || IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_wb_predicated || IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset || - IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset_predicated || - IntrinsicID == Intrinsic::masked_scatter || + IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset_predicated); +} + +// Return true if the given intrinsic is a scatter +inline bool isScatter(IntrinsicInst *IntInst) { + if (IntInst == nullptr) + return false; + unsigned IntrinsicID = IntInst->getIntrinsicID(); + return (IntrinsicID == Intrinsic::masked_scatter || IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base || IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_predicated || IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_wb || @@ -859,6 +866,13 @@ IntrinsicID == Intrinsic::arm_mve_vstr_scatter_offset_predicated); } +// Return true if the given intrinsic is a gather or scatter +inline bool isGatherScatter(IntrinsicInst *IntInst) { + if (IntInst == nullptr) + return false; + return isGather(IntInst) || isScatter(IntInst); +} + } // end namespace llvm #endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H Index: llvm/lib/Target/ARM/MVETailPredication.cpp =================================================================== --- llvm/lib/Target/ARM/MVETailPredication.cpp +++ llvm/lib/Target/ARM/MVETailPredication.cpp @@ -153,8 +153,8 @@ return false; Intrinsic::ID ID = Call->getIntrinsicID(); - // TODO: Support gather/scatter expand/compress operations. - return ID == Intrinsic::masked_store || ID == Intrinsic::masked_load; + return ID == Intrinsic::masked_store || ID == Intrinsic::masked_load || + isGatherScatter(Call); } bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) { @@ -233,9 +233,19 @@ } static FixedVectorType *getVectorType(IntrinsicInst *I) { - unsigned TypeOp = I->getIntrinsicID() == Intrinsic::masked_load ? 0 : 1; - auto *PtrTy = cast(I->getOperand(TypeOp)->getType()); - auto *VecTy = cast(PtrTy->getElementType()); + unsigned ID = I->getIntrinsicID(); + FixedVectorType *VecTy; + if (ID == Intrinsic::masked_load || isGather(I)) { + if (ID == Intrinsic::arm_mve_vldr_gather_base_wb_predicated) + // then the type is a StructType + VecTy = dyn_cast(I->getType()->getContainedType(0)); + else + VecTy = dyn_cast(I->getType()); + } else if (ID == Intrinsic::masked_store) { + VecTy = dyn_cast(I->getOperand(0)->getType()); + } else { + VecTy = dyn_cast(I->getOperand(2)->getType()); + } assert(VecTy && "No scalable vectors expected here"); return VecTy; } @@ -273,7 +283,6 @@ default: break; } - if (IsMasked(&I)) { auto *VecTy = getVectorType(Int); unsigned Lanes = VecTy->getNumElements(); @@ -581,7 +590,8 @@ // Walk through the masked intrinsics and try to find whether the predicate // operand is generated by intrinsic @llvm.get.active.lane.mask(). for (auto *I : MaskedInsts) { - unsigned PredOp = I->getIntrinsicID() == Intrinsic::masked_load ? 2 : 3; + unsigned PredOp = + (I->getIntrinsicID() == Intrinsic::masked_load || isGather(I)) ? 2 : 3; auto *Predicate = dyn_cast(I->getArgOperand(PredOp)); if (!Predicate || Predicates.count(Predicate)) continue; Index: llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-scatter-tailpred.ll @@ -6,40 +6,25 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} ; CHECK-NEXT: adr r4, .LCPI0_0 -; CHECK-NEXT: add.w r0, r0, r3, lsl #2 +; CHECK-NEXT: add.w r12, r0, r3, lsl #2 ; CHECK-NEXT: vldrw.u32 q0, [r4] -; CHECK-NEXT: adr r4, .LCPI0_1 -; CHECK-NEXT: add.w r12, r3, #4 -; CHECK-NEXT: vldrw.u32 q2, [r4] -; CHECK-NEXT: movw lr, #1250 +; CHECK-NEXT: adds r0, r3, #1 +; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: vadd.i32 q0, q0, r1 -; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: vdup.32 q3, r3 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: adds r1, r3, #4 +; CHECK-NEXT: dlstp.32 lr, r0 ; CHECK-NEXT: .LBB0_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vadd.i32 q1, q2, r1 -; CHECK-NEXT: vdup.32 q5, r1 -; CHECK-NEXT: adds r1, #4 -; CHECK-NEXT: vcmp.u32 hi, q5, q1 -; CHECK-NEXT: vpnot -; CHECK-NEXT: vpsttt -; CHECK-NEXT: vcmpt.u32 cs, q3, q1 -; CHECK-NEXT: vldrwt.u32 q1, [r0], #16 -; CHECK-NEXT: vldrwt.u32 q5, [q0, #80]! -; CHECK-NEXT: vmul.i32 q1, q5, q1 -; CHECK-NEXT: vadd.i32 q1, q4, q1 -; CHECK-NEXT: le lr, .LBB0_1 +; CHECK-NEXT: vldrw.u32 q1, [r12], #16 +; CHECK-NEXT: vldrw.u32 q3, [q0, #80]! +; CHECK-NEXT: vmul.i32 q1, q3, q1 +; CHECK-NEXT: vadd.i32 q2, q2, q1 +; CHECK-NEXT: letp lr, .LBB0_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vpsel q0, q1, q4 +; CHECK-NEXT: vmov q0, q2 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: str.w r0, [r2, r12, lsl #2] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: str.w r0, [r2, r1, lsl #2] ; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: @@ -48,11 +33,6 @@ ; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 ; CHECK-NEXT: .long 4294967268 @ 0xffffffe4 ; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 -; CHECK-NEXT: .LCPI0_1: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 entry: ; preds = %middle. %add.us.us = add i32 4, %n %arrayidx.us.us = getelementptr inbounds i32, i32* %C, i32 %add.us.us @@ -90,46 +70,33 @@ define dso_local void @mve_gatherscatter_offset(i32* noalias nocapture readonly %A, i32* noalias nocapture readonly %B, i32* noalias nocapture %C, i32 %n, i32 %m, i32 %l) { ; CHECK-LABEL: mve_gatherscatter_offset: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} ; CHECK-NEXT: adr r4, .LCPI1_0 -; CHECK-NEXT: adr r5, .LCPI1_1 +; CHECK-NEXT: add.w r12, r0, r3, lsl #2 +; CHECK-NEXT: adds r0, r3, #1 ; CHECK-NEXT: vldrw.u32 q1, [r4] -; CHECK-NEXT: add.w r0, r0, r3, lsl #2 -; CHECK-NEXT: add.w r12, r3, #4 -; CHECK-NEXT: vldrw.u32 q3, [r5] -; CHECK-NEXT: movw lr, #1250 -; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: adds r3, #4 +; CHECK-NEXT: vmov.i32 q3, #0x0 ; CHECK-NEXT: vmov.i32 q0, #0x14 -; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: vdup.32 q4, r3 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: dlstp.32 lr, r0 ; CHECK-NEXT: .LBB1_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q5, q2 -; CHECK-NEXT: vadd.i32 q2, q3, r4 -; CHECK-NEXT: vdup.32 q6, r4 -; CHECK-NEXT: adds r4, #4 -; CHECK-NEXT: vcmp.u32 hi, q6, q2 -; CHECK-NEXT: vpnot -; CHECK-NEXT: vpsttt -; CHECK-NEXT: vcmpt.u32 cs, q4, q2 -; CHECK-NEXT: vldrwt.u32 q2, [r1, q1, uxtw #2] -; CHECK-NEXT: vldrwt.u32 q6, [r0], #16 -; CHECK-NEXT: vmul.i32 q2, q2, q6 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q2, [r1, q1, uxtw #2] -; CHECK-NEXT: vadd.i32 q2, q5, q2 +; CHECK-NEXT: vldrw.u32 q2, [r1, q1, uxtw #2] +; CHECK-NEXT: vldrw.u32 q4, [r12], #16 +; CHECK-NEXT: vmul.i32 q2, q2, q4 +; CHECK-NEXT: vstrw.32 q2, [r1, q1, uxtw #2] ; CHECK-NEXT: vadd.i32 q1, q1, q0 -; CHECK-NEXT: le lr, .LBB1_1 +; CHECK-NEXT: vadd.i32 q3, q3, q2 +; CHECK-NEXT: letp lr, .LBB1_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vpsel q0, q2, q5 +; CHECK-NEXT: vmov q0, q3 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: str.w r0, [r2, r12, lsl #2] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: str.w r0, [r2, r3, lsl #2] +; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: ; CHECK-NEXT: .LCPI1_0: @@ -137,11 +104,6 @@ ; CHECK-NEXT: .long 8 @ 0x8 ; CHECK-NEXT: .long 13 @ 0xd ; CHECK-NEXT: .long 18 @ 0x12 -; CHECK-NEXT: .LCPI1_1: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 entry: ; preds = %middle. %add.us.us = add i32 4, %n %arrayidx.us.us = getelementptr inbounds i32, i32* %C, i32 %add.us.us @@ -181,42 +143,26 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, lr} ; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: adr r4, .LCPI2_0 -; CHECK-NEXT: add.w r0, r0, r3, lsl #2 +; CHECK-NEXT: add.w r12, r0, r3, lsl #2 ; CHECK-NEXT: vldrw.u32 q0, [r4] -; CHECK-NEXT: adr r4, .LCPI2_1 -; CHECK-NEXT: add.w r12, r3, #4 -; CHECK-NEXT: vldrw.u32 q2, [r4] -; CHECK-NEXT: movw lr, #1250 +; CHECK-NEXT: adds r0, r3, #1 +; CHECK-NEXT: vmov.i32 q3, #0x0 ; CHECK-NEXT: vadd.i32 q0, q0, r1 -; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: movs r1, #0 -; CHECK-NEXT: vdup.32 q3, r3 -; CHECK-NEXT: vmov.i32 q4, #0x3 -; CHECK-NEXT: dls lr, lr +; CHECK-NEXT: adds r1, r3, #4 +; CHECK-NEXT: vmov.i32 q2, #0x3 +; CHECK-NEXT: dlstp.32 lr, r0 ; CHECK-NEXT: .LBB2_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vadd.i32 q1, q2, r1 -; CHECK-NEXT: vdup.32 q6, r1 -; CHECK-NEXT: adds r1, #4 -; CHECK-NEXT: vcmp.u32 hi, q6, q1 -; CHECK-NEXT: vpnot -; CHECK-NEXT: vpstt -; CHECK-NEXT: vcmpt.u32 cs, q3, q1 -; CHECK-NEXT: vldrwt.u32 q1, [r0], #16 -; CHECK-NEXT: vmul.i32 q1, q1, q4 -; CHECK-NEXT: vpst -; CHECK-NEXT: vstrwt.32 q1, [q0, #80]! -; CHECK-NEXT: vadd.i32 q1, q5, q1 -; CHECK-NEXT: le lr, .LBB2_1 +; CHECK-NEXT: vldrw.u32 q1, [r12], #16 +; CHECK-NEXT: vmul.i32 q1, q1, q2 +; CHECK-NEXT: vstrw.32 q1, [q0, #80]! +; CHECK-NEXT: vadd.i32 q3, q3, q1 +; CHECK-NEXT: letp lr, .LBB2_1 ; CHECK-NEXT: @ %bb.2: @ %middle.block -; CHECK-NEXT: vpsel q0, q1, q5 +; CHECK-NEXT: vmov q0, q3 ; CHECK-NEXT: vaddv.u32 r0, q0 -; CHECK-NEXT: str.w r0, [r2, r12, lsl #2] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} +; CHECK-NEXT: str.w r0, [r2, r1, lsl #2] ; CHECK-NEXT: pop {r4, pc} ; CHECK-NEXT: .p2align 4 ; CHECK-NEXT: @ %bb.3: @@ -225,11 +171,6 @@ ; CHECK-NEXT: .long 4294967248 @ 0xffffffd0 ; CHECK-NEXT: .long 4294967268 @ 0xffffffe4 ; CHECK-NEXT: .long 4294967288 @ 0xfffffff8 -; CHECK-NEXT: .LCPI2_1: -; CHECK-NEXT: .long 0 @ 0x0 -; CHECK-NEXT: .long 1 @ 0x1 -; CHECK-NEXT: .long 2 @ 0x2 -; CHECK-NEXT: .long 3 @ 0x3 entry: ; preds = %middle. %add.us.us = add i32 4, %n %arrayidx.us.us = getelementptr inbounds i32, i32* %C, i32 %add.us.us