Index: llvm/include/llvm/Analysis/TargetTransformInfo.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfo.h +++ llvm/include/llvm/Analysis/TargetTransformInfo.h @@ -606,6 +606,10 @@ /// Return true if the target supports masked expand load. bool isLegalMaskedExpandLoad(Type *DataType) const; + /// Give the target a chance to custom legalise a masked gather whilst it is + /// still IR. Return true if legalised. + bool customLegalizeMaskedGather(IntrinsicInst *I, bool &ModifiedDT) const; + /// Return true if the target has a unified operation to calculate division /// and remainder. If so, the additional implicit multiplication and /// subtraction required to calculate a remainder from division are free. This @@ -1241,6 +1245,7 @@ virtual bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) = 0; virtual bool isLegalMaskedCompressStore(Type *DataType) = 0; virtual bool isLegalMaskedExpandLoad(Type *DataType) = 0; + virtual bool customLegalizeMaskedGather(IntrinsicInst *I, bool &ModifiedDT) = 0; virtual bool hasDivRemOp(Type *DataType, bool IsSigned) = 0; virtual bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) = 0; virtual bool prefersVectorizedAddressing() = 0; @@ -1549,6 +1554,9 @@ bool isLegalMaskedExpandLoad(Type *DataType) override { return Impl.isLegalMaskedExpandLoad(DataType); } + bool customLegalizeMaskedGather(IntrinsicInst *I, bool &ModifiedDT) override { + return Impl.customLegalizeMaskedGather(I, ModifiedDT); + } bool hasDivRemOp(Type *DataType, bool IsSigned) override { return Impl.hasDivRemOp(DataType, IsSigned); } Index: llvm/include/llvm/Analysis/TargetTransformInfoImpl.h =================================================================== --- llvm/include/llvm/Analysis/TargetTransformInfoImpl.h +++ llvm/include/llvm/Analysis/TargetTransformInfoImpl.h @@ -284,6 +284,8 @@ bool isLegalMaskedExpandLoad(Type *DataType) { return false; } + bool customLegalizeMaskedGather(IntrinsicInst *I, bool &ModifiedDT) { return false; } + bool hasDivRemOp(Type *DataType, bool IsSigned) { return false; } bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) { return false; } Index: llvm/lib/Analysis/TargetTransformInfo.cpp =================================================================== --- llvm/lib/Analysis/TargetTransformInfo.cpp +++ llvm/lib/Analysis/TargetTransformInfo.cpp @@ -333,6 +333,11 @@ return TTIImpl->isLegalMaskedExpandLoad(DataType); } +bool TargetTransformInfo::customLegalizeMaskedGather(IntrinsicInst *I, + bool &ModifiedDT) const { + return TTIImpl->customLegalizeMaskedGather(I, ModifiedDT); +} + bool TargetTransformInfo::hasDivRemOp(Type *DataType, bool IsSigned) const { return TTIImpl->hasDivRemOp(DataType, IsSigned); } Index: llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp =================================================================== --- llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp +++ llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp @@ -871,6 +871,8 @@ } case Intrinsic::masked_gather: Alignment = cast(CI->getArgOperand(1))->getZExtValue(); + if (TTI->customLegalizeMaskedGather(II, ModifiedDT)) + return true; if (TTI->isLegalMaskedGather(CI->getType(), MaybeAlign(Alignment))) return false; scalarizeMaskedGather(CI, ModifiedDT); Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -159,10 +159,12 @@ return isLegalMaskedLoad(DataTy, Alignment); } - bool isLegalMaskedGather(Type *Ty, MaybeAlign Alignment) { return false; } + bool isLegalMaskedGather(Type *Ty, MaybeAlign Alignment); bool isLegalMaskedScatter(Type *Ty, MaybeAlign Alignment) { return false; } + bool customLegalizeMaskedGather(IntrinsicInst *I, bool &ModifiedDT) const; + int getMemcpyCost(const Instruction *I); int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp); Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -22,6 +22,8 @@ #include "llvm/IR/Instruction.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsARM.h" +#include "llvm/IR/PatternMatch.h" #include "llvm/IR/Type.h" #include "llvm/MC/SubtargetFeature.h" #include "llvm/Support/Casting.h" @@ -40,6 +42,10 @@ "enable-arm-maskedldst", cl::Hidden, cl::init(true), cl::desc("Enable the generation of masked loads and stores")); +static cl::opt EnableMaskedGatherScatters( + "enable-arm-maskedgatscat", cl::Hidden, cl::init(false), + cl::desc("Enable the generation of masked gathers and scatters")); + static cl::opt DisableLowOverheadLoops( "disable-arm-loloops", cl::Hidden, cl::init(false), cl::desc("Disable the generation of low-overhead loops")); @@ -514,6 +520,102 @@ (EltWidth == 8); } +bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, MaybeAlign Alignment) { + if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps()) + return false; + + // This method is called to 2 places + // - once from the vectorizer with a scalar type, in which case we need to + // get this as good as we can with the limited info we have (and rely on + // the cot model for the rest). + // - and also from the masked intrinsic lowering pass with the actual type. + // For MVE, we want to custom legalise any gathers that we can to mve + // intrinsics, and expand all the rest. So if we are here, we know we want + // to expand. + if (dyn_cast(Ty)) { + return false; + } + + unsigned EltWidth = Ty->getScalarSizeInBits(); + return ((EltWidth == 32 && (!Alignment || Alignment >= 4)) || + (EltWidth == 16 && (!Alignment || Alignment >= 2)) || EltWidth == 8); +} + +bool isLegalAlignment(unsigned NumElements, unsigned ElemSize, + unsigned Alignment) { + // Do only allow non-extending v4i32 gathers for now + return NumElements == 4 && ElemSize == 32 && Alignment >= 4; +} + +bool ARMTTIImpl::customLegalizeMaskedGather(IntrinsicInst *I, + bool &ModifiedDT) const { + if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps()) + return false; + using namespace PatternMatch; + LLVM_DEBUG(dbgs() << "masked gathers: checking transform preconditions\n"); + + // @llvm.masked.gather.*(Ptrs, alignment, Mask, Src0) + // Attempt to turn the masked gather in I into a mve intrinsic + // Potentially optimising the addressing modes as we do so. + Type *Ty = I->getType(); + Value *Ptr = I->getArgOperand(0); + unsigned Alignment = cast(I->getArgOperand(1))->getZExtValue(); + Value *Mask = I->getArgOperand(2); + Value *PassThru = I->getArgOperand(3); + + // Check this is a valid gather with correct alignment + if (!isLegalAlignment(Ty->getVectorNumElements(), + Ty->getScalarSizeInBits(), Alignment)) { + LLVM_DEBUG(dbgs() << "masked gathers: instruction does not have valid " + << "alignment or vector type \n"); + return false; + } + + IRBuilder<> Builder(I->getContext()); + Builder.SetInsertPoint(I); + Builder.SetCurrentDebugLocation(I->getDebugLoc()); + + Value *Load = nullptr; + // Look through bitcast instruction if #elements is same + if (auto *BitCast = dyn_cast(Ptr)) { + Type *BCTy = BitCast->getType(); + Type *BCSrcTy = BitCast->getOperand(0)->getType(); + if (BCTy->getVectorNumElements() == BCSrcTy->getVectorNumElements()) { + LLVM_DEBUG(dbgs() << "masked gathers: looking through bitcast\n"); + Ptr = BitCast->getOperand(0); + } + } + assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type"); + + if (Ty->getVectorNumElements() != 4) + // Can't build an intrinsic for this + return false; + if (match(Mask, m_One())) + Load = Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base, + {Ty, Ptr->getType()}, + {Ptr, Builder.getInt32(0)}); + else + Load = Builder.CreateIntrinsic( + Intrinsic::arm_mve_vldr_gather_base_predicated, + {Ty, Ptr->getType(), Mask->getType()}, + {Ptr, Builder.getInt32(0), Mask}); + + if (Load && !isa(PassThru) && !match(PassThru, m_Zero())) { + LLVM_DEBUG(dbgs() << "masked gathers: found non-trivial passthru - " + << "creating select\n"); + Load = Builder.CreateSelect(Mask, Load, PassThru); + } + + if (Load) { + LLVM_DEBUG(dbgs() << "masked gathers: successfully built masked gather\n"); + I->replaceAllUsesWith(Load); + I->eraseFromParent(); + return true; + } + LLVM_DEBUG(dbgs() << "masked gathers: could not build masked gather\n"); + return false; +} + int ARMTTIImpl::getMemcpyCost(const Instruction *I) { const MemCpyInst *MI = dyn_cast(I); assert(MI && "MemcpyInst expected"); Index: llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll @@ -62,19 +62,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: ldr r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldr r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -91,14 +80,7 @@ ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vshl.i32 q0, q0, #2 ; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vldr s3, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vldr s2, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vldr s1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vldr s0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -113,19 +95,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: ldr r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldr r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -140,19 +111,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q0, [r1] ; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: ldr r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldr r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -168,14 +128,7 @@ ; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vshl.i32 q0, q0, #2 ; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vldr s3, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vldr s2, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vldr s1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vldr s0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -192,14 +145,7 @@ ; CHECK-NEXT: vldrh.s32 q0, [r1] ; CHECK-NEXT: vshl.i32 q0, q0, #2 ; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vldr s3, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vldr s2, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vldr s1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vldr s0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -331,19 +277,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: ldr r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldr r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -358,19 +293,8 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.s32 q0, [r1] ; CHECK-NEXT: vshl.i32 q0, q0, #2 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: ldr r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldr r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -386,14 +310,7 @@ ; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vshl.i32 q0, q0, #2 ; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vldr s3, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vldr s2, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vldr s1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vldr s0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -410,14 +327,7 @@ ; CHECK-NEXT: vldrb.s32 q0, [r1] ; CHECK-NEXT: vshl.i32 q0, q0, #2 ; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vldr s3, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vldr s2, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vldr s1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vldr s0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 Index: llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll @@ -117,19 +117,8 @@ ; CHECK-LABEL: unscaled_i32_i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: ldr r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldr r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -144,14 +133,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vldr s3, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vldr s2, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vldr s1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vldr s0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -165,19 +147,8 @@ ; CHECK-LABEL: unsigned_unscaled_b_i32_i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: ldr r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldr r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -192,19 +163,8 @@ ; CHECK-LABEL: signed_unscaled_i32_i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: ldr r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldr r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -220,14 +180,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u32 q0, [r1] ; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vldr s3, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vldr s2, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vldr s1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vldr s0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -243,14 +196,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q0, [r1] ; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vldr s3, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vldr s2, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vldr s1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vldr s0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -497,19 +443,8 @@ ; CHECK-LABEL: unsigned_unscaled_b_i32_i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: ldr r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldr r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -524,19 +459,8 @@ ; CHECK-LABEL: signed_unscaled_i32_i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: ldr r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldr r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vadd.i32 q1, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -552,14 +476,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u32 q0, [r1] ; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vldr s3, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vldr s2, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vldr s1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vldr s0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -575,14 +492,7 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.s32 q0, [r1] ; CHECK-NEXT: vadd.i32 q1, q0, r0 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vldr s3, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vldr s2, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vldr s1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vldr s0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 Index: llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll @@ -21,19 +21,8 @@ define arm_aapcs_vfpcc <4 x i32> @ptr_v4i32(<4 x i32*>* %offptr) { ; CHECK-LABEL: ptr_v4i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldr r0, [r0] -; CHECK-NEXT: ldr r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldr r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldr r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32*>, <4 x i32*>* %offptr, align 4 @@ -162,14 +151,7 @@ ; CHECK-LABEL: ptr_v4f32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vldr s3, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vldr s2, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vldr s1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vldr s0, [r0] +; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x float*>, <4 x float*>* %offptr, align 4 @@ -759,62 +741,23 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: bic r2, r2, #15 ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: blt .LBB22_3 -; CHECK-NEXT: @ %bb.1: @ %vector.body.preheader +; CHECK-NEXT: it lt +; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: sub.w r12, r1, #16 +; CHECK-NEXT: subs r1, #16 ; CHECK-NEXT: add.w lr, r3, r2, lsr #2 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB22_2: @ %vector.body +; CHECK-NEXT: .LBB22_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r12, #16] -; CHECK-NEXT: @ implicit-def: $q1 -; CHECK-NEXT: add.w r12, r12, #16 -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmrs r3, p0 -; CHECK-NEXT: and r2, r3, #1 -; CHECK-NEXT: rsbs r1, r2, #0 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: bfi r2, r1, #0, #1 -; CHECK-NEXT: ubfx r1, r3, #4, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r2, r1, #1, #1 -; CHECK-NEXT: ubfx r1, r3, #8, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r2, r1, #2, #1 -; CHECK-NEXT: ubfx r1, r3, #12, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r2, r1, #3, #1 -; CHECK-NEXT: lsls r1, r2, #31 -; CHECK-NEXT: ittt ne -; CHECK-NEXT: vmovne r1, s0 -; CHECK-NEXT: ldrne r1, [r1] -; CHECK-NEXT: vmovne.32 q1[0], r1 -; CHECK-NEXT: lsls r1, r2, #30 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r1, s1 -; CHECK-NEXT: ldrmi r1, [r1] -; CHECK-NEXT: vmovmi.32 q1[1], r1 -; CHECK-NEXT: lsls r1, r2, #29 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r1, s2 -; CHECK-NEXT: ldrmi r1, [r1] -; CHECK-NEXT: vmovmi.32 q1[2], r1 -; CHECK-NEXT: lsls r1, r2, #28 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r1, s3 -; CHECK-NEXT: ldrmi r1, [r1] -; CHECK-NEXT: vmovmi.32 q1[3], r1 -; CHECK-NEXT: vpst +; CHECK-NEXT: vldrw.u32 q0, [r1, #16]! +; CHECK-NEXT: vptt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q1, [q0] ; CHECK-NEXT: vstrwt.32 q1, [r0], #16 -; CHECK-NEXT: le lr, .LBB22_2 -; CHECK-NEXT: .LBB22_3: @ %for.end -; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: le lr, .LBB22_1 +; CHECK-NEXT: @ %bb.2: @ %for.end ; CHECK-NEXT: pop {r7, pc} entry: %and = and i32 %n, -16 @@ -844,62 +787,23 @@ ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r7, lr} ; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: bic r2, r2, #15 ; CHECK-NEXT: cmp r2, #1 -; CHECK-NEXT: blt .LBB23_3 -; CHECK-NEXT: @ %bb.1: @ %vector.body.preheader +; CHECK-NEXT: it lt +; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: subs r2, #4 ; CHECK-NEXT: movs r3, #1 -; CHECK-NEXT: sub.w r12, r1, #16 +; CHECK-NEXT: subs r1, #16 ; CHECK-NEXT: add.w lr, r3, r2, lsr #2 ; CHECK-NEXT: dls lr, lr -; CHECK-NEXT: .LBB23_2: @ %vector.body +; CHECK-NEXT: .LBB23_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r12, #16] -; CHECK-NEXT: @ implicit-def: $q1 -; CHECK-NEXT: add.w r12, r12, #16 -; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vmrs r3, p0 -; CHECK-NEXT: and r2, r3, #1 -; CHECK-NEXT: rsbs r1, r2, #0 -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: bfi r2, r1, #0, #1 -; CHECK-NEXT: ubfx r1, r3, #4, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r2, r1, #1, #1 -; CHECK-NEXT: ubfx r1, r3, #8, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r2, r1, #2, #1 -; CHECK-NEXT: ubfx r1, r3, #12, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r2, r1, #3, #1 -; CHECK-NEXT: lsls r1, r2, #31 -; CHECK-NEXT: ittt ne -; CHECK-NEXT: vmovne r1, s0 -; CHECK-NEXT: ldrne r1, [r1] -; CHECK-NEXT: vmovne.32 q1[0], r1 -; CHECK-NEXT: lsls r1, r2, #30 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r1, s1 -; CHECK-NEXT: ldrmi r1, [r1] -; CHECK-NEXT: vmovmi.32 q1[1], r1 -; CHECK-NEXT: lsls r1, r2, #29 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r1, s2 -; CHECK-NEXT: ldrmi r1, [r1] -; CHECK-NEXT: vmovmi.32 q1[2], r1 -; CHECK-NEXT: lsls r1, r2, #28 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r1, s3 -; CHECK-NEXT: ldrmi r1, [r1] -; CHECK-NEXT: vmovmi.32 q1[3], r1 -; CHECK-NEXT: vpst +; CHECK-NEXT: vldrw.u32 q0, [r1, #16]! +; CHECK-NEXT: vptt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q1, [q0] ; CHECK-NEXT: vstrwt.32 q1, [r0], #16 -; CHECK-NEXT: le lr, .LBB23_2 -; CHECK-NEXT: .LBB23_3: @ %for.end -; CHECK-NEXT: add sp, #4 +; CHECK-NEXT: le lr, .LBB23_1 +; CHECK-NEXT: @ %bb.2: @ %for.end ; CHECK-NEXT: pop {r7, pc} entry: %and = and i32 %n, -16