diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp --- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -81,6 +81,12 @@ Align Alignment); // Check whether Ptr is hidden behind a bitcast and look through it void lookThroughBitcast(Value *&Ptr); + // Decompose a ptr into Base and Offsets, potentially using a GEP to return a + // scalar base and vector offsets, or else fallback to using a base of 0 and + // offset of Ptr where possible. + Value *decomposePtr(Value *Ptr, Value *&Offsets, int &Scale, + FixedVectorType *Ty, Type *MemoryTy, + IRBuilder<> &Builder); // Check for a getelementptr and deduce base and offsets from it, on success // returning the base directly and the offsets indirectly using the Offsets // argument @@ -213,6 +219,33 @@ return true; } +Value *MVEGatherScatterLowering::decomposePtr(Value *Ptr, Value *&Offsets, + int &Scale, FixedVectorType *Ty, + Type *MemoryTy, + IRBuilder<> &Builder) { + if (auto *GEP = dyn_cast(Ptr)) { + if (Value *V = decomposeGEP(Offsets, Ty, GEP, Builder)) { + Scale = + computeScale(GEP->getSourceElementType()->getPrimitiveSizeInBits(), + MemoryTy->getScalarSizeInBits()); + return Scale == -1 ? nullptr : V; + } + } + + // If we couldn't use the GEP (or it doesn't exist), attempt to use a + // BasePtr of 0 with Ptr as the Offsets, so long as there are only 4 + // elements. + FixedVectorType *PtrTy = cast(Ptr->getType()); + if (PtrTy->getNumElements() != 4 || MemoryTy->getScalarSizeInBits() == 32) + return nullptr; + Value *Zero = ConstantInt::get(Builder.getInt32Ty(), 0); + Value *BasePtr = Builder.CreateIntToPtr(Zero, Builder.getInt8PtrTy()); + Offsets = Builder.CreatePtrToInt( + Ptr, FixedVectorType::get(Builder.getInt32Ty(), 4)); + Scale = 0; + return BasePtr; +} + Value *MVEGatherScatterLowering::decomposeGEP(Value *&Offsets, FixedVectorType *Ty, GetElementPtrInst *GEP, @@ -446,14 +479,14 @@ IntrinsicInst *I, Value *Ptr, Instruction *&Root, IRBuilder<> &Builder) { using namespace PatternMatch; - Type *OriginalTy = I->getType(); - Type *ResultTy = OriginalTy; + Type *MemoryTy = I->getType(); + Type *ResultTy = MemoryTy; unsigned Unsigned = 1; // The size of the gather was already checked in isLegalTypeAndAlignment; // if it was not a full vector width an appropriate extend should follow. auto *Extend = Root; - if (OriginalTy->getPrimitiveSizeInBits() < 128) { + if (MemoryTy->getPrimitiveSizeInBits() < 128) { // Only transform gathers with exactly one use if (!I->hasOneUse()) return nullptr; @@ -478,32 +511,26 @@ } } - GetElementPtrInst *GEP = dyn_cast(Ptr); Value *Offsets; - Value *BasePtr = - decomposeGEP(Offsets, cast(ResultTy), GEP, Builder); + int Scale; + Value *BasePtr = decomposePtr( + Ptr, Offsets, Scale, cast(ResultTy), MemoryTy, Builder); if (!BasePtr) return nullptr; - int Scale = - computeScale(GEP->getSourceElementType()->getPrimitiveSizeInBits(), - OriginalTy->getScalarSizeInBits()); - if (Scale == -1) - return nullptr; Root = Extend; - Value *Mask = I->getArgOperand(2); if (!match(Mask, m_One())) return Builder.CreateIntrinsic( Intrinsic::arm_mve_vldr_gather_offset_predicated, {ResultTy, BasePtr->getType(), Offsets->getType(), Mask->getType()}, - {BasePtr, Offsets, Builder.getInt32(OriginalTy->getScalarSizeInBits()), + {BasePtr, Offsets, Builder.getInt32(MemoryTy->getScalarSizeInBits()), Builder.getInt32(Scale), Builder.getInt32(Unsigned), Mask}); else return Builder.CreateIntrinsic( Intrinsic::arm_mve_vldr_gather_offset, {ResultTy, BasePtr->getType(), Offsets->getType()}, - {BasePtr, Offsets, Builder.getInt32(OriginalTy->getScalarSizeInBits()), + {BasePtr, Offsets, Builder.getInt32(MemoryTy->getScalarSizeInBits()), Builder.getInt32(Scale), Builder.getInt32(Unsigned)}); } @@ -617,19 +644,13 @@ return nullptr; } - GetElementPtrInst *GEP = dyn_cast(Ptr); Value *Offsets; - Value *BasePtr = - decomposeGEP(Offsets, cast(InputTy), GEP, Builder); + int Scale; + Value *BasePtr = decomposePtr( + Ptr, Offsets, Scale, cast(InputTy), MemoryTy, Builder); if (!BasePtr) return nullptr; - int Scale = - computeScale(GEP->getSourceElementType()->getPrimitiveSizeInBits(), - MemoryTy->getScalarSizeInBits()); - if (Scale == -1) - return nullptr; - if (!match(Mask, m_One())) return Builder.CreateIntrinsic( Intrinsic::arm_mve_vstr_scatter_offset_predicated, diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll @@ -234,16 +234,9 @@ define arm_aapcs_vfpcc <4 x i32> @ptr_v4i16_sext(<4 x i16*>* %offptr) { ; CHECK-LABEL: ptr_v4i16_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: vldrh.s32 q0, [r1, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4 @@ -255,16 +248,9 @@ define arm_aapcs_vfpcc <4 x i32> @ptr_v4i16_zext(<4 x i16*>* %offptr) { ; CHECK-LABEL: ptr_v4i16_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: vldrh.u32 q0, [r1, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4 @@ -574,17 +560,9 @@ define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_sext32(<4 x i8*>* %offptr) { ; CHECK-LABEL: ptr_v4i8_sext32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov q0[2], q0[0], r2, r0 -; CHECK-NEXT: vmov q0[3], q0[1], r3, r1 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: vldrb.s32 q0, [r1, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4 @@ -596,17 +574,9 @@ define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_zext32(<4 x i8*>* %offptr) { ; CHECK-LABEL: ptr_v4i8_zext32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r0, r1, d1 -; CHECK-NEXT: vmov r2, r3, d0 -; CHECK-NEXT: vmov.i32 q0, #0xff -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov q1[2], q1[0], r2, r0 -; CHECK-NEXT: vmov q1[3], q1[1], r3, r1 -; CHECK-NEXT: vand q0, q1, q0 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: movs r1, #0 +; CHECK-NEXT: vldrb.u32 q0, [r1, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4 diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll @@ -236,22 +236,14 @@ define arm_aapcs_vfpcc void @ext_scaled_i16_i32_2gep(i16* %base, <4 x i32>* %offptr, <4 x i32> %input) { ; CHECK-LABEL: ext_scaled_i16_i32_2gep: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vldrw.u32 q2, [r1] ; CHECK-NEXT: vmov.i32 q1, #0xa -; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: vshl.i32 q2, q2, #1 -; CHECK-NEXT: vmov r4, r5, d1 ; CHECK-NEXT: vadd.i32 q2, q2, r0 ; CHECK-NEXT: vadd.i32 q1, q2, q1 -; CHECK-NEXT: vmov r0, r12, d2 -; CHECK-NEXT: vmov r2, lr, d3 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: strh.w r3, [r12] -; CHECK-NEXT: strh r4, [r2] -; CHECK-NEXT: strh.w r5, [lr] -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vstrh.32 q0, [r2, q1] +; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 %ptrs = getelementptr inbounds i16, i16* %base, <4 x i32> %offs diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll --- a/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll @@ -228,22 +228,13 @@ ret void } -; Expand define arm_aapcs_vfpcc void @ptr_v4i16_trunc(<4 x i32> %v, <4 x i16*>* %offptr) { ; CHECK-LABEL: ptr_v4i16_trunc: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov r1, r3, d0 -; CHECK-NEXT: vmov r4, r5, d1 -; CHECK-NEXT: vmov r0, r12, d2 -; CHECK-NEXT: vmov r2, lr, d3 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: strh.w r3, [r12] -; CHECK-NEXT: strh r4, [r2] -; CHECK-NEXT: strh.w r5, [lr] -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: vstrh.32 q0, [r0, q1] +; CHECK-NEXT: bx lr entry: %offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4 %ext = trunc <4 x i32> %v to <4 x i16> @@ -463,22 +454,13 @@ ret void } -; Expand define arm_aapcs_vfpcc void @ptr_v4i8_trunc32(<4 x i32> %v, <4 x i8*>* %offptr) { ; CHECK-LABEL: ptr_v4i8_trunc32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov r1, r3, d0 -; CHECK-NEXT: vmov r4, r5, d1 -; CHECK-NEXT: vmov r0, r12, d2 -; CHECK-NEXT: vmov r2, lr, d3 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: strb.w r3, [r12] -; CHECK-NEXT: strb r4, [r2] -; CHECK-NEXT: strb.w r5, [lr] -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: movs r0, #0 +; CHECK-NEXT: vstrb.32 q0, [r0, q1] +; CHECK-NEXT: bx lr entry: %offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4 %ext = trunc <4 x i32> %v to <4 x i8>