diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp --- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -84,7 +84,7 @@ bool lowerGather(IntrinsicInst *I); // Create a gather from a base + vector of offsets Value *tryCreateMaskedGatherOffset(IntrinsicInst *I, Value *Ptr, - IRBuilder<> Builder); + Instruction *&Root, IRBuilder<> Builder); // Create a gather from a vector of pointers Value *tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder); @@ -104,9 +104,9 @@ bool MVEGatherScatterLowering::isLegalTypeAndAlignment(unsigned NumElements, unsigned ElemSize, unsigned Alignment) { - // Do only allow non-extending gathers for now - if (((NumElements == 4 && ElemSize == 32) || - (NumElements == 8 && ElemSize == 16) || + if (((NumElements == 4 && + (ElemSize == 32 || ElemSize == 16 || ElemSize == 8)) || + (NumElements == 8 && (ElemSize == 16 || ElemSize == 8)) || (NumElements == 16 && ElemSize == 8)) && ElemSize / 8 <= Alignment) return true; @@ -126,9 +126,6 @@ << " from base + vector of offsets\n"); Value *GEPPtr = GEP->getPointerOperand(); if (GEPPtr->getType()->isVectorTy()) { - LLVM_DEBUG(dbgs() << "masked gathers: gather from a vector of pointers" - << " hidden behind a getelementptr currently not" - << " supported. Expanding.\n"); return nullptr; } if (GEP->getNumOperands() != 2) { @@ -194,7 +191,10 @@ IRBuilder<> Builder(I->getContext()); Builder.SetInsertPoint(I); Builder.SetCurrentDebugLocation(I->getDebugLoc()); - Value *Load = tryCreateMaskedGatherOffset(I, Ptr, Builder); + + Instruction *Root = I; + + Value *Load = tryCreateMaskedGatherOffset(I, Ptr, Root, Builder); if (!Load) Load = tryCreateMaskedGatherBase(I, Ptr, Builder); if (!Load) @@ -206,18 +206,24 @@ Load = Builder.CreateSelect(Mask, Load, PassThru); } + Root->replaceAllUsesWith(Load); + Root->eraseFromParent(); + if (Root != I) + // If this was an extending gather, we need to get rid of the sext/zext + // sext/zext as well as of the gather itself + I->eraseFromParent(); LLVM_DEBUG(dbgs() << "masked gathers: successfully built masked gather\n"); - I->replaceAllUsesWith(Load); - I->eraseFromParent(); return true; } Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase( IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) { using namespace PatternMatch; - LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers\n"); + Type *Ty = I->getType(); - if (Ty->getVectorNumElements() != 4) + + LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers\n"); + if (Ty->getVectorNumElements() != 4 || Ty->getScalarSizeInBits() != 32) // Can't build an intrinsic for this return nullptr; Value *Mask = I->getArgOperand(2); @@ -233,23 +239,55 @@ } Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset( - IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) { + IntrinsicInst *I, Value *Ptr, Instruction *&Root, IRBuilder<> Builder) { using namespace PatternMatch; - Type *Ty = I->getType(); + + Type *OriginalTy = I->getType(); + Type *ResultTy = OriginalTy; + + unsigned Unsigned = 1; + // The size of the gather was already checked in isLegalTypeAndAlignment; + // if it was not a full vector width an appropriate extend should follow. + auto *Extend = Root; + if (OriginalTy->getPrimitiveSizeInBits() < 128) { + // Only transform gathers with exactly one use + if (!I->hasOneUse()) + return nullptr; + + // The correct root to replace is the not the CallInst itself, but the + // instruction which extends it + Extend = cast(*I->users().begin()); + if (isa(Extend)) { + Unsigned = 0; + } else if (!isa(Extend)) { + LLVM_DEBUG(dbgs() << "masked gathers: extend needed but not provided. " + << "Expanding\n"); + return nullptr; + } + LLVM_DEBUG(dbgs() << "masked gathers: found an extending gather\n"); + ResultTy = Extend->getType(); + // The final size of the gather must be a full vector width + if (ResultTy->getPrimitiveSizeInBits() != 128) { + LLVM_DEBUG(dbgs() << "masked gathers: extending from the wrong type. " + << "Expanding\n"); + return nullptr; + } + } + Value *Offsets; - Value *BasePtr = checkGEP(Offsets, Ty, Ptr, Builder); + Value *BasePtr = checkGEP(Offsets, ResultTy, Ptr, Builder); if (!BasePtr) return nullptr; unsigned Scale; int GEPElemSize = BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(); - int ResultElemSize = Ty->getScalarSizeInBits(); + int MemoryElemSize = OriginalTy->getScalarSizeInBits(); // This can be a 32bit load scaled by 4, a 16bit load scaled by 2, or a // 8bit, 16bit or 32bit load scaled by 1 - if (GEPElemSize == 32 && ResultElemSize == 32) { + if (GEPElemSize == 32 && MemoryElemSize == 32) { Scale = 2; - } else if (GEPElemSize == 16 && ResultElemSize == 16) { + } else if (GEPElemSize == 16 && MemoryElemSize == 16) { Scale = 1; } else if (GEPElemSize == 8) { Scale = 0; @@ -258,20 +296,21 @@ << " create masked gather\n"); return nullptr; } + Root = Extend; Value *Mask = I->getArgOperand(2); if (!match(Mask, m_One())) return Builder.CreateIntrinsic( Intrinsic::arm_mve_vldr_gather_offset_predicated, - {Ty, BasePtr->getType(), Offsets->getType(), Mask->getType()}, - {BasePtr, Offsets, Builder.getInt32(Ty->getScalarSizeInBits()), - Builder.getInt32(Scale), Builder.getInt32(1), Mask}); + {ResultTy, BasePtr->getType(), Offsets->getType(), Mask->getType()}, + {BasePtr, Offsets, Builder.getInt32(OriginalTy->getScalarSizeInBits()), + Builder.getInt32(Scale), Builder.getInt32(Unsigned), Mask}); else return Builder.CreateIntrinsic( Intrinsic::arm_mve_vldr_gather_offset, - {Ty, BasePtr->getType(), Offsets->getType()}, - {BasePtr, Offsets, Builder.getInt32(Ty->getScalarSizeInBits()), - Builder.getInt32(Scale), Builder.getInt32(1)}); + {ResultTy, BasePtr->getType(), Offsets->getType()}, + {BasePtr, Offsets, Builder.getInt32(OriginalTy->getScalarSizeInBits()), + Builder.getInt32(Scale), Builder.getInt32(Unsigned)}); } bool MVEGatherScatterLowering::runOnFunction(Function &F) { diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll @@ -4,38 +4,9 @@ define arm_aapcs_vfpcc <8 x i16> @zext_unscaled_i8_i16(i8* %base, <8 x i16>* %offptr) { ; CHECK-LABEL: zext_unscaled_i8_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: ldrb.w lr, [r3] -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.16 q0[0], r5 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.16 q0[1], lr -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.16 q0[3], r12 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vldrb.u16 q0, [r0, q1] +; CHECK-NEXT: bx lr entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 %offs.zext = zext <8 x i16> %offs to <8 x i32> @@ -48,38 +19,9 @@ define arm_aapcs_vfpcc <8 x i16> @sext_unscaled_i8_i16(i8* %base, <8 x i16>* %offptr) { ; CHECK-LABEL: sext_unscaled_i8_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: ldrb.w lr, [r3] -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.16 q0[0], r5 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.16 q0[1], lr -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.16 q0[3], r12 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vldrb.s16 q0, [r0, q1] +; CHECK-NEXT: bx lr entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 %offs.zext = zext <8 x i16> %offs to <8 x i32> @@ -122,38 +64,9 @@ define arm_aapcs_vfpcc <8 x i16> @zext_unsigned_unscaled_i8_i8(i8* %base, <8 x i8>* %offptr) { ; CHECK-LABEL: zext_unsigned_unscaled_i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vldrb.u32 q1, [r1, #4] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: ldrb.w lr, [r3] -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.16 q0[0], r5 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.16 q0[1], lr -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.16 q0[3], r12 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vldrb.u16 q1, [r1] +; CHECK-NEXT: vldrb.u16 q0, [r0, q1] +; CHECK-NEXT: bx lr entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 %offs.zext = zext <8 x i8> %offs to <8 x i32> @@ -166,38 +79,9 @@ define arm_aapcs_vfpcc <8 x i16> @sext_unsigned_unscaled_i8_i8(i8* %base, <8 x i8>* %offptr) { ; CHECK-LABEL: sext_unsigned_unscaled_i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vldrb.u32 q1, [r1, #4] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: ldrb.w lr, [r3] -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.16 q0[0], r5 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.16 q0[1], lr -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.16 q0[3], r12 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vldrb.u16 q1, [r1] +; CHECK-NEXT: vldrb.s16 q0, [r0, q1] +; CHECK-NEXT: bx lr entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 %offs.zext = zext <8 x i8> %offs to <8 x i32> diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll @@ -4,22 +4,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_scaled_i16_i32(i16* %base, <4 x i32>* %offptr) { ; CHECK-LABEL: zext_scaled_i16_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -32,22 +18,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_scaled_i16_i32(i16* %base, <4 x i32>* %offptr) { ; CHECK-LABEL: sext_scaled_i16_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -147,22 +119,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_signed_scaled_i16_i16(i16* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: zext_signed_scaled_i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -176,22 +134,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_signed_scaled_i16_i16(i16* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: sext_signed_scaled_i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -205,22 +149,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_scaled_i16_i16(i16* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: zext_unsigned_scaled_i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -234,22 +164,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_scaled_i16_i16(i16* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: sext_unsigned_scaled_i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -321,22 +237,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_signed_scaled_i16_i8(i16* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: zext_signed_scaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -350,22 +252,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_signed_scaled_i16_i8(i16* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: sext_signed_scaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -379,22 +267,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_scaled_i16_i8(i16* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: zext_unsigned_scaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -408,22 +282,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_scaled_i16_i8(i16* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: sext_unsigned_scaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll @@ -4,22 +4,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_unscaled_i8_i32(i8* %base, <4 x i32>* %offptr) { ; CHECK-LABEL: zext_unscaled_i8_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vmov.i32 q1, #0xff -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrb.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -32,22 +18,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_unscaled_i8_i32(i8* %base, <4 x i32>* %offptr) { ; CHECK-LABEL: sext_unscaled_i8_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrb.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -60,21 +32,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_unscaled_i16_i32(i8* %base, <4 x i32>* %offptr) { ; CHECK-LABEL: zext_unscaled_i16_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -88,21 +47,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_unscaled_i16_i32(i8* %base, <4 x i32>* %offptr) { ; CHECK-LABEL: sext_unscaled_i16_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -204,21 +150,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_signed_unscaled_i16_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: zext_signed_unscaled_i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -233,21 +166,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_signed_unscaled_i16_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: sext_signed_unscaled_i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -262,21 +182,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_unscaled_i16_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: zext_unsigned_unscaled_i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -291,21 +198,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_unscaled_i16_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: sext_unsigned_unscaled_i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -320,22 +214,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_signed_unscaled_i8_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: zext_signed_unscaled_i8_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vmov.i32 q1, #0xff -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vldrb.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -349,22 +229,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_signed_unscaled_i8_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: sext_signed_unscaled_i8_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vldrb.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -378,22 +244,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_unscaled_i8_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: zext_unsigned_unscaled_i8_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vmov.i32 q1, #0xff -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vldrb.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -407,22 +259,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_unscaled_i8_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: sext_unsigned_unscaled_i8_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vldrb.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -496,21 +334,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_signed_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: zext_signed_unscaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -525,21 +350,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_signed_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: sext_signed_unscaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -554,21 +366,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: zext_unsigned_unscaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -583,21 +382,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: sext_unsigned_unscaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -612,22 +398,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_signed_unscaled_i8_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: zext_signed_unscaled_i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vmov.i32 q1, #0xff -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vldrb.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -641,22 +413,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_signed_unscaled_i8_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: sext_signed_unscaled_i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vldrb.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -670,22 +428,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_unscaled_i8_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: zext_unsigned_unscaled_i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vmov.i32 q1, #0xff -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vldrb.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -699,22 +443,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_unscaled_i8_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: sext_unsigned_unscaled_i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vldrb.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll --- a/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll @@ -842,6 +842,53 @@ ret <4 x i32> %gather } +define arm_aapcs_vfpcc <8 x i32> @sext_unsigned_unscaled_i8_i8_toi64(i8* %base, <8 x i8>* %offptr) { +; CHECK-LABEL: sext_unsigned_unscaled_i8_i8_toi64: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r4, r5, r7, lr} +; CHECK-NEXT: push {r4, r5, r7, lr} +; CHECK-NEXT: vldrb.u32 q0, [r1] +; CHECK-NEXT: vldrb.u32 q1, [r1, #4] +; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vadd.i32 q1, q1, r0 +; CHECK-NEXT: vmov r2, s2 +; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vmov r0, s4 +; CHECK-NEXT: vmov r1, s5 +; CHECK-NEXT: vmov r5, s1 +; CHECK-NEXT: vmov r4, s7 +; CHECK-NEXT: ldrb.w r12, [r2] +; CHECK-NEXT: vmov r2, s0 +; CHECK-NEXT: ldrb.w lr, [r3] +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: ldrb r0, [r0] +; CHECK-NEXT: ldrb r1, [r1] +; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: vmov.32 q1[0], r0 +; CHECK-NEXT: ldrb r4, [r4] +; CHECK-NEXT: vmov.32 q1[1], r1 +; CHECK-NEXT: ldrb r2, [r2] +; CHECK-NEXT: ldrb r3, [r3] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.32 q0[1], r5 +; CHECK-NEXT: vmov.32 q1[2], r3 +; CHECK-NEXT: vmov.32 q0[2], r12 +; CHECK-NEXT: vmov.32 q1[3], r4 +; CHECK-NEXT: vmov.32 q0[3], lr +; CHECK-NEXT: vmovlb.s8 q1, q1 +; CHECK-NEXT: vmovlb.s8 q0, q0 +; CHECK-NEXT: vmovlb.s16 q1, q1 +; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: pop {r4, r5, r7, pc} +entry: + %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 + %offs.zext = zext <8 x i8> %offs to <8 x i32> + %ptrs = getelementptr inbounds i8, i8* %base, <8 x i32> %offs.zext + %gather = call <8 x i8> @llvm.masked.gather.v8i8.v8p0i8(<8 x i8*> %ptrs, i32 1, <8 x i1> , <8 x i8> undef) + %gather.sext = sext <8 x i8> %gather to <8 x i32> + ret <8 x i32> %gather.sext +} + declare <2 x i32> @llvm.masked.gather.v2i32.v2p0i32(<2 x i32*>, i32, <2 x i1>, <2 x i32>) declare <4 x i32> @llvm.masked.gather.v4i32.v4p0i32(<4 x i32*>, i32, <4 x i1>, <4 x i32>) declare <8 x i32> @llvm.masked.gather.v8i32.v8p0i32(<8 x i32*>, i32, <8 x i1>, <8 x i32>)