Index: llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp =================================================================== --- llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -83,7 +83,7 @@ bool lowerGather(IntrinsicInst *I); // Create a gather from a base + vector of offsets - Value *tryCreateMaskedGatherOffset(IntrinsicInst *I, Value *Ptr, + Value *tryCreateMaskedGatherOffset(IntrinsicInst *I, Value *Ptr, Instruction *&Root, IRBuilder<> Builder); // Create a gather from a vector of pointers Value *tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr, @@ -104,9 +104,9 @@ bool MVEGatherScatterLowering::isLegalTypeAndAlignment(unsigned NumElements, unsigned ElemSize, unsigned Alignment) { - // Do only allow non-extending gathers for now - if (((NumElements == 4 && ElemSize == 32) || - (NumElements == 8 && ElemSize == 16) || + if (((NumElements == 4 && + (ElemSize == 32 || ElemSize == 16 || ElemSize == 8)) || + (NumElements == 8 && (ElemSize == 16 || ElemSize == 8)) || (NumElements == 16 && ElemSize == 8)) && ElemSize / 8 <= Alignment) return true; @@ -126,9 +126,9 @@ << " from base + vector of offsets\n"); Value *GEPPtr = GEP->getPointerOperand(); if (GEPPtr->getType()->isVectorTy()) { - LLVM_DEBUG(dbgs() << "masked gathers: gather from a vector of pointers" - << " hidden behind a getelementptr currently not" - << " supported. Expanding.\n"); + //LLVM_DEBUG(dbgs() << "masked gathers: gather from a vector of pointers" + // << " hidden behind a getelementptr currently not" + // << " supported. Expanding.\n"); return nullptr; } if (GEP->getNumOperands() != 2) { @@ -194,7 +194,10 @@ IRBuilder<> Builder(I->getContext()); Builder.SetInsertPoint(I); Builder.SetCurrentDebugLocation(I->getDebugLoc()); - Value *Load = tryCreateMaskedGatherOffset(I, Ptr, Builder); + + Instruction *Root = I; + + Value *Load = tryCreateMaskedGatherOffset(I, Ptr, Root, Builder); if (!Load) Load = tryCreateMaskedGatherBase(I, Ptr, Builder); if (!Load) @@ -206,18 +209,24 @@ Load = Builder.CreateSelect(Mask, Load, PassThru); } + Root->replaceAllUsesWith(Load); + Root->eraseFromParent(); + if (Root != I) + // If this was an extending gather, we need to get rid of the sext/zext + // sext/zext as well as of the gather itself + I->eraseFromParent(); LLVM_DEBUG(dbgs() << "masked gathers: successfully built masked gather\n"); - I->replaceAllUsesWith(Load); - I->eraseFromParent(); return true; } Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase( IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) { using namespace PatternMatch; - LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers\n"); + Type *Ty = I->getType(); - if (Ty->getVectorNumElements() != 4) + + LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers\n"); + if (Ty->getVectorNumElements() != 4 || Ty->getScalarSizeInBits() != 32) // Can't build an intrinsic for this return nullptr; Value *Mask = I->getArgOperand(2); @@ -233,18 +242,57 @@ } Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset( - IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) { + IntrinsicInst *I, Value *Ptr, Instruction *&Root, IRBuilder<> Builder) { using namespace PatternMatch; - Type *Ty = I->getType(); + + Type *OriginalTy = I->getType(); + Type *ResultTy = OriginalTy; + + LLVMContext &C = I->getContext(); + unsigned Unsigned = 1; + // If this is an extending gather, a SExt or ZExt must be following + if (OriginalTy->getScalarSizeInBits() * OriginalTy->getVectorNumElements() < + 128) { + bool FoundExtend = false; + if (OriginalTy->getVectorNumElements() == 4) + ResultTy = VectorType::get(Type::getInt32Ty(C), 4); + else if (OriginalTy->getVectorNumElements() == 8) + ResultTy = VectorType::get(Type::getInt16Ty(C), 8); + // The correct root to replace is the not the CallInst itself, but the + // instruction which extends it + Instruction *Parent = nullptr; + for (User *u : I->users()) { + // Only do this to gathers with exactly one use + if (Parent || !(Parent = dyn_cast(u))) + return nullptr; + } + if (Parent) { + LLVM_DEBUG(dbgs() << "masked gathers: found an extending gather\n"); + if (isa(Parent)) { + Root = Parent; + FoundExtend = true; + } else if (isa(Parent)) { + Root = Parent; + Unsigned = 0; + FoundExtend = true; + } + } + if (!FoundExtend) { + LLVM_DEBUG(dbgs() << "masked gathers: extend needed but not provided. " + << "Expanding\n"); + return nullptr; + } + } + Value *Offsets; - Value *BasePtr = checkGEP(Offsets, Ty, Ptr, Builder); + Value *BasePtr = checkGEP(Offsets, ResultTy, Ptr, Builder); if (!BasePtr) return nullptr; unsigned Scale; int GEPElemSize = BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(); - int ResultElemSize = Ty->getScalarSizeInBits(); + int ResultElemSize = OriginalTy->getScalarSizeInBits(); // This can be a 32bit load scaled by 4, a 16bit load scaled by 2, or a // 8bit, 16bit or 32bit load scaled by 1 if (GEPElemSize == 32 && ResultElemSize == 32) { @@ -254,7 +302,7 @@ } else if (GEPElemSize == 8) { Scale = 0; } else { - LLVM_DEBUG(dbgs() << "masked gathers: incorrect scale for load. Can't" + LLVM_DEBUG(dbgs() << "masked gathers: incorrect scale for load. Can't " << "create masked gather\n"); return nullptr; } @@ -263,15 +311,15 @@ if (!match(Mask, m_One())) return Builder.CreateIntrinsic( Intrinsic::arm_mve_vldr_gather_offset_predicated, - {Ty, BasePtr->getType(), Offsets->getType(), Mask->getType()}, - {BasePtr, Offsets, Builder.getInt32(Ty->getScalarSizeInBits()), - Builder.getInt32(Scale), Builder.getInt32(1), Mask}); + {ResultTy, BasePtr->getType(), Offsets->getType(), Mask->getType()}, + {BasePtr, Offsets, Builder.getInt32(OriginalTy->getScalarSizeInBits()), + Builder.getInt32(Scale), Builder.getInt32(Unsigned), Mask}); else return Builder.CreateIntrinsic( Intrinsic::arm_mve_vldr_gather_offset, - {Ty, BasePtr->getType(), Offsets->getType()}, - {BasePtr, Offsets, Builder.getInt32(Ty->getScalarSizeInBits()), - Builder.getInt32(Scale), Builder.getInt32(1)}); + {ResultTy, BasePtr->getType(), Offsets->getType()}, + {BasePtr, Offsets, Builder.getInt32(OriginalTy->getScalarSizeInBits()), + Builder.getInt32(Scale), Builder.getInt32(Unsigned)}); } bool MVEGatherScatterLowering::runOnFunction(Function &F) { @@ -295,7 +343,7 @@ return false; for (IntrinsicInst *I : Gathers) - LowerGather(I); + lowerGather(I); return true; } Index: llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll @@ -4,38 +4,9 @@ define arm_aapcs_vfpcc <8 x i16> @zext_unscaled_i8_i16(i8* %base, <8 x i16>* %offptr) { ; CHECK-LABEL: zext_unscaled_i8_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: ldrb.w lr, [r3] -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.16 q0[0], r5 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.16 q0[1], lr -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.16 q0[3], r12 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vldrb.u16 q0, [r0, q1] +; CHECK-NEXT: bx lr entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 %offs.zext = zext <8 x i16> %offs to <8 x i32> @@ -48,38 +19,9 @@ define arm_aapcs_vfpcc <8 x i16> @sext_unscaled_i8_i16(i8* %base, <8 x i16>* %offptr) { ; CHECK-LABEL: sext_unscaled_i8_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: ldrb.w lr, [r3] -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.16 q0[0], r5 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.16 q0[1], lr -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.16 q0[3], r12 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vldrb.s16 q0, [r0, q1] +; CHECK-NEXT: bx lr entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 %offs.zext = zext <8 x i16> %offs to <8 x i32> @@ -122,38 +64,9 @@ define arm_aapcs_vfpcc <8 x i16> @zext_unsigned_unscaled_i8_i8(i8* %base, <8 x i8>* %offptr) { ; CHECK-LABEL: zext_unsigned_unscaled_i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vldrb.u32 q1, [r1, #4] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: ldrb.w lr, [r3] -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.16 q0[0], r5 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.16 q0[1], lr -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.16 q0[3], r12 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vldrb.u16 q1, [r1] +; CHECK-NEXT: vldrb.u16 q0, [r0, q1] +; CHECK-NEXT: bx lr entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 %offs.zext = zext <8 x i8> %offs to <8 x i32> @@ -166,38 +79,9 @@ define arm_aapcs_vfpcc <8 x i16> @sext_unsigned_unscaled_i8_i8(i8* %base, <8 x i8>* %offptr) { ; CHECK-LABEL: sext_unsigned_unscaled_i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vldrb.u32 q1, [r1, #4] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: ldrb.w lr, [r3] -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.16 q0[0], r5 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.16 q0[1], lr -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.16 q0[3], r12 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vldrb.u16 q1, [r1] +; CHECK-NEXT: vldrb.s16 q0, [r0, q1] +; CHECK-NEXT: bx lr entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 %offs.zext = zext <8 x i8> %offs to <8 x i32> Index: llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ind32-scaled.ll @@ -4,22 +4,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_scaled_i16_i32(i16* %base, <4 x i32>* %offptr) { ; CHECK-LABEL: zext_scaled_i16_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -32,22 +18,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_scaled_i16_i32(i16* %base, <4 x i32>* %offptr) { ; CHECK-LABEL: sext_scaled_i16_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -147,22 +119,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_signed_scaled_i16_i16(i16* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: zext_signed_scaled_i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -176,22 +134,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_signed_scaled_i16_i16(i16* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: sext_signed_scaled_i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -205,22 +149,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_scaled_i16_i16(i16* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: zext_unsigned_scaled_i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -234,22 +164,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_scaled_i16_i16(i16* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: sext_unsigned_scaled_i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -321,22 +237,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_signed_scaled_i16_i8(i16* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: zext_signed_scaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -350,22 +252,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_signed_scaled_i16_i8(i16* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: sext_signed_scaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -379,22 +267,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_scaled_i16_i8(i16* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: zext_unsigned_scaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -408,22 +282,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_scaled_i16_i8(i16* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: sext_unsigned_scaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vshl.i32 q0, q0, #1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 Index: llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll @@ -4,22 +4,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_unscaled_i8_i32(i8* %base, <4 x i32>* %offptr) { ; CHECK-LABEL: zext_unscaled_i8_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vmov.i32 q1, #0xff -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrb.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -32,22 +18,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_unscaled_i8_i32(i8* %base, <4 x i32>* %offptr) { ; CHECK-LABEL: sext_unscaled_i8_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrb.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -60,21 +32,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_unscaled_i16_i32(i8* %base, <4 x i32>* %offptr) { ; CHECK-LABEL: zext_unscaled_i16_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -88,21 +47,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_unscaled_i16_i32(i8* %base, <4 x i32>* %offptr) { ; CHECK-LABEL: sext_unscaled_i16_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -204,21 +150,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_signed_unscaled_i16_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: zext_signed_unscaled_i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -233,21 +166,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_signed_unscaled_i16_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: sext_signed_unscaled_i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -262,21 +182,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_unscaled_i16_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: zext_unsigned_unscaled_i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -291,21 +198,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_unscaled_i16_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: sext_unsigned_unscaled_i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -320,22 +214,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_signed_unscaled_i8_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: zext_signed_unscaled_i8_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vmov.i32 q1, #0xff -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vldrb.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -349,22 +229,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_signed_unscaled_i8_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: sext_signed_unscaled_i8_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vldrb.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -378,22 +244,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_unscaled_i8_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: zext_unsigned_unscaled_i8_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vmov.i32 q1, #0xff -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vldrb.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -407,22 +259,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_unscaled_i8_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: sext_unsigned_unscaled_i8_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vldrb.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -496,21 +334,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_signed_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: zext_signed_unscaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -525,21 +350,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_signed_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: sext_signed_unscaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -554,21 +366,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: zext_unsigned_unscaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -583,21 +382,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: sext_unsigned_unscaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -612,22 +398,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_signed_unscaled_i8_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: zext_signed_unscaled_i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vmov.i32 q1, #0xff -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vldrb.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -641,22 +413,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_signed_unscaled_i8_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: sext_signed_unscaled_i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vldrb.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -670,22 +428,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_unscaled_i8_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: zext_unsigned_unscaled_i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vmov.i32 q1, #0xff -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vldrb.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -699,22 +443,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_unscaled_i8_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: sext_unsigned_unscaled_i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vldrb.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1