Index: llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp =================================================================== --- llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -84,9 +84,9 @@ static bool isLegalTypeAndAlignment(unsigned NumElements, unsigned ElemSize, unsigned Alignment) { - // Do only allow non-extending gathers for now - return ((NumElements == 4 && ElemSize == 32) || - (NumElements == 8 && ElemSize == 16) || + return ((NumElements == 4 && + (ElemSize == 32 || ElemSize == 16 || ElemSize == 8)) || + (NumElements == 8 && (ElemSize == 16 || ElemSize == 8)) || (NumElements == 16 && ElemSize == 8)) && ElemSize / 8 <= Alignment; } @@ -98,15 +98,17 @@ // @llvm.masked.gather.*(Ptrs, alignment, Mask, Src0) // Attempt to turn the masked gather in I into a MVE intrinsic // Potentially optimising the addressing modes as we do so. - Type *Ty = I->getType(); + Type *OriginalTy = I->getType(); + Type *ResultTy = OriginalTy; Value *Ptr = I->getArgOperand(0); unsigned Alignment = cast(I->getArgOperand(1))->getZExtValue(); Value *Mask = I->getArgOperand(2); Value *PassThru = I->getArgOperand(3); + Instruction *Root = I; // Check this is a valid gather with correct alignment - if (!isLegalTypeAndAlignment(Ty->getVectorNumElements(), - Ty->getScalarSizeInBits(), Alignment)) { + if (!isLegalTypeAndAlignment(OriginalTy->getVectorNumElements(), + OriginalTy->getScalarSizeInBits(), Alignment)) { LLVM_DEBUG(dbgs() << "masked gathers: instruction does not have valid " << "alignment or vector type \n"); return false; @@ -128,10 +130,46 @@ } assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type"); + // If this is an extending gather, a SExt or ZExt must be following + LLVMContext &C = I->getContext(); + unsigned Unsigned = 1; + if (OriginalTy->getScalarSizeInBits() * OriginalTy->getVectorNumElements() < + 128) { + bool FoundExtend = false; + if (OriginalTy->getVectorNumElements() == 4) + ResultTy = VectorType::get(OriginalTy->getInt32Ty(C), 4); + else if (OriginalTy->getVectorNumElements() == 8) + ResultTy = VectorType::get(OriginalTy->getInt16Ty(C), 8); + // The correct root to replace is the not the CallInst itself, but the + // instruction which extends it + Instruction *Parent = nullptr; + for (User *u : I->users()) { + // Only do this to gathers with exactly one use + if (Parent || !(Parent = dyn_cast(u))) + return false; + } + if (Parent) { + LLVM_DEBUG(dbgs() << "masked gathers: found an extending gather\n"); + if (isa(Parent)) { + Root = Parent; + FoundExtend = true; + } else if (isa(Parent)) { + Root = Parent; + Unsigned = 0; + FoundExtend = true; + } + } + if (!FoundExtend) { + LLVM_DEBUG(dbgs() << "masked gathers: extend needed but not provided. " + << "Expanding\n"); + return false; + } + } + Value *BasePtr = Ptr; GetElementPtrInst *GEP = dyn_cast(Ptr); if (!GEP) { - if (Ty->getVectorNumElements() != 4) + if (ResultTy->getVectorNumElements() != 4) // Can't build an intrinsic for this return false; @@ -139,12 +177,12 @@ << " from vector of pointers\n"); if (match(Mask, m_One())) Load = Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base, - {Ty, Ptr->getType()}, + {ResultTy, Ptr->getType()}, {Ptr, Builder.getInt32(0)}); else Load = Builder.CreateIntrinsic( Intrinsic::arm_mve_vldr_gather_base_predicated, - {Ty, Ptr->getType(), Mask->getType()}, + {ResultTy, Ptr->getType(), Mask->getType()}, {Ptr, Builder.getInt32(0), Mask}); } else { LLVM_DEBUG(dbgs() << "masked gathers: getelementpointer found. Loading" @@ -156,7 +194,7 @@ // we therefore can't fold them if (ZExtInst *ZextOffs = dyn_cast(Offsets)) Offsets = ZextOffs->getOperand(0); - Type *OffsType = VectorType::getInteger(cast(Ty)); + Type *OffsType = VectorType::getInteger(cast(ResultTy)); // If the offset we found does not have the type the intrinsic expects, // i.e., the same type as the gather itself, we need to convert it (only i // types) or fall back to expanding the gather @@ -189,7 +227,7 @@ unsigned Scale; int GEPElemSize = GEP->getResultElementType()->getPrimitiveSizeInBits(); - int ResultElemSize = Ty->getScalarSizeInBits(); + int ResultElemSize = ResultTy->getScalarSizeInBits(); // This can be a 32bit load scaled by 4, a 16bit load scaled by 2, or a // 8bit, 16bit or 32bit load scaled by 1 if (GEPElemSize == 32 && ResultElemSize == 32) { @@ -207,15 +245,15 @@ if (!match(Mask, m_One())) Load = Builder.CreateIntrinsic( Intrinsic::arm_mve_vldr_gather_offset_predicated, - {Ty, BasePtr->getType(), Offsets->getType(), Mask->getType()}, - {BasePtr, Offsets, Builder.getInt32(Ty->getScalarSizeInBits()), - Builder.getInt32(Scale), Builder.getInt32(1), Mask}); + {ResultTy, BasePtr->getType(), Offsets->getType(), Mask->getType()}, + {BasePtr, Offsets, Builder.getInt32(OriginalTy->getScalarSizeInBits()), + Builder.getInt32(Scale), Builder.getInt32(Unsigned), Mask}); else Load = Builder.CreateIntrinsic( Intrinsic::arm_mve_vldr_gather_offset, - {Ty, BasePtr->getType(), Offsets->getType()}, - {BasePtr, Offsets, Builder.getInt32(Ty->getScalarSizeInBits()), - Builder.getInt32(Scale), Builder.getInt32(1)}); + {ResultTy, BasePtr->getType(), Offsets->getType()}, + {BasePtr, Offsets, Builder.getInt32(OriginalTy->getScalarSizeInBits()), + Builder.getInt32(Scale), Builder.getInt32(Unsigned)}); } if (!isa(PassThru) && !match(PassThru, m_Zero())) { @@ -223,10 +261,13 @@ << "creating select\n"); Load = Builder.CreateSelect(Mask, Load, PassThru); } - + Root->replaceAllUsesWith(Load); + Root->eraseFromParent(); + if (Root != I) + // If this was an extending gather, also get rid of the gather instruction + // itself, to avoid reevaluation + I->eraseFromParent(); LLVM_DEBUG(dbgs() << "masked gathers: successfully built masked gather\n"); - I->replaceAllUsesWith(Load); - I->eraseFromParent(); return true; } Index: llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ind16-unscaled.ll @@ -4,38 +4,9 @@ define arm_aapcs_vfpcc <8 x i16> @zext_unscaled_i8_i16(i8* %base, <8 x i16>* %offptr) { ; CHECK-LABEL: zext_unscaled_i8_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: ldrb.w lr, [r3] -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.16 q0[0], r5 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.16 q0[1], lr -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.16 q0[3], r12 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vldrb.u16 q0, [r0, q1] +; CHECK-NEXT: bx lr entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 %offs.zext = zext <8 x i16> %offs to <8 x i32> @@ -48,38 +19,9 @@ define arm_aapcs_vfpcc <8 x i16> @sext_unscaled_i8_i16(i8* %base, <8 x i16>* %offptr) { ; CHECK-LABEL: sext_unscaled_i8_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: ldrb.w lr, [r3] -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.16 q0[0], r5 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.16 q0[1], lr -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.16 q0[3], r12 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vldrb.s16 q0, [r0, q1] +; CHECK-NEXT: bx lr entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 %offs.zext = zext <8 x i16> %offs to <8 x i32> @@ -122,38 +64,9 @@ define arm_aapcs_vfpcc <8 x i16> @zext_unsigned_unscaled_i8_i8(i8* %base, <8 x i8>* %offptr) { ; CHECK-LABEL: zext_unsigned_unscaled_i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vldrb.u32 q1, [r1, #4] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: ldrb.w lr, [r3] -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.16 q0[0], r5 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.16 q0[1], lr -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.16 q0[3], r12 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: vmovlb.u8 q0, q0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vldrb.u16 q1, [r1] +; CHECK-NEXT: vldrb.u16 q0, [r0, q1] +; CHECK-NEXT: bx lr entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 %offs.zext = zext <8 x i8> %offs to <8 x i32> @@ -166,38 +79,9 @@ define arm_aapcs_vfpcc <8 x i16> @sext_unsigned_unscaled_i8_i8(i8* %base, <8 x i8>* %offptr) { ; CHECK-LABEL: sext_unsigned_unscaled_i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r7, lr} -; CHECK-NEXT: push {r4, r5, r7, lr} -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vldrb.u32 q1, [r1, #4] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r5, s0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov r4, s7 -; CHECK-NEXT: ldrb.w r12, [r2] -; CHECK-NEXT: vmov r2, s2 -; CHECK-NEXT: ldrb.w lr, [r3] -; CHECK-NEXT: vmov r3, s6 -; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.16 q0[0], r5 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.16 q0[1], lr -; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov.16 q0[3], r12 -; CHECK-NEXT: vmov.16 q0[4], r0 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov.16 q0[6], r3 -; CHECK-NEXT: vmov.16 q0[7], r4 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: pop {r4, r5, r7, pc} +; CHECK-NEXT: vldrb.u16 q1, [r1] +; CHECK-NEXT: vldrb.s16 q0, [r0, q1] +; CHECK-NEXT: bx lr entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 %offs.zext = zext <8 x i8> %offs to <8 x i32> Index: llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ind32-unscaled.ll @@ -4,22 +4,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_unscaled_i8_i32(i8* %base, <4 x i32>* %offptr) { ; CHECK-LABEL: zext_unscaled_i8_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vmov.i32 q1, #0xff -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrb.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -32,22 +18,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_unscaled_i8_i32(i8* %base, <4 x i32>* %offptr) { ; CHECK-LABEL: sext_unscaled_i8_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrb.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -60,21 +32,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_unscaled_i16_i32(i8* %base, <4 x i32>* %offptr) { ; CHECK-LABEL: zext_unscaled_i16_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -88,21 +47,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_unscaled_i16_i32(i8* %base, <4 x i32>* %offptr) { ; CHECK-LABEL: sext_unscaled_i16_i32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrw.u32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -204,21 +150,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_signed_unscaled_i16_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: zext_signed_unscaled_i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -233,21 +166,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_signed_unscaled_i16_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: sext_signed_unscaled_i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -262,21 +182,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_unscaled_i16_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: zext_unsigned_unscaled_i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -291,21 +198,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_unscaled_i16_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: sext_unsigned_unscaled_i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -320,22 +214,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_signed_unscaled_i8_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: zext_signed_unscaled_i8_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vmov.i32 q1, #0xff -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vldrb.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -349,22 +229,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_signed_unscaled_i8_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: sext_signed_unscaled_i8_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrh.s32 q1, [r1] +; CHECK-NEXT: vldrb.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -378,22 +244,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_unscaled_i8_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: zext_unsigned_unscaled_i8_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vmov.i32 q1, #0xff -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vldrb.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -407,22 +259,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_unscaled_i8_i16(i8* %base, <4 x i16>* %offptr) { ; CHECK-LABEL: sext_unsigned_unscaled_i8_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrh.u32 q1, [r1] +; CHECK-NEXT: vldrb.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -496,21 +334,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_signed_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: zext_signed_unscaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -525,21 +350,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_signed_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: sext_signed_unscaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -554,21 +366,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: zext_unsigned_unscaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vldrh.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -583,21 +382,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_unscaled_i16_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: sext_unsigned_unscaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vldrh.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -612,22 +398,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_signed_unscaled_i8_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: zext_signed_unscaled_i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vmov.i32 q1, #0xff -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vldrb.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -641,22 +413,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_signed_unscaled_i8_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: sext_signed_unscaled_i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.s32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vldrb.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -670,22 +428,8 @@ define arm_aapcs_vfpcc <4 x i32> @zext_unsigned_unscaled_i8_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: zext_unsigned_unscaled_i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vmov.i32 q1, #0xff -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vldrb.u32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -699,22 +443,8 @@ define arm_aapcs_vfpcc <4 x i32> @sext_unsigned_unscaled_i8_i8(i8* %base, <4 x i8>* %offptr) { ; CHECK-LABEL: sext_unsigned_unscaled_i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q0, [r1] -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vldrb.s32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 Index: llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll +++ llvm/test/CodeGen/Thumb2/mve-gather-ptrs.ll @@ -276,20 +276,8 @@ define arm_aapcs_vfpcc <4 x i32> @ptr_v4i16_sext(<4 x i16*>* %offptr) { ; CHECK-LABEL: ptr_v4i16_sext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4 @@ -301,20 +289,8 @@ define arm_aapcs_vfpcc <4 x i32> @ptr_v4i16_zext(<4 x i16*>* %offptr) { ; CHECK-LABEL: ptr_v4i16_zext: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r0, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: ldrh r2, [r2] -; CHECK-NEXT: ldrh r0, [r0] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrh r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r0 -; CHECK-NEXT: ldrh r1, [r1] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q0 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16*>, <4 x i16*>* %offptr, align 4 @@ -600,21 +576,8 @@ define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_sext32(<4 x i8*>* %offptr) { ; CHECK-LABEL: ptr_v4i8_sext32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov r0, s0 -; CHECK-NEXT: vmov r3, s1 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: vmov r2, s3 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[0], r0 -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[1], r3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: vmov.32 q0[2], r1 -; CHECK-NEXT: vmov.32 q0[3], r2 -; CHECK-NEXT: vmovlb.s8 q0, q0 -; CHECK-NEXT: vmovlb.s16 q0, q0 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4 @@ -626,21 +589,8 @@ define arm_aapcs_vfpcc <4 x i32> @ptr_v4i8_zext32(<4 x i8*>* %offptr) { ; CHECK-LABEL: ptr_v4i8_zext32: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q0, [r0] -; CHECK-NEXT: vmov.i32 q1, #0xff -; CHECK-NEXT: vmov r2, s0 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: vmov r3, s2 -; CHECK-NEXT: vmov r0, s3 -; CHECK-NEXT: ldrb r2, [r2] -; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.32 q0[1], r1 -; CHECK-NEXT: ldrb r0, [r0] -; CHECK-NEXT: vmov.32 q0[2], r3 -; CHECK-NEXT: vmov.32 q0[3], r0 -; CHECK-NEXT: vand q0, q0, q1 +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vldrw.u32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8*>, <4 x i8*>* %offptr, align 4