Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.h =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.h +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.h @@ -161,7 +161,7 @@ bool isLegalMaskedGather(Type *Ty, MaybeAlign Alignment); - bool isLegalMaskedScatter(Type *Ty, MaybeAlign Alignment) { return false; } + bool isLegalMaskedScatter(Type *Ty, MaybeAlign Alignment); int getMemcpyCost(const Instruction *I); Index: llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp =================================================================== --- llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp +++ llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp @@ -538,6 +538,18 @@ (EltWidth == 16 && (!Alignment || Alignment >= 2)) || EltWidth == 8); } +bool ARMTTIImpl::isLegalMaskedScatter(Type *Ty, MaybeAlign Alignment) { + if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps()) + return false; + + if (isa(Ty)) + return false; + + unsigned EltWidth = Ty->getScalarSizeInBits(); + return ((EltWidth == 32 && (!Alignment || Alignment >= 4)) || + (EltWidth == 16 && (!Alignment || Alignment >= 2)) || EltWidth == 8); +} + int ARMTTIImpl::getMemcpyCost(const Instruction *I) { const MemCpyInst *MI = dyn_cast(I); assert(MI && "MemcpyInst expected"); Index: llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp =================================================================== --- llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp +++ llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp @@ -80,6 +80,8 @@ // returning the base directly and the offsets indirectly using the Offsets // argument Value *checkGEP(Value *&Offsets, Type *Ty, Value *Ptr, IRBuilder<> Builder); + // Compute the scale of this gather/scatter instruction + int computeScale(unsigned GEPElemSize, unsigned MemoryElemSize); bool lowerGather(IntrinsicInst *I); // Create a gather from a base + vector of offsets @@ -88,6 +90,14 @@ // Create a gather from a vector of pointers Value *tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder); + + bool lowerScatter(IntrinsicInst *I); + // Create a scatter to a base + vector of offsets + Value *tryCreateMaskedScatterOffset(IntrinsicInst *I, Value *Ptr, + IRBuilder<> Builder); + // Create a scatter to a vector of pointers + Value *tryCreateMaskedScatterBase(IntrinsicInst *I, Value *Ptr, + IRBuilder<> Builder); }; } // end anonymous namespace @@ -110,8 +120,8 @@ (NumElements == 16 && ElemSize == 8)) && ElemSize / 8 <= Alignment) return true; - LLVM_DEBUG(dbgs() << "masked gathers: instruction does not have valid " - << "alignment or vector type \n"); + LLVM_DEBUG(dbgs() << "masked gathers/scatters: instruction does not have " + << "valid alignment or vector type \n"); return false; } @@ -119,17 +129,18 @@ IRBuilder<> Builder) { GetElementPtrInst *GEP = dyn_cast(Ptr); if (!GEP) { - LLVM_DEBUG(dbgs() << "masked gathers: no getelementpointer found\n"); + LLVM_DEBUG( + dbgs() << "masked gathers/scatters: no getelementpointer found\n"); return nullptr; } - LLVM_DEBUG(dbgs() << "masked gathers: getelementpointer found. Loading" - << " from base + vector of offsets\n"); + LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementpointer found." + << " Looking at intrinsic for base + vector of offsets\n"); Value *GEPPtr = GEP->getPointerOperand(); if (GEPPtr->getType()->isVectorTy()) { return nullptr; } if (GEP->getNumOperands() != 2) { - LLVM_DEBUG(dbgs() << "masked gathers: getelementptr with too many" + LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementptr with too many" << " operands. Expanding.\n"); return nullptr; } @@ -140,16 +151,16 @@ Offsets = ZextOffs->getOperand(0); Type *OffsType = VectorType::getInteger(cast(Ty)); // If the offset we found does not have the type the intrinsic expects, - // i.e., the same type as the gather itself, we need to convert it (only i - // types) or fall back to expanding the gather + // i.e., the same type as the gather (or scatter input) itself, we need to + // convert it (only i types) or fall back to expanding the gather if (OffsType != Offsets->getType()) { if (OffsType->getScalarSizeInBits() > Offsets->getType()->getScalarSizeInBits()) { - LLVM_DEBUG(dbgs() << "masked gathers: extending offsets\n"); + LLVM_DEBUG(dbgs() << "masked gathers/scatters: extending offsets\n"); Offsets = Builder.CreateZExt(Offsets, OffsType, ""); } else { - LLVM_DEBUG(dbgs() << "masked gathers: no correct offset type. Can't" - << " create masked gather\n"); + LLVM_DEBUG(dbgs() << "masked gathers/scatters: no correct offset type." + << " Can't create intrinsic.\n"); return nullptr; } } @@ -163,12 +174,28 @@ Type *BCTy = BitCast->getType(); Type *BCSrcTy = BitCast->getOperand(0)->getType(); if (BCTy->getVectorNumElements() == BCSrcTy->getVectorNumElements()) { - LLVM_DEBUG(dbgs() << "masked gathers: looking through bitcast\n"); + LLVM_DEBUG( + dbgs() << "masked gathers/scatters: looking through bitcast\n"); Ptr = BitCast->getOperand(0); } } } +int MVEGatherScatterLowering::computeScale(unsigned GEPElemSize, + unsigned MemoryElemSize) { + // This can be a 32bit load/store scaled by 4, a 16bit load/store scaled by 2, + // or a 8bit, 16bit or 32bit load/store scaled by 1 + if (GEPElemSize == 32 && MemoryElemSize == 32) + return 2; + else if (GEPElemSize == 16 && MemoryElemSize == 16) + return 1; + else if (GEPElemSize == 8) + return 0; + LLVM_DEBUG(dbgs() << "masked gathers/scatters: incorrect scale. Can't " + << "create intrinsic\n"); + return -1; +} + bool MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) { using namespace PatternMatch; LLVM_DEBUG(dbgs() << "masked gathers: checking transform preconditions\n"); @@ -193,7 +220,6 @@ Builder.SetCurrentDebugLocation(I->getDebugLoc()); Instruction *Root = I; - Value *Load = tryCreateMaskedGatherOffset(I, Ptr, Root, Builder); if (!Load) Load = tryCreateMaskedGatherBase(I, Ptr, Builder); @@ -219,9 +245,7 @@ Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase( IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) { using namespace PatternMatch; - Type *Ty = I->getType(); - LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers\n"); if (Ty->getVectorNumElements() != 4 || Ty->getScalarSizeInBits() != 32) // Can't build an intrinsic for this @@ -265,7 +289,7 @@ return nullptr; } LLVM_DEBUG(dbgs() << "masked gathers: found an extending gather\n"); - ResultTy = Extend->getType(); + ResultTy = Root->getType(); // The final size of the gather must be a full vector width if (ResultTy->getPrimitiveSizeInBits() != 128) { LLVM_DEBUG(dbgs() << "masked gathers: extending from the wrong type. " @@ -279,23 +303,11 @@ if (!BasePtr) return nullptr; - unsigned Scale; - int GEPElemSize = - BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(); - int MemoryElemSize = OriginalTy->getScalarSizeInBits(); - // This can be a 32bit load scaled by 4, a 16bit load scaled by 2, or a - // 8bit, 16bit or 32bit load scaled by 1 - if (GEPElemSize == 32 && MemoryElemSize == 32) { - Scale = 2; - } else if (GEPElemSize == 16 && MemoryElemSize == 16) { - Scale = 1; - } else if (GEPElemSize == 8) { - Scale = 0; - } else { - LLVM_DEBUG(dbgs() << "masked gathers: incorrect scale for load. Can't" - << " create masked gather\n"); + int Scale = computeScale( + BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(), + OriginalTy->getScalarSizeInBits()); + if (Scale == -1) return nullptr; - } Root = Extend; Value *Mask = I->getArgOperand(2); @@ -313,6 +325,117 @@ Builder.getInt32(Scale), Builder.getInt32(Unsigned)}); } +bool MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) { + using namespace PatternMatch; + LLVM_DEBUG(dbgs() << "masked scatters: checking transform preconditions\n"); + + // @llvm.masked.scatter.*(data, ptrs, alignment, mask) + // Attempt to turn the masked scatter in I into a MVE intrinsic + // Potentially optimising the addressing modes as we do so. + Value *Input = I->getArgOperand(0); + Value *Ptr = I->getArgOperand(1); + unsigned Alignment = cast(I->getArgOperand(2))->getZExtValue(); + Type *Ty = Input->getType(); + + if (!isLegalTypeAndAlignment(Ty->getVectorNumElements(), + Ty->getScalarSizeInBits(), Alignment)) + return false; + lookThroughBitcast(Ptr); + assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type"); + + IRBuilder<> Builder(I->getContext()); + Builder.SetInsertPoint(I); + Builder.SetCurrentDebugLocation(I->getDebugLoc()); + + Value *Store = tryCreateMaskedScatterOffset(I, Ptr, Builder); + if (!Store) + Store = tryCreateMaskedScatterBase(I, Ptr, Builder); + if (!Store) + return false; + + LLVM_DEBUG(dbgs() << "masked scatters: successfully built masked scatter\n"); + I->replaceAllUsesWith(Store); + I->eraseFromParent(); + return true; +} + +Value *MVEGatherScatterLowering::tryCreateMaskedScatterBase( + IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) { + using namespace PatternMatch; + Value *Input = I->getArgOperand(0); + Value *Mask = I->getArgOperand(3); + Type *Ty = Input->getType(); + // Only QR variants allow truncating + if (!(Ty->getVectorNumElements() == 4 && Ty->getScalarSizeInBits() == 32)) { + // Can't build an intrinsic for this + return nullptr; + } + // int_arm_mve_vstr_scatter_base(_predicated) addr, offset, data(, mask) + LLVM_DEBUG(dbgs() << "masked scatters: storing to a vector of pointers\n"); + if (match(Mask, m_One())) + return Builder.CreateIntrinsic(Intrinsic::arm_mve_vstr_scatter_base, + {Ptr->getType(), Input->getType()}, + {Ptr, Builder.getInt32(0), Input}); + else + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vstr_scatter_base_predicated, + {Ptr->getType(), Input->getType(), Mask->getType()}, + {Ptr, Builder.getInt32(0), Input, Mask}); +} + +Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset( + IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) { + using namespace PatternMatch; + Value *Input = I->getArgOperand(0); + Value *Mask = I->getArgOperand(3); + Type *InputTy = Input->getType(); + Type *MemoryTy = InputTy; + LLVM_DEBUG(dbgs() << "masked scatters: getelementpointer found. Storing" + << " to base + vector of offsets\n"); + // If the input has been truncated, try to integrate that trunc into the + // scatter instruction (we don't care about alignment here) + if (TruncInst *Trunc = dyn_cast(Input)) { + Value *PreTrunc = Trunc->getOperand(0); + Type *PreTruncTy = PreTrunc->getType(); + if (PreTruncTy->getPrimitiveSizeInBits() == 128) { + Input = PreTrunc; + InputTy = PreTruncTy; + } + } + if (InputTy->getPrimitiveSizeInBits() != 128) { + LLVM_DEBUG( + dbgs() << "masked scatters: cannot create scatters for non-standard" + << " input types. Expanding.\n"); + return nullptr; + } + + Value *Offsets; + Value *BasePtr = checkGEP(Offsets, InputTy, Ptr, Builder); + if (!BasePtr) + return nullptr; + int Scale = computeScale( + BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(), + MemoryTy->getScalarSizeInBits()); + if (Scale == -1) + return nullptr; + + if (!match(Mask, m_One())) + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vstr_scatter_offset_predicated, + {BasePtr->getType(), Offsets->getType(), Input->getType(), + Mask->getType()}, + {BasePtr, Offsets, Input, + Builder.getInt32(MemoryTy->getScalarSizeInBits()), + Builder.getInt32(Scale), Mask}); + else + return Builder.CreateIntrinsic( + Intrinsic::arm_mve_vstr_scatter_offset, + {BasePtr->getType(), Offsets->getType(), Input->getType()}, + {BasePtr, Offsets, Input, + Builder.getInt32(MemoryTy->getScalarSizeInBits()), + Builder.getInt32(Scale)}); +} + bool MVEGatherScatterLowering::runOnFunction(Function &F) { if (!EnableMaskedGatherScatters) return false; @@ -322,19 +445,24 @@ if (!ST->hasMVEIntegerOps()) return false; SmallVector Gathers; + SmallVector Scatters; for (BasicBlock &BB : F) { for (Instruction &I : BB) { IntrinsicInst *II = dyn_cast(&I); if (II && II->getIntrinsicID() == Intrinsic::masked_gather) Gathers.push_back(II); + else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter) + Scatters.push_back(II); } } - if (Gathers.empty()) + if (Gathers.empty() && Scatters.empty()) return false; for (IntrinsicInst *I : Gathers) lowerGather(I); + for (IntrinsicInst *I : Scatters) + lowerScatter(I); return true; } Index: llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll +++ llvm/test/CodeGen/Thumb2/mve-scatter-ind16-scaled.ll @@ -5,36 +5,8 @@ define arm_aapcs_vfpcc void @scaled_v8i16_i16(i16* %base, <8 x i16>* %offptr, <8 x i16> %input) { ; CHECK-LABEL: scaled_v8i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q2, [r1] -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vshl.i32 q2, q2, #1 -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vstrh.16 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 @@ -48,32 +20,8 @@ define arm_aapcs_vfpcc void @scaled_v8f16_i16(i16* %base, <8 x i16>* %offptr, <8 x half> %input) { ; CHECK-LABEL: scaled_v8f16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q1, [r1] -; CHECK-NEXT: vmovx.f16 s12, s0 -; CHECK-NEXT: vshl.i32 q2, q1, #1 -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vstr.16 s0, [r1] -; CHECK-NEXT: vmov r1, s9 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vstr.16 s12, [r1] -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vstr.16 s1, [r1] -; CHECK-NEXT: vmov r1, s11 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vstr.16 s8, [r1] -; CHECK-NEXT: vstr.16 s2, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmovx.f16 s8, s2 -; CHECK-NEXT: vstr.16 s8, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vstr.16 s3, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vstrh.16 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 @@ -88,32 +36,8 @@ define arm_aapcs_vfpcc void @scaled_v8f16_half(half* %base, <8 x i16>* %offptr, <8 x half> %input) { ; CHECK-LABEL: scaled_v8f16_half: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q1, [r1] -; CHECK-NEXT: vmovx.f16 s12, s0 -; CHECK-NEXT: vshl.i32 q2, q1, #1 -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vstr.16 s0, [r1] -; CHECK-NEXT: vmov r1, s9 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vstr.16 s12, [r1] -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vstr.16 s1, [r1] -; CHECK-NEXT: vmov r1, s11 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vstr.16 s8, [r1] -; CHECK-NEXT: vstr.16 s2, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmovx.f16 s8, s2 -; CHECK-NEXT: vstr.16 s8, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vstr.16 s3, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vstrh.16 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 @@ -210,36 +134,8 @@ define arm_aapcs_vfpcc void @unsigned_scaled_v8i16_i8(i16* %base, <8 x i8>* %offptr, <8 x i16> %input) { ; CHECK-LABEL: unsigned_scaled_v8i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q2, [r1] -; CHECK-NEXT: vldrb.u32 q1, [r1, #4] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vshl.i32 q2, q2, #1 -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vldrb.u16 q1, [r1] +; CHECK-NEXT: vstrh.16 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 @@ -253,32 +149,8 @@ define arm_aapcs_vfpcc void @unsigned_scaled_v8f16_i8(i16* %base, <8 x i8>* %offptr, <8 x half> %input) { ; CHECK-LABEL: unsigned_scaled_v8f16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q1, [r1] -; CHECK-NEXT: vmovx.f16 s12, s0 -; CHECK-NEXT: vshl.i32 q2, q1, #1 -; CHECK-NEXT: vldrb.u32 q1, [r1, #4] -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vstr.16 s0, [r1] -; CHECK-NEXT: vmov r1, s9 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vstr.16 s12, [r1] -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vstr.16 s1, [r1] -; CHECK-NEXT: vmov r1, s11 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vstr.16 s8, [r1] -; CHECK-NEXT: vstr.16 s2, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmovx.f16 s8, s2 -; CHECK-NEXT: vstr.16 s8, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vstr.16 s3, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vldrb.u16 q1, [r1] +; CHECK-NEXT: vstrh.16 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 @@ -292,103 +164,10 @@ define arm_aapcs_vfpcc void @scaled_v8i16_i16_passthru_icmp0(i16* %base, <8 x i16>* %offptr, <8 x i16> %input) { ; CHECK-LABEL: scaled_v8i16_i16_passthru_icmp0: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r7, lr} -; CHECK-NEXT: push {r7, lr} -; CHECK-NEXT: .pad #8 -; CHECK-NEXT: sub sp, #8 -; CHECK-NEXT: vldrh.u16 q2, [r1] -; CHECK-NEXT: movs r2, #0 -; CHECK-NEXT: vcmp.s16 gt, q2, zr -; CHECK-NEXT: vmov.u16 r3, q2[4] -; CHECK-NEXT: vmrs r1, p0 -; CHECK-NEXT: vmov.32 q1[0], r3 -; CHECK-NEXT: vmov.u16 r3, q2[5] -; CHECK-NEXT: vmov.u16 r12, q2[7] -; CHECK-NEXT: vmov.32 q1[1], r3 -; CHECK-NEXT: vmov.u16 r3, q2[6] -; CHECK-NEXT: vmov.32 q1[2], r3 -; CHECK-NEXT: vmov.u16 r3, q2[0] -; CHECK-NEXT: vmov.32 q3[0], r3 -; CHECK-NEXT: vmov.u16 r3, q2[1] -; CHECK-NEXT: vmov.32 q3[1], r3 -; CHECK-NEXT: vmov.u16 r3, q2[2] -; CHECK-NEXT: vmov.32 q3[2], r3 -; CHECK-NEXT: vmov.u16 r3, q2[3] -; CHECK-NEXT: vmov.32 q3[3], r3 -; CHECK-NEXT: vmov.32 q1[3], r12 -; CHECK-NEXT: vmovlb.u16 q2, q3 -; CHECK-NEXT: vmovlb.u16 q1, q1 -; CHECK-NEXT: vshl.i32 q2, q2, #1 -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: and lr, r1, #1 -; CHECK-NEXT: rsb.w r3, lr, #0 -; CHECK-NEXT: bfi r2, r3, #0, #1 -; CHECK-NEXT: ubfx r3, r1, #2, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r2, r3, #1, #1 -; CHECK-NEXT: ubfx r3, r1, #4, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r2, r3, #2, #1 -; CHECK-NEXT: ubfx r3, r1, #6, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r2, r3, #3, #1 -; CHECK-NEXT: ubfx r3, r1, #8, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r2, r3, #4, #1 -; CHECK-NEXT: ubfx r3, r1, #10, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r2, r3, #5, #1 -; CHECK-NEXT: ubfx r3, r1, #12, #1 -; CHECK-NEXT: ubfx r1, r1, #14, #1 -; CHECK-NEXT: rsbs r3, r3, #0 -; CHECK-NEXT: bfi r2, r3, #6, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r2, r1, #7, #1 -; CHECK-NEXT: uxtb r1, r2 -; CHECK-NEXT: lsls r2, r2, #31 -; CHECK-NEXT: ittt ne -; CHECK-NEXT: vmovne r2, s8 -; CHECK-NEXT: vmovne.u16 r3, q0[0] -; CHECK-NEXT: strhne r3, [r2] -; CHECK-NEXT: lsls r2, r1, #30 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r2, s9 -; CHECK-NEXT: vmovmi.u16 r3, q0[1] -; CHECK-NEXT: strhmi r3, [r2] -; CHECK-NEXT: lsls r2, r1, #29 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r2, s10 -; CHECK-NEXT: vmovmi.u16 r3, q0[2] -; CHECK-NEXT: strhmi r3, [r2] -; CHECK-NEXT: lsls r2, r1, #28 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r2, s11 -; CHECK-NEXT: vmovmi.u16 r3, q0[3] -; CHECK-NEXT: strhmi r3, [r2] -; CHECK-NEXT: lsls r0, r1, #27 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi.u16 r0, q0[4] -; CHECK-NEXT: vmovmi r2, s4 -; CHECK-NEXT: strhmi r0, [r2] -; CHECK-NEXT: lsls r0, r1, #26 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi.u16 r0, q0[5] -; CHECK-NEXT: vmovmi r2, s5 -; CHECK-NEXT: strhmi r0, [r2] -; CHECK-NEXT: lsls r0, r1, #25 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi.u16 r0, q0[6] -; CHECK-NEXT: vmovmi r2, s6 -; CHECK-NEXT: strhmi r0, [r2] -; CHECK-NEXT: lsls r0, r1, #24 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi.u16 r0, q0[7] -; CHECK-NEXT: vmovmi r1, s7 -; CHECK-NEXT: strhmi r0, [r1] -; CHECK-NEXT: add sp, #8 -; CHECK-NEXT: pop {r7, pc} +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vpt.s16 gt, q1, zr +; CHECK-NEXT: vstrht.16 q0, [r0, q1, uxtw #1] +; CHECK-NEXT: bx lr entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 %offs.zext = zext <8 x i16> %offs to <8 x i32> Index: llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll +++ llvm/test/CodeGen/Thumb2/mve-scatter-ind16-unscaled.ll @@ -5,34 +5,8 @@ define arm_aapcs_vfpcc void @ext_unscaled_i8_i16(i8* %base, <8 x i16>* %offptr, <8 x i16> %input) { ; CHECK-LABEL: ext_unscaled_i8_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q2, [r1] -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vstrb.16 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 @@ -47,34 +21,8 @@ define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <8 x i8>* %offptr, <8 x i16> %input) { ; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q2, [r1] -; CHECK-NEXT: vldrb.u32 q1, [r1, #4] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vldrb.u16 q1, [r1] +; CHECK-NEXT: vstrb.16 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 @@ -89,34 +37,8 @@ define arm_aapcs_vfpcc void @unscaled_i16_i16(i8* %base, <8 x i16>* %offptr, <8 x i16> %input) { ; CHECK-LABEL: unscaled_i16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q2, [r1] -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vstrh.16 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 @@ -131,30 +53,8 @@ define arm_aapcs_vfpcc void @unscaled_v8f16_i16(i8* %base, <8 x i16>* %offptr, <8 x half> %input) { ; CHECK-LABEL: unscaled_v8f16_i16: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u32 q2, [r1] -; CHECK-NEXT: vldrh.u32 q1, [r1, #8] -; CHECK-NEXT: vmovx.f16 s12, s0 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vstr.16 s0, [r1] -; CHECK-NEXT: vmov r1, s9 -; CHECK-NEXT: vstr.16 s12, [r1] -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: vstr.16 s1, [r1] -; CHECK-NEXT: vmov r1, s11 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vstr.16 s8, [r1] -; CHECK-NEXT: vstr.16 s2, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmovx.f16 s8, s2 -; CHECK-NEXT: vstr.16 s8, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vstr.16 s3, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vldrh.u16 q1, [r1] +; CHECK-NEXT: vstrh.16 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <8 x i16>, <8 x i16>* %offptr, align 2 @@ -327,34 +227,8 @@ define arm_aapcs_vfpcc void @unsigned_unscaled_i16_i8(i8* %base, <8 x i8>* %offptr, <8 x i16> %input) { ; CHECK-LABEL: unsigned_unscaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q2, [r1] -; CHECK-NEXT: vldrb.u32 q1, [r1, #4] -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vldrb.u16 q1, [r1] +; CHECK-NEXT: vstrh.16 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 @@ -369,30 +243,8 @@ define arm_aapcs_vfpcc void @unsigned_unscaled_f16_i8(i8* %base, <8 x i8>* %offptr, <8 x half> %input) { ; CHECK-LABEL: unsigned_unscaled_f16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q2, [r1] -; CHECK-NEXT: vldrb.u32 q1, [r1, #4] -; CHECK-NEXT: vmovx.f16 s12, s0 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: vstr.16 s0, [r1] -; CHECK-NEXT: vmov r1, s9 -; CHECK-NEXT: vstr.16 s12, [r1] -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: vstr.16 s1, [r1] -; CHECK-NEXT: vmov r1, s11 -; CHECK-NEXT: vmovx.f16 s8, s1 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vstr.16 s8, [r1] -; CHECK-NEXT: vstr.16 s2, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmovx.f16 s8, s2 -; CHECK-NEXT: vstr.16 s8, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vstr.16 s3, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmovx.f16 s0, s3 -; CHECK-NEXT: vstr.16 s0, [r0] +; CHECK-NEXT: vldrb.u16 q1, [r1] +; CHECK-NEXT: vstrh.16 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 @@ -453,37 +305,27 @@ define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <8 x i8>* %offptr, <8 x i64> %input) { ; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrb.u32 q5, [r1] -; CHECK-NEXT: vldrb.u32 q4, [r1, #4] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vadd.i32 q5, q5, r0 -; CHECK-NEXT: vadd.i32 q4, q4, r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s23 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov r1, s14 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.16 q4[0], r3 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov.16 q4[1], r3 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.16 q4[2], r3 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov.16 q4[3], r3 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov.16 q4[4], r3 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: vmov.16 q4[5], r3 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov r2, s14 +; CHECK-NEXT: vmov.16 q4[6], r3 +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vmov.16 q4[7], r2 +; CHECK-NEXT: vstrh.16 q4, [r0, q0] +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 @@ -542,34 +384,24 @@ define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <8 x i8>* %offptr, <8 x i32> %input) { ; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u32 q3, [r1] -; CHECK-NEXT: vldrb.u32 q2, [r1, #4] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vadd.i32 q3, q3, r0 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.16 q2[0], r3 +; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov.16 q2[1], r3 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov.16 q2[2], r3 +; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vmov.16 q2[3], r3 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.16 q2[4], r3 +; CHECK-NEXT: vmov r3, s5 +; CHECK-NEXT: vmov.16 q2[5], r3 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vmov.16 q2[6], r3 +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vmov.16 q2[7], r2 +; CHECK-NEXT: vstrh.16 q2, [r0, q0] ; CHECK-NEXT: bx lr entry: %offs = load <8 x i8>, <8 x i8>* %offptr, align 1 Index: llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll +++ llvm/test/CodeGen/Thumb2/mve-scatter-ind32-scaled.ll @@ -6,20 +6,7 @@ ; CHECK-LABEL: ext_scaled_i16_i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vstrh.32 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -34,20 +21,7 @@ ; CHECK-LABEL: scaled_i32_i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vshl.i32 q1, q1, #2 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -61,16 +35,7 @@ ; CHECK-LABEL: scaled_f32_i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vshl.i32 q1, q1, #2 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vstr s0, [r3] -; CHECK-NEXT: vstr s1, [r2] -; CHECK-NEXT: vstr s2, [r1] -; CHECK-NEXT: vstr s3, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -85,20 +50,7 @@ ; CHECK-LABEL: unsigned_scaled_b_i32_i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vshl.i32 q1, q1, #2 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -113,20 +65,7 @@ ; CHECK-LABEL: signed_scaled_i32_i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vshl.i32 q1, q1, #2 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -141,16 +80,7 @@ ; CHECK-LABEL: a_unsigned_scaled_f32_i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u32 q1, [r1] -; CHECK-NEXT: vshl.i32 q1, q1, #2 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vstr s0, [r3] -; CHECK-NEXT: vstr s1, [r2] -; CHECK-NEXT: vstr s2, [r1] -; CHECK-NEXT: vstr s3, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -166,16 +96,7 @@ ; CHECK-LABEL: b_signed_scaled_f32_i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q1, [r1] -; CHECK-NEXT: vshl.i32 q1, q1, #2 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vstr s0, [r3] -; CHECK-NEXT: vstr s1, [r2] -; CHECK-NEXT: vstr s2, [r1] -; CHECK-NEXT: vstr s3, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -191,20 +112,7 @@ ; CHECK-LABEL: ext_signed_scaled_i16_i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vstrh.32 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -220,20 +128,7 @@ ; CHECK-LABEL: ext_unsigned_scaled_i16_i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vstrh.32 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -249,20 +144,7 @@ ; CHECK-LABEL: unsigned_scaled_b_i32_i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vshl.i32 q1, q1, #2 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -277,20 +159,7 @@ ; CHECK-LABEL: signed_scaled_i32_i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.s32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vshl.i32 q1, q1, #2 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -305,16 +174,7 @@ ; CHECK-LABEL: a_unsigned_scaled_f32_i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u32 q1, [r1] -; CHECK-NEXT: vshl.i32 q1, q1, #2 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vstr s0, [r3] -; CHECK-NEXT: vstr s1, [r2] -; CHECK-NEXT: vstr s2, [r1] -; CHECK-NEXT: vstr s3, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -330,16 +190,7 @@ ; CHECK-LABEL: b_signed_scaled_f32_i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.s32 q1, [r1] -; CHECK-NEXT: vshl.i32 q1, q1, #2 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vstr s0, [r3] -; CHECK-NEXT: vstr s1, [r2] -; CHECK-NEXT: vstr s2, [r1] -; CHECK-NEXT: vstr s3, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0, q1, uxtw #2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -355,20 +206,7 @@ ; CHECK-LABEL: ext_signed_scaled_i16_i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.s32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vstrh.32 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -384,20 +222,7 @@ ; CHECK-LABEL: ext_unsigned_scaled_i16_i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vshl.i32 q1, q1, #1 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vstrh.32 q0, [r0, q1, uxtw #1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 Index: llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll +++ llvm/test/CodeGen/Thumb2/mve-scatter-ind32-unscaled.ll @@ -6,19 +6,7 @@ ; CHECK-LABEL: ext_unscaled_i8_i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vstrb.32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -33,19 +21,7 @@ ; CHECK-LABEL: ext_unscaled_i16_i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vstrh.32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -61,19 +37,7 @@ ; CHECK-LABEL: unscaled_i32_i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -88,15 +52,7 @@ ; CHECK-LABEL: unscaled_f32_i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vstr s0, [r3] -; CHECK-NEXT: vstr s1, [r2] -; CHECK-NEXT: vstr s2, [r1] -; CHECK-NEXT: vstr s3, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32>, <4 x i32>* %offptr, align 4 @@ -111,19 +67,7 @@ ; CHECK-LABEL: unsigned_unscaled_b_i32_i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -139,19 +83,7 @@ ; CHECK-LABEL: signed_unscaled_i32_i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -167,15 +99,7 @@ ; CHECK-LABEL: a_unsigned_unscaled_f32_i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u32 q1, [r1] -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vstr s0, [r3] -; CHECK-NEXT: vstr s1, [r2] -; CHECK-NEXT: vstr s2, [r1] -; CHECK-NEXT: vstr s3, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -191,15 +115,7 @@ ; CHECK-LABEL: b_signed_unscaled_f32_i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q1, [r1] -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vstr s0, [r3] -; CHECK-NEXT: vstr s1, [r2] -; CHECK-NEXT: vstr s2, [r1] -; CHECK-NEXT: vstr s3, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -215,19 +131,7 @@ ; CHECK-LABEL: ext_signed_unscaled_i16_i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vstrh.32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -244,19 +148,7 @@ ; CHECK-LABEL: ext_unsigned_unscaled_i16_i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vstrh.32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -273,19 +165,7 @@ ; CHECK-LABEL: ext_signed_unscaled_i8_i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.s32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vstrb.32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -301,19 +181,7 @@ ; CHECK-LABEL: ext_unsigned_unscaled_i8_i16: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrh.u32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vstrb.32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i16>, <4 x i16>* %offptr, align 2 @@ -329,19 +197,7 @@ ; CHECK-LABEL: unsigned_unscaled_b_i32_i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -357,19 +213,7 @@ ; CHECK-LABEL: signed_unscaled_i32_i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.s32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -385,15 +229,7 @@ ; CHECK-LABEL: a_unsigned_unscaled_f32_i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u32 q1, [r1] -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vstr s0, [r3] -; CHECK-NEXT: vstr s1, [r2] -; CHECK-NEXT: vstr s2, [r1] -; CHECK-NEXT: vstr s3, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -409,15 +245,7 @@ ; CHECK-LABEL: b_signed_unscaled_f32_i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.s32 q1, [r1] -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vstr s0, [r3] -; CHECK-NEXT: vstr s1, [r2] -; CHECK-NEXT: vstr s2, [r1] -; CHECK-NEXT: vstr s3, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -433,19 +261,7 @@ ; CHECK-LABEL: ext_signed_unscaled_i8_i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.s32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vstrb.32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -461,19 +277,7 @@ ; CHECK-LABEL: ext_unsigned_unscaled_i8_i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: strb r1, [r0] +; CHECK-NEXT: vstrb.32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -487,20 +291,11 @@ define arm_aapcs_vfpcc void @trunc_signed_unscaled_i64_i8(i8* %base, <4 x i8>* %offptr, <4 x i64> %input) { ; CHECK-LABEL: trunc_signed_unscaled_i64_i8: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vldrb.s32 q2, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s6 +; CHECK-NEXT: vstrw.32 q0, [r0, q2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -515,20 +310,11 @@ define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <4 x i8>* %offptr, <4 x i64> %input) { ; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8: ; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: vmov.f32 s1, s2 ; CHECK-NEXT: vldrb.u32 q2, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vmov.f32 s2, s4 +; CHECK-NEXT: vmov.f32 s3, s6 +; CHECK-NEXT: vstrw.32 q0, [r0, q2] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -544,19 +330,7 @@ ; CHECK-LABEL: trunc_signed_unscaled_i32_i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.s32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vstrh.32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 @@ -572,19 +346,7 @@ ; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrb.u32 q1, [r1] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strh r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: strh r1, [r0] +; CHECK-NEXT: vstrh.32 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i8>, <4 x i8>* %offptr, align 1 Index: llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll +++ llvm/test/CodeGen/Thumb2/mve-scatter-ind8-unscaled.ll @@ -5,65 +5,8 @@ define arm_aapcs_vfpcc void @unscaled_v16i8_i8(i8* %base, <16 x i8>* %offptr, <16 x i8> %input) { ; CHECK-LABEL: unscaled_v16i8_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9} -; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: vldrb.u32 q4, [r1] -; CHECK-NEXT: vldrb.u32 q1, [r1, #12] -; CHECK-NEXT: vldrb.u32 q2, [r1, #8] -; CHECK-NEXT: vldrb.u32 q3, [r1, #4] -; CHECK-NEXT: vadd.i32 q4, q4, r0 -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vadd.i32 q3, q3, r0 -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.u8 r1, q0[8] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov.u8 r1, q0[14] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov.u8 r1, q0[15] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vpop {d8, d9} +; CHECK-NEXT: vldrb.u8 q1, [r1] +; CHECK-NEXT: vstrb.8 q0, [r0, q1] ; CHECK-NEXT: bx lr entry: %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 @@ -435,74 +378,54 @@ define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i64_i8(i8* %base, <16 x i8>* %offptr, <16 x i64> %input) { ; CHECK-LABEL: trunc_unsigned_unscaled_i64_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrb.u32 q7, [r1] -; CHECK-NEXT: vldrb.u32 q4, [r1, #12] -; CHECK-NEXT: vldrb.u32 q5, [r1, #8] -; CHECK-NEXT: vldrb.u32 q6, [r1, #4] -; CHECK-NEXT: vadd.i32 q7, q7, r0 -; CHECK-NEXT: vadd.i32 q4, q4, r0 -; CHECK-NEXT: vadd.i32 q5, q5, r0 -; CHECK-NEXT: vadd.i32 q6, q6, r0 -; CHECK-NEXT: vmov r0, s28 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s29 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s30 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s31 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s25 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s26 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s27 -; CHECK-NEXT: vmov r1, s14 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: add r1, sp, #64 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: add r1, sp, #80 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s23 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: add r1, sp, #96 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: add r1, sp, #112 -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: bx lr +; CHECK-NEXT: .save {r4, lr} +; CHECK-NEXT: push {r4, lr} +; CHECK-NEXT: .vsave {d8, d9, d10, d11} +; CHECK-NEXT: vpush {d8, d9, d10, d11} +; CHECK-NEXT: vmov r4, s0 +; CHECK-NEXT: add r3, sp, #40 +; CHECK-NEXT: vmov.8 q5[0], r4 +; CHECK-NEXT: vmov r4, s2 +; CHECK-NEXT: vmov.8 q5[1], r4 +; CHECK-NEXT: vmov r4, s4 +; CHECK-NEXT: vmov.8 q5[2], r4 +; CHECK-NEXT: vmov r4, s6 +; CHECK-NEXT: vmov.8 q5[3], r4 +; CHECK-NEXT: vmov r4, s8 +; CHECK-NEXT: vmov.8 q5[4], r4 +; CHECK-NEXT: vmov r4, s10 +; CHECK-NEXT: vldrw.u32 q0, [r3] +; CHECK-NEXT: vmov.8 q5[5], r4 +; CHECK-NEXT: vmov r4, s12 +; CHECK-NEXT: add.w lr, sp, #56 +; CHECK-NEXT: vmov.8 q5[6], r4 +; CHECK-NEXT: vmov r4, s14 +; CHECK-NEXT: vmov.8 q5[7], r4 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.8 q5[8], r3 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vldrw.u32 q0, [lr] +; CHECK-NEXT: vmov.8 q5[9], r3 +; CHECK-NEXT: add.w r12, sp, #72 +; CHECK-NEXT: add r2, sp, #88 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vldrw.u32 q4, [r2] +; CHECK-NEXT: vmov.8 q5[10], r3 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vldrw.u32 q0, [r12] +; CHECK-NEXT: vmov.8 q5[11], r3 +; CHECK-NEXT: vmov r2, s18 +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.8 q5[12], r3 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov.8 q5[13], r3 +; CHECK-NEXT: vmov r3, s16 +; CHECK-NEXT: vmov.8 q5[14], r3 +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: vmov.8 q5[15], r2 +; CHECK-NEXT: vstrb.8 q5, [r0, q0] +; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: pop {r4, pc} entry: %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 %offs.zext = zext <16 x i8> %offs to <16 x i32> @@ -516,65 +439,43 @@ define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i32_i8(i8* %base, <16 x i8>* %offptr, <16 x i32> %input) { ; CHECK-LABEL: trunc_unsigned_unscaled_i32_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: vldrb.u32 q7, [r1] -; CHECK-NEXT: vldrb.u32 q4, [r1, #12] -; CHECK-NEXT: vldrb.u32 q5, [r1, #8] -; CHECK-NEXT: vldrb.u32 q6, [r1, #4] -; CHECK-NEXT: vadd.i32 q7, q7, r0 -; CHECK-NEXT: vadd.i32 q4, q4, r0 -; CHECK-NEXT: vadd.i32 q5, q5, r0 -; CHECK-NEXT: vadd.i32 q6, q6, r0 -; CHECK-NEXT: vmov r0, s28 -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s29 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s30 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s31 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s24 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s25 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s26 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s27 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov r1, s8 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: vmov r1, s9 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov r1, s10 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s23 -; CHECK-NEXT: vmov r1, s11 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov r1, s12 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov r1, s13 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov r1, s14 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov r1, s15 -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} +; CHECK-NEXT: .vsave {d8, d9} +; CHECK-NEXT: vpush {d8, d9} +; CHECK-NEXT: vmov r3, s0 +; CHECK-NEXT: vmov.8 q4[0], r3 +; CHECK-NEXT: vmov r3, s1 +; CHECK-NEXT: vmov.8 q4[1], r3 +; CHECK-NEXT: vmov r3, s2 +; CHECK-NEXT: vmov.8 q4[2], r3 +; CHECK-NEXT: vmov r3, s3 +; CHECK-NEXT: vmov.8 q4[3], r3 +; CHECK-NEXT: vmov r3, s4 +; CHECK-NEXT: vmov.8 q4[4], r3 +; CHECK-NEXT: vmov r3, s5 +; CHECK-NEXT: vmov.8 q4[5], r3 +; CHECK-NEXT: vmov r3, s6 +; CHECK-NEXT: vmov.8 q4[6], r3 +; CHECK-NEXT: vmov r3, s7 +; CHECK-NEXT: vmov.8 q4[7], r3 +; CHECK-NEXT: vmov r3, s8 +; CHECK-NEXT: vmov.8 q4[8], r3 +; CHECK-NEXT: vmov r3, s9 +; CHECK-NEXT: vmov.8 q4[9], r3 +; CHECK-NEXT: vmov r3, s10 +; CHECK-NEXT: vmov.8 q4[10], r3 +; CHECK-NEXT: vmov r3, s11 +; CHECK-NEXT: vmov.8 q4[11], r3 +; CHECK-NEXT: vmov r3, s12 +; CHECK-NEXT: vmov.8 q4[12], r3 +; CHECK-NEXT: vmov r3, s13 +; CHECK-NEXT: vmov.8 q4[13], r3 +; CHECK-NEXT: vmov r3, s14 +; CHECK-NEXT: vmov r2, s15 +; CHECK-NEXT: vmov.8 q4[14], r3 +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: vmov.8 q4[15], r2 +; CHECK-NEXT: vstrb.8 q4, [r0, q0] +; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: bx lr entry: %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 @@ -589,65 +490,40 @@ define arm_aapcs_vfpcc void @trunc_unsigned_unscaled_i16_i8(i8* %base, <16 x i8>* %offptr, <16 x i16> %input) { ; CHECK-LABEL: trunc_unsigned_unscaled_i16_i8: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .vsave {d8, d9, d10, d11} -; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: vldrb.u32 q5, [r1] -; CHECK-NEXT: vldrb.u32 q2, [r1, #12] -; CHECK-NEXT: vldrb.u32 q3, [r1, #8] -; CHECK-NEXT: vldrb.u32 q4, [r1, #4] -; CHECK-NEXT: vadd.i32 q5, q5, r0 -; CHECK-NEXT: vadd.i32 q2, q2, r0 -; CHECK-NEXT: vadd.i32 q3, q3, r0 -; CHECK-NEXT: vadd.i32 q4, q4, r0 -; CHECK-NEXT: vmov r0, s20 -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s21 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s22 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s23 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s16 -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s17 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s18 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s19 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s12 -; CHECK-NEXT: vmov.u16 r1, q1[0] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s13 -; CHECK-NEXT: vmov.u16 r1, q1[1] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s14 -; CHECK-NEXT: vmov.u16 r1, q1[2] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s15 -; CHECK-NEXT: vmov.u16 r1, q1[3] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s8 -; CHECK-NEXT: vmov.u16 r1, q1[4] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s9 -; CHECK-NEXT: vmov.u16 r1, q1[5] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s10 -; CHECK-NEXT: vmov.u16 r1, q1[6] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vmov r0, s11 -; CHECK-NEXT: vmov.u16 r1, q1[7] -; CHECK-NEXT: strb r1, [r0] -; CHECK-NEXT: vpop {d8, d9, d10, d11} +; CHECK-NEXT: vmov.u16 r3, q0[0] +; CHECK-NEXT: vmov.u16 r2, q1[7] +; CHECK-NEXT: vmov.8 q2[0], r3 +; CHECK-NEXT: vmov.u16 r3, q0[1] +; CHECK-NEXT: vmov.8 q2[1], r3 +; CHECK-NEXT: vmov.u16 r3, q0[2] +; CHECK-NEXT: vmov.8 q2[2], r3 +; CHECK-NEXT: vmov.u16 r3, q0[3] +; CHECK-NEXT: vmov.8 q2[3], r3 +; CHECK-NEXT: vmov.u16 r3, q0[4] +; CHECK-NEXT: vmov.8 q2[4], r3 +; CHECK-NEXT: vmov.u16 r3, q0[5] +; CHECK-NEXT: vmov.8 q2[5], r3 +; CHECK-NEXT: vmov.u16 r3, q0[6] +; CHECK-NEXT: vmov.8 q2[6], r3 +; CHECK-NEXT: vmov.u16 r3, q0[7] +; CHECK-NEXT: vmov.8 q2[7], r3 +; CHECK-NEXT: vmov.u16 r3, q1[0] +; CHECK-NEXT: vmov.8 q2[8], r3 +; CHECK-NEXT: vmov.u16 r3, q1[1] +; CHECK-NEXT: vmov.8 q2[9], r3 +; CHECK-NEXT: vmov.u16 r3, q1[2] +; CHECK-NEXT: vmov.8 q2[10], r3 +; CHECK-NEXT: vmov.u16 r3, q1[3] +; CHECK-NEXT: vmov.8 q2[11], r3 +; CHECK-NEXT: vmov.u16 r3, q1[4] +; CHECK-NEXT: vmov.8 q2[12], r3 +; CHECK-NEXT: vmov.u16 r3, q1[5] +; CHECK-NEXT: vmov.8 q2[13], r3 +; CHECK-NEXT: vmov.u16 r3, q1[6] +; CHECK-NEXT: vmov.8 q2[14], r3 +; CHECK-NEXT: vldrb.u8 q0, [r1] +; CHECK-NEXT: vmov.8 q2[15], r2 +; CHECK-NEXT: vstrb.8 q2, [r0, q0] ; CHECK-NEXT: bx lr entry: %offs = load <16 x i8>, <16 x i8>* %offptr, align 1 Index: llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll +++ llvm/test/CodeGen/Thumb2/mve-scatter-ptrs.ll @@ -24,18 +24,7 @@ ; CHECK-LABEL: ptr_v4i32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov r1, s0 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vstrw.32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x i32*>, <4 x i32*>* %offptr, align 4 @@ -167,14 +156,7 @@ ; CHECK-LABEL: ptr_v4f32: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmov r3, s4 -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vstr s0, [r3] -; CHECK-NEXT: vstr s1, [r2] -; CHECK-NEXT: vstr s2, [r1] -; CHECK-NEXT: vstr s3, [r0] +; CHECK-NEXT: vstrw.32 q0, [q1] ; CHECK-NEXT: bx lr entry: %offs = load <4 x float*>, <4 x float*>* %offptr, align 4 @@ -541,61 +523,21 @@ define void @foo_ptr_p_int32_t(i32* %dest, i32** %src, i32 %n) { ; CHECK-LABEL: foo_ptr_p_int32_t: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: bic r3, r2, #15 ; CHECK-NEXT: cmp r3, #1 -; CHECK-NEXT: blt .LBB16_3 -; CHECK-NEXT: @ %bb.1: @ %vector.body.preheader -; CHECK-NEXT: sub.w r12, r1, #16 -; CHECK-NEXT: .LBB16_2: @ %vector.body +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: subs r1, #16 +; CHECK-NEXT: .LBB16_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r12, #16] -; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrwt.u32 q1, [r0] -; CHECK-NEXT: vmrs r4, p0 -; CHECK-NEXT: add.w r12, r12, #16 -; CHECK-NEXT: adds r0, #16 -; CHECK-NEXT: and r3, r4, #1 -; CHECK-NEXT: ubfx r1, r4, #4, #1 -; CHECK-NEXT: rsb.w lr, r3, #0 -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, lr, #0, #1 -; CHECK-NEXT: bfi r3, r1, #1, #1 -; CHECK-NEXT: ubfx r1, r4, #8, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #2, #1 -; CHECK-NEXT: ubfx r1, r4, #12, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #3, #1 -; CHECK-NEXT: lsls r1, r3, #31 -; CHECK-NEXT: ittt ne -; CHECK-NEXT: vmovne r1, s0 -; CHECK-NEXT: vmovne r4, s4 -; CHECK-NEXT: strne r4, [r1] -; CHECK-NEXT: lsls r1, r3, #30 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r1, s1 -; CHECK-NEXT: vmovmi r4, s5 -; CHECK-NEXT: strmi r4, [r1] -; CHECK-NEXT: lsls r1, r3, #29 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r1, s2 -; CHECK-NEXT: vmovmi r4, s6 -; CHECK-NEXT: strmi r4, [r1] -; CHECK-NEXT: lsls r1, r3, #28 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r1, s3 -; CHECK-NEXT: vmovmi r3, s7 -; CHECK-NEXT: strmi r3, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r1, #16]! ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: bne .LBB16_2 -; CHECK-NEXT: .LBB16_3: @ %for.end -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: vptt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q1, [r0], #16 +; CHECK-NEXT: vstrwt.32 q1, [q0] +; CHECK-NEXT: bne .LBB16_1 +; CHECK-NEXT: @ %bb.2: @ %for.end +; CHECK-NEXT: bx lr entry: %and = and i32 %n, -16 %cmp11 = icmp sgt i32 %and, 0 @@ -622,61 +564,21 @@ define void @foo_ptr_p_float(float* %dest, float** %src, i32 %n) { ; CHECK-LABEL: foo_ptr_p_float: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, lr} -; CHECK-NEXT: push {r4, lr} -; CHECK-NEXT: .pad #4 -; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: bic r3, r2, #15 ; CHECK-NEXT: cmp r3, #1 -; CHECK-NEXT: blt .LBB17_3 -; CHECK-NEXT: @ %bb.1: @ %vector.body.preheader -; CHECK-NEXT: sub.w r12, r1, #16 -; CHECK-NEXT: .LBB17_2: @ %vector.body +; CHECK-NEXT: it lt +; CHECK-NEXT: bxlt lr +; CHECK-NEXT: subs r1, #16 +; CHECK-NEXT: .LBB17_1: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [r12, #16] -; CHECK-NEXT: vpt.i32 ne, q0, zr -; CHECK-NEXT: vldrwt.u32 q1, [r0] -; CHECK-NEXT: vmrs r4, p0 -; CHECK-NEXT: add.w r12, r12, #16 -; CHECK-NEXT: adds r0, #16 -; CHECK-NEXT: and r3, r4, #1 -; CHECK-NEXT: ubfx r1, r4, #4, #1 -; CHECK-NEXT: rsb.w lr, r3, #0 -; CHECK-NEXT: movs r3, #0 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, lr, #0, #1 -; CHECK-NEXT: bfi r3, r1, #1, #1 -; CHECK-NEXT: ubfx r1, r4, #8, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #2, #1 -; CHECK-NEXT: ubfx r1, r4, #12, #1 -; CHECK-NEXT: rsbs r1, r1, #0 -; CHECK-NEXT: bfi r3, r1, #3, #1 -; CHECK-NEXT: lsls r1, r3, #31 -; CHECK-NEXT: ittt ne -; CHECK-NEXT: vmovne r1, s0 -; CHECK-NEXT: vmovne r4, s4 -; CHECK-NEXT: strne r4, [r1] -; CHECK-NEXT: lsls r1, r3, #30 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r1, s1 -; CHECK-NEXT: vmovmi r4, s5 -; CHECK-NEXT: strmi r4, [r1] -; CHECK-NEXT: lsls r1, r3, #29 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r1, s2 -; CHECK-NEXT: vmovmi r4, s6 -; CHECK-NEXT: strmi r4, [r1] -; CHECK-NEXT: lsls r1, r3, #28 -; CHECK-NEXT: ittt mi -; CHECK-NEXT: vmovmi r1, s3 -; CHECK-NEXT: vmovmi r3, s7 -; CHECK-NEXT: strmi r3, [r1] +; CHECK-NEXT: vldrw.u32 q0, [r1, #16]! ; CHECK-NEXT: subs r2, #4 -; CHECK-NEXT: bne .LBB17_2 -; CHECK-NEXT: .LBB17_3: @ %for.end -; CHECK-NEXT: add sp, #4 -; CHECK-NEXT: pop {r4, pc} +; CHECK-NEXT: vptt.i32 ne, q0, zr +; CHECK-NEXT: vldrwt.u32 q1, [r0], #16 +; CHECK-NEXT: vstrwt.32 q1, [q0] +; CHECK-NEXT: bne .LBB17_1 +; CHECK-NEXT: @ %bb.2: @ %for.end +; CHECK-NEXT: bx lr entry: %and = and i32 %n, -16 %cmp11 = icmp sgt i32 %and, 0 @@ -706,19 +608,8 @@ ; CHECK-LABEL: qi4: ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: vmov.i32 q2, #0x10 -; CHECK-NEXT: vmov r1, s0 ; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vmov r0, s4 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s5 -; CHECK-NEXT: vmov r1, s1 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s6 -; CHECK-NEXT: vmov r1, s2 -; CHECK-NEXT: str r1, [r0] -; CHECK-NEXT: vmov r0, s7 -; CHECK-NEXT: vmov r1, s3 -; CHECK-NEXT: str r1, [r0] +; CHECK-NEXT: vstrw.32 q0, [q1] ; CHECK-NEXT: bx lr entry: %g = getelementptr inbounds i32, <4 x i32*> %p, i32 4