Index: llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/trunk/lib/Target/ARM/ARMISelLowering.cpp @@ -898,9 +898,6 @@ setTargetDAGCombine(ISD::SHL); setTargetDAGCombine(ISD::SRL); setTargetDAGCombine(ISD::SRA); - setTargetDAGCombine(ISD::SIGN_EXTEND); - setTargetDAGCombine(ISD::ZERO_EXTEND); - setTargetDAGCombine(ISD::ANY_EXTEND); setTargetDAGCombine(ISD::FP_TO_SINT); setTargetDAGCombine(ISD::FP_TO_UINT); setTargetDAGCombine(ISD::FDIV); @@ -922,6 +919,9 @@ setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); setTargetDAGCombine(ISD::STORE); + setTargetDAGCombine(ISD::SIGN_EXTEND); + setTargetDAGCombine(ISD::ZERO_EXTEND); + setTargetDAGCombine(ISD::ANY_EXTEND); } if (!Subtarget->hasFP64()) { @@ -13694,6 +13694,71 @@ return SDValue(); } +// Look for a sign/zero extend of a larger than legal load. This can be split +// into two extending loads, which are simpler to deal with than an arbitrary +// sign extend. +SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) { + SDValue N0 = N->getOperand(0); + if (N0.getOpcode() != ISD::LOAD) + return SDValue(); + LoadSDNode *LD = cast(N0.getNode()); + if (!LD->isSimple() || !N0.hasOneUse() || LD->isIndexed() || + LD->getExtensionType() != ISD::NON_EXTLOAD) + return SDValue(); + EVT FromVT = LD->getValueType(0); + EVT ToVT = N->getValueType(0); + if (!ToVT.isVector()) + return SDValue(); + assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); + EVT ToEltVT = ToVT.getVectorElementType(); + EVT FromEltVT = FromVT.getVectorElementType(); + + unsigned NumElements = 0; + if (ToEltVT == MVT::i32 && (FromEltVT == MVT::i16 || FromEltVT == MVT::i8)) + NumElements = 4; + if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8) + NumElements = 8; + if (NumElements == 0 || + FromVT.getVectorNumElements() == NumElements || + FromVT.getVectorNumElements() % NumElements != 0 || + !isPowerOf2_32(NumElements)) + return SDValue(); + + SDLoc DL(LD); + // Details about the old load + SDValue Ch = LD->getChain(); + SDValue BasePtr = LD->getBasePtr(); + unsigned Alignment = LD->getOriginalAlignment(); + MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags(); + AAMDNodes AAInfo = LD->getAAInfo(); + + ISD::LoadExtType NewExtType = + N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD; + SDValue Offset = DAG.getUNDEF(BasePtr.getValueType()); + EVT NewFromVT = FromVT.getHalfNumVectorElementsVT(*DAG.getContext()); + EVT NewToVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext()); + unsigned NewOffset = NewFromVT.getSizeInBits() / 8; + SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); + + // Split the load in half, each side of which is extended separately. This + // is good enough, as legalisation will take it from there. They are either + // already legal or they will be split further into something that is + // legal. + SDValue NewLoad1 = + DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, BasePtr, Offset, + LD->getPointerInfo(), NewFromVT, Alignment, MMOFlags, AAInfo); + SDValue NewLoad2 = + DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset, + LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT, + Alignment, MMOFlags, AAInfo); + + SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, + SDValue(NewLoad1.getNode(), 1), + SDValue(NewLoad2.getNode(), 1)); + DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain); + return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, NewLoad1, NewLoad2); +} + /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND, /// ISD::ZERO_EXTEND, and ISD::ANY_EXTEND. static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG, @@ -13731,6 +13796,10 @@ } } + if (ST->hasMVEIntegerOps()) + if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG)) + return NewLoad; + return SDValue(); } Index: llvm/trunk/test/CodeGen/Thumb2/mve-widen-narrow.ll =================================================================== --- llvm/trunk/test/CodeGen/Thumb2/mve-widen-narrow.ll +++ llvm/trunk/test/CodeGen/Thumb2/mve-widen-narrow.ll @@ -134,51 +134,14 @@ define void @foo_int32_int8_double(<16 x i32>* %dest, <16 x i8>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_int32_int8_double: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r1] -; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: vmov.32 q1[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: vmov.32 q1[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[14] -; CHECK-NEXT: vmov.32 q1[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[15] -; CHECK-NEXT: vmov.32 q1[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[8] -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vstrw.32 q1, [r0, #48] -; CHECK-NEXT: vmov.32 q1[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: vmov.32 q1[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: vmov.32 q1[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: vmov.32 q1[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vstrw.32 q1, [r0, #32] -; CHECK-NEXT: vmov.32 q1[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: vmov.32 q1[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: vmov.32 q1[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: vmov.32 q1[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vstrw.32 q1, [r0, #16] -; CHECK-NEXT: vmov.32 q1[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: vmov.32 q1[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: vmov.32 q1[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: vmov.32 q1[3], r1 -; CHECK-NEXT: vmovlb.s8 q0, q1 -; CHECK-NEXT: vmovlb.s16 q0, q0 -; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vldrb.s32 q0, [r1, #4] +; CHECK-NEXT: vldrb.s32 q1, [r1] +; CHECK-NEXT: vldrb.s32 q2, [r1, #12] +; CHECK-NEXT: vldrb.s32 q3, [r1, #8] +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0, #16] +; CHECK-NEXT: vstrw.32 q3, [r0, #32] +; CHECK-NEXT: vstrw.32 q2, [r0, #48] ; CHECK-NEXT: bx lr entry: %wide.load = load <16 x i8>, <16 x i8>* %src, align 1 @@ -190,42 +153,9 @@ define void @foo_int16_int8_double(<16 x i16>* %dest, <16 x i8>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_int16_int8_double: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r1] -; CHECK-NEXT: vmov.u8 r1, q0[8] -; CHECK-NEXT: vmov.16 q1[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: vmov.16 q1[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: vmov.16 q1[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: vmov.16 q1[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: vmov.16 q1[4], r1 -; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: vmov.16 q1[5], r1 -; CHECK-NEXT: vmov.u8 r1, q0[14] -; CHECK-NEXT: vmov.16 q1[6], r1 -; CHECK-NEXT: vmov.u8 r1, q0[15] -; CHECK-NEXT: vmov.16 q1[7], r1 -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: vmovlb.s8 q1, q1 +; CHECK-NEXT: vldrb.s16 q0, [r1] +; CHECK-NEXT: vldrb.s16 q1, [r1, #8] ; CHECK-NEXT: vstrh.16 q1, [r0, #16] -; CHECK-NEXT: vmov.16 q1[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: vmov.16 q1[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: vmov.16 q1[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: vmov.16 q1[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: vmov.16 q1[4], r1 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: vmov.16 q1[5], r1 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: vmov.16 q1[6], r1 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: vmov.16 q1[7], r1 -; CHECK-NEXT: vmovlb.s8 q0, q1 ; CHECK-NEXT: vstrh.16 q0, [r0] ; CHECK-NEXT: bx lr entry: @@ -238,26 +168,9 @@ define void @foo_int32_int16_double(<8 x i32>* %dest, <8 x i16>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_int32_int16_double: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmov.32 q1[0], r1 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmov.32 q1[1], r1 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: vmov.32 q1[2], r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: vmov.32 q1[3], r1 -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmovlb.s16 q1, q1 +; CHECK-NEXT: vldrh.s32 q0, [r1] +; CHECK-NEXT: vldrh.s32 q1, [r1, #8] ; CHECK-NEXT: vstrw.32 q1, [r0, #16] -; CHECK-NEXT: vmov.32 q1[0], r1 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov.32 q1[1], r1 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: vmov.32 q1[2], r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: vmov.32 q1[3], r1 -; CHECK-NEXT: vmovlb.s16 q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: bx lr entry: @@ -311,48 +224,14 @@ define void @foo_uint32_uint8_double(<16 x i32>* %dest, <16 x i8>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_uint32_uint8_double: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r1] -; CHECK-NEXT: vmov.i32 q1, #0xff -; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: vmov.32 q2[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[14] -; CHECK-NEXT: vmov.32 q2[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[15] -; CHECK-NEXT: vmov.32 q2[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[8] -; CHECK-NEXT: vand q2, q2, q1 +; CHECK-NEXT: vldrb.u32 q0, [r1, #4] +; CHECK-NEXT: vldrb.u32 q1, [r1] +; CHECK-NEXT: vldrb.u32 q2, [r1, #12] +; CHECK-NEXT: vldrb.u32 q3, [r1, #8] +; CHECK-NEXT: vstrw.32 q1, [r0] +; CHECK-NEXT: vstrw.32 q0, [r0, #16] +; CHECK-NEXT: vstrw.32 q3, [r0, #32] ; CHECK-NEXT: vstrw.32 q2, [r0, #48] -; CHECK-NEXT: vmov.32 q2[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: vmov.32 q2[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: vmov.32 q2[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vstrw.32 q2, [r0, #32] -; CHECK-NEXT: vmov.32 q2[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: vmov.32 q2[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: vmov.32 q2[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: vand q2, q2, q1 -; CHECK-NEXT: vstrw.32 q2, [r0, #16] -; CHECK-NEXT: vmov.32 q2[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: vmov.32 q2[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: vmov.32 q2[3], r1 -; CHECK-NEXT: vand q0, q2, q1 -; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: bx lr entry: %wide.load = load <16 x i8>, <16 x i8>* %src, align 1 @@ -364,42 +243,9 @@ define void @foo_uint16_uint8_double(<16 x i16>* %dest, <16 x i8>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_uint16_uint8_double: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r1] -; CHECK-NEXT: vmov.u8 r1, q0[8] -; CHECK-NEXT: vmov.16 q1[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: vmov.16 q1[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: vmov.16 q1[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: vmov.16 q1[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: vmov.16 q1[4], r1 -; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: vmov.16 q1[5], r1 -; CHECK-NEXT: vmov.u8 r1, q0[14] -; CHECK-NEXT: vmov.16 q1[6], r1 -; CHECK-NEXT: vmov.u8 r1, q0[15] -; CHECK-NEXT: vmov.16 q1[7], r1 -; CHECK-NEXT: vmov.u8 r1, q0[0] -; CHECK-NEXT: vmovlb.u8 q1, q1 +; CHECK-NEXT: vldrb.u16 q0, [r1] +; CHECK-NEXT: vldrb.u16 q1, [r1, #8] ; CHECK-NEXT: vstrh.16 q1, [r0, #16] -; CHECK-NEXT: vmov.16 q1[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: vmov.16 q1[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: vmov.16 q1[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: vmov.16 q1[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: vmov.16 q1[4], r1 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: vmov.16 q1[5], r1 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: vmov.16 q1[6], r1 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: vmov.16 q1[7], r1 -; CHECK-NEXT: vmovlb.u8 q0, q1 ; CHECK-NEXT: vstrh.16 q0, [r0] ; CHECK-NEXT: bx lr entry: @@ -412,26 +258,9 @@ define void @foo_uint32_uint16_double(<8 x i32>* %dest, <8 x i16>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_uint32_uint16_double: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1] -; CHECK-NEXT: vmov.u16 r1, q0[4] -; CHECK-NEXT: vmov.32 q1[0], r1 -; CHECK-NEXT: vmov.u16 r1, q0[5] -; CHECK-NEXT: vmov.32 q1[1], r1 -; CHECK-NEXT: vmov.u16 r1, q0[6] -; CHECK-NEXT: vmov.32 q1[2], r1 -; CHECK-NEXT: vmov.u16 r1, q0[7] -; CHECK-NEXT: vmov.32 q1[3], r1 -; CHECK-NEXT: vmov.u16 r1, q0[0] -; CHECK-NEXT: vmovlb.u16 q1, q1 +; CHECK-NEXT: vldrh.u32 q0, [r1] +; CHECK-NEXT: vldrh.u32 q1, [r1, #8] ; CHECK-NEXT: vstrw.32 q1, [r0, #16] -; CHECK-NEXT: vmov.32 q1[0], r1 -; CHECK-NEXT: vmov.u16 r1, q0[1] -; CHECK-NEXT: vmov.32 q1[1], r1 -; CHECK-NEXT: vmov.u16 r1, q0[2] -; CHECK-NEXT: vmov.32 q1[2], r1 -; CHECK-NEXT: vmov.u16 r1, q0[3] -; CHECK-NEXT: vmov.32 q1[3], r1 -; CHECK-NEXT: vmovlb.u16 q0, q1 ; CHECK-NEXT: vstrw.32 q0, [r0] ; CHECK-NEXT: bx lr entry: @@ -445,34 +274,18 @@ define void @foo_int32_int8_both(<16 x i32>* %dest, <16 x i8>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_int32_int8_both: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrb.u8 q0, [r1] -; CHECK-NEXT: vmov.u8 r1, q0[8] -; CHECK-NEXT: vmov.16 q1[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[9] -; CHECK-NEXT: vmov.16 q1[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[10] -; CHECK-NEXT: vmov.16 q1[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[11] -; CHECK-NEXT: vmov.16 q1[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[12] -; CHECK-NEXT: vmov.16 q1[4], r1 -; CHECK-NEXT: vmov.u8 r1, q0[13] -; CHECK-NEXT: vmov.16 q1[5], r1 -; CHECK-NEXT: vmov.u8 r1, q0[14] -; CHECK-NEXT: vmov.16 q1[6], r1 -; CHECK-NEXT: vmov.u8 r1, q0[15] -; CHECK-NEXT: vmov.16 q1[7], r1 -; CHECK-NEXT: vmovlb.s8 q1, q1 -; CHECK-NEXT: vmov.u16 r1, q1[4] -; CHECK-NEXT: vmov.32 q2[0], r1 -; CHECK-NEXT: vmov.u16 r1, q1[5] -; CHECK-NEXT: vmov.32 q2[1], r1 -; CHECK-NEXT: vmov.u16 r1, q1[6] -; CHECK-NEXT: vmov.32 q2[2], r1 -; CHECK-NEXT: vmov.u16 r1, q1[7] -; CHECK-NEXT: vmov.32 q2[3], r1 +; CHECK-NEXT: vldrb.s16 q1, [r1, #8] +; CHECK-NEXT: vmov.u16 r2, q1[4] +; CHECK-NEXT: vmov.32 q0[0], r2 +; CHECK-NEXT: vmov.u16 r2, q1[5] +; CHECK-NEXT: vmov.32 q0[1], r2 +; CHECK-NEXT: vmov.u16 r2, q1[6] +; CHECK-NEXT: vmov.32 q0[2], r2 +; CHECK-NEXT: vmov.u16 r2, q1[7] +; CHECK-NEXT: vmov.32 q0[3], r2 +; CHECK-NEXT: vmovlb.u16 q2, q0 +; CHECK-NEXT: vldrb.s16 q0, [r1] ; CHECK-NEXT: vmov.u16 r1, q1[0] -; CHECK-NEXT: vmovlb.u16 q2, q2 ; CHECK-NEXT: vstrw.32 q2, [r0, #48] ; CHECK-NEXT: vmov.32 q2[0], r1 ; CHECK-NEXT: vmov.u16 r1, q1[1] @@ -481,26 +294,9 @@ ; CHECK-NEXT: vmov.32 q2[2], r1 ; CHECK-NEXT: vmov.u16 r1, q1[3] ; CHECK-NEXT: vmov.32 q2[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[0] +; CHECK-NEXT: vmov.u16 r1, q0[4] ; CHECK-NEXT: vmovlb.u16 q1, q2 ; CHECK-NEXT: vstrw.32 q1, [r0, #32] -; CHECK-NEXT: vmov.16 q1[0], r1 -; CHECK-NEXT: vmov.u8 r1, q0[1] -; CHECK-NEXT: vmov.16 q1[1], r1 -; CHECK-NEXT: vmov.u8 r1, q0[2] -; CHECK-NEXT: vmov.16 q1[2], r1 -; CHECK-NEXT: vmov.u8 r1, q0[3] -; CHECK-NEXT: vmov.16 q1[3], r1 -; CHECK-NEXT: vmov.u8 r1, q0[4] -; CHECK-NEXT: vmov.16 q1[4], r1 -; CHECK-NEXT: vmov.u8 r1, q0[5] -; CHECK-NEXT: vmov.16 q1[5], r1 -; CHECK-NEXT: vmov.u8 r1, q0[6] -; CHECK-NEXT: vmov.16 q1[6], r1 -; CHECK-NEXT: vmov.u8 r1, q0[7] -; CHECK-NEXT: vmov.16 q1[7], r1 -; CHECK-NEXT: vmovlb.s8 q0, q1 -; CHECK-NEXT: vmov.u16 r1, q0[4] ; CHECK-NEXT: vmov.32 q1[0], r1 ; CHECK-NEXT: vmov.u16 r1, q0[5] ; CHECK-NEXT: vmov.32 q1[1], r1 @@ -532,27 +328,10 @@ define <8 x i16>* @foo_uint32_uint16_double_offset(<8 x i32>* %dest, <8 x i16>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_uint32_uint16_double_offset: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q0, [r1, #16]! -; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov.32 q1[1], r2 -; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: vmov.32 q1[2], r2 -; CHECK-NEXT: vmov.u16 r2, q0[7] -; CHECK-NEXT: vmov.32 q1[3], r2 -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vstrw.32 q1, [r0, #16] -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: vmov.32 q1[1], r2 -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov.32 q1[2], r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov.32 q1[3], r2 -; CHECK-NEXT: vmovlb.s16 q0, q1 +; CHECK-NEXT: vldrh.s32 q0, [r1, #16]! +; CHECK-NEXT: vldrh.s32 q1, [r1, #8] ; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vstrw.32 q1, [r0, #16] ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bx lr entry: @@ -566,48 +345,14 @@ define <16 x i16>* @foo_uint32_uint16_quad_offset(<16 x i32>* %dest, <16 x i16>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_uint32_uint16_quad_offset: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q1, [r1, #32]! -; CHECK-NEXT: vmov.u16 r2, q1[4] -; CHECK-NEXT: vmov.32 q0[0], r2 -; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov.32 q0[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[6] -; CHECK-NEXT: vmov.32 q0[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] -; CHECK-NEXT: vmov.32 q0[3], r2 -; CHECK-NEXT: vmov.u16 r2, q1[0] -; CHECK-NEXT: vmovlb.s16 q2, q0 -; CHECK-NEXT: vldrh.u16 q0, [r1, #16] -; CHECK-NEXT: vstrw.32 q2, [r0, #16] -; CHECK-NEXT: vmov.32 q2[0], r2 -; CHECK-NEXT: vmov.u16 r2, q1[1] -; CHECK-NEXT: vmov.32 q2[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.32 q2[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[3] -; CHECK-NEXT: vmov.32 q2[3], r2 -; CHECK-NEXT: vmov.u16 r2, q0[4] -; CHECK-NEXT: vmovlb.s16 q1, q2 -; CHECK-NEXT: vstrw.32 q1, [r0] -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[5] -; CHECK-NEXT: vmov.32 q1[1], r2 -; CHECK-NEXT: vmov.u16 r2, q0[6] -; CHECK-NEXT: vmov.32 q1[2], r2 -; CHECK-NEXT: vmov.u16 r2, q0[7] -; CHECK-NEXT: vmov.32 q1[3], r2 -; CHECK-NEXT: vmov.u16 r2, q0[0] -; CHECK-NEXT: vmovlb.s16 q1, q1 -; CHECK-NEXT: vstrw.32 q1, [r0, #48] -; CHECK-NEXT: vmov.32 q1[0], r2 -; CHECK-NEXT: vmov.u16 r2, q0[1] -; CHECK-NEXT: vmov.32 q1[1], r2 -; CHECK-NEXT: vmov.u16 r2, q0[2] -; CHECK-NEXT: vmov.32 q1[2], r2 -; CHECK-NEXT: vmov.u16 r2, q0[3] -; CHECK-NEXT: vmov.32 q1[3], r2 -; CHECK-NEXT: vmovlb.s16 q0, q1 -; CHECK-NEXT: vstrw.32 q0, [r0, #32] +; CHECK-NEXT: vldrh.s32 q0, [r1, #32]! +; CHECK-NEXT: vldrh.s32 q1, [r1, #8] +; CHECK-NEXT: vldrh.s32 q2, [r1, #24] +; CHECK-NEXT: vldrh.s32 q3, [r1, #16] +; CHECK-NEXT: vstrw.32 q0, [r0] +; CHECK-NEXT: vstrw.32 q2, [r0, #48] +; CHECK-NEXT: vstrw.32 q1, [r0, #16] +; CHECK-NEXT: vstrw.32 q3, [r0, #32] ; CHECK-NEXT: mov r0, r1 ; CHECK-NEXT: bx lr entry: