Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -901,7 +901,6 @@ setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); - setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::FP_TO_SINT); setTargetDAGCombine(ISD::FP_TO_UINT); setTargetDAGCombine(ISD::FDIV); @@ -922,6 +921,7 @@ setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setTargetDAGCombine(ISD::STORE); } if (!Subtarget->hasFP64()) { @@ -13164,95 +13164,161 @@ return SDValue(); } -/// PerformSTORECombine - Target-specific dag combine xforms for -/// ISD::STORE. -static SDValue PerformSTORECombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { - StoreSDNode *St = cast(N); - if (St->isVolatile()) - return SDValue(); - - // Optimize trunc store (of multiple scalars) to shuffle and store. First, - // pack all of the elements in one place. Next, store to memory in fewer - // chunks. +// Optimize trunc store (of multiple scalars) to shuffle and store. First, +// pack all of the elements in one place. Next, store to memory in fewer +// chunks. +SDValue PerformTruncatingStoreCombine(StoreSDNode *St, SelectionDAG &DAG) { SDValue StVal = St->getValue(); EVT VT = StVal.getValueType(); - if (St->isTruncatingStore() && VT.isVector()) { - SelectionDAG &DAG = DCI.DAG; - const TargetLowering &TLI = DAG.getTargetLoweringInfo(); - EVT StVT = St->getMemoryVT(); - unsigned NumElems = VT.getVectorNumElements(); - assert(StVT != VT && "Cannot truncate to the same type"); - unsigned FromEltSz = VT.getScalarSizeInBits(); - unsigned ToEltSz = StVT.getScalarSizeInBits(); + if (!St->isTruncatingStore() || !VT.isVector()) + return SDValue(); + const TargetLowering &TLI = DAG.getTargetLoweringInfo(); + EVT StVT = St->getMemoryVT(); + unsigned NumElems = VT.getVectorNumElements(); + assert(StVT != VT && "Cannot truncate to the same type"); + unsigned FromEltSz = VT.getScalarSizeInBits(); + unsigned ToEltSz = StVT.getScalarSizeInBits(); + + // From, To sizes and ElemCount must be pow of two + if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) + return SDValue(); - // From, To sizes and ElemCount must be pow of two - if (!isPowerOf2_32(NumElems * FromEltSz * ToEltSz)) return SDValue(); + // We are going to use the original vector elt for storing. + // Accumulated smaller vector elements must be a multiple of the store size. + if (0 != (NumElems * FromEltSz) % ToEltSz) + return SDValue(); - // We are going to use the original vector elt for storing. - // Accumulated smaller vector elements must be a multiple of the store size. - if (0 != (NumElems * FromEltSz) % ToEltSz) return SDValue(); + unsigned SizeRatio = FromEltSz / ToEltSz; + assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); - unsigned SizeRatio = FromEltSz / ToEltSz; - assert(SizeRatio * NumElems * ToEltSz == VT.getSizeInBits()); + // Create a type on which we perform the shuffle. + EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), + NumElems * SizeRatio); + assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); - // Create a type on which we perform the shuffle. - EVT WideVecVT = EVT::getVectorVT(*DAG.getContext(), StVT.getScalarType(), - NumElems*SizeRatio); - assert(WideVecVT.getSizeInBits() == VT.getSizeInBits()); + SDLoc DL(St); + SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); + SmallVector ShuffleVec(NumElems * SizeRatio, -1); + for (unsigned i = 0; i < NumElems; ++i) + ShuffleVec[i] = DAG.getDataLayout().isBigEndian() ? (i + 1) * SizeRatio - 1 + : i * SizeRatio; - SDLoc DL(St); - SDValue WideVec = DAG.getNode(ISD::BITCAST, DL, WideVecVT, StVal); - SmallVector ShuffleVec(NumElems * SizeRatio, -1); - for (unsigned i = 0; i < NumElems; ++i) - ShuffleVec[i] = DAG.getDataLayout().isBigEndian() - ? (i + 1) * SizeRatio - 1 - : i * SizeRatio; - - // Can't shuffle using an illegal type. - if (!TLI.isTypeLegal(WideVecVT)) return SDValue(); - - SDValue Shuff = DAG.getVectorShuffle(WideVecVT, DL, WideVec, - DAG.getUNDEF(WideVec.getValueType()), - ShuffleVec); - // At this point all of the data is stored at the bottom of the - // register. We now need to save it to mem. - - // Find the largest store unit - MVT StoreType = MVT::i8; - for (MVT Tp : MVT::integer_valuetypes()) { - if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) - StoreType = Tp; - } - // Didn't find a legal store type. - if (!TLI.isTypeLegal(StoreType)) - return SDValue(); + // Can't shuffle using an illegal type. + if (!TLI.isTypeLegal(WideVecVT)) + return SDValue(); - // Bitcast the original vector into a vector of store-size units - EVT StoreVecVT = EVT::getVectorVT(*DAG.getContext(), - StoreType, VT.getSizeInBits()/EVT(StoreType).getSizeInBits()); - assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); - SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); - SmallVector Chains; - SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, - TLI.getPointerTy(DAG.getDataLayout())); - SDValue BasePtr = St->getBasePtr(); + SDValue Shuff = DAG.getVectorShuffle( + WideVecVT, DL, WideVec, DAG.getUNDEF(WideVec.getValueType()), ShuffleVec); + // At this point all of the data is stored at the bottom of the + // register. We now need to save it to mem. - // Perform one or more big stores into memory. - unsigned E = (ToEltSz*NumElems)/StoreType.getSizeInBits(); - for (unsigned I = 0; I < E; I++) { - SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, - StoreType, ShuffWide, - DAG.getIntPtrConstant(I, DL)); - SDValue Ch = DAG.getStore(St->getChain(), DL, SubVec, BasePtr, - St->getPointerInfo(), St->getAlignment(), - St->getMemOperand()->getFlags()); - BasePtr = DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, - Increment); - Chains.push_back(Ch); - } - return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); + // Find the largest store unit + MVT StoreType = MVT::i8; + for (MVT Tp : MVT::integer_valuetypes()) { + if (TLI.isTypeLegal(Tp) && Tp.getSizeInBits() <= NumElems * ToEltSz) + StoreType = Tp; + } + // Didn't find a legal store type. + if (!TLI.isTypeLegal(StoreType)) + return SDValue(); + + // Bitcast the original vector into a vector of store-size units + EVT StoreVecVT = + EVT::getVectorVT(*DAG.getContext(), StoreType, + VT.getSizeInBits() / EVT(StoreType).getSizeInBits()); + assert(StoreVecVT.getSizeInBits() == VT.getSizeInBits()); + SDValue ShuffWide = DAG.getNode(ISD::BITCAST, DL, StoreVecVT, Shuff); + SmallVector Chains; + SDValue Increment = DAG.getConstant(StoreType.getSizeInBits() / 8, DL, + TLI.getPointerTy(DAG.getDataLayout())); + SDValue BasePtr = St->getBasePtr(); + + // Perform one or more big stores into memory. + unsigned E = (ToEltSz * NumElems) / StoreType.getSizeInBits(); + for (unsigned I = 0; I < E; I++) { + SDValue SubVec = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, StoreType, + ShuffWide, DAG.getIntPtrConstant(I, DL)); + SDValue Ch = + DAG.getStore(St->getChain(), DL, SubVec, BasePtr, St->getPointerInfo(), + St->getAlignment(), St->getMemOperand()->getFlags()); + BasePtr = + DAG.getNode(ISD::ADD, DL, BasePtr.getValueType(), BasePtr, Increment); + Chains.push_back(Ch); + } + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); +} + +// Try taking a single vector store from an truncate (which would otherwise turn +// into an expensive buildvector) and splitting it into a series of narrowing +// stores. +SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, SelectionDAG &DAG) { + if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) + return SDValue(); + SDValue Trunc = St->getValue(); + if (Trunc->getOpcode() != ISD::TRUNCATE) + return SDValue(); + EVT FromVT = Trunc->getOperand(0).getValueType(); + EVT ToVT = Trunc.getValueType(); + if (!ToVT.isVector()) + return SDValue(); + assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); + EVT ToEltVT = ToVT.getVectorElementType(); + EVT FromEltVT = FromVT.getVectorElementType(); + + unsigned NumElements = 0; + if (FromEltVT == MVT::i32 && (ToEltVT == MVT::i16 || ToEltVT == MVT::i8)) + NumElements = 4; + if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8) + NumElements = 8; + if (NumElements == 0 || FromVT.getVectorNumElements() == NumElements || + FromVT.getVectorNumElements() % NumElements != 0) + return SDValue(); + + SDLoc DL(St); + // Details about the old store + SDValue Ch = St->getChain(); + SDValue BasePtr = St->getBasePtr(); + unsigned Alignment = St->getOriginalAlignment(); + MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); + AAMDNodes AAInfo = St->getAAInfo(); + + EVT NewFromVT = EVT::getVectorVT(*DAG.getContext(), FromEltVT, NumElements); + EVT NewToVT = EVT::getVectorVT(*DAG.getContext(), ToEltVT, NumElements); + + SmallVector Stores; + for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) { + unsigned NewOffset = i * NumElements * ToEltVT.getSizeInBits() / 8; + SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); + + SDValue Extract = + DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0), + DAG.getConstant(i * NumElements, DL, MVT::i32)); + SDValue Store = DAG.getTruncStore( + Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset), + NewToVT, Alignment, MMOFlags, AAInfo); + Stores.push_back(Store); } + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); +} + +/// PerformSTORECombine - Target-specific dag combine xforms for +/// ISD::STORE. +static SDValue PerformSTORECombine(SDNode *N, + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { + StoreSDNode *St = cast(N); + if (St->isVolatile()) + return SDValue(); + SDValue StVal = St->getValue(); + EVT VT = StVal.getValueType(); + + if (Subtarget->hasNEON()) + if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG)) + return Store; + + if (Subtarget->hasMVEIntegerOps()) + if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG)) + return NewToken; if (!ISD::isNormalStore(St)) return SDValue(); @@ -13304,7 +13370,7 @@ } // If this is a legal vector store, try to combine it into a VST1_UPD. - if (ISD::isNormalStore(N) && VT.isVector() && + if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() && DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) return CombineBaseUpdate(N, DCI); @@ -14214,7 +14280,7 @@ case ARMISD::BFI: return PerformBFICombine(N, DCI); case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); - case ISD::STORE: return PerformSTORECombine(N, DCI); + case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget); case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); Index: llvm/test/CodeGen/Thumb2/float-ops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/float-ops.ll +++ llvm/test/CodeGen/Thumb2/float-ops.ll @@ -130,7 +130,7 @@ entry: ; CHECK-LABEL: store_d: ; NOREGS: strd r2, r3, [r0] -; ONLYREGS: vstr d0, [r0] +; ONLYREGS: strd r2, r3, [r0] ; HARD: vstr d0, [r0] store double %b, double* %a, align 8 ret void Index: llvm/test/CodeGen/Thumb2/mve-masked-store.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-masked-store.ll +++ llvm/test/CodeGen/Thumb2/mve-masked-store.ll @@ -521,15 +521,11 @@ ; CHECK-LE-NEXT: bfi r3, r1, #0, #1 ; CHECK-LE-NEXT: and r1, r3, #3 ; CHECK-LE-NEXT: lsls r2, r3, #31 -; CHECK-LE-NEXT: ittt ne -; CHECK-LE-NEXT: vmovne r2, s1 -; CHECK-LE-NEXT: vmovne r3, s0 -; CHECK-LE-NEXT: strdne r3, r2, [r0] +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: vstrne d0, [r0] ; CHECK-LE-NEXT: lsls r1, r1, #30 -; CHECK-LE-NEXT: ittt mi -; CHECK-LE-NEXT: vmovmi r1, s3 -; CHECK-LE-NEXT: vmovmi r2, s2 -; CHECK-LE-NEXT: strdmi r2, r1, [r0, #8] +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vstrmi d1, [r0, #8] ; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr ; @@ -558,25 +554,11 @@ ; CHECK-BE-NEXT: bfi r3, r1, #0, #1 ; CHECK-BE-NEXT: and r1, r3, #3 ; CHECK-BE-NEXT: lsls r2, r3, #31 -; CHECK-BE-NEXT: bne .LBB19_3 -; CHECK-BE-NEXT: @ %bb.1: @ %else -; CHECK-BE-NEXT: lsls r1, r1, #30 -; CHECK-BE-NEXT: bmi .LBB19_4 -; CHECK-BE-NEXT: .LBB19_2: @ %else2 -; CHECK-BE-NEXT: add sp, #4 -; CHECK-BE-NEXT: bx lr -; CHECK-BE-NEXT: .LBB19_3: @ %cond.store -; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: vmov r2, s5 -; CHECK-BE-NEXT: vmov r3, s4 -; CHECK-BE-NEXT: strd r3, r2, [r0] +; CHECK-BE-NEXT: it ne +; CHECK-BE-NEXT: vstrne d0, [r0] ; CHECK-BE-NEXT: lsls r1, r1, #30 -; CHECK-BE-NEXT: bpl .LBB19_2 -; CHECK-BE-NEXT: .LBB19_4: @ %cond.store1 -; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: vmov r1, s7 -; CHECK-BE-NEXT: vmov r2, s6 -; CHECK-BE-NEXT: strd r2, r1, [r0, #8] +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vstrmi d1, [r0, #8] ; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: bx lr entry: Index: llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll +++ llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll @@ -45,43 +45,14 @@ define void @foo_int8_int32_double(<16 x i8>* %dest, <16 x i32>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_int8_int32_double: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.8 q0[0], r2 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov.8 q0[1], r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.8 q0[2], r2 -; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] -; CHECK-NEXT: vmov.8 q0[3], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.8 q0[4], r2 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov.8 q0[5], r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.8 q0[6], r2 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vldrw.u32 q1, [r1, #32] -; CHECK-NEXT: vmov.8 q0[7], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.8 q0[8], r2 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov.8 q0[9], r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.8 q0[10], r2 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vldrw.u32 q1, [r1, #48] -; CHECK-NEXT: vmov.8 q0[11], r2 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmov.8 q0[12], r1 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov.8 q0[13], r1 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov.8 q0[14], r1 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: vmov.8 q0[15], r1 -; CHECK-NEXT: vstrb.8 q0, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r1, #32] +; CHECK-NEXT: vldrw.u32 q3, [r1, #48] +; CHECK-NEXT: vstrb.32 q1, [r0, #4] +; CHECK-NEXT: vstrb.32 q0, [r0] +; CHECK-NEXT: vstrb.32 q3, [r0, #12] +; CHECK-NEXT: vstrb.32 q2, [r0, #8] ; CHECK-NEXT: bx lr entry: %wide.load = load <16 x i32>, <16 x i32>* %src, align 4 @@ -93,25 +64,10 @@ define void @foo_int16_int32_double(<8 x i16>* %dest, <8 x i32>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_int16_int32_double: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmov.16 q0[4], r1 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov.16 q0[6], r1 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: vmov.16 q0[7], r1 -; CHECK-NEXT: vstrh.16 q0, [r0] +; CHECK-NEXT: vstrh.32 q1, [r0, #8] +; CHECK-NEXT: vstrh.32 q0, [r0] ; CHECK-NEXT: bx lr entry: %wide.load = load <8 x i32>, <8 x i32>* %src, align 4 @@ -123,41 +79,10 @@ define void @foo_int8_int16_double(<16 x i8>* %dest, <16 x i16>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_int8_int16_double: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q1, [r1] -; CHECK-NEXT: vmov.u16 r2, q1[0] -; CHECK-NEXT: vmov.8 q0[0], r2 -; CHECK-NEXT: vmov.u16 r2, q1[1] -; CHECK-NEXT: vmov.8 q0[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.8 q0[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[3] -; CHECK-NEXT: vmov.8 q0[3], r2 -; CHECK-NEXT: vmov.u16 r2, q1[4] -; CHECK-NEXT: vmov.8 q0[4], r2 -; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov.8 q0[5], r2 -; CHECK-NEXT: vmov.u16 r2, q1[6] -; CHECK-NEXT: vmov.8 q0[6], r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r1, #16] -; CHECK-NEXT: vmov.8 q0[7], r2 -; CHECK-NEXT: vmov.u16 r1, q1[0] -; CHECK-NEXT: vmov.8 q0[8], r1 -; CHECK-NEXT: vmov.u16 r1, q1[1] -; CHECK-NEXT: vmov.8 q0[9], r1 -; CHECK-NEXT: vmov.u16 r1, q1[2] -; CHECK-NEXT: vmov.8 q0[10], r1 -; CHECK-NEXT: vmov.u16 r1, q1[3] -; CHECK-NEXT: vmov.8 q0[11], r1 -; CHECK-NEXT: vmov.u16 r1, q1[4] -; CHECK-NEXT: vmov.8 q0[12], r1 -; CHECK-NEXT: vmov.u16 r1, q1[5] -; CHECK-NEXT: vmov.8 q0[13], r1 -; CHECK-NEXT: vmov.u16 r1, q1[6] -; CHECK-NEXT: vmov.8 q0[14], r1 -; CHECK-NEXT: vmov.u16 r1, q1[7] -; CHECK-NEXT: vmov.8 q0[15], r1 -; CHECK-NEXT: vstrb.8 q0, [r0] +; CHECK-NEXT: vstrb.16 q1, [r0, #8] +; CHECK-NEXT: vstrb.16 q0, [r0] ; CHECK-NEXT: bx lr entry: %wide.load = load <16 x i16>, <16 x i16>* %src, align 2