Index: llvm/lib/Target/ARM/ARMISelLowering.h =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.h +++ llvm/lib/Target/ARM/ARMISelLowering.h @@ -137,6 +137,8 @@ PREDICATE_CAST, // Predicate cast for MVE i1 types VECTOR_REG_CAST, // Reinterpret the current contents of a vector register + MVETRUNC, // Legalization aid for truncating two vectors into a single. + VCMP, // Vector compare. VCMPZ, // Vector compare to zero. VTST, // Vector test bits. @@ -381,6 +383,7 @@ SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const; SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const; SDValue PerformIntrinsicCombine(SDNode *N, DAGCombinerInfo &DCI) const; + SDValue PerformMVETruncCombine(SDNode *N, DAGCombinerInfo &DCI) const; SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override; bool SimplifyDemandedBitsForTargetNode(SDValue Op, Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -444,6 +444,8 @@ setOperationAction(ISD::LOAD, VT, Custom); setOperationAction(ISD::STORE, VT, Custom); } + setOperationAction(ISD::TRUNCATE, MVT::v8i32, Custom); + setOperationAction(ISD::TRUNCATE, MVT::v16i16, Custom); } ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM, @@ -1679,6 +1681,7 @@ case ARMISD::PREDICATE_CAST: return "ARMISD::PREDICATE_CAST"; case ARMISD::VECTOR_REG_CAST: return "ARMISD::VECTOR_REG_CAST"; + case ARMISD::MVETRUNC: return "ARMISD::MVETRUNC"; case ARMISD::VCMP: return "ARMISD::VCMP"; case ARMISD::VCMPZ: return "ARMISD::VCMPZ"; case ARMISD::VTST: return "ARMISD::VTST"; @@ -7268,6 +7271,28 @@ return true; } +static bool isVMOVNTruncMask(ArrayRef M, EVT ToVT, bool rev) { + unsigned NumElts = ToVT.getVectorNumElements(); + if (NumElts != M.size()) + return false; + + // Test if the Trunc can be convertable to a VMOVN with this shuffle. We are + // looking for patterns of: + // !rev: 0 N/2 1 N/2+1 2 N/2+2 ... + // rev: N/2 0 N/2+1 1 N/2+2 2 ... + + unsigned Off0 = rev ? NumElts / 2 : 0; + unsigned Off1 = rev ? 0 : NumElts / 2; + for (unsigned i = 0; i < NumElts; i += 2) { + if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2)) + return false; + if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2)) + return false; + } + + return true; +} + // Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted // from a pair of inputs. For example: // BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0), @@ -9719,6 +9744,25 @@ return DAG.getMergeValues({Result, Chain}, dl); } +static SDValue LowerTruncate(SDNode *N, SelectionDAG &DAG, + const ARMSubtarget *Subtarget) { + if (!Subtarget->hasMVEIntegerOps()) + return SDValue(); + + SDLoc DL(N); + + EVT ToVT = N->getValueType(0); + if (ToVT != MVT::v8i16 && ToVT != MVT::v16i8) + return SDValue(); + EVT FromVT = N->getOperand(0).getValueType(); + if (FromVT != MVT::v8i32 && FromVT != MVT::v16i16) + return SDValue(); + + SDValue Lo, Hi; + std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0); + return DAG.getNode(ARMISD::MVETRUNC, DL, ToVT, Lo, Hi); +} + SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump()); switch (Op.getOpcode()) { @@ -9825,6 +9869,8 @@ case ISD::STRICT_FSETCC: case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG); case ARMISD::WIN__DBZCHK: return SDValue(); + case ISD::TRUNCATE: + return LowerTruncate(Op.getNode(), DAG, Subtarget); } } @@ -9913,6 +9959,9 @@ case ISD::LOAD: LowerLOAD(N, Results, DAG); break; + case ISD::TRUNCATE: + Res = LowerTruncate(N, DAG, Subtarget); + break; } if (Res.getNode()) Results.push_back(Res); @@ -13865,6 +13914,10 @@ if (ST->isLittle()) return DCI.DAG.getNode(ISD::BITCAST, dl, VT, Op); + // VECTOR_REG_CAST undef -> undef + if (Op.isUndef()) + return DCI.DAG.getUNDEF(VT); + // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x) if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) { // If the valuetypes are the same, we can remove the cast entirely. @@ -13955,6 +14008,15 @@ return X; } + // extract (MVETrunc(x)) -> extract x + if (Op0->getOpcode() == ARMISD::MVETRUNC) { + unsigned Idx = N->getConstantOperandVal(1); + unsigned Vec = Idx / Op0->getOperand(0).getValueType().getVectorNumElements(); + unsigned SubIdx = Idx % Op0->getOperand(0).getValueType().getVectorNumElements(); + return DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, Op0.getOperand(Vec), + DCI.DAG.getConstant(SubIdx, dl, MVT::i32)); + } + return SDValue(); } @@ -14627,7 +14689,7 @@ return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } -// Try taking a single vector store from an truncate (which would otherwise turn +// Try taking a single vector store from an fpround (which would otherwise turn // into an expensive buildvector) and splitting it into a series of narrowing // stores. static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St, @@ -14635,7 +14697,7 @@ if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) return SDValue(); SDValue Trunc = St->getValue(); - if (Trunc->getOpcode() != ISD::TRUNCATE && Trunc->getOpcode() != ISD::FP_ROUND) + if (Trunc->getOpcode() != ISD::FP_ROUND) return SDValue(); EVT FromVT = Trunc->getOperand(0).getValueType(); EVT ToVT = Trunc.getValueType(); @@ -14645,16 +14707,11 @@ EVT ToEltVT = ToVT.getVectorElementType(); EVT FromEltVT = FromVT.getVectorElementType(); - unsigned NumElements = 0; - if (FromEltVT == MVT::i32 && (ToEltVT == MVT::i16 || ToEltVT == MVT::i8)) - NumElements = 4; - if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8) - NumElements = 8; - if (FromEltVT == MVT::f32 && ToEltVT == MVT::f16) - NumElements = 4; - if (NumElements == 0 || - (FromEltVT != MVT::f32 && FromVT.getVectorNumElements() == NumElements) || - FromVT.getVectorNumElements() % NumElements != 0) + if (FromEltVT != MVT::f32 || ToEltVT != MVT::f16) + return SDValue(); + + unsigned NumElements = 4; + if (FromVT.getVectorNumElements() % NumElements != 0) return SDValue(); // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so @@ -14679,14 +14736,6 @@ return true; }; - // It may be preferable to keep the store unsplit as the trunc may end up - // being removed. Check that here. - if (Trunc.getOperand(0).getOpcode() == ISD::SMIN) { - if (SDValue U = PerformVQDMULHCombine(Trunc.getOperand(0).getNode(), DAG)) { - DAG.ReplaceAllUsesWith(Trunc.getOperand(0), U); - return SDValue(); - } - } if (auto *Shuffle = dyn_cast(Trunc->getOperand(0))) if (isVMOVNOriginalMask(Shuffle->getMask(), false) || isVMOVNOriginalMask(Shuffle->getMask(), true)) @@ -14717,13 +14766,52 @@ DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0), DAG.getConstant(i * NumElements, DL, MVT::i32)); - if (ToEltVT == MVT::f16) { - SDValue FPTrunc = - DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16), - Extract, DAG.getConstant(0, DL, MVT::i32)); - Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc); - } + SDValue FPTrunc = + DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16), + Extract, DAG.getConstant(0, DL, MVT::i32)); + Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc); + + SDValue Store = DAG.getTruncStore( + Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset), + NewToVT, Alignment.value(), MMOFlags, AAInfo); + Stores.push_back(Store); + } + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); +} + +// Try taking a single vector store from an MVETRUNC (which would otherwise turn +// into an expensive buildvector) and splitting it into a series of narrowing +// stores. +static SDValue PerformSplittingMVETruncToNarrowingStores(StoreSDNode *St, + SelectionDAG &DAG) { + if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) + return SDValue(); + SDValue Trunc = St->getValue(); + if (Trunc->getOpcode() != ARMISD::MVETRUNC) + return SDValue(); + EVT FromVT = Trunc->getOperand(0).getValueType(); + EVT ToVT = Trunc.getValueType(); + + LLVMContext &C = *DAG.getContext(); + SDLoc DL(St); + // Details about the old store + SDValue Ch = St->getChain(); + SDValue BasePtr = St->getBasePtr(); + Align Alignment = St->getOriginalAlign(); + MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); + AAMDNodes AAInfo = St->getAAInfo(); + + EVT NewToVT = EVT::getVectorVT(C, ToVT.getVectorElementType(), + FromVT.getVectorNumElements()); + + SmallVector Stores; + for (unsigned i = 0; i < Trunc.getNumOperands(); i++) { + unsigned NewOffset = + i * FromVT.getVectorNumElements() * ToVT.getScalarSizeInBits() / 8; + SDValue NewPtr = + DAG.getObjectPtrOffset(DL, BasePtr, TypeSize::Fixed(NewOffset)); + SDValue Extract = Trunc.getOperand(i); SDValue Store = DAG.getTruncStore( Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset), NewToVT, Alignment.value(), MMOFlags, AAInfo); @@ -14747,9 +14835,13 @@ if (SDValue Store = PerformTruncatingStoreCombine(St, DCI.DAG)) return Store; - if (Subtarget->hasMVEIntegerOps()) + if (Subtarget->hasMVEIntegerOps()) { if (SDValue NewToken = PerformSplittingToNarrowingStores(St, DCI.DAG)) return NewToken; + if (SDValue NewToken = + PerformSplittingMVETruncToNarrowingStores(St, DCI.DAG)) + return NewToken; + } if (!ISD::isNormalStore(St)) return SDValue(); @@ -15132,6 +15224,14 @@ return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0), Op0, Op1->getOperand(1), N->getOperand(2)); + // VMOVNb(undef, x) -> x + // VMOVNb(x, undef) -> x + // VMOVNt(x, undef) -> x + if (Op0.isUndef() && !IsTop) + return Op1; + if (Op1.isUndef()) + return Op0; + // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting // into the top or bottom lanes. @@ -16266,6 +16366,83 @@ return SDValue(); } +// Some combines for the MVETrunc truncations legalizer helper. Also lowers the +// node into a buildvector after legalizeOps. +SDValue ARMTargetLowering::PerformMVETruncCombine( + SDNode *N, TargetLowering::DAGCombinerInfo &DCI) const { + EVT VT = N->getValueType(0); + SDLoc DL(N); + + // MVETrunc(MVETrunc a b, MVETrunc c, d) -> MVETrunc + if (N->getNumOperands() == 2 && + N->getOperand(0).getOpcode() == ARMISD::MVETRUNC && + N->getOperand(1).getOpcode() == ARMISD::MVETRUNC) + return DCI.DAG.getNode( + ARMISD::MVETRUNC, DL, VT, N->getOperand(0).getOperand(0), + N->getOperand(0).getOperand(1), N->getOperand(1).getOperand(0), + N->getOperand(1).getOperand(1)); + + // MVETrunc(shuffle, shuffle) -> VMOVN + if (N->getNumOperands() == 2 && + N->getOperand(0).getOpcode() == ISD::VECTOR_SHUFFLE && + N->getOperand(1).getOpcode() == ISD::VECTOR_SHUFFLE) { + auto *S0 = cast(N->getOperand(0).getNode()); + auto *S1 = cast(N->getOperand(1).getNode()); + + if (S0->getOperand(0) == S1->getOperand(0) && + S0->getOperand(1) == S1->getOperand(1)) { + // Construct complete shuffle mask + SmallVector Mask(S0->getMask().begin(), S0->getMask().end()); + Mask.append(S1->getMask().begin(), S1->getMask().end()); + + if (isVMOVNTruncMask(Mask, VT, 0)) + return DCI.DAG.getNode( + ARMISD::VMOVN, DL, VT, + DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)), + DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)), + DCI.DAG.getConstant(1, DL, MVT::i32)); + if (isVMOVNTruncMask(Mask, VT, 1)) + return DCI.DAG.getNode( + ARMISD::VMOVN, DL, VT, + DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(1)), + DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, S0->getOperand(0)), + DCI.DAG.getConstant(1, DL, MVT::i32)); + } + } + + auto LowerToBuildVec = [&]() { + SmallVector Extracts; + for (unsigned Op = 0; Op < N->getNumOperands(); Op++) { + SDValue O = N->getOperand(Op); + for (unsigned i = 0; i < O.getValueType().getVectorNumElements(); i++) { + SDValue Ext = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, O, + DCI.DAG.getConstant(i, DL, MVT::i32)); + Extracts.push_back(Ext); + } + } + return DCI.DAG.getBuildVector(VT, DL, Extracts); + }; + + // For MVETrunc of a buildvector or shuffle, it can be beneficial to lower the + // truncate to a buildvector to allow the generic optimisations to kick in. + if (all_of(N->ops(), [](SDValue Op) { + return Op.getOpcode() == ISD::BUILD_VECTOR || + Op.getOpcode() == ISD::VECTOR_SHUFFLE || + (Op.getOpcode() == ISD::BITCAST && + Op.getOperand(0).getOpcode() == ISD::BUILD_VECTOR); + })) + return LowerToBuildVec(); + + // If we are late in the legalization process and nothing has optimised + // the trunc to anything better lower it to a series of extracts and a + // buildvector. + if (DCI.isBeforeLegalizeOps()) + return SDValue(); + + SDValue BuildVec = LowerToBuildVec(); + return LowerBUILD_VECTOR(BuildVec, DCI.DAG, Subtarget); +} + SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const { switch (N->getOpcode()) { @@ -16337,6 +16514,8 @@ return PerformPREDICATE_CASTCombine(N, DCI); case ARMISD::VECTOR_REG_CAST: return PerformVECTOR_REG_CASTCombine(N, DCI, Subtarget); + case ARMISD::MVETRUNC: + return PerformMVETruncCombine(N, DCI); case ARMISD::VCMP: return PerformVCMPCombine(N, DCI, Subtarget); case ISD::VECREDUCE_ADD: Index: llvm/test/CodeGen/Thumb2/mve-vhadd.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vhadd.ll +++ llvm/test/CodeGen/Thumb2/mve-vhadd.ll @@ -175,40 +175,7 @@ ; CHECK-NEXT: vldrb.u8 q0, [r0], #16 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16 ; CHECK-NEXT: vhadd.s8 q0, q1, q0 -; CHECK-NEXT: vmov.u8 r3, q0[8] -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmov.u8 r3, q0[9] -; CHECK-NEXT: vmov.16 q1[1], r3 -; CHECK-NEXT: vmov.u8 r3, q0[10] -; CHECK-NEXT: vmov.16 q1[2], r3 -; CHECK-NEXT: vmov.u8 r3, q0[11] -; CHECK-NEXT: vmov.16 q1[3], r3 -; CHECK-NEXT: vmov.u8 r3, q0[12] -; CHECK-NEXT: vmov.16 q1[4], r3 -; CHECK-NEXT: vmov.u8 r3, q0[13] -; CHECK-NEXT: vmov.16 q1[5], r3 -; CHECK-NEXT: vmov.u8 r3, q0[14] -; CHECK-NEXT: vmov.16 q1[6], r3 -; CHECK-NEXT: vmov.u8 r3, q0[15] -; CHECK-NEXT: vmov.16 q1[7], r3 -; CHECK-NEXT: vmov.u8 r3, q0[0] -; CHECK-NEXT: vstrb.16 q1, [r2, #8] -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmov.u8 r3, q0[1] -; CHECK-NEXT: vmov.16 q1[1], r3 -; CHECK-NEXT: vmov.u8 r3, q0[2] -; CHECK-NEXT: vmov.16 q1[2], r3 -; CHECK-NEXT: vmov.u8 r3, q0[3] -; CHECK-NEXT: vmov.16 q1[3], r3 -; CHECK-NEXT: vmov.u8 r3, q0[4] -; CHECK-NEXT: vmov.16 q1[4], r3 -; CHECK-NEXT: vmov.u8 r3, q0[5] -; CHECK-NEXT: vmov.16 q1[5], r3 -; CHECK-NEXT: vmov.u8 r3, q0[6] -; CHECK-NEXT: vmov.16 q1[6], r3 -; CHECK-NEXT: vmov.u8 r3, q0[7] -; CHECK-NEXT: vmov.16 q1[7], r3 -; CHECK-NEXT: vstrb.16 q1, [r2], #16 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB12_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -251,24 +218,7 @@ ; CHECK-NEXT: vldrh.u16 q0, [r0], #16 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16 ; CHECK-NEXT: vhadd.s16 q0, q1, q0 -; CHECK-NEXT: vmov.u16 r3, q0[4] -; CHECK-NEXT: vmov.32 q1[0], r3 -; CHECK-NEXT: vmov.u16 r3, q0[5] -; CHECK-NEXT: vmov.32 q1[1], r3 -; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: vmov.32 q1[2], r3 -; CHECK-NEXT: vmov.u16 r3, q0[7] -; CHECK-NEXT: vmov.32 q1[3], r3 -; CHECK-NEXT: vmov.u16 r3, q0[0] -; CHECK-NEXT: vstrh.32 q1, [r2, #8] -; CHECK-NEXT: vmov.32 q1[0], r3 -; CHECK-NEXT: vmov.u16 r3, q0[1] -; CHECK-NEXT: vmov.32 q1[1], r3 -; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: vmov.32 q1[2], r3 -; CHECK-NEXT: vmov.u16 r3, q0[3] -; CHECK-NEXT: vmov.32 q1[3], r3 -; CHECK-NEXT: vstrh.32 q1, [r2], #16 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB13_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -354,42 +304,7 @@ ; CHECK-NEXT: vldrb.u8 q0, [r0], #16 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16 ; CHECK-NEXT: vhadd.u8 q0, q1, q0 -; CHECK-NEXT: vmov.u8 r3, q0[8] -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmov.u8 r3, q0[9] -; CHECK-NEXT: vmov.16 q1[1], r3 -; CHECK-NEXT: vmov.u8 r3, q0[10] -; CHECK-NEXT: vmov.16 q1[2], r3 -; CHECK-NEXT: vmov.u8 r3, q0[11] -; CHECK-NEXT: vmov.16 q1[3], r3 -; CHECK-NEXT: vmov.u8 r3, q0[12] -; CHECK-NEXT: vmov.16 q1[4], r3 -; CHECK-NEXT: vmov.u8 r3, q0[13] -; CHECK-NEXT: vmov.16 q1[5], r3 -; CHECK-NEXT: vmov.u8 r3, q0[14] -; CHECK-NEXT: vmov.16 q1[6], r3 -; CHECK-NEXT: vmov.u8 r3, q0[15] -; CHECK-NEXT: vmov.16 q1[7], r3 -; CHECK-NEXT: vmov.u8 r3, q0[0] -; CHECK-NEXT: vmovlb.u8 q1, q1 -; CHECK-NEXT: vstrb.16 q1, [r2, #8] -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmov.u8 r3, q0[1] -; CHECK-NEXT: vmov.16 q1[1], r3 -; CHECK-NEXT: vmov.u8 r3, q0[2] -; CHECK-NEXT: vmov.16 q1[2], r3 -; CHECK-NEXT: vmov.u8 r3, q0[3] -; CHECK-NEXT: vmov.16 q1[3], r3 -; CHECK-NEXT: vmov.u8 r3, q0[4] -; CHECK-NEXT: vmov.16 q1[4], r3 -; CHECK-NEXT: vmov.u8 r3, q0[5] -; CHECK-NEXT: vmov.16 q1[5], r3 -; CHECK-NEXT: vmov.u8 r3, q0[6] -; CHECK-NEXT: vmov.16 q1[6], r3 -; CHECK-NEXT: vmov.u8 r3, q0[7] -; CHECK-NEXT: vmov.16 q1[7], r3 -; CHECK-NEXT: vmovlb.u8 q0, q1 -; CHECK-NEXT: vstrb.16 q0, [r2], #16 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB15_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -432,24 +347,7 @@ ; CHECK-NEXT: vldrh.u16 q0, [r0], #16 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16 ; CHECK-NEXT: vhadd.u16 q0, q1, q0 -; CHECK-NEXT: vmov.u16 r3, q0[4] -; CHECK-NEXT: vmov.32 q1[0], r3 -; CHECK-NEXT: vmov.u16 r3, q0[5] -; CHECK-NEXT: vmov.32 q1[1], r3 -; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: vmov.32 q1[2], r3 -; CHECK-NEXT: vmov.u16 r3, q0[7] -; CHECK-NEXT: vmov.32 q1[3], r3 -; CHECK-NEXT: vmov.u16 r3, q0[0] -; CHECK-NEXT: vstrh.32 q1, [r2, #8] -; CHECK-NEXT: vmov.32 q1[0], r3 -; CHECK-NEXT: vmov.u16 r3, q0[1] -; CHECK-NEXT: vmov.32 q1[1], r3 -; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: vmov.32 q1[2], r3 -; CHECK-NEXT: vmov.u16 r3, q0[3] -; CHECK-NEXT: vmov.32 q1[3], r3 -; CHECK-NEXT: vstrh.32 q1, [r2], #16 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB16_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -535,42 +433,7 @@ ; CHECK-NEXT: vldrb.u8 q0, [r1], #16 ; CHECK-NEXT: vldrb.u8 q1, [r0], #16 ; CHECK-NEXT: vrhadd.u8 q0, q1, q0 -; CHECK-NEXT: vmov.u8 r3, q0[8] -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmov.u8 r3, q0[9] -; CHECK-NEXT: vmov.16 q1[1], r3 -; CHECK-NEXT: vmov.u8 r3, q0[10] -; CHECK-NEXT: vmov.16 q1[2], r3 -; CHECK-NEXT: vmov.u8 r3, q0[11] -; CHECK-NEXT: vmov.16 q1[3], r3 -; CHECK-NEXT: vmov.u8 r3, q0[12] -; CHECK-NEXT: vmov.16 q1[4], r3 -; CHECK-NEXT: vmov.u8 r3, q0[13] -; CHECK-NEXT: vmov.16 q1[5], r3 -; CHECK-NEXT: vmov.u8 r3, q0[14] -; CHECK-NEXT: vmov.16 q1[6], r3 -; CHECK-NEXT: vmov.u8 r3, q0[15] -; CHECK-NEXT: vmov.16 q1[7], r3 -; CHECK-NEXT: vmov.u8 r3, q0[0] -; CHECK-NEXT: vmovlb.u8 q1, q1 -; CHECK-NEXT: vstrb.16 q1, [r2, #8] -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmov.u8 r3, q0[1] -; CHECK-NEXT: vmov.16 q1[1], r3 -; CHECK-NEXT: vmov.u8 r3, q0[2] -; CHECK-NEXT: vmov.16 q1[2], r3 -; CHECK-NEXT: vmov.u8 r3, q0[3] -; CHECK-NEXT: vmov.16 q1[3], r3 -; CHECK-NEXT: vmov.u8 r3, q0[4] -; CHECK-NEXT: vmov.16 q1[4], r3 -; CHECK-NEXT: vmov.u8 r3, q0[5] -; CHECK-NEXT: vmov.16 q1[5], r3 -; CHECK-NEXT: vmov.u8 r3, q0[6] -; CHECK-NEXT: vmov.16 q1[6], r3 -; CHECK-NEXT: vmov.u8 r3, q0[7] -; CHECK-NEXT: vmov.16 q1[7], r3 -; CHECK-NEXT: vmovlb.u8 q0, q1 -; CHECK-NEXT: vstrb.16 q0, [r2], #16 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB18_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -614,24 +477,7 @@ ; CHECK-NEXT: vldrh.u16 q0, [r1], #16 ; CHECK-NEXT: vldrh.u16 q1, [r0], #16 ; CHECK-NEXT: vrhadd.u16 q0, q1, q0 -; CHECK-NEXT: vmov.u16 r3, q0[4] -; CHECK-NEXT: vmov.32 q1[0], r3 -; CHECK-NEXT: vmov.u16 r3, q0[5] -; CHECK-NEXT: vmov.32 q1[1], r3 -; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: vmov.32 q1[2], r3 -; CHECK-NEXT: vmov.u16 r3, q0[7] -; CHECK-NEXT: vmov.32 q1[3], r3 -; CHECK-NEXT: vmov.u16 r3, q0[0] -; CHECK-NEXT: vstrh.32 q1, [r2, #8] -; CHECK-NEXT: vmov.32 q1[0], r3 -; CHECK-NEXT: vmov.u16 r3, q0[1] -; CHECK-NEXT: vmov.32 q1[1], r3 -; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: vmov.32 q1[2], r3 -; CHECK-NEXT: vmov.u16 r3, q0[3] -; CHECK-NEXT: vmov.32 q1[3], r3 -; CHECK-NEXT: vstrh.32 q1, [r2], #16 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB19_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -719,42 +565,7 @@ ; CHECK-NEXT: vldrb.u8 q0, [r1], #16 ; CHECK-NEXT: vldrb.u8 q1, [r0], #16 ; CHECK-NEXT: vrhadd.u8 q0, q1, q0 -; CHECK-NEXT: vmov.u8 r3, q0[8] -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmov.u8 r3, q0[9] -; CHECK-NEXT: vmov.16 q1[1], r3 -; CHECK-NEXT: vmov.u8 r3, q0[10] -; CHECK-NEXT: vmov.16 q1[2], r3 -; CHECK-NEXT: vmov.u8 r3, q0[11] -; CHECK-NEXT: vmov.16 q1[3], r3 -; CHECK-NEXT: vmov.u8 r3, q0[12] -; CHECK-NEXT: vmov.16 q1[4], r3 -; CHECK-NEXT: vmov.u8 r3, q0[13] -; CHECK-NEXT: vmov.16 q1[5], r3 -; CHECK-NEXT: vmov.u8 r3, q0[14] -; CHECK-NEXT: vmov.16 q1[6], r3 -; CHECK-NEXT: vmov.u8 r3, q0[15] -; CHECK-NEXT: vmov.16 q1[7], r3 -; CHECK-NEXT: vmov.u8 r3, q0[0] -; CHECK-NEXT: vmovlb.u8 q1, q1 -; CHECK-NEXT: vstrb.16 q1, [r2, #8] -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmov.u8 r3, q0[1] -; CHECK-NEXT: vmov.16 q1[1], r3 -; CHECK-NEXT: vmov.u8 r3, q0[2] -; CHECK-NEXT: vmov.16 q1[2], r3 -; CHECK-NEXT: vmov.u8 r3, q0[3] -; CHECK-NEXT: vmov.16 q1[3], r3 -; CHECK-NEXT: vmov.u8 r3, q0[4] -; CHECK-NEXT: vmov.16 q1[4], r3 -; CHECK-NEXT: vmov.u8 r3, q0[5] -; CHECK-NEXT: vmov.16 q1[5], r3 -; CHECK-NEXT: vmov.u8 r3, q0[6] -; CHECK-NEXT: vmov.16 q1[6], r3 -; CHECK-NEXT: vmov.u8 r3, q0[7] -; CHECK-NEXT: vmov.16 q1[7], r3 -; CHECK-NEXT: vmovlb.u8 q0, q1 -; CHECK-NEXT: vstrb.16 q0, [r2], #16 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB21_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -798,24 +609,7 @@ ; CHECK-NEXT: vldrh.u16 q0, [r1], #16 ; CHECK-NEXT: vldrh.u16 q1, [r0], #16 ; CHECK-NEXT: vrhadd.u16 q0, q1, q0 -; CHECK-NEXT: vmov.u16 r3, q0[4] -; CHECK-NEXT: vmov.32 q1[0], r3 -; CHECK-NEXT: vmov.u16 r3, q0[5] -; CHECK-NEXT: vmov.32 q1[1], r3 -; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: vmov.32 q1[2], r3 -; CHECK-NEXT: vmov.u16 r3, q0[7] -; CHECK-NEXT: vmov.32 q1[3], r3 -; CHECK-NEXT: vmov.u16 r3, q0[0] -; CHECK-NEXT: vstrh.32 q1, [r2, #8] -; CHECK-NEXT: vmov.32 q1[0], r3 -; CHECK-NEXT: vmov.u16 r3, q0[1] -; CHECK-NEXT: vmov.32 q1[1], r3 -; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: vmov.32 q1[2], r3 -; CHECK-NEXT: vmov.u16 r3, q0[3] -; CHECK-NEXT: vmov.32 q1[3], r3 -; CHECK-NEXT: vstrh.32 q1, [r2], #16 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB22_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} Index: llvm/test/CodeGen/Thumb2/mve-vmovn.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vmovn.ll +++ llvm/test/CodeGen/Thumb2/mve-vmovn.ll @@ -763,13 +763,13 @@ define arm_aapcs_vfpcc <8 x i16> @vmovn32trunct_undef(<4 x i32> %src1) { ; CHECK-LABEL: vmovn32trunct_undef: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vrev32.16 q0, q0 +; CHECK-NEXT: vmovnt.i32 q0, q0 ; CHECK-NEXT: bx lr ; ; CHECKBE-LABEL: vmovn32trunct_undef: ; CHECKBE: @ %bb.0: @ %entry ; CHECKBE-NEXT: vrev64.32 q1, q0 -; CHECKBE-NEXT: vrev32.16 q1, q1 +; CHECKBE-NEXT: vmovnt.i32 q1, q1 ; CHECKBE-NEXT: vrev64.16 q0, q1 ; CHECKBE-NEXT: bx lr entry: @@ -828,13 +828,13 @@ define arm_aapcs_vfpcc <16 x i8> @vmovn16trunct_undef(<8 x i16> %src1) { ; CHECK-LABEL: vmovn16trunct_undef: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vrev16.8 q0, q0 +; CHECK-NEXT: vmovnt.i16 q0, q0 ; CHECK-NEXT: bx lr ; ; CHECKBE-LABEL: vmovn16trunct_undef: ; CHECKBE: @ %bb.0: @ %entry ; CHECKBE-NEXT: vrev64.16 q1, q0 -; CHECKBE-NEXT: vrev16.8 q1, q1 +; CHECKBE-NEXT: vmovnt.i16 q1, q1 ; CHECKBE-NEXT: vrev64.8 q0, q1 ; CHECKBE-NEXT: bx lr entry: Index: llvm/test/CodeGen/Thumb2/mve-vmulh.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-vmulh.ll +++ llvm/test/CodeGen/Thumb2/mve-vmulh.ll @@ -201,42 +201,7 @@ ; CHECK-NEXT: vldrb.u8 q0, [r0], #16 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16 ; CHECK-NEXT: vmulh.s8 q0, q1, q0 -; CHECK-NEXT: vmov.u8 r3, q0[8] -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmov.u8 r3, q0[9] -; CHECK-NEXT: vmov.16 q1[1], r3 -; CHECK-NEXT: vmov.u8 r3, q0[10] -; CHECK-NEXT: vmov.16 q1[2], r3 -; CHECK-NEXT: vmov.u8 r3, q0[11] -; CHECK-NEXT: vmov.16 q1[3], r3 -; CHECK-NEXT: vmov.u8 r3, q0[12] -; CHECK-NEXT: vmov.16 q1[4], r3 -; CHECK-NEXT: vmov.u8 r3, q0[13] -; CHECK-NEXT: vmov.16 q1[5], r3 -; CHECK-NEXT: vmov.u8 r3, q0[14] -; CHECK-NEXT: vmov.16 q1[6], r3 -; CHECK-NEXT: vmov.u8 r3, q0[15] -; CHECK-NEXT: vmov.16 q1[7], r3 -; CHECK-NEXT: vmov.u8 r3, q0[0] -; CHECK-NEXT: vmovlb.u8 q1, q1 -; CHECK-NEXT: vstrb.16 q1, [r2, #8] -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmov.u8 r3, q0[1] -; CHECK-NEXT: vmov.16 q1[1], r3 -; CHECK-NEXT: vmov.u8 r3, q0[2] -; CHECK-NEXT: vmov.16 q1[2], r3 -; CHECK-NEXT: vmov.u8 r3, q0[3] -; CHECK-NEXT: vmov.16 q1[3], r3 -; CHECK-NEXT: vmov.u8 r3, q0[4] -; CHECK-NEXT: vmov.16 q1[4], r3 -; CHECK-NEXT: vmov.u8 r3, q0[5] -; CHECK-NEXT: vmov.16 q1[5], r3 -; CHECK-NEXT: vmov.u8 r3, q0[6] -; CHECK-NEXT: vmov.16 q1[6], r3 -; CHECK-NEXT: vmov.u8 r3, q0[7] -; CHECK-NEXT: vmov.16 q1[7], r3 -; CHECK-NEXT: vmovlb.u8 q0, q1 -; CHECK-NEXT: vstrb.16 q0, [r2], #16 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB12_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -279,24 +244,7 @@ ; CHECK-NEXT: vldrh.u16 q0, [r0], #16 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16 ; CHECK-NEXT: vmulh.s16 q0, q1, q0 -; CHECK-NEXT: vmov.u16 r3, q0[4] -; CHECK-NEXT: vmov.32 q1[0], r3 -; CHECK-NEXT: vmov.u16 r3, q0[5] -; CHECK-NEXT: vmov.32 q1[1], r3 -; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: vmov.32 q1[2], r3 -; CHECK-NEXT: vmov.u16 r3, q0[7] -; CHECK-NEXT: vmov.32 q1[3], r3 -; CHECK-NEXT: vmov.u16 r3, q0[0] -; CHECK-NEXT: vstrh.32 q1, [r2, #8] -; CHECK-NEXT: vmov.32 q1[0], r3 -; CHECK-NEXT: vmov.u16 r3, q0[1] -; CHECK-NEXT: vmov.32 q1[1], r3 -; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: vmov.32 q1[2], r3 -; CHECK-NEXT: vmov.u16 r3, q0[3] -; CHECK-NEXT: vmov.32 q1[3], r3 -; CHECK-NEXT: vstrh.32 q1, [r2], #16 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB13_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -382,42 +330,7 @@ ; CHECK-NEXT: vldrb.u8 q0, [r0], #16 ; CHECK-NEXT: vldrb.u8 q1, [r1], #16 ; CHECK-NEXT: vmulh.u8 q0, q1, q0 -; CHECK-NEXT: vmov.u8 r3, q0[8] -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmov.u8 r3, q0[9] -; CHECK-NEXT: vmov.16 q1[1], r3 -; CHECK-NEXT: vmov.u8 r3, q0[10] -; CHECK-NEXT: vmov.16 q1[2], r3 -; CHECK-NEXT: vmov.u8 r3, q0[11] -; CHECK-NEXT: vmov.16 q1[3], r3 -; CHECK-NEXT: vmov.u8 r3, q0[12] -; CHECK-NEXT: vmov.16 q1[4], r3 -; CHECK-NEXT: vmov.u8 r3, q0[13] -; CHECK-NEXT: vmov.16 q1[5], r3 -; CHECK-NEXT: vmov.u8 r3, q0[14] -; CHECK-NEXT: vmov.16 q1[6], r3 -; CHECK-NEXT: vmov.u8 r3, q0[15] -; CHECK-NEXT: vmov.16 q1[7], r3 -; CHECK-NEXT: vmov.u8 r3, q0[0] -; CHECK-NEXT: vmovlb.u8 q1, q1 -; CHECK-NEXT: vstrb.16 q1, [r2, #8] -; CHECK-NEXT: vmov.16 q1[0], r3 -; CHECK-NEXT: vmov.u8 r3, q0[1] -; CHECK-NEXT: vmov.16 q1[1], r3 -; CHECK-NEXT: vmov.u8 r3, q0[2] -; CHECK-NEXT: vmov.16 q1[2], r3 -; CHECK-NEXT: vmov.u8 r3, q0[3] -; CHECK-NEXT: vmov.16 q1[3], r3 -; CHECK-NEXT: vmov.u8 r3, q0[4] -; CHECK-NEXT: vmov.16 q1[4], r3 -; CHECK-NEXT: vmov.u8 r3, q0[5] -; CHECK-NEXT: vmov.16 q1[5], r3 -; CHECK-NEXT: vmov.u8 r3, q0[6] -; CHECK-NEXT: vmov.16 q1[6], r3 -; CHECK-NEXT: vmov.u8 r3, q0[7] -; CHECK-NEXT: vmov.16 q1[7], r3 -; CHECK-NEXT: vmovlb.u8 q0, q1 -; CHECK-NEXT: vstrb.16 q0, [r2], #16 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB15_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -460,24 +373,7 @@ ; CHECK-NEXT: vldrh.u16 q0, [r0], #16 ; CHECK-NEXT: vldrh.u16 q1, [r1], #16 ; CHECK-NEXT: vmulh.u16 q0, q1, q0 -; CHECK-NEXT: vmov.u16 r3, q0[4] -; CHECK-NEXT: vmov.32 q1[0], r3 -; CHECK-NEXT: vmov.u16 r3, q0[5] -; CHECK-NEXT: vmov.32 q1[1], r3 -; CHECK-NEXT: vmov.u16 r3, q0[6] -; CHECK-NEXT: vmov.32 q1[2], r3 -; CHECK-NEXT: vmov.u16 r3, q0[7] -; CHECK-NEXT: vmov.32 q1[3], r3 -; CHECK-NEXT: vmov.u16 r3, q0[0] -; CHECK-NEXT: vstrh.32 q1, [r2, #8] -; CHECK-NEXT: vmov.32 q1[0], r3 -; CHECK-NEXT: vmov.u16 r3, q0[1] -; CHECK-NEXT: vmov.32 q1[1], r3 -; CHECK-NEXT: vmov.u16 r3, q0[2] -; CHECK-NEXT: vmov.32 q1[2], r3 -; CHECK-NEXT: vmov.u16 r3, q0[3] -; CHECK-NEXT: vmov.32 q1[3], r3 -; CHECK-NEXT: vstrh.32 q1, [r2], #16 +; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: le lr, .LBB16_1 ; CHECK-NEXT: @ %bb.2: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc}