Index: llvm/lib/Target/ARM/ARMISelLowering.cpp =================================================================== --- llvm/lib/Target/ARM/ARMISelLowering.cpp +++ llvm/lib/Target/ARM/ARMISelLowering.cpp @@ -901,7 +901,6 @@ setTargetDAGCombine(ISD::SIGN_EXTEND); setTargetDAGCombine(ISD::ZERO_EXTEND); setTargetDAGCombine(ISD::ANY_EXTEND); - setTargetDAGCombine(ISD::STORE); setTargetDAGCombine(ISD::FP_TO_SINT); setTargetDAGCombine(ISD::FP_TO_UINT); setTargetDAGCombine(ISD::FDIV); @@ -922,6 +921,7 @@ setTargetDAGCombine(ISD::BUILD_VECTOR); setTargetDAGCombine(ISD::VECTOR_SHUFFLE); setTargetDAGCombine(ISD::INSERT_VECTOR_ELT); + setTargetDAGCombine(ISD::STORE); } if (!Subtarget->hasFP64()) { @@ -13155,7 +13155,8 @@ /// PerformSTORECombine - Target-specific dag combine xforms for /// ISD::STORE. static SDValue PerformSTORECombine(SDNode *N, - TargetLowering::DAGCombinerInfo &DCI) { + TargetLowering::DAGCombinerInfo &DCI, + const ARMSubtarget *Subtarget) { StoreSDNode *St = cast(N); if (St->isVolatile()) return SDValue(); @@ -13165,7 +13166,7 @@ // chunks. SDValue StVal = St->getValue(); EVT VT = StVal.getValueType(); - if (St->isTruncatingStore() && VT.isVector()) { + if (Subtarget->hasNEON() && St->isTruncatingStore() && VT.isVector()) { SelectionDAG &DAG = DCI.DAG; const TargetLowering &TLI = DAG.getTargetLoweringInfo(); EVT StVT = St->getMemoryVT(); @@ -13241,6 +13242,61 @@ } return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains); } + // Try taking a single vector store from an expensive buildvector and splitting it + // into a series of narrowing stores. + auto trySplittingToNarrowingStores = [](StoreSDNode *St, SelectionDAG &DAG) { + if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed()) + return SDValue(); + SDValue Ext = St->getValue(); + if (Ext->getOpcode() != ISD::TRUNCATE) + return SDValue(); + EVT FromVT = Ext->getOperand(0).getValueType(); + EVT ToVT = Ext.getValueType(); + if (!ToVT.isVector()) + return SDValue(); + assert(FromVT.getVectorNumElements() == ToVT.getVectorNumElements()); + EVT ToEltVT = ToVT.getVectorElementType(); + EVT FromEltVT = FromVT.getVectorElementType(); + + unsigned Multiple = 0; + if (FromEltVT == MVT::i32 && (ToEltVT == MVT::i16 || ToEltVT == MVT::i8)) + Multiple = 4; + if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8) + Multiple = 8; + if (Multiple == 0 || FromVT.getVectorNumElements() == Multiple || + FromVT.getVectorNumElements() % Multiple != 0) + return SDValue(); + + SDLoc DL(St); + // Details about the old load + SDValue Ch = St->getChain(); + SDValue BasePtr = St->getBasePtr(); + unsigned Alignment = St->getOriginalAlignment(); + MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags(); + AAMDNodes AAInfo = St->getAAInfo(); + + EVT NewFromVT = EVT::getVectorVT(*DAG.getContext(), FromEltVT, Multiple); + EVT NewToVT = EVT::getVectorVT(*DAG.getContext(), ToEltVT, Multiple); + + SmallVector Stores; + for (unsigned i = 0; i < FromVT.getVectorNumElements() / Multiple; i++) { + unsigned NewOffset = i * Multiple * ToEltVT.getSizeInBits()/8; + SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset); + + SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, + Ext.getOperand(0), + DAG.getConstant(i*Multiple, DL, MVT::i32)); + SDValue Store = + DAG.getTruncStore(Ch, DL, Extract, NewPtr, + St->getPointerInfo().getWithOffset(NewOffset), + NewToVT, Alignment, MMOFlags, AAInfo); + Stores.push_back(Store); + } + return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores); + }; + if (Subtarget->hasMVEIntegerOps()) + if (SDValue NewToken = trySplittingToNarrowingStores(St, DCI.DAG)) + return NewToken; if (!ISD::isNormalStore(St)) return SDValue(); @@ -13292,7 +13348,7 @@ } // If this is a legal vector store, try to combine it into a VST1_UPD. - if (ISD::isNormalStore(N) && VT.isVector() && + if (Subtarget->hasNEON() && ISD::isNormalStore(N) && VT.isVector() && DCI.DAG.getTargetLoweringInfo().isTypeLegal(VT)) return CombineBaseUpdate(N, DCI); @@ -14202,7 +14258,7 @@ case ARMISD::BFI: return PerformBFICombine(N, DCI); case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget); case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG); - case ISD::STORE: return PerformSTORECombine(N, DCI); + case ISD::STORE: return PerformSTORECombine(N, DCI, Subtarget); case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget); case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI); case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG); Index: llvm/test/CodeGen/Thumb2/float-ops.ll =================================================================== --- llvm/test/CodeGen/Thumb2/float-ops.ll +++ llvm/test/CodeGen/Thumb2/float-ops.ll @@ -130,7 +130,7 @@ entry: ; CHECK-LABEL: store_d: ; NOREGS: strd r2, r3, [r0] -; ONLYREGS: vstr d0, [r0] +; ONLYREGS: strd r2, r3, [r0] ; HARD: vstr d0, [r0] store double %b, double* %a, align 8 ret void Index: llvm/test/CodeGen/Thumb2/mve-masked-store.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-masked-store.ll +++ llvm/test/CodeGen/Thumb2/mve-masked-store.ll @@ -521,15 +521,11 @@ ; CHECK-LE-NEXT: bfi r3, r1, #0, #1 ; CHECK-LE-NEXT: and r1, r3, #3 ; CHECK-LE-NEXT: lsls r2, r3, #31 -; CHECK-LE-NEXT: ittt ne -; CHECK-LE-NEXT: vmovne r2, s1 -; CHECK-LE-NEXT: vmovne r3, s0 -; CHECK-LE-NEXT: strdne r3, r2, [r0] +; CHECK-LE-NEXT: it ne +; CHECK-LE-NEXT: vstrne d0, [r0] ; CHECK-LE-NEXT: lsls r1, r1, #30 -; CHECK-LE-NEXT: ittt mi -; CHECK-LE-NEXT: vmovmi r1, s3 -; CHECK-LE-NEXT: vmovmi r2, s2 -; CHECK-LE-NEXT: strdmi r2, r1, [r0, #8] +; CHECK-LE-NEXT: it mi +; CHECK-LE-NEXT: vstrmi d1, [r0, #8] ; CHECK-LE-NEXT: add sp, #4 ; CHECK-LE-NEXT: bx lr ; @@ -558,25 +554,11 @@ ; CHECK-BE-NEXT: bfi r3, r1, #0, #1 ; CHECK-BE-NEXT: and r1, r3, #3 ; CHECK-BE-NEXT: lsls r2, r3, #31 -; CHECK-BE-NEXT: bne .LBB19_3 -; CHECK-BE-NEXT: @ %bb.1: @ %else -; CHECK-BE-NEXT: lsls r1, r1, #30 -; CHECK-BE-NEXT: bmi .LBB19_4 -; CHECK-BE-NEXT: .LBB19_2: @ %else2 -; CHECK-BE-NEXT: add sp, #4 -; CHECK-BE-NEXT: bx lr -; CHECK-BE-NEXT: .LBB19_3: @ %cond.store -; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: vmov r2, s5 -; CHECK-BE-NEXT: vmov r3, s4 -; CHECK-BE-NEXT: strd r3, r2, [r0] +; CHECK-BE-NEXT: it ne +; CHECK-BE-NEXT: vstrne d0, [r0] ; CHECK-BE-NEXT: lsls r1, r1, #30 -; CHECK-BE-NEXT: bpl .LBB19_2 -; CHECK-BE-NEXT: .LBB19_4: @ %cond.store1 -; CHECK-BE-NEXT: vrev64.32 q1, q0 -; CHECK-BE-NEXT: vmov r1, s7 -; CHECK-BE-NEXT: vmov r2, s6 -; CHECK-BE-NEXT: strd r2, r1, [r0, #8] +; CHECK-BE-NEXT: it mi +; CHECK-BE-NEXT: vstrmi d1, [r0, #8] ; CHECK-BE-NEXT: add sp, #4 ; CHECK-BE-NEXT: bx lr entry: Index: llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll =================================================================== --- llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll +++ llvm/test/CodeGen/Thumb2/mve-widen-narrow.ll @@ -45,43 +45,14 @@ define void @foo_int8_int32_double(<16 x i8>* %dest, <16 x i32>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_int8_int32_double: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.8 q0[0], r2 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov.8 q0[1], r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.8 q0[2], r2 -; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] -; CHECK-NEXT: vmov.8 q0[3], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.8 q0[4], r2 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov.8 q0[5], r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.8 q0[6], r2 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vldrw.u32 q1, [r1, #32] -; CHECK-NEXT: vmov.8 q0[7], r2 -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.8 q0[8], r2 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov.8 q0[9], r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.8 q0[10], r2 -; CHECK-NEXT: vmov r2, s7 -; CHECK-NEXT: vldrw.u32 q1, [r1, #48] -; CHECK-NEXT: vmov.8 q0[11], r2 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmov.8 q0[12], r1 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov.8 q0[13], r1 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov.8 q0[14], r1 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: vmov.8 q0[15], r1 -; CHECK-NEXT: vstrb.8 q0, [r0] +; CHECK-NEXT: vldrw.u32 q2, [r1, #32] +; CHECK-NEXT: vldrw.u32 q3, [r1, #48] +; CHECK-NEXT: vstrb.32 q1, [r0, #4] +; CHECK-NEXT: vstrb.32 q0, [r0] +; CHECK-NEXT: vstrb.32 q3, [r0, #12] +; CHECK-NEXT: vstrb.32 q2, [r0, #8] ; CHECK-NEXT: bx lr entry: %wide.load = load <16 x i32>, <16 x i32>* %src, align 4 @@ -93,25 +64,10 @@ define void @foo_int16_int32_double(<8 x i16>* %dest, <8 x i32>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_int16_int32_double: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vmov r2, s4 -; CHECK-NEXT: vmov.16 q0[0], r2 -; CHECK-NEXT: vmov r2, s5 -; CHECK-NEXT: vmov.16 q0[1], r2 -; CHECK-NEXT: vmov r2, s6 -; CHECK-NEXT: vmov.16 q0[2], r2 -; CHECK-NEXT: vmov r2, s7 +; CHECK-NEXT: vldrw.u32 q0, [r1] ; CHECK-NEXT: vldrw.u32 q1, [r1, #16] -; CHECK-NEXT: vmov.16 q0[3], r2 -; CHECK-NEXT: vmov r1, s4 -; CHECK-NEXT: vmov.16 q0[4], r1 -; CHECK-NEXT: vmov r1, s5 -; CHECK-NEXT: vmov.16 q0[5], r1 -; CHECK-NEXT: vmov r1, s6 -; CHECK-NEXT: vmov.16 q0[6], r1 -; CHECK-NEXT: vmov r1, s7 -; CHECK-NEXT: vmov.16 q0[7], r1 -; CHECK-NEXT: vstrh.16 q0, [r0] +; CHECK-NEXT: vstrh.32 q1, [r0, #8] +; CHECK-NEXT: vstrh.32 q0, [r0] ; CHECK-NEXT: bx lr entry: %wide.load = load <8 x i32>, <8 x i32>* %src, align 4 @@ -123,41 +79,10 @@ define void @foo_int8_int16_double(<16 x i8>* %dest, <16 x i16>* readonly %src, i32 %n) { ; CHECK-LABEL: foo_int8_int16_double: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: vldrh.u16 q1, [r1] -; CHECK-NEXT: vmov.u16 r2, q1[0] -; CHECK-NEXT: vmov.8 q0[0], r2 -; CHECK-NEXT: vmov.u16 r2, q1[1] -; CHECK-NEXT: vmov.8 q0[1], r2 -; CHECK-NEXT: vmov.u16 r2, q1[2] -; CHECK-NEXT: vmov.8 q0[2], r2 -; CHECK-NEXT: vmov.u16 r2, q1[3] -; CHECK-NEXT: vmov.8 q0[3], r2 -; CHECK-NEXT: vmov.u16 r2, q1[4] -; CHECK-NEXT: vmov.8 q0[4], r2 -; CHECK-NEXT: vmov.u16 r2, q1[5] -; CHECK-NEXT: vmov.8 q0[5], r2 -; CHECK-NEXT: vmov.u16 r2, q1[6] -; CHECK-NEXT: vmov.8 q0[6], r2 -; CHECK-NEXT: vmov.u16 r2, q1[7] +; CHECK-NEXT: vldrh.u16 q0, [r1] ; CHECK-NEXT: vldrh.u16 q1, [r1, #16] -; CHECK-NEXT: vmov.8 q0[7], r2 -; CHECK-NEXT: vmov.u16 r1, q1[0] -; CHECK-NEXT: vmov.8 q0[8], r1 -; CHECK-NEXT: vmov.u16 r1, q1[1] -; CHECK-NEXT: vmov.8 q0[9], r1 -; CHECK-NEXT: vmov.u16 r1, q1[2] -; CHECK-NEXT: vmov.8 q0[10], r1 -; CHECK-NEXT: vmov.u16 r1, q1[3] -; CHECK-NEXT: vmov.8 q0[11], r1 -; CHECK-NEXT: vmov.u16 r1, q1[4] -; CHECK-NEXT: vmov.8 q0[12], r1 -; CHECK-NEXT: vmov.u16 r1, q1[5] -; CHECK-NEXT: vmov.8 q0[13], r1 -; CHECK-NEXT: vmov.u16 r1, q1[6] -; CHECK-NEXT: vmov.8 q0[14], r1 -; CHECK-NEXT: vmov.u16 r1, q1[7] -; CHECK-NEXT: vmov.8 q0[15], r1 -; CHECK-NEXT: vstrb.8 q0, [r0] +; CHECK-NEXT: vstrb.16 q1, [r0, #8] +; CHECK-NEXT: vstrb.16 q0, [r0] ; CHECK-NEXT: bx lr entry: %wide.load = load <16 x i16>, <16 x i16>* %src, align 2