Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -524,6 +524,8 @@ SmallVectorImpl &InVals, bool isThisReturn, SDValue ThisVal) const; + SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; bool isEligibleForTailCallOptimization( Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -742,6 +742,8 @@ setOperationAction(ISD::FTRUNC, Ty, Legal); setOperationAction(ISD::FROUND, Ty, Legal); } + + setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom); } PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); @@ -2673,6 +2675,68 @@ } } +// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16. +static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, + EVT VT, EVT MemVT, + SelectionDAG &DAG) { + assert(VT.isVector() && "VT should be a vector type"); + assert(MemVT == MVT::v4i8 && VT == MVT::v4i16); + + SDValue Value = ST->getValue(); + + // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract + // the word lane which represent the v4i8 subvector. It optimizes the store + // to: + // + // xtn v0.8b, v0.8h + // str s0, [x0] + + SDValue Undef = DAG.getUNDEF(MVT::i16); + SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL, + {Undef, Undef, Undef, Undef}); + + SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, + Value, UndefVec); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt); + + Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc); + SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, + Trunc, DAG.getConstant(0, DL, MVT::i64)); + + return DAG.getStore(ST->getChain(), DL, ExtractTrunc, + ST->getBasePtr(), ST->getMemOperand()); +} + +// Custom lowering for any store, vector or scalar and/or default or with +// a truncate operations. Currently only custom lower truncate operation +// from vector v4i16 to v4i8. +SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc Dl(Op); + StoreSDNode *StoreNode = cast(Op); + assert (StoreNode && "Can only custom lower store nodes"); + + SDValue Value = StoreNode->getValue(); + + EVT VT = Value.getValueType(); + EVT MemVT = StoreNode->getMemoryVT(); + + assert (VT.isVector() && "Can only custom lower vector store types"); + + unsigned AS = StoreNode->getAddressSpace(); + unsigned Align = StoreNode->getAlignment(); + if (Align < MemVT.getStoreSize() && + !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) { + return scalarizeVectorStore(StoreNode, DAG); + } + + if (StoreNode->isTruncatingStore()) { + return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG); + } + + return SDValue(); +} + SDValue AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { LLVM_DEBUG(dbgs() << "Custom lowering: "); @@ -2784,6 +2848,8 @@ return LowerMULH(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::STORE: + return LowerSTORE(Op, DAG); case ISD::VECREDUCE_ADD: case ISD::VECREDUCE_SMAX: case ISD::VECREDUCE_SMIN: Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -634,14 +634,22 @@ return LT.first * 2 * AmortizationCost; } - if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8) && - Ty->getVectorNumElements() < 8) { - // We scalarize the loads/stores because there is not v.4b register and we - // have to promote the elements to v.4h. - unsigned NumVecElts = Ty->getVectorNumElements(); - unsigned NumVectorizableInstsToAmortize = NumVecElts * 2; - // We generate 2 instructions per vector element. - return NumVectorizableInstsToAmortize * NumVecElts * 2; + if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8)) { + unsigned ProfitableNumElements; + if (Opcode == Instruction::Store) + // We use a custom trunc store lowering so v.4b should be profitable. + ProfitableNumElements = 4; + else + // We scalarize the loads because there is not v.4b register and we + // have to promote the elements to v.2. + ProfitableNumElements = 8; + + if (Ty->getVectorNumElements() < ProfitableNumElements) { + unsigned NumVecElts = Ty->getVectorNumElements(); + unsigned NumVectorizableInstsToAmortize = NumVecElts * 2; + // We generate 2 instructions per vector element. + return NumVectorizableInstsToAmortize * NumVecElts * 2; + } } return LT.first; Index: test/Analysis/CostModel/AArch64/store.ll =================================================================== --- test/Analysis/CostModel/AArch64/store.ll +++ test/Analysis/CostModel/AArch64/store.ll @@ -59,7 +59,7 @@ ; these types (they get extended to v.4h/v.2s). ; CHECK: cost of 16 {{.*}} store store <2 x i8> undef, <2 x i8> * undef - ; CHECK: cost of 64 {{.*}} store + ; CHECK: cost of 1 {{.*}} store store <4 x i8> undef, <4 x i8> * undef ; CHECK: cost of 16 {{.*}} load load <2 x i8> , <2 x i8> * undef Index: test/CodeGen/AArch64/neon-truncStore-extLoad.ll =================================================================== --- test/CodeGen/AArch64/neon-truncStore-extLoad.ll +++ test/CodeGen/AArch64/neon-truncStore-extLoad.ll @@ -20,10 +20,20 @@ ret void } +define void @truncStore.v4i8(<4 x i32> %a, <4 x i8>* %result) { +; CHECK-LABEL: truncStore.v4i8: +; CHECK: xtn [[TMP:(v[0-9]+)]].4h, v{{[0-9]+}}.4s +; CHECK: xtn [[TMP2:(v[0-9]+)]].8b, [[TMP]].8h +; CHECK: {{st1 { [[TMP2]].4h }[0]|str s[0-9]+}}, [x{{[0-9]+|sp}}] + %b = trunc <4 x i32> %a to <4 x i8> + store <4 x i8> %b, <4 x i8>* %result + ret void +} + define void @truncStore.v8i16(<8 x i16> %a, <8 x i8>* %result) { ; CHECK-LABEL: truncStore.v8i16: -; CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h -; CHECK: {{st1 { v[0-9]+.8b }|str d[0-9]+}}, [x{{[0-9]+|sp}}] +; CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h +; CHECK-NEXT: str d{{[0-9]+}}, [x{{[0-9]+}}] %b = trunc <8 x i16> %a to <8 x i8> store <8 x i8> %b, <8 x i8>* %result ret void Index: test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll +++ test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll @@ -15,7 +15,6 @@ ; CHECK: Found an estimated cost of 0 for VF 2 For instruction: {{.*}} load i8 ; CHECK: vector.body ; CHECK: load i8 -; CHECK: load i8 ; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body define void @test(%pair* %p, i64 %n) {