Index: lib/Target/AArch64/AArch64ISelLowering.h =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.h +++ lib/Target/AArch64/AArch64ISelLowering.h @@ -524,6 +524,8 @@ SmallVectorImpl &InVals, bool isThisReturn, SDValue ThisVal) const; + SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const; + SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const; bool isEligibleForTailCallOptimization( Index: lib/Target/AArch64/AArch64ISelLowering.cpp =================================================================== --- lib/Target/AArch64/AArch64ISelLowering.cpp +++ lib/Target/AArch64/AArch64ISelLowering.cpp @@ -740,6 +740,8 @@ setOperationAction(ISD::FTRUNC, Ty, Legal); setOperationAction(ISD::FROUND, Ty, Legal); } + + setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom); } PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive(); @@ -2651,6 +2653,68 @@ } } +// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16. +static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST, + EVT VT, EVT MemVT, + SelectionDAG &DAG) { + assert(VT.isVector() && "VT should be a vector type"); + assert(MemVT == MVT::v4i8 && VT == MVT::v4i16); + + SDValue Value = ST->getValue(); + + // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract + // the word lane which represent the v4i8 subvector. It optimizes the store + // to: + // + // xtn v0.8b, v0.8h + // str s0, [x0] + + SDValue Undef = DAG.getUNDEF(MVT::i16); + SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL, + {Undef, Undef, Undef, Undef}); + + SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16, + Value, UndefVec); + SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt); + + Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc); + SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32, + Trunc, DAG.getConstant(0, DL, MVT::i64)); + + return DAG.getStore(ST->getChain(), DL, ExtractTrunc, + ST->getBasePtr(), ST->getMemOperand()); +} + +// Custom lowering for any store, vector or scalar and/or default or with +// a truncate operations. Currently only custom lower truncate operation +// from vector v4i16 to v4i8. +SDValue AArch64TargetLowering::LowerSTORE(SDValue Op, + SelectionDAG &DAG) const { + SDLoc Dl(Op); + StoreSDNode *StoreNode = cast(Op); + assert (StoreNode && "Can only custom lower store nodes"); + + SDValue Value = StoreNode->getValue(); + + EVT VT = Value.getValueType(); + EVT MemVT = StoreNode->getMemoryVT(); + + assert (VT.isVector() && "Can only custom lower vector store types"); + + unsigned AS = StoreNode->getAddressSpace(); + unsigned Align = StoreNode->getAlignment(); + if (Align < MemVT.getStoreSize() && + !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) { + return scalarizeVectorStore(StoreNode, DAG); + } + + if (StoreNode->isTruncatingStore()) { + return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG); + } + + return SDValue(); +} + SDValue AArch64TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const { LLVM_DEBUG(dbgs() << "Custom lowering: "); @@ -2760,6 +2824,8 @@ return LowerMULH(Op, DAG); case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG); + case ISD::STORE: + return LowerSTORE(Op, DAG); case ISD::VECREDUCE_ADD: case ISD::VECREDUCE_SMAX: case ISD::VECREDUCE_SMIN: Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp =================================================================== --- lib/Target/AArch64/AArch64TargetTransformInfo.cpp +++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp @@ -634,10 +634,11 @@ return LT.first * 2 * AmortizationCost; } + // We use a custom trunc store lowering so v.4b should be profitable. if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8) && - Ty->getVectorNumElements() < 8) { - // We scalarize the loads/stores because there is not v.4b register and we - // have to promote the elements to v.4h. + Ty->getVectorNumElements() < 4) { + // We scalarize the loads/stores because there is not v.2b register and we + // have to promote the elements to v.2. unsigned NumVecElts = Ty->getVectorNumElements(); unsigned NumVectorizableInstsToAmortize = NumVecElts * 2; // We generate 2 instructions per vector element. Index: test/Analysis/CostModel/AArch64/store.ll =================================================================== --- test/Analysis/CostModel/AArch64/store.ll +++ test/Analysis/CostModel/AArch64/store.ll @@ -59,11 +59,11 @@ ; these types (they get extended to v.4h/v.2s). ; CHECK: cost of 16 {{.*}} store store <2 x i8> undef, <2 x i8> * undef - ; CHECK: cost of 64 {{.*}} store + ; CHECK: cost of 1 {{.*}} store store <4 x i8> undef, <4 x i8> * undef ; CHECK: cost of 16 {{.*}} load load <2 x i8> , <2 x i8> * undef - ; CHECK: cost of 64 {{.*}} load + ; CHECK: cost of 1 {{.*}} load load <4 x i8> , <4 x i8> * undef ret void Index: test/CodeGen/AArch64/neon-truncStore-extLoad.ll =================================================================== --- test/CodeGen/AArch64/neon-truncStore-extLoad.ll +++ test/CodeGen/AArch64/neon-truncStore-extLoad.ll @@ -20,10 +20,20 @@ ret void } +define void @truncStore.v4i8(<4 x i32> %a, <4 x i8>* %result) { +; CHECK-LABEL: truncStore.v4i8: +; CHECK: xtn [[TMP:(v[0-9]+)]].4h, v{{[0-9]+}}.4s +; CHECK: xtn [[TMP2:(v[0-9]+)]].8b, [[TMP]].8h +; CHECK: {{st1 { [[TMP2]].4h }[0]|str s[0-9]+}}, [x{{[0-9]+|sp}}] + %b = trunc <4 x i32> %a to <4 x i8> + store <4 x i8> %b, <4 x i8>* %result + ret void +} + define void @truncStore.v8i16(<8 x i16> %a, <8 x i8>* %result) { ; CHECK-LABEL: truncStore.v8i16: -; CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h -; CHECK: {{st1 { v[0-9]+.8b }|str d[0-9]+}}, [x{{[0-9]+|sp}}] +; CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h +; CHECK-NEXT: str d{{[0-9]+}}, [x{{[0-9]+}}] %b = trunc <8 x i16> %a to <8 x i8> store <8 x i8> %b, <8 x i8>* %result ret void Index: test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll =================================================================== --- test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll +++ test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll @@ -11,11 +11,10 @@ %pair = type { i8, i8 } ; CHECK-LABEL: test -; CHECK: Found an estimated cost of 20 for VF 2 For instruction: {{.*}} load i8 +; CHECK: Found an estimated cost of 16 for VF 2 For instruction: {{.*}} load i8 ; CHECK: Found an estimated cost of 0 for VF 2 For instruction: {{.*}} load i8 ; CHECK: vector.body -; CHECK: load i8 -; CHECK: load i8 +; CHECK: load <4 x i8> ; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body define void @test(%pair* %p, i64 %n) { Index: test/Transforms/SLPVectorizer/AArch64/gather-root.ll =================================================================== --- test/Transforms/SLPVectorizer/AArch64/gather-root.ll +++ test/Transforms/SLPVectorizer/AArch64/gather-root.ll @@ -235,12 +235,8 @@ ; ; MAX-COST-LABEL: @PR32038( ; MAX-COST-NEXT: entry: -; MAX-COST-NEXT: [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1 -; MAX-COST-NEXT: [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer -; MAX-COST-NEXT: [[TMP4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1 -; MAX-COST-NEXT: [[TMPP5:%.*]] = icmp eq i8 [[TMP4]], 0 -; MAX-COST-NEXT: [[TMP6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4 -; MAX-COST-NEXT: [[TMPP7:%.*]] = icmp eq i8 [[TMP6]], 0 +; MAX-COST-NEXT: [[TMP0:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <4 x i8>*), align 1 +; MAX-COST-NEXT: [[V1:%.*]] = icmp eq <4 x i8> [[TMP0]], zeroinitializer ; MAX-COST-NEXT: [[TMP8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1 ; MAX-COST-NEXT: [[TMP9:%.*]] = icmp eq i8 [[TMP8]], 0 ; MAX-COST-NEXT: [[TMP10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2 @@ -252,13 +248,15 @@ ; MAX-COST-NEXT: br label [[FOR_BODY:%.*]] ; MAX-COST: for.body: ; MAX-COST-NEXT: [[TMP17:%.*]] = phi i32 [ [[TMP34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ] -; MAX-COST-NEXT: [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0 -; MAX-COST-NEXT: [[TMP3:%.*]] = insertelement <4 x i1> undef, i1 [[TMP2]], i32 0 -; MAX-COST-NEXT: [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1 -; MAX-COST-NEXT: [[TMP5:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[TMP4]], i32 1 -; MAX-COST-NEXT: [[TMP6:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[TMPP5]], i32 2 -; MAX-COST-NEXT: [[TMP7:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[TMPP7]], i32 3 -; MAX-COST-NEXT: [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> , <4 x i32> +; MAX-COST-NEXT: [[V2:%.*]] = extractelement <4 x i1> [[V1]], i32 0 +; MAX-COST-NEXT: [[V3:%.*]] = insertelement <4 x i1> undef, i1 [[V2]], i32 0 +; MAX-COST-NEXT: [[V4:%.*]] = extractelement <4 x i1> [[V1]], i32 1 +; MAX-COST-NEXT: [[V5:%.*]] = insertelement <4 x i1> [[V3]], i1 [[V4]], i32 1 +; MAX-COST-NEXT: [[V6:%.*]] = extractelement <4 x i1> [[V1]], i32 2 +; MAX-COST-NEXT: [[V7:%.*]] = insertelement <4 x i1> [[V5]], i1 [[V6]], i32 2 +; MAX-COST-NEXT: [[V8:%.*]] = extractelement <4 x i1> [[V1]], i32 3 +; MAX-COST-NEXT: [[V9:%.*]] = insertelement <4 x i1> [[V7]], i1 [[V8]], i32 3 +; MAX-COST-NEXT: [[V10:%.*]] = select <4 x i1> [[V9]], <4 x i32> , <4 x i32> ; MAX-COST-NEXT: [[TMP20:%.*]] = add i32 -5, undef ; MAX-COST-NEXT: [[TMP22:%.*]] = add i32 [[TMP20]], undef ; MAX-COST-NEXT: [[TMP24:%.*]] = add i32 [[TMP22]], undef @@ -266,7 +264,7 @@ ; MAX-COST-NEXT: [[TMP27:%.*]] = select i1 [[TMP9]], i32 -720, i32 -80 ; MAX-COST-NEXT: [[TMP28:%.*]] = add i32 [[TMP26]], [[TMP27]] ; MAX-COST-NEXT: [[TMP29:%.*]] = select i1 [[TMP11]], i32 -720, i32 -80 -; MAX-COST-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP8]]) +; MAX-COST-NEXT: [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[V10]]) ; MAX-COST-NEXT: [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP27]] ; MAX-COST-NEXT: [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP29]] ; MAX-COST-NEXT: [[BIN_EXTRA:%.*]] = add i32 [[TMP11]], -5