Index: lib/Target/AArch64/AArch64ISelLowering.h
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.h
+++ lib/Target/AArch64/AArch64ISelLowering.h
@@ -524,6 +524,8 @@
                           SmallVectorImpl<SDValue> &InVals, bool isThisReturn,
                           SDValue ThisVal) const;
 
+  SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG) const;
+
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
 
   bool isEligibleForTailCallOptimization(
Index: lib/Target/AArch64/AArch64ISelLowering.cpp
===================================================================
--- lib/Target/AArch64/AArch64ISelLowering.cpp
+++ lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -740,6 +740,8 @@
       setOperationAction(ISD::FTRUNC, Ty, Legal);
       setOperationAction(ISD::FROUND, Ty, Legal);
     }
+
+    setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
   }
 
   PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
@@ -2651,6 +2653,68 @@
   }
 }
 
+// Custom lower trunc store for v4i8 vectors, since it is promoted to v4i16.
+static SDValue LowerTruncateVectorStore(SDLoc DL, StoreSDNode *ST,
+                                        EVT VT, EVT MemVT,
+                                        SelectionDAG &DAG) {
+  assert(VT.isVector() && "VT should be a vector type");
+  assert(MemVT == MVT::v4i8 && VT == MVT::v4i16);
+
+  SDValue Value = ST->getValue();
+
+  // It first extend the promoted v4i16 to v8i16, truncate to v8i8, and extract
+  // the word lane which represent the v4i8 subvector.  It optimizes the store
+  // to:
+  //
+  //   xtn  v0.8b, v0.8h
+  //   str  s0, [x0]
+
+  SDValue Undef = DAG.getUNDEF(MVT::i16);
+  SDValue UndefVec = DAG.getBuildVector(MVT::v4i16, DL,
+                                        {Undef, Undef, Undef, Undef});
+
+  SDValue TruncExt = DAG.getNode(ISD::CONCAT_VECTORS, DL, MVT::v8i16,
+                                 Value, UndefVec);
+  SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i8, TruncExt);
+
+  Trunc = DAG.getNode(ISD::BITCAST, DL, MVT::v2i32, Trunc);
+  SDValue ExtractTrunc = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+                                     Trunc, DAG.getConstant(0, DL, MVT::i64));
+
+  return DAG.getStore(ST->getChain(), DL, ExtractTrunc,
+                      ST->getBasePtr(), ST->getMemOperand());
+}
+
+// Custom lowering for any store, vector or scalar and/or default or with
+// a truncate operations.  Currently only custom lower truncate operation
+// from vector v4i16 to v4i8.
+SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
+                                          SelectionDAG &DAG) const {
+  SDLoc Dl(Op);
+  StoreSDNode *StoreNode = cast<StoreSDNode>(Op);
+  assert (StoreNode && "Can only custom lower store nodes");
+
+  SDValue Value = StoreNode->getValue();
+
+  EVT VT = Value.getValueType();
+  EVT MemVT = StoreNode->getMemoryVT();
+
+  assert (VT.isVector() && "Can only custom lower vector store types");
+
+  unsigned AS = StoreNode->getAddressSpace();
+  unsigned Align = StoreNode->getAlignment();
+  if (Align < MemVT.getStoreSize() &&
+      !allowsMisalignedMemoryAccesses(MemVT, AS, Align, nullptr)) {
+    return scalarizeVectorStore(StoreNode, DAG);
+  }
+
+  if (StoreNode->isTruncatingStore()) {
+    return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
+  }
+
+  return SDValue();
+}
+
 SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
                                               SelectionDAG &DAG) const {
   LLVM_DEBUG(dbgs() << "Custom lowering: ");
@@ -2760,6 +2824,8 @@
     return LowerMULH(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN:
     return LowerINTRINSIC_WO_CHAIN(Op, DAG);
+  case ISD::STORE:
+    return LowerSTORE(Op, DAG);
   case ISD::VECREDUCE_ADD:
   case ISD::VECREDUCE_SMAX:
   case ISD::VECREDUCE_SMIN:
Index: lib/Target/AArch64/AArch64TargetTransformInfo.cpp
===================================================================
--- lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -634,10 +634,11 @@
     return LT.first * 2 * AmortizationCost;
   }
 
+  // We use a custom trunc store lowering so v.4b should be profitable.
   if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8) &&
-      Ty->getVectorNumElements() < 8) {
-    // We scalarize the loads/stores because there is not v.4b register and we
-    // have to promote the elements to v.4h.
+      Ty->getVectorNumElements() < 4) {
+    // We scalarize the loads/stores because there is not v.2b register and we
+    // have to promote the elements to v.2.
     unsigned NumVecElts = Ty->getVectorNumElements();
     unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
     // We generate 2 instructions per vector element.
Index: test/Analysis/CostModel/AArch64/store.ll
===================================================================
--- test/Analysis/CostModel/AArch64/store.ll
+++ test/Analysis/CostModel/AArch64/store.ll
@@ -59,11 +59,11 @@
     ; these types (they get extended to v.4h/v.2s).
     ; CHECK: cost of 16 {{.*}} store
     store <2 x i8> undef, <2 x i8> * undef
-    ; CHECK: cost of 64 {{.*}} store
+    ; CHECK: cost of 1 {{.*}} store
     store <4 x i8> undef, <4 x i8> * undef
     ; CHECK: cost of 16 {{.*}} load
     load <2 x i8> , <2 x i8> * undef
-    ; CHECK: cost of 64 {{.*}} load
+    ; CHECK: cost of 1 {{.*}} load
     load <4 x i8> , <4 x i8> * undef
 
     ret void
Index: test/CodeGen/AArch64/neon-truncStore-extLoad.ll
===================================================================
--- test/CodeGen/AArch64/neon-truncStore-extLoad.ll
+++ test/CodeGen/AArch64/neon-truncStore-extLoad.ll
@@ -20,10 +20,20 @@
   ret void
 }
 
+define void @truncStore.v4i8(<4 x i32> %a, <4 x i8>* %result) {
+; CHECK-LABEL: truncStore.v4i8:
+; CHECK: xtn [[TMP:(v[0-9]+)]].4h, v{{[0-9]+}}.4s
+; CHECK: xtn [[TMP2:(v[0-9]+)]].8b, [[TMP]].8h
+; CHECK: {{st1 { [[TMP2]].4h }[0]|str s[0-9]+}}, [x{{[0-9]+|sp}}]
+  %b = trunc <4 x i32> %a to <4 x i8>
+  store <4 x i8> %b, <4 x i8>* %result
+  ret void
+}
+
 define void @truncStore.v8i16(<8 x i16> %a, <8 x i8>* %result) {
 ; CHECK-LABEL: truncStore.v8i16:
-; CHECK: xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
-; CHECK: {{st1 { v[0-9]+.8b }|str d[0-9]+}}, [x{{[0-9]+|sp}}]
+; CHECK:      xtn v{{[0-9]+}}.8b, v{{[0-9]+}}.8h
+; CHECK-NEXT: str d{{[0-9]+}}, [x{{[0-9]+}}]
   %b = trunc <8 x i16> %a to <8 x i8>
   store <8 x i8> %b, <8 x i8>* %result
   ret void
Index: test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
===================================================================
--- test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
+++ test/Transforms/LoopVectorize/AArch64/interleaved-vs-scalar.ll
@@ -11,11 +11,10 @@
 %pair = type { i8, i8 }
 
 ; CHECK-LABEL: test
-; CHECK: Found an estimated cost of 20 for VF 2 For instruction:   {{.*}} load i8
+; CHECK: Found an estimated cost of 16 for VF 2 For instruction:   {{.*}} load i8
 ; CHECK: Found an estimated cost of 0 for VF 2 For instruction:   {{.*}} load i8
 ; CHECK: vector.body
-; CHECK: load i8
-; CHECK: load i8
+; CHECK: load <4 x i8>
 ; CHECK: br i1 {{.*}}, label %middle.block, label %vector.body
 
 define void @test(%pair* %p, i64 %n) {
Index: test/Transforms/SLPVectorizer/AArch64/gather-root.ll
===================================================================
--- test/Transforms/SLPVectorizer/AArch64/gather-root.ll
+++ test/Transforms/SLPVectorizer/AArch64/gather-root.ll
@@ -235,12 +235,8 @@
 ;
 ; MAX-COST-LABEL: @PR32038(
 ; MAX-COST-NEXT:  entry:
-; MAX-COST-NEXT:    [[TMP0:%.*]] = load <2 x i8>, <2 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <2 x i8>*), align 1
-; MAX-COST-NEXT:    [[TMP1:%.*]] = icmp eq <2 x i8> [[TMP0]], zeroinitializer
-; MAX-COST-NEXT:    [[TMP4:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 3), align 1
-; MAX-COST-NEXT:    [[TMPP5:%.*]] = icmp eq i8 [[TMP4]], 0
-; MAX-COST-NEXT:    [[TMP6:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 4), align 4
-; MAX-COST-NEXT:    [[TMPP7:%.*]] = icmp eq i8 [[TMP6]], 0
+; MAX-COST-NEXT:    [[TMP0:%.*]] = load <4 x i8>, <4 x i8>* bitcast (i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 1) to <4 x i8>*), align 1
+; MAX-COST-NEXT:    [[V1:%.*]] = icmp eq <4 x i8> [[TMP0]], zeroinitializer
 ; MAX-COST-NEXT:    [[TMP8:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 5), align 1
 ; MAX-COST-NEXT:    [[TMP9:%.*]] = icmp eq i8 [[TMP8]], 0
 ; MAX-COST-NEXT:    [[TMP10:%.*]] = load i8, i8* getelementptr inbounds ([80 x i8], [80 x i8]* @a, i64 0, i64 6), align 2
@@ -252,13 +248,15 @@
 ; MAX-COST-NEXT:    br label [[FOR_BODY:%.*]]
 ; MAX-COST:       for.body:
 ; MAX-COST-NEXT:    [[TMP17:%.*]] = phi i32 [ [[TMP34:%.*]], [[FOR_BODY]] ], [ 0, [[ENTRY:%.*]] ]
-; MAX-COST-NEXT:    [[TMP2:%.*]] = extractelement <2 x i1> [[TMP1]], i32 0
-; MAX-COST-NEXT:    [[TMP3:%.*]] = insertelement <4 x i1> undef, i1 [[TMP2]], i32 0
-; MAX-COST-NEXT:    [[TMP4:%.*]] = extractelement <2 x i1> [[TMP1]], i32 1
-; MAX-COST-NEXT:    [[TMP5:%.*]] = insertelement <4 x i1> [[TMP3]], i1 [[TMP4]], i32 1
-; MAX-COST-NEXT:    [[TMP6:%.*]] = insertelement <4 x i1> [[TMP5]], i1 [[TMPP5]], i32 2
-; MAX-COST-NEXT:    [[TMP7:%.*]] = insertelement <4 x i1> [[TMP6]], i1 [[TMPP7]], i32 3
-; MAX-COST-NEXT:    [[TMP8:%.*]] = select <4 x i1> [[TMP7]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
+; MAX-COST-NEXT:    [[V2:%.*]] = extractelement <4 x i1> [[V1]], i32 0
+; MAX-COST-NEXT:    [[V3:%.*]] = insertelement <4 x i1> undef, i1 [[V2]], i32 0
+; MAX-COST-NEXT:    [[V4:%.*]] = extractelement <4 x i1> [[V1]], i32 1
+; MAX-COST-NEXT:    [[V5:%.*]] = insertelement <4 x i1> [[V3]], i1 [[V4]], i32 1
+; MAX-COST-NEXT:    [[V6:%.*]] = extractelement <4 x i1> [[V1]], i32 2
+; MAX-COST-NEXT:    [[V7:%.*]] = insertelement <4 x i1> [[V5]], i1 [[V6]], i32 2
+; MAX-COST-NEXT:    [[V8:%.*]] = extractelement <4 x i1> [[V1]], i32 3
+; MAX-COST-NEXT:    [[V9:%.*]] = insertelement <4 x i1> [[V7]], i1 [[V8]], i32 3
+; MAX-COST-NEXT:    [[V10:%.*]] = select <4 x i1> [[V9]], <4 x i32> <i32 -720, i32 -720, i32 -720, i32 -720>, <4 x i32> <i32 -80, i32 -80, i32 -80, i32 -80>
 ; MAX-COST-NEXT:    [[TMP20:%.*]] = add i32 -5, undef
 ; MAX-COST-NEXT:    [[TMP22:%.*]] = add i32 [[TMP20]], undef
 ; MAX-COST-NEXT:    [[TMP24:%.*]] = add i32 [[TMP22]], undef
@@ -266,7 +264,7 @@
 ; MAX-COST-NEXT:    [[TMP27:%.*]] = select i1 [[TMP9]], i32 -720, i32 -80
 ; MAX-COST-NEXT:    [[TMP28:%.*]] = add i32 [[TMP26]], [[TMP27]]
 ; MAX-COST-NEXT:    [[TMP29:%.*]] = select i1 [[TMP11]], i32 -720, i32 -80
-; MAX-COST-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[TMP8]])
+; MAX-COST-NEXT:    [[TMP9:%.*]] = call i32 @llvm.experimental.vector.reduce.add.i32.v4i32(<4 x i32> [[V10]])
 ; MAX-COST-NEXT:    [[TMP10:%.*]] = add i32 [[TMP9]], [[TMP27]]
 ; MAX-COST-NEXT:    [[TMP11:%.*]] = add i32 [[TMP10]], [[TMP29]]
 ; MAX-COST-NEXT:    [[BIN_EXTRA:%.*]] = add i32 [[TMP11]], -5