Index: lib/Target/ARM/ARMISelDAGToDAG.cpp
===================================================================
--- lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -1763,12 +1763,14 @@
   default: llvm_unreachable("unhandled vld type");
     // Double-register operations:
   case MVT::v8i8:  OpcodeIndex = 0; break;
+  case MVT::v4f16:
   case MVT::v4i16: OpcodeIndex = 1; break;
   case MVT::v2f32:
   case MVT::v2i32: OpcodeIndex = 2; break;
   case MVT::v1i64: OpcodeIndex = 3; break;
     // Quad-register operations:
   case MVT::v16i8: OpcodeIndex = 0; break;
+  case MVT::v8f16:
   case MVT::v8i16: OpcodeIndex = 1; break;
   case MVT::v4f32:
   case MVT::v4i32: OpcodeIndex = 2; break;
@@ -2070,10 +2072,12 @@
   default: llvm_unreachable("unhandled vld/vst lane type");
     // Double-register operations:
   case MVT::v8i8:  OpcodeIndex = 0; break;
+  case MVT::v4f16:
   case MVT::v4i16: OpcodeIndex = 1; break;
   case MVT::v2f32:
   case MVT::v2i32: OpcodeIndex = 2; break;
     // Quad-register operations:
+  case MVT::v8f16:
   case MVT::v8i16: OpcodeIndex = 0; break;
   case MVT::v4f32:
   case MVT::v4i32: OpcodeIndex = 1; break;
Index: lib/Target/ARM/ARMISelLowering.cpp
===================================================================
--- lib/Target/ARM/ARMISelLowering.cpp
+++ lib/Target/ARM/ARMISelLowering.cpp
@@ -5734,16 +5734,16 @@
 static SDValue isNEONModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
                                  unsigned SplatBitSize, SelectionDAG &DAG,
                                  const SDLoc &dl, EVT &VT, bool is128Bits,
-                                 NEONModImmType type) {
+                                 NEONModImmType type, bool FP16 = false) {
   unsigned OpCmode, Imm;
 
   // SplatBitSize is set to the smallest size that splats the vector, so a
   // zero vector will always have SplatBitSize == 8.  However, NEON modified
   // immediate instructions others than VMOV do not support the 8-bit encoding
   // of a zero vector, and the default encoding of zero is supposed to be the
-  // 32-bit version.
+  // 32-bit version, and the 16-bit version for f16 vectors.
   if (SplatBits == 0)
-    SplatBitSize = 32;
+    SplatBitSize = FP16 ? 16 : 32;
 
   switch (SplatBitSize) {
   case 8:
@@ -6384,10 +6384,11 @@
     if (SplatBitSize <= 64) {
       // Check if an immediate VMOV works.
       EVT VmovVT;
+      const bool FP16 = (VT == MVT::v4f16 || VT == MVT::v8f16);
       SDValue Val = isNEONModifiedImm(SplatBits.getZExtValue(),
                                       SplatUndef.getZExtValue(), SplatBitSize,
                                       DAG, dl, VmovVT, VT.is128BitVector(),
-                                      VMOVModImm);
+                                      VMOVModImm, FP16);
       if (Val.getNode()) {
         SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
         return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
@@ -6465,9 +6466,10 @@
 
   unsigned EltSize = VT.getScalarSizeInBits();
 
-  // Use VDUP for non-constant splats.  For f32 constant splats, reduce to
-  // i32 and try again.
+  // Use VDUP for non-constant splats.  For f32 and f16 constant splats, reduce to
+  // i32 and i16 and try again.
   if (hasDominantValue && EltSize <= 32) {
+    EVT IntEltType = (EltSize == 32 ? MVT::i32 : MVT::i16);
     if (!isConstant) {
       SDValue N;
 
@@ -6514,9 +6516,9 @@
     if (VT.getVectorElementType().isFloatingPoint()) {
       SmallVector<SDValue, 8> Ops;
       for (unsigned i = 0; i < NumElts; ++i)
-        Ops.push_back(DAG.getNode(ISD::BITCAST, dl, MVT::i32,
+        Ops.push_back(DAG.getNode(ISD::BITCAST, dl, IntEltType,
                                   Op.getOperand(i)));
-      EVT VecVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+      EVT VecVT = EVT::getVectorVT(*DAG.getContext(), IntEltType, NumElts);
       SDValue Val = DAG.getBuildVector(VecVT, dl, Ops);
       Val = LowerBUILD_VECTOR(Val, DAG, ST);
       if (Val.getNode())
Index: lib/Target/ARM/ARMInstrNEON.td
===================================================================
--- lib/Target/ARM/ARMInstrNEON.td
+++ lib/Target/ARM/ARMInstrNEON.td
@@ -7142,6 +7142,7 @@
 }
 def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>;
 let Predicates = [IsLE] in {
+  def : Pat<(v4f16 (bitconvert (v4i16 DPR:$src))), (v4f16 DPR:$src)>;
   def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (v4i16 DPR:$src)>;
   def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (v4i16 DPR:$src)>;
   def : Pat<(v4i16 (bitconvert (v8i8  DPR:$src))), (v4i16 DPR:$src)>;
@@ -7185,6 +7186,7 @@
 }
 def : Pat<(v4i32 (bitconvert (v4f32 QPR:$src))), (v4i32 QPR:$src)>;
 let Predicates = [IsLE] in {
+  def : Pat<(v8f16 (bitconvert (v8i16 QPR:$src))), (v8f16 QPR:$src)>;
   def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>;
   def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (v8i16 QPR:$src)>;
   def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (v8i16 QPR:$src)>;
@@ -7223,6 +7225,7 @@
   def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>;
   def : Pat<(v2i32 (bitconvert (v8i8  DPR:$src))), (VREV32d8  DPR:$src)>;
   def : Pat<(v2i32 (bitconvert (f64   DPR:$src))), (VREV64d32 DPR:$src)>;
+  def : Pat<(v4f16 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>;
   def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (VREV64d16 DPR:$src)>;
   def : Pat<(v4i16 (bitconvert (v2i32 DPR:$src))), (VREV32d16 DPR:$src)>;
   def : Pat<(v4i16 (bitconvert (v8i8  DPR:$src))), (VREV16d8  DPR:$src)>;
@@ -7256,6 +7259,7 @@
   def : Pat<(v8i16 (bitconvert (v4i32 QPR:$src))), (VREV32q16 QPR:$src)>;
   def : Pat<(v8i16 (bitconvert (v16i8 QPR:$src))), (VREV16q8  QPR:$src)>;
   def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>;
+  def : Pat<(v8f16 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>;
   def : Pat<(v8f16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>;
   def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>;
   def : Pat<(v16i8 (bitconvert (v2i64 QPR:$src))), (VREV64q8  QPR:$src)>;
Index: test/CodeGen/ARM/fp16-reduction.ll
===================================================================
--- /dev/null
+++ test/CodeGen/ARM/fp16-reduction.ll
@@ -0,0 +1,227 @@
+; RUN: llc < %s | FileCheck %s
+; RUN: llc -mtriple armeb-unknown < %s | FileCheck %s
+
+target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+target triple = "armv8.2a-arm-unknown-eabihf"
+
+define dso_local float @vec8_zero_init(half* nocapture readonly %V, i32 %N) local_unnamed_addr #0 {
+entry:
+  %cmp6 = icmp sgt i32 %N, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %min.iters.check = icmp ult i32 %N, 8
+  br i1 %min.iters.check, label %for.body.preheader15, label %vector.ph
+
+for.body.preheader15:
+  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
+  %Tmp.07.ph = phi half [ 0xH0000, %for.body.preheader ], [ 0xH8000, %middle.block ]
+  br label %for.body
+
+vector.ph:
+  %n.vec = and i32 %N, -8
+  br label %vector.body
+
+vector.body:
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <8 x half> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
+
+; CHECK-LABEL: vec8_zero_init:
+; CHECK:       vmov.i16 q8, #0x0
+
+  %0 = getelementptr inbounds half, half* %V, i32 %index
+  %1 = bitcast half* %0 to <8 x half>*
+  %wide.load = load <8 x half>, <8 x half>* %1, align 2
+  %2 = fadd fast <8 x half> %wide.load, %vec.phi
+  %index.next = add i32 %index, 8
+  %3 = icmp eq i32 %index.next, %n.vec
+  br i1 %3, label %middle.block, label %vector.body
+
+middle.block:
+  %rdx.shuf = shufflevector <8 x half> %2, <8 x half> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = fadd fast <8 x half> %2, %rdx.shuf
+  %rdx.shuf11 = shufflevector <8 x half> %bin.rdx, <8 x half> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx12 = fadd fast <8 x half> %bin.rdx, %rdx.shuf11
+  %rdx.shuf13 = shufflevector <8 x half> %bin.rdx12, <8 x half> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx14 = fadd fast <8 x half> %bin.rdx12, %rdx.shuf13
+
+; TODO: suport v8f16 extractelement
+;  %4 = extractelement <8 x half> %bin.rdx14, i32 0
+  %cmp.n = icmp eq i32 %n.vec, %N
+  br i1 %cmp.n, label %for.cond.cleanup.loopexit, label %for.body.preheader15
+
+for.cond.cleanup.loopexit:
+;   %add.lcssa = phi half [ %4, %middle.block ], [ %add, %for.body ]
+   %add.lcssa = phi half [ 0.000000e+00, %middle.block ], [ %add, %for.body ]
+  %phitmp = bitcast half %add.lcssa to i16
+  %phitmp9 = zext i16 %phitmp to i32
+  %phitmp10 = bitcast i32 %phitmp9 to float
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %Tmp.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %phitmp10, %for.cond.cleanup.loopexit ]
+  ret float %Tmp.0.lcssa
+
+for.body:
+  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader15 ]
+  %Tmp.07 = phi half [ %add, %for.body ], [ %Tmp.07.ph, %for.body.preheader15 ]
+  %arrayidx = getelementptr inbounds half, half* %V, i32 %i.08
+  %V5 = load half, half* %arrayidx, align 2
+  %add = fadd fast half %V5, %Tmp.07
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+
+define dso_local float @vec8_one_init(half* nocapture readonly %V, i32 %N) local_unnamed_addr #0 {
+entry:
+  %cmp6 = icmp sgt i32 %N, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %min.iters.check = icmp ult i32 %N, 8
+  br i1 %min.iters.check, label %for.body.preheader15, label %vector.ph
+
+for.body.preheader15:
+  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
+  %Tmp.07.ph = phi half [ 0xH0000, %for.body.preheader ], [ 0xH8000, %middle.block ]
+  br label %for.body
+
+vector.ph:
+  %n.vec = and i32 %N, -8
+  br label %vector.body
+
+vector.body:
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <8 x half> [ <half 0xH3C00, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000, half 0xH0000>, %vector.ph ], [ %2, %vector.body ]
+
+; CHECK-LABEL: vec8_one_init:
+; CHECK:       adr r2, .LCPI1_2
+; CHECK:       .LCPI1_2:
+; CHECK-NEXT:    .short  15360                   @ half 1
+; CHECK-NEXT:    .short  0                       @ half 0
+; CHECK-NEXT:    .short  0                       @ half 0
+; CHECK-NEXT:    .short  0                       @ half 0
+; CHECK-NEXT:    .short  0                       @ half 0
+; CHECK-NEXT:    .short  0                       @ half 0
+; CHECK-NEXT:    .short  0                       @ half 0
+; CHECK-NEXT:    .short  0                       @ half 0
+
+
+  %0 = getelementptr inbounds half, half* %V, i32 %index
+  %1 = bitcast half* %0 to <8 x half>*
+  %wide.load = load <8 x half>, <8 x half>* %1, align 2
+  %2 = fadd fast <8 x half> %wide.load, %vec.phi
+  %index.next = add i32 %index, 8
+  %3 = icmp eq i32 %index.next, %n.vec
+  br i1 %3, label %middle.block, label %vector.body
+
+middle.block:
+  %rdx.shuf = shufflevector <8 x half> %2, <8 x half> undef, <8 x i32> <i32 4, i32 5, i32 6, i32 7, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx = fadd fast <8 x half> %2, %rdx.shuf
+  %rdx.shuf11 = shufflevector <8 x half> %bin.rdx, <8 x half> undef, <8 x i32> <i32 2, i32 3, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx12 = fadd fast <8 x half> %bin.rdx, %rdx.shuf11
+  %rdx.shuf13 = shufflevector <8 x half> %bin.rdx12, <8 x half> undef, <8 x i32> <i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %bin.rdx14 = fadd fast <8 x half> %bin.rdx12, %rdx.shuf13
+
+; TODO: suport v8f16 extractelement
+;  %4 = extractelement <8 x half> %bin.rdx14, i32 0
+  %cmp.n = icmp eq i32 %n.vec, %N
+  br i1 %cmp.n, label %for.cond.cleanup.loopexit, label %for.body.preheader15
+
+for.cond.cleanup.loopexit:
+;   %add.lcssa = phi half [ %4, %middle.block ], [ %add, %for.body ]
+   %add.lcssa = phi half [ 0.000000e+00, %middle.block ], [ %add, %for.body ]
+  %phitmp = bitcast half %add.lcssa to i16
+  %phitmp9 = zext i16 %phitmp to i32
+  %phitmp10 = bitcast i32 %phitmp9 to float
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %Tmp.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %phitmp10, %for.cond.cleanup.loopexit ]
+  ret float %Tmp.0.lcssa
+
+for.body:
+  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader15 ]
+  %Tmp.07 = phi half [ %add, %for.body ], [ %Tmp.07.ph, %for.body.preheader15 ]
+  %arrayidx = getelementptr inbounds half, half* %V, i32 %i.08
+  %V5 = load half, half* %arrayidx, align 2
+  %add = fadd fast half %V5, %Tmp.07
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+
+
+define dso_local float @vec4_zero_init(half* nocapture readonly %V, i32 %N) local_unnamed_addr #0 {
+entry:
+  %cmp6 = icmp sgt i32 %N, 0
+  br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+for.body.preheader:
+  %min.iters.check = icmp ult i32 %N, 8
+  br i1 %min.iters.check, label %for.body.preheader15, label %vector.ph
+
+for.body.preheader15:
+  %i.08.ph = phi i32 [ 0, %for.body.preheader ], [ %n.vec, %middle.block ]
+  %Tmp.07.ph = phi half [ 0xH0000, %for.body.preheader ], [ 0xH8000, %middle.block ]
+  br label %for.body
+
+vector.ph:
+  %n.vec = and i32 %N, -8
+  br label %vector.body
+
+vector.body:
+  %index = phi i32 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+  %vec.phi = phi <4 x half> [ zeroinitializer, %vector.ph ], [ %2, %vector.body ]
+
+; CHECK-LABEL: vec4_zero_init:
+; CHECK:       vmov.i16 d16, #0x0
+
+  %0 = getelementptr inbounds half, half* %V, i32 %index
+  %1 = bitcast half* %0 to <4 x half>*
+  %wide.load = load <4 x half>, <4 x half>* %1, align 2
+  %2 = fadd fast <4 x half> %wide.load, %vec.phi
+  %index.next = add i32 %index, 8
+  %3 = icmp eq i32 %index.next, %n.vec
+  br i1 %3, label %middle.block, label %vector.body
+
+middle.block:
+  %rdx.shuf = shufflevector <4 x half> %2, <4 x half> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
+  %bin.rdx = fadd fast <4 x half> %2, %rdx.shuf
+  %rdx.shuf11 = shufflevector <4 x half> %bin.rdx, <4 x half> undef, <4 x i32> <i32 2, i32 3, i32 undef, i32 undef>
+  %bin.rdx12 = fadd fast <4 x half> %bin.rdx, %rdx.shuf11
+  %rdx.shuf13 = shufflevector <4 x half> %bin.rdx12, <4 x half> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %bin.rdx14 = fadd fast <4 x half> %bin.rdx12, %rdx.shuf13
+
+; TODO: support v4f16 extractelement
+;  %4 = extractelement <4 x half> %bin.rdx14, i32 0
+  %cmp.n = icmp eq i32 %n.vec, %N
+  br i1 %cmp.n, label %for.cond.cleanup.loopexit, label %for.body.preheader15
+
+for.cond.cleanup.loopexit:
+;   %add.lcssa = phi half [ %4, %middle.block ], [ %add, %for.body ]
+   %add.lcssa = phi half [ 0.000000e+00, %middle.block ], [ %add, %for.body ]
+  %phitmp = bitcast half %add.lcssa to i16
+  %phitmp9 = zext i16 %phitmp to i32
+  %phitmp10 = bitcast i32 %phitmp9 to float
+  br label %for.cond.cleanup
+
+for.cond.cleanup:
+  %Tmp.0.lcssa = phi float [ 0.000000e+00, %entry ], [ %phitmp10, %for.cond.cleanup.loopexit ]
+  ret float %Tmp.0.lcssa
+
+for.body:
+  %i.08 = phi i32 [ %inc, %for.body ], [ %i.08.ph, %for.body.preheader15 ]
+  %Tmp.07 = phi half [ %add, %for.body ], [ %Tmp.07.ph, %for.body.preheader15 ]
+  %arrayidx = getelementptr inbounds half, half* %V, i32 %i.08
+  %V5 = load half, half* %arrayidx, align 2
+  %add = fadd fast half %V5, %Tmp.07
+  %inc = add nuw nsw i32 %i.08, 1
+  %exitcond = icmp eq i32 %inc, %N
+  br i1 %exitcond, label %for.cond.cleanup.loopexit, label %for.body
+}
+
+attributes #0 = { norecurse nounwind readonly "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "min-legal-vector-width"="0" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="true" "no-jump-tables"="false" "no-nans-fp-math"="true" "no-signed-zeros-fp-math"="true" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+armv8.2-a,+crc,+crypto,+dsp,+fp-armv8,+fullfp16,+hwdiv,+hwdiv-arm,+neon,+ras,+strict-align,-thumb-mode" "unsafe-fp-math"="true" "use-soft-float"="false" }