Index: lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
===================================================================
--- lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -614,6 +614,29 @@
     std::reverse(Parts, Parts + OrigNumParts);
 }
 
+static SDValue widenVectorToPartType(SelectionDAG &DAG,
+                                     SDValue Val, const SDLoc &DL, EVT PartVT) {
+  EVT ValueVT = Val.getValueType();
+  if (PartVT.isVector() &&
+      PartVT.getVectorElementType() == ValueVT.getVectorElementType() &&
+      PartVT.getVectorNumElements() > ValueVT.getVectorNumElements()) {
+    EVT ElementVT = PartVT.getVectorElementType();
+    // Vector widening case, e.g. <2 x float> -> <4 x float>.  Shuffle in
+    // undef elements.
+    SmallVector<SDValue, 16> Ops;
+    DAG.ExtractVectorElements(Val, Ops);
+    SDValue EltUndef = DAG.getUNDEF(ElementVT);
+    for (unsigned i = ValueVT.getVectorNumElements(),
+           e = PartVT.getVectorNumElements(); i != e; ++i)
+      Ops.push_back(EltUndef);
+
+    // FIXME: Use CONCAT for 2x -> 4x.
+    return DAG.getBuildVector(PartVT, DL, Ops);
+  }
+
+  return SDValue();
+}
+
 /// getCopyToPartsVector - Create a series of nodes that contain the specified
 /// value split into legal parts.
 static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
@@ -632,28 +655,8 @@
     } else if (PartVT.getSizeInBits() == ValueVT.getSizeInBits()) {
       // Bitconvert vector->vector case.
       Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
-    } else if (PartVT.isVector() &&
-               PartEVT.getVectorElementType() == ValueVT.getVectorElementType() &&
-               PartEVT.getVectorNumElements() > ValueVT.getVectorNumElements()) {
-      EVT ElementVT = PartVT.getVectorElementType();
-      // Vector widening case, e.g. <2 x float> -> <4 x float>.  Shuffle in
-      // undef elements.
-      SmallVector<SDValue, 16> Ops;
-      for (unsigned i = 0, e = ValueVT.getVectorNumElements(); i != e; ++i)
-        Ops.push_back(DAG.getNode(
-            ISD::EXTRACT_VECTOR_ELT, DL, ElementVT, Val,
-            DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout()))));
-
-      for (unsigned i = ValueVT.getVectorNumElements(),
-           e = PartVT.getVectorNumElements(); i != e; ++i)
-        Ops.push_back(DAG.getUNDEF(ElementVT));
-
-      Val = DAG.getBuildVector(PartVT, DL, Ops);
-
-      // FIXME: Use CONCAT for 2x -> 4x.
-
-      //SDValue UndefElts = DAG.getUNDEF(VectorTy);
-      //Val = DAG.getNode(ISD::CONCAT_VECTORS, DL, PartVT, Val, UndefElts);
+    } else if (SDValue Widened = widenVectorToPartType(DAG, Val, DL, PartVT)) {
+      Val = Widened;
     } else if (PartVT.isVector() &&
                PartEVT.getVectorElementType().bitsGE(
                  ValueVT.getVectorElementType()) &&
@@ -701,27 +704,35 @@
   NumParts = NumRegs; // Silence a compiler warning.
   assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");
 
+  unsigned IntermediateNumElts = IntermediateVT.isVector() ?
+    IntermediateVT.getVectorNumElements() : 1;
+
   // Convert the vector to the appropiate type if necessary.
-  unsigned DestVectorNoElts =
-      NumIntermediates *
-      (IntermediateVT.isVector() ? IntermediateVT.getVectorNumElements() : 1);
+  unsigned DestVectorNoElts = NumIntermediates * IntermediateNumElts;
+
   EVT BuiltVectorTy = EVT::getVectorVT(
       *DAG.getContext(), IntermediateVT.getScalarType(), DestVectorNoElts);
-  if (Val.getValueType() != BuiltVectorTy)
+  MVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+  if (ValueVT != BuiltVectorTy) {
+    if (SDValue Widened = widenVectorToPartType(DAG, Val, DL, BuiltVectorTy))
+      Val = Widened;
+
     Val = DAG.getNode(ISD::BITCAST, DL, BuiltVectorTy, Val);
+  }
 
   // Split the vector into intermediate operands.
   SmallVector<SDValue, 8> Ops(NumIntermediates);
   for (unsigned i = 0; i != NumIntermediates; ++i) {
-    if (IntermediateVT.isVector())
-      Ops[i] =
-          DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, IntermediateVT, Val,
-                      DAG.getConstant(i * (NumElements / NumIntermediates), DL,
-                                      TLI.getVectorIdxTy(DAG.getDataLayout())));
-    else
+    unsigned SubVecIdx = i * (NumElements + 1) / NumIntermediates;
+
+    if (IntermediateVT.isVector()) {
+      Ops[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, IntermediateVT, Val,
+                           DAG.getConstant(SubVecIdx, DL, IdxVT));
+    } else {
       Ops[i] = DAG.getNode(
           ISD::EXTRACT_VECTOR_ELT, DL, IntermediateVT, Val,
-          DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+          DAG.getConstant(i, DL, IdxVT));
+    }
   }
 
   // Split the intermediate operands into legal parts.
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -712,9 +712,7 @@
     if (Size == 64)
       return MVT::i32;
 
-    if (Size == 16 &&
-        Subtarget->has16BitInsts() &&
-        isPowerOf2_32(VT.getVectorNumElements()))
+    if (Size == 16 && Subtarget->has16BitInsts())
       return VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
   }
 
@@ -736,8 +734,8 @@
       return 2 * NumElts;
 
     // FIXME: Fails to break down as we want with v3.
-    if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts))
-      return VT.getVectorNumElements() / 2;
+    if (Size == 16 && Subtarget->has16BitInsts())
+      return (VT.getVectorNumElements() + 1) / 2;
   }
 
   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
@@ -768,10 +766,10 @@
     // FIXME: We should fix the ABI to be the same on targets without 16-bit
     // support, but unless we can properly handle 3-vectors, it will be still be
     // inconsistent.
-    if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts)) {
+    if (Size == 16 && Subtarget->has16BitInsts()) {
       RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16;
       IntermediateVT = RegisterVT;
-      NumIntermediates = NumElts / 2;
+      NumIntermediates = (NumElts + 1) / 2;
       return NumIntermediates;
     }
   }
Index: test/CodeGen/AMDGPU/call-argument-types.ll
===================================================================
--- test/CodeGen/AMDGPU/call-argument-types.ll
+++ test/CodeGen/AMDGPU/call-argument-types.ll
@@ -399,18 +399,35 @@
   ret void
 }
 
-; FIXME: materialize constant directly in VGPR
+; GCN-LABEL: {{^}}test_call_external_void_func_v3f16:
+; GFX9: buffer_load_dwordx2 v[0:1]
+; GFX9-NOT: v0
+; GFX9-NOT: v1
+; GFX9: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v3f16() #0 {
+  %val = load <3 x half>, <3 x half> addrspace(1)* undef
+  call void @external_void_func_v3f16(<3 x half> %val)
+  ret void
+}
+
 ; GCN-LABEL: {{^}}test_call_external_void_func_v3i16_imm:
-; GFX9-DAG: s_mov_b32 [[K01:s[0-9]+]], 0x20001
-; GFX9-DAG: s_mov_b32 [[K2:s[0-9]+]], 3
-; GFX9: v_mov_b32_e32 v0, [[K01]]
-; GFX9: v_mov_b32_e32 v1, [[K2]]
+; GFX9: v_mov_b32_e32 v0, 0x20001
+; GFX9: v_mov_b32_e32 v1, 3
 ; GFX9: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 {
   call void @external_void_func_v3i16(<3 x i16> <i16 1, i16 2, i16 3>)
   ret void
 }
 
+; GCN-LABEL: {{^}}test_call_external_void_func_v3f16_imm:
+; GFX9: v_mov_b32_e32 v0, 0x40003c00
+; GFX9: v_mov_b32_e32 v1, 0x4400
+; GFX9: s_swappc_b64
+define amdgpu_kernel void @test_call_external_void_func_v3f16_imm() #0 {
+  call void @external_void_func_v3f16(<3 x half> <half 1.0, half 2.0, half 4.0>)
+  ret void
+}
+
 ; GCN-LABEL: {{^}}test_call_external_void_func_v4i16:
 ; GFX9: buffer_load_dwordx2 v[0:1]
 ; GFX9-NOT: v0
Index: test/CodeGen/AMDGPU/fcanonicalize.f16.ll
===================================================================
--- test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -504,16 +504,15 @@
 ; FIXME: Extra 4th component handled
 ; GCN-LABEL: {{^}}v_test_canonicalize_var_v3f16:
 ; GFX9: s_waitcnt
-; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
 ; GFX9-NEXT: v_pk_max_f16 v0, v0, v0
+; GFX9-NEXT: v_pk_max_f16 v1, v1, v1
 ; GFX9-NEXT: s_setpc_b64
 
-; VI-DAG: v_max_f16_sdwa [[CANON_ELT3:v[0-9]+]], v1, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
-; VI-DAG: v_max_f16_e32 [[CANON_ELT2:v[0-9]+]], v1, v1
 ; VI-DAG: v_max_f16_sdwa [[CANON_ELT1:v[0-9]+]], v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:WORD_1
 ; VI-DAG: v_max_f16_e32 [[CANON_ELT0:v[0-9]+]], v0, v0
+; VI-DAG: v_max_f16_e32 v1, v1, v1
 ; VI-DAG: v_or_b32_e32 v0, [[CANON_ELT0]], [[CANON_ELT1]]
-; VI-DAG: v_or_b32_e32 v1, [[CANON_ELT2]], [[CANON_ELT3]]
+
 ; VI: s_setpc_b64
 define <3 x half> @v_test_canonicalize_var_v3f16(<3 x half> %val) #1 {
   %canonicalized = call <3 x half> @llvm.canonicalize.v3f16(<3 x half> %val)
Index: test/CodeGen/AMDGPU/function-args.ll
===================================================================
--- test/CodeGen/AMDGPU/function-args.ll
+++ test/CodeGen/AMDGPU/function-args.ll
@@ -314,8 +314,17 @@
 }
 
 ; GCN-LABEL: {{^}}void_func_v5i16:
-; GCN-DAG: buffer_store_short v4, off,
-; GCN-DAG: buffer_store_dwordx2 v[1:2], off
+; CI: v_lshlrev_b32
+; CI: v_and_b32
+; CI: v_lshlrev_b32
+; CI: v_or_b32
+; CI: v_or_b32
+; CI-DAG: buffer_store_short v
+; CI-DAG: buffer_store_dwordx2 v
+
+; GFX89-DAG: buffer_store_short v2, off,
+; GFX89-DAG: buffer_store_dwordx2 v[0:1], off
+
 define void @void_func_v5i16(<5 x i16> %arg0) #0 {
   store <5 x i16> %arg0, <5 x i16> addrspace(1)* undef
   ret void
Index: test/CodeGen/AMDGPU/function-returns.ll
===================================================================
--- test/CodeGen/AMDGPU/function-returns.ll
+++ test/CodeGen/AMDGPU/function-returns.ll
@@ -317,13 +317,13 @@
   ret <4 x half> %val
 }
 
+; FIXME: Mixing buffer and global
 ; FIXME: Should not scalarize
 ; GCN-LABEL: {{^}}v5i16_func_void:
 ; GFX9: buffer_load_dwordx2 v[0:1]
-; GFX9: buffer_load_ushort v4
-; GFX9: v_lshrrev_b32_e32 v5, 16, v0
-; GFX9: v_lshrrev_b32_e32 v3, 16, v1
-; GCN: s_setpc_b64
+; GFX9-NEXT: global_load_short_d16 v2
+; GFX9-NEXT: s_waitcnt
+; GFX9-NEXT: s_setpc_b64
 define <5 x i16> @v5i16_func_void() #0 {
   %ptr = load volatile <5 x i16> addrspace(1)*, <5 x i16> addrspace(1)* addrspace(4)* undef
   %val = load <5 x i16>, <5 x i16> addrspace(1)* %ptr
Index: test/CodeGen/AMDGPU/mad-mix-lo.ll
===================================================================
--- test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -94,12 +94,10 @@
 
 ; GCN-LABEL: {{^}}v_mad_mix_v3f32:
 ; GCN: s_waitcnt
-; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
-; GFX9-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
-; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; GFX9-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-; GFX9-NEXT: v_mov_b32_e32 v0, v6
-; GFX9-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
+; GFX9-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1]
+; GFX9-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; GFX9-NEXT: v_mov_b32_e32 v0, v3
 ; GFX9-NEXT: s_setpc_b64
 define <3 x half> @v_mad_mix_v3f32(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
   %src0.ext = fpext <3 x half> %src0 to <3 x float>
@@ -149,11 +147,11 @@
 ; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_postcvt:
 ; GCN: s_waitcnt
 ; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GFX9-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
 ; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX9-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX9-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GFX9-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
 ; GFX9-NEXT: v_mov_b32_e32 v0, v6
-; GFX9-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-NEXT: v_mov_b32_e32 v1, v2
 ; GFX9-NEXT: s_setpc_b64
 define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
   %src0.ext = fpext <3 x half> %src0 to <3 x float>
@@ -246,15 +244,16 @@
 
 ; FIXME: Handling undef 4th component
 ; GCN-LABEL: {{^}}v_mad_mix_v3f32_clamp_precvt:
-; GFX9: v_mad_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX9: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GFX9: v_mad_mix_f32 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GFX9: v_mad_mix_f32 v1, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
-
-; GFX9: v_cvt_f16_f32
-; GFX9: v_cvt_f16_f32
-; GFX9: v_cvt_f16_f32
-; GFX9: v_cvt_f16_f32
+; GCN: s_waitcnt
+; GFX9-NEXT: v_mad_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GFX9-NEXT: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GFX9-NEXT: v_cvt_f16_f32_e32 v0, v0
+; GFX9-NEXT: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GFX9-NEXT: v_cvt_f16_f32_e32 v2, v6
+; GFX9-NEXT: v_cvt_f16_f32_e32 v1, v1
+; GFX9-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GFX9-NEXT: v_lshl_or_b32 v0, v2, 16, v0
+; GFX9-NEXT: s_setpc_b64
 define <3 x half> @v_mad_mix_v3f32_clamp_precvt(<3 x half> %src0, <3 x half> %src1, <3 x half> %src2) #0 {
   %src0.ext = fpext <3 x half> %src0 to <3 x float>
   %src1.ext = fpext <3 x half> %src1 to <3 x float>