Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -6989,27 +6989,42 @@
 
   // TODO: This could be better with wider vectors that will be split to v2f16,
   // and to consider uses since there aren't that many packed operations.
-  if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16) {
+  if (N0.getOpcode() == ISD::BUILD_VECTOR && VT == MVT::v2f16 &&
+      isTypeLegal(MVT::v2f16)) {
     SDLoc SL(N);
     SDValue NewElts[2];
     SDValue Lo = N0.getOperand(0);
     SDValue Hi = N0.getOperand(1);
+    EVT EltVT = Lo.getValueType();
+
     if (vectorEltWillFoldAway(Lo) || vectorEltWillFoldAway(Hi)) {
       for (unsigned I = 0; I != 2; ++I) {
         SDValue Op = N0.getOperand(I);
-        EVT EltVT = Op.getValueType();
         if (ConstantFPSDNode *CFP = dyn_cast<ConstantFPSDNode>(Op)) {
           NewElts[I] = getCanonicalConstantFP(DAG, SL, EltVT,
                                               CFP->getValueAPF());
         } else if (Op.isUndef()) {
-          // This would ordinarily be folded to a qNaN. Since this may be half
-          // of a packed operation, it may be cheaper to use a 0.
-          NewElts[I] = DAG.getConstantFP(0.0f, SL, EltVT);
+          // Handled below based on what the other operand is.
+          NewElts[I] = Op;
         } else {
           NewElts[I] = DAG.getNode(ISD::FCANONICALIZE, SL, EltVT, Op);
         }
       }
 
+      // If one half is undef, and one is constant, perfer a splat vector rather
+      // than the normal qNaN. If it's a register, prefer 0.0 since that's
+      // cheaper to use amd may be free with a packed operation.
+      if (NewElts[0].isUndef()) {
+        if (isa<ConstantFPSDNode>(NewElts[1]))
+          NewElts[0] = isa<ConstantFPSDNode>(NewElts[1]) ?
+            NewElts[1]: DAG.getConstantFP(0.0f, SL, EltVT);
+      }
+
+      if (NewElts[1].isUndef()) {
+        NewElts[1] = isa<ConstantFPSDNode>(NewElts[0]) ?
+          NewElts[0] : DAG.getConstantFP(0.0f, SL, EltVT);
+      }
+
       return DAG.getBuildVector(VT, SL, NewElts);
     }
   }
Index: test/CodeGen/AMDGPU/clamp.ll
===================================================================
--- test/CodeGen/AMDGPU/clamp.ll
+++ test/CodeGen/AMDGPU/clamp.ll
@@ -688,6 +688,38 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}v_clamp_v2f16_undef_limit_elts0:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GFX9-NOT: [[A]]
+; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
+  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half 0.0, half undef>)
+  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half undef, half 1.0>)
+
+  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
+  ret void
+}
+
+; GCN-LABEL: {{^}}v_clamp_v2f16_undef_limit_elts1:
+; GCN: {{buffer|flat|global}}_load_dword [[A:v[0-9]+]]
+; GFX9-NOT: [[A]]
+; GFX9: v_pk_max_f16 [[CLAMP:v[0-9]+]], [[A]], [[A]] clamp{{$}}
+define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(<2 x half> addrspace(1)* %out, <2 x half> addrspace(1)* %aptr) #0 {
+  %tid = call i32 @llvm.amdgcn.workitem.id.x()
+  %gep0 = getelementptr <2 x half>, <2 x half> addrspace(1)* %aptr, i32 %tid
+  %out.gep = getelementptr <2 x half>, <2 x half> addrspace(1)* %out, i32 %tid
+  %a = load <2 x half>, <2 x half> addrspace(1)* %gep0
+  %max = call <2 x half> @llvm.maxnum.v2f16(<2 x half> %a, <2 x half> <half undef, half 0.0>)
+  %med = call <2 x half> @llvm.minnum.v2f16(<2 x half> %max, <2 x half> <half 1.0, half undef>)
+
+  store <2 x half> %med, <2 x half> addrspace(1)* %out.gep
+  ret void
+}
+
 ; GCN-LABEL: {{^}}v_clamp_diff_source_f32:
 ; GCN: v_add_f32_e32 [[A:v[0-9]+]]
 ; GCN: v_add_f32_e32 [[B:v[0-9]+]]
Index: test/CodeGen/AMDGPU/fcanonicalize.f16.ll
===================================================================
--- test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -565,20 +565,71 @@
 }
 
 ; GCN-LABEL: {{^}}v_test_canonicalize_undef_reg_v2f16:
-; GFX9: s_waitcnt
-; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
-; GFX9-NEXT: v_lshl_or_b32 v0, v0, 16, 0
-; GFX9-NEXT: s_setpc_b64
-
-; VI: s_waitcnt
-; VI-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; VI-NEXT: s_setpc_b64
+; GFX89: s_waitcnt
+; GFX89-NEXT: v_max_f16_sdwa v0, v0, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX89-NEXT: s_setpc_b64
 define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 {
   %vec = insertelement <2 x half> undef, half %val, i32 1
   %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
   ret <2 x half> %canonicalized
 }
 
+; GCN-LABEL: {{^}}v_test_canonicalize_undef_lo_imm_hi_v2f16:
+; GCN: s_waitcnt
+; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c003c00
+; GFX89-NEXT: s_setpc_b64
+
+; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000
+; CI-NEXT: v_mov_b32_e32 v1, 1.0
+; CI-NEXT: s_setpc_b64
+define <2 x half> @v_test_canonicalize_undef_lo_imm_hi_v2f16() #1 {
+  %vec = insertelement <2 x half> undef, half 1.0, i32 1
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
+  ret <2 x half> %canonicalized
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_imm_lo_undef_hi_v2f16:
+; GCN: s_waitcnt
+; GFX89-NEXT: v_mov_b32_e32 v0, 0x3c003c00
+; GFX89-NEXT: s_setpc_b64
+
+; CI-NEXT: v_mov_b32_e32 v0, 1.0
+; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; CI-NEXT: s_setpc_b64
+define <2 x half> @v_test_canonicalize_imm_lo_undef_hi_v2f16() #1 {
+  %vec = insertelement <2 x half> undef, half 1.0, i32 0
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
+  ret <2 x half> %canonicalized
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_undef_lo_k_hi_v2f16:
+; GCN: s_waitcnt
+; GFX89-NEXT: v_mov_b32_e32 v0, 0x4c004c00
+; GFX89-NEXT: s_setpc_b64
+
+; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000
+; CI-NEXT: v_mov_b32_e32 v1, 0x41800000
+; CI-NEXT: s_setpc_b64
+define <2 x half> @v_test_canonicalize_undef_lo_k_hi_v2f16() #1 {
+  %vec = insertelement <2 x half> undef, half 16.0, i32 1
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
+  ret <2 x half> %canonicalized
+}
+
+; GCN-LABEL: {{^}}v_test_canonicalize_k_lo_undef_hi_v2f16:
+; GCN: s_waitcnt
+; GFX89-NEXT: v_mov_b32_e32 v0, 0x4c004c00
+; GFX89-NEXT: s_setpc_b64
+
+; CI-NEXT: v_mov_b32_e32 v0, 0x41800000
+; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
+; CI-NEXT: s_setpc_b64
+define <2 x half> @v_test_canonicalize_k_lo_undef_hi_v2f16() #1 {
+  %vec = insertelement <2 x half> undef, half 16.0, i32 0
+  %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %vec)
+  ret <2 x half> %canonicalized
+}
+
 ; GCN-LABEL: {{^}}v_test_canonicalize_reg_k_v2f16:
 ; GFX9: s_waitcnt
 ; GFX9-DAG: v_max_f16_e32 v0, v0, v0