Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -1351,9 +1351,9 @@
 
 TargetLoweringBase::LegalizeTypeAction
 SITargetLowering::getPreferredVectorAction(MVT VT) const {
-  if (VT.getVectorNumElements() != 1 && VT.getScalarType().bitsLE(MVT::i16))
-    return TypeSplitVector;
-
+  int NumElts = VT.getVectorNumElements();
+  if (NumElts != 1 && VT.getScalarType().bitsLE(MVT::i16))
+    return VT.isPow2VectorType() ? TypeSplitVector : TypeWidenVector;
   return TargetLoweringBase::getPreferredVectorAction(VT);
 }
 
Index: test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
===================================================================
--- test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
+++ test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll
@@ -165,11 +165,104 @@
   ret { i32, half } %ins1
 }
 
+define amdgpu_kernel void @v3i16_registers(i1 %cond) #0 {
+; GCN-LABEL: v3i16_registers:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GCN-NEXT:    s_mov_b32 s33, s9
+; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s33
+; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s4, 1, s4
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 1
+; GCN-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; GCN-NEXT:    s_cbranch_vccz BB4_2
+; GCN-NEXT:  ; %bb.1:
+; GCN-NEXT:    s_mov_b32 s4, 0
+; GCN-NEXT:    s_mov_b32 s5, s4
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    s_branch BB4_3
+; GCN-NEXT:  BB4_2: ; %if.else
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, func_v3i16@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, func_v3i16@gotpcrel32@hi+4
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:  BB4_3: ; %if.end
+; GCN-NEXT:    global_store_short v[0:1], v1, off
+; GCN-NEXT:    global_store_dword v[0:1], v0, off
+; GCN-NEXT:    s_endpgm
+entry:
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call6 = tail call <3 x i16> @func_v3i16() #0
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %call6.sink = phi <3 x i16> [ %call6, %if.else ], [ undef, %if.then ]
+  store <3 x i16> %call6.sink, <3 x i16> addrspace(1)* undef
+  ret void
+}
+
+define amdgpu_kernel void @v3f16_registers(i1 %cond) #0 {
+; GCN-LABEL: v3f16_registers:
+; GCN:       ; %bb.0: ; %entry
+; GCN-NEXT:    s_load_dword s4, s[4:5], 0x0
+; GCN-NEXT:    s_mov_b32 s33, s9
+; GCN-NEXT:    s_add_u32 flat_scratch_lo, s6, s33
+; GCN-NEXT:    s_addc_u32 flat_scratch_hi, s7, 0
+; GCN-NEXT:    s_mov_b32 s32, s33
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_and_b32 s4, 1, s4
+; GCN-NEXT:    v_cmp_eq_u32_e64 s[4:5], s4, 1
+; GCN-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; GCN-NEXT:    s_cbranch_vccz BB5_2
+; GCN-NEXT:  ; %bb.1:
+; GCN-NEXT:    s_mov_b32 s4, 0
+; GCN-NEXT:    s_mov_b32 s5, s4
+; GCN-NEXT:    v_mov_b32_e32 v0, s4
+; GCN-NEXT:    v_mov_b32_e32 v1, s5
+; GCN-NEXT:    s_branch BB5_3
+; GCN-NEXT:  BB5_2: ; %if.else
+; GCN-NEXT:    s_getpc_b64 s[4:5]
+; GCN-NEXT:    s_add_u32 s4, s4, func_v3f16@gotpcrel32@lo+4
+; GCN-NEXT:    s_addc_u32 s5, s5, func_v3f16@gotpcrel32@hi+4
+; GCN-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; GCN-NEXT:    s_waitcnt lgkmcnt(0)
+; GCN-NEXT:    s_swappc_b64 s[30:31], s[4:5]
+; GCN-NEXT:  BB5_3: ; %if.end
+; GCN-NEXT:    global_store_short v[0:1], v1, off
+; GCN-NEXT:    global_store_dword v[0:1], v0, off
+; GCN-NEXT:    s_endpgm
+entry:
+  br i1 %cond, label %if.then, label %if.else
+
+if.then:                                          ; preds = %entry
+  br label %if.end
+
+if.else:                                          ; preds = %entry
+  %call6 = tail call <3 x half> @func_v3f16() #0
+  br label %if.end
+
+if.end:                                           ; preds = %if.else, %if.then
+  %call6.sink = phi <3 x half> [ %call6, %if.else ], [ undef, %if.then ]
+  store <3 x half> %call6.sink, <3 x half> addrspace(1)* undef
+  ret void
+}
 
 declare hidden <2 x float> @func_v2f32() #0
 declare hidden <3 x float> @func_v3f32() #0
 declare hidden <4 x float> @func_v4f32() #0
 declare hidden <4 x half> @func_v4f16() #0
+declare <3 x i16> @func_v3i16()
+declare <3 x half> @func_v3f16()
 
 declare hidden { <4 x i32>, <4 x half> } @func_struct() #0