Index: lib/Target/AMDGPU/SIISelLowering.h
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.h
+++ lib/Target/AMDGPU/SIISelLowering.h
@@ -25,6 +25,17 @@
 private:
   const SISubtarget *Subtarget;
 
+public:
+  MVT getRegisterTypeForCallingConv(LLVMContext &Context,
+                                    EVT VT) const override;
+  unsigned getNumRegistersForCallingConv(LLVMContext &Context,
+                                         EVT VT) const override;
+
+  unsigned getVectorTypeBreakdownForCallingConv(
+    LLVMContext &Context, EVT VT, EVT &IntermediateVT,
+    unsigned &NumIntermediates, MVT &RegisterVT) const override;
+
+private:
   SDValue lowerKernArgParameterPtr(SelectionDAG &DAG, const SDLoc &SL,
                                    SDValue Chain, uint64_t Offset) const;
   SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const;
Index: lib/Target/AMDGPU/SIISelLowering.cpp
===================================================================
--- lib/Target/AMDGPU/SIISelLowering.cpp
+++ lib/Target/AMDGPU/SIISelLowering.cpp
@@ -693,6 +693,47 @@
   return false;
 }
 
+MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
+                                                    EVT VT) const {
+  if (VT.isVector() && VT.getVectorNumElements() == 3) {
+    EVT ScalarVT = VT.getScalarType();
+    if (ScalarVT.getSizeInBits() == 32)
+      return ScalarVT.getSimpleVT();
+  }
+
+  return TargetLowering::getRegisterTypeForCallingConv(Context, VT);
+}
+
+unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
+                                                         EVT VT) const {
+  if (VT.isVector() && VT.getVectorNumElements() == 3) {
+    EVT ScalarVT = VT.getScalarType();
+    if (ScalarVT.getSizeInBits() == 32)
+      return 3;
+  }
+
+  return TargetLowering::getNumRegistersForCallingConv(Context, VT);
+}
+
+unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
+  LLVMContext &Context, EVT VT, EVT &IntermediateVT,
+  unsigned &NumIntermediates, MVT &RegisterVT) const {
+
+  if (VT.getVectorNumElements() == 3) {
+    EVT ScalarVT = VT.getScalarType();
+    if (ScalarVT.getSizeInBits() == 32 ||
+        ScalarVT.getSizeInBits() == 64) {
+      RegisterVT = ScalarVT.getSimpleVT();
+      IntermediateVT = RegisterVT;
+      NumIntermediates = 3;
+      return NumIntermediates;
+    }
+  }
+
+  return TargetLowering::getVectorTypeBreakdownForCallingConv(
+    Context, VT, IntermediateVT, NumIntermediates, RegisterVT);
+}
+
 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                           const CallInst &CI,
                                           MachineFunction &MF,
Index: test/CodeGen/AMDGPU/call-argument-types.ll
===================================================================
--- test/CodeGen/AMDGPU/call-argument-types.ll
+++ test/CodeGen/AMDGPU/call-argument-types.ll
@@ -27,6 +27,7 @@
 
 declare void @external_void_func_v2i32(<2 x i32>) #0
 declare void @external_void_func_v3i32(<3 x i32>) #0
+declare void @external_void_func_v3i32_i32(<3 x i32>, i32) #0
 declare void @external_void_func_v4i32(<4 x i32>) #0
 declare void @external_void_func_v8i32(<8 x i32>) #0
 declare void @external_void_func_v16i32(<16 x i32>) #0
@@ -313,15 +314,14 @@
   ret void
 }
 
-; FIXME: Passing 4th
 ; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm:
 ; HSA-DAG: s_mov_b32 s33, s9
 ; MESA-DAG: s_mov_b32 s33, s3{{$}}
 
-; GCN-DAG: v_mov_b32_e32 v0
-; GCN-DAG: v_mov_b32_e32 v1
-; GCN-DAG: v_mov_b32_e32 v2
-; GCN-DAG: v_mov_b32_e32 v3
+; GCN-DAG: v_mov_b32_e32 v0, 3
+; GCN-DAG: v_mov_b32_e32 v1, 4
+; GCN-DAG: v_mov_b32_e32 v2, 5
+; GCN-NOT: v3
 
 ; GCN: s_swappc_b64
 define amdgpu_kernel void @test_call_external_void_func_v3i32_imm(i32) #0 {
@@ -329,6 +329,16 @@
   ret void
 }
 
+; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_i32:
+; GCN-DAG: v_mov_b32_e32 v0, 3
+; GCN-DAG: v_mov_b32_e32 v1, 4
+; GCN-DAG: v_mov_b32_e32 v2, 5
+; GCN-DAG: v_mov_b32_e32 v3, 6
+define amdgpu_kernel void @test_call_external_void_func_v3i32_i32(i32) #0 {
+  call void @external_void_func_v3i32_i32(<3 x i32> <i32 3, i32 4, i32 5>, i32 6)
+  ret void
+}
+
 ; GCN-LABEL: {{^}}test_call_external_void_func_v4i32:
 ; GCN: buffer_load_dwordx4 v[0:3]
 ; GCN: s_waitcnt
Index: test/CodeGen/AMDGPU/fmaxnum.ll
===================================================================
--- test/CodeGen/AMDGPU/fmaxnum.ll
+++ test/CodeGen/AMDGPU/fmaxnum.ll
@@ -3,6 +3,7 @@
 
 declare float @llvm.maxnum.f32(float, float) #0
 declare <2 x float> @llvm.maxnum.v2f32(<2 x float>, <2 x float>) #0
+declare <3 x float> @llvm.maxnum.v3f32(<3 x float>, <3 x float>) #0
 declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) #0
 declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>) #0
 declare <16 x float> @llvm.maxnum.v16f32(<16 x float>, <16 x float>) #0
@@ -33,6 +34,17 @@
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_fmax_v3f32:
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI-NOT: v_max_f32
+define amdgpu_kernel void @test_fmax_v3f32(<3 x float> addrspace(1)* %out, <3 x float> %a, <3 x float> %b) nounwind {
+  %val = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %a, <3 x float> %b) #0
+  store <3 x float> %val, <3 x float> addrspace(1)* %out, align 16
+  ret void
+}
+
 ; FUNC-LABEL: @test_fmax_v4f32
 ; SI: v_max_f32_e32
 ; SI: v_max_f32_e32
@@ -280,4 +292,14 @@
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_func_fmax_v3f32:
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI: v_max_f32_e32
+; SI-NOT: v_max_f32
+define <3 x float> @test_func_fmax_v3f32(<3 x float> %a, <3 x float> %b) nounwind {
+  %val = call <3 x float> @llvm.maxnum.v3f32(<3 x float> %a, <3 x float> %b) #0
+  ret <3 x float> %val
+}
+
 attributes #0 = { nounwind readnone }
Index: test/CodeGen/AMDGPU/fminnum.ll
===================================================================
--- test/CodeGen/AMDGPU/fminnum.ll
+++ test/CodeGen/AMDGPU/fminnum.ll
@@ -4,6 +4,7 @@
 
 declare float @llvm.minnum.f32(float, float) #0
 declare <2 x float> @llvm.minnum.v2f32(<2 x float>, <2 x float>) #0
+declare <3 x float> @llvm.minnum.v3f32(<3 x float>, <3 x float>) #0
 declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) #0
 declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>) #0
 declare <16 x float> @llvm.minnum.v16f32(<16 x float>, <16 x float>) #0
@@ -278,4 +279,14 @@
   ret void
 }
 
+; FUNC-LABEL: {{^}}test_func_fmin_v3f32:
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI: v_min_f32_e32
+; SI-NOT: v_min_f32
+define <3 x float> @test_func_fmin_v3f32(<3 x float> %a, <3 x float> %b) nounwind {
+  %val = call <3 x float> @llvm.minnum.v3f32(<3 x float> %a, <3 x float> %b) #0
+  ret <3 x float> %val
+}
+
 attributes #0 = { nounwind readnone }
Index: test/CodeGen/AMDGPU/function-args.ll
===================================================================
--- test/CodeGen/AMDGPU/function-args.ll
+++ test/CodeGen/AMDGPU/function-args.ll
@@ -739,6 +739,45 @@
   ret void
 }
 
+; Make sure v3 isn't a wasted register because of v3 types being promoted to v4
+; GCN-LABEL: {{^}}void_func_v3f32_wasted_reg:
+; GCN: s_waitcnt
+; GCN: ds_write_b32 v{{[0-9]+}}, v0
+; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v1
+; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v2
+; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v3
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @void_func_v3f32_wasted_reg(<3 x float> %arg0, i32 %arg1) #0 {
+  %arg0.0 = extractelement <3 x float> %arg0, i32 0
+  %arg0.1 = extractelement <3 x float> %arg0, i32 1
+  %arg0.2 = extractelement <3 x float> %arg0, i32 2
+  store volatile float %arg0.0, float addrspace(3)* undef
+  store volatile float %arg0.1, float addrspace(3)* undef
+  store volatile float %arg0.2, float addrspace(3)* undef
+  store volatile i32 %arg1, i32 addrspace(3)* undef
+  ret void
+}
+
+; GCN-LABEL: {{^}}void_func_v3i32_wasted_reg:
+; GCN: s_waitcnt
+; GCN: ds_write_b32 v{{[0-9]+}}, v0
+; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v1
+; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v2
+; GCN-NEXT: ds_write_b32 v{{[0-9]+}}, v3
+; GCN-NEXT: s_waitcnt
+; GCN-NEXT: s_setpc_b64
+define void @void_func_v3i32_wasted_reg(<3 x i32> %arg0, i32 %arg1) #0 {
+  %arg0.0 = extractelement <3 x i32> %arg0, i32 0
+  %arg0.1 = extractelement <3 x i32> %arg0, i32 1
+  %arg0.2 = extractelement <3 x i32> %arg0, i32 2
+  store volatile i32 %arg0.0, i32 addrspace(3)* undef
+  store volatile i32 %arg0.1, i32 addrspace(3)* undef
+  store volatile i32 %arg0.2, i32 addrspace(3)* undef
+  store volatile i32 %arg1, i32 addrspace(3)* undef
+  ret void
+}
+
 ; Check there is no crash.
 ; GCN-LABEL: {{^}}void_func_v16i8:
 define void @void_func_v16i8(<16 x i8> %arg0) #0 {
Index: test/CodeGen/AMDGPU/function-returns.ll
===================================================================
--- test/CodeGen/AMDGPU/function-returns.ll
+++ test/CodeGen/AMDGPU/function-returns.ll
@@ -531,4 +531,43 @@
   ret { i32, <32 x i32> }%val
 }
 
+; Make sure the last struct component is returned in v3, not v4.
+; GCN-LABEL: {{^}}v3i32_struct_func_void_wasted_reg:
+; GCN: ds_read_b32 v0,
+; GCN: ds_read_b32 v1,
+; GCN: ds_read_b32 v2,
+; GCN: ds_read_b32 v3,
+define { <3 x i32>, i32 } @v3i32_struct_func_void_wasted_reg() #0 {
+  %load0 = load volatile i32, i32 addrspace(3)* undef
+  %load1 = load volatile i32, i32 addrspace(3)* undef
+  %load2 = load volatile i32, i32 addrspace(3)* undef
+  %load3 = load volatile i32, i32 addrspace(3)* undef
+
+  %insert.0 = insertelement <3 x i32> undef, i32 %load0, i32 0
+  %insert.1 = insertelement <3 x i32> %insert.0, i32 %load1, i32 1
+  %insert.2 = insertelement <3 x i32> %insert.1, i32 %load2, i32 2
+  %insert.3 = insertvalue { <3 x i32>, i32 } undef, <3 x i32> %insert.2, 0
+  %insert.4 = insertvalue { <3 x i32>, i32 } %insert.3, i32 %load3, 1
+  ret { <3 x i32>, i32 } %insert.4
+}
+
+; GCN-LABEL: {{^}}v3f32_struct_func_void_wasted_reg:
+; GCN: ds_read_b32 v0,
+; GCN: ds_read_b32 v1,
+; GCN: ds_read_b32 v2,
+; GCN: ds_read_b32 v3,
+define { <3 x float>, i32 } @v3f32_struct_func_void_wasted_reg() #0 {
+  %load0 = load volatile float, float addrspace(3)* undef
+  %load1 = load volatile float, float addrspace(3)* undef
+  %load2 = load volatile float, float addrspace(3)* undef
+  %load3 = load volatile i32, i32 addrspace(3)* undef
+
+  %insert.0 = insertelement <3 x float> undef, float %load0, i32 0
+  %insert.1 = insertelement <3 x float> %insert.0, float %load1, i32 1
+  %insert.2 = insertelement <3 x float> %insert.1, float %load2, i32 2
+  %insert.3 = insertvalue { <3 x float>, i32 } undef, <3 x float> %insert.2, 0
+  %insert.4 = insertvalue { <3 x float>, i32 } %insert.3, i32 %load3, 1
+  ret { <3 x float>, i32 } %insert.4
+}
+
 attributes #0 = { nounwind }
Index: test/CodeGen/AMDGPU/ret.ll
===================================================================
--- test/CodeGen/AMDGPU/ret.ll
+++ test/CodeGen/AMDGPU/ret.ll
@@ -33,9 +33,9 @@
 }
 
 ; GCN: .long 165580
-; GCN-NEXT: .long 562
+; GCN-NEXT: .long 2242
 ; GCN-NEXT: .long 165584
-; GCN-NEXT: .long 562
+; GCN-NEXT: .long 2242
 ; GCN-LABEL: {{^}}vgpr_ps_addr0:
 ; GCN-NOT: v_mov_b32_e32 v0
 ; GCN-NOT: v_mov_b32_e32 v1
@@ -74,9 +74,9 @@
 }
 
 ; GCN: .long 165580
-; GCN-NEXT: .long 2081
+; GCN-NEXT: .long 8321
 ; GCN-NEXT: .long 165584
-; GCN-NEXT: .long 2081
+; GCN-NEXT: .long 8321
 ; GCN-LABEL: {{^}}ps_input_ena_pos_w:
 ; GCN-DAG: v_mov_b32_e32 v0, v4
 ; GCN-DAG: v_mov_b32_e32 v1, v2
@@ -91,9 +91,9 @@
 }
 
 ; GCN: .long 165580
-; GCN-NEXT: .long 562
+; GCN-NEXT: .long 2242
 ; GCN-NEXT: .long 165584
-; GCN-NEXT: .long 563
+; GCN-NEXT: .long 2243
 ; GCN-LABEL: {{^}}vgpr_ps_addr1:
 ; GCN-DAG: v_mov_b32_e32 v0, v2
 ; GCN-DAG: v_mov_b32_e32 v1, v3
@@ -120,17 +120,20 @@
 }
 
 ; GCN: .long 165580
-; GCN-NEXT: .long 562
+; GCN-NEXT: .long 2242
 ; GCN-NEXT: .long 165584
-; GCN-NEXT: .long 631
+; GCN-NEXT: .long 2295
 ; GCN-LABEL: {{^}}vgpr_ps_addr119:
 ; GCN-DAG: v_mov_b32_e32 v0, v2
 ; GCN-DAG: v_mov_b32_e32 v1, v3
-; GCN-DAG: v_mov_b32_e32 v2, v6
-; GCN-DAG: v_mov_b32_e32 v3, v8
+; GCN-DAG: v_mov_b32_e32 v2, v8
+; GCN-DAG: v_mov_b32_e32 v3, v10
 ; GCN-DAG: v_mov_b32_e32 v4, v12
 ; GCN-NOT: s_endpgm
-define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr119([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #3 {
+define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr119([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2,
+  <2 x i32> %arg3,  <2 x i32> %arg4,  <2 x i32> %arg5,  <3 x i32> %arg6,  <2 x i32> %arg7, <2 x i32> %arg8,
+  <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15,
+float %arg16, float %arg17, float %arg18) #3 {
 bb:
   %i0 = extractelement <2 x i32> %arg4, i32 0
   %i1 = extractelement <2 x i32> %arg4, i32 1
@@ -140,24 +143,25 @@
   %f1 = bitcast i32 %i1 to float
   %f2 = bitcast i32 %i2 to float
   %f3 = bitcast i32 %i3 to float
-  %r0 = insertvalue { float, float, float, float, float } undef, float %f0, 0
-  %r1 = insertvalue { float, float, float, float, float } %r0, float %f1, 1
-  %r2 = insertvalue { float, float, float, float, float } %r1, float %f2, 2
-  %r3 = insertvalue { float, float, float, float, float } %r2, float %f3, 3
-  %r4 = insertvalue { float, float, float, float, float } %r3, float %arg12, 4
+  %r0 = insertvalue { float, float, float, float, float } undef, float %f0, 0 ; return v0
+  %r1 = insertvalue { float, float, float, float, float } %r0, float %f1, 1 ; return v1
+  %r2 = insertvalue { float, float, float, float, float } %r1, float %f2, 2 ; return v2
+  %r3 = insertvalue { float, float, float, float, float } %r2, float %f3, 3 ; return v3
+  %r4 = insertvalue { float, float, float, float, float } %r3, float %arg12, 4 ; return v4
   ret { float, float, float, float, float } %r4
 }
 
 ; GCN: .long 165580
-; GCN-NEXT: .long 562
+; GCN-NEXT: .long 2242
 ; GCN-NEXT: .long 165584
-; GCN-NEXT: .long 946
+; GCN-NEXT: .long 2530
 ; GCN-LABEL: {{^}}vgpr_ps_addr418:
 ; GCN-NOT: v_mov_b32_e32 v0
 ; GCN-NOT: v_mov_b32_e32 v1
 ; GCN-NOT: v_mov_b32_e32 v2
-; GCN: v_mov_b32_e32 v3, v4
-; GCN: v_mov_b32_e32 v4, v8
+; GCN: v_mov_b32_e32 v2, v3
+; GCN: v_mov_b32_e32 v4, v9
+; GCN: v_mov_b32_e32 v3, v5
 ; GCN-NOT: s_endpgm
 define amdgpu_ps { float, float, float, float, float } @vgpr_ps_addr418([9 x <16 x i8>] addrspace(4)* byval %arg, i32 inreg %arg1, i32 inreg %arg2, <2 x i32> %arg3, <2 x i32> %arg4, <2 x i32> %arg5, <3 x i32> %arg6, <2 x i32> %arg7, <2 x i32> %arg8, <2 x i32> %arg9, float %arg10, float %arg11, float %arg12, float %arg13, float %arg14, float %arg15, float %arg16, float %arg17, float %arg18) #4 {
 bb: