Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -702,6 +702,11 @@ unsigned Size = ScalarVT.getSizeInBits(); if (Size == 32 || Size == 64) return ScalarVT.getSimpleVT(); + + if (Size == 16 && + Subtarget->has16BitInsts() && + isPowerOf2_32(VT.getVectorNumElements())) + return VT.isInteger() ? MVT::v2i16 : MVT::v2f16; } return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT); @@ -711,10 +716,16 @@ CallingConv::ID CC, EVT VT) const { if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { + unsigned NumElts = VT.getVectorNumElements(); EVT ScalarVT = VT.getScalarType(); unsigned Size = ScalarVT.getSizeInBits(); + if (Size == 32 || Size == 64) - return VT.getVectorNumElements(); + return NumElts; + + // FIXME: Fails to break down as we want with v3. + if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts)) + return VT.getVectorNumElements() / 2; } return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); @@ -725,12 +736,23 @@ EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const { if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { + unsigned NumElts = VT.getVectorNumElements(); EVT ScalarVT = VT.getScalarType(); unsigned Size = ScalarVT.getSizeInBits(); if (Size == 32 || Size == 64) { RegisterVT = ScalarVT.getSimpleVT(); IntermediateVT = RegisterVT; - NumIntermediates = VT.getVectorNumElements(); + NumIntermediates = NumElts; + return NumIntermediates; + } + + // FIXME: We should fix the ABI to be the same on targets without 16-bit + // support, but unless we can properly handle 3-vectors, it will be still be + // inconsistent. + if (Size == 16 && Subtarget->has16BitInsts() && isPowerOf2_32(NumElts)) { + RegisterVT = VT.isInteger() ? MVT::v2i16 : MVT::v2f16; + IntermediateVT = RegisterVT; + NumIntermediates = NumElts / 2; return NumIntermediates; } } Index: test/CodeGen/AMDGPU/call-argument-types.ll =================================================================== --- test/CodeGen/AMDGPU/call-argument-types.ll +++ test/CodeGen/AMDGPU/call-argument-types.ll @@ -24,9 +24,14 @@ declare void @external_void_func_f16(half) #0 declare void @external_void_func_f32(float) #0 declare void @external_void_func_f64(double) #0 +declare void @external_void_func_v2f32(<2 x float>) #0 declare void @external_void_func_v2i16(<2 x i16>) #0 declare void @external_void_func_v2f16(<2 x half>) #0 +declare void @external_void_func_v3i16(<3 x i16>) #0 +declare void @external_void_func_v3f16(<3 x half>) #0 +declare void @external_void_func_v4i16(<4 x i16>) #0 +declare void @external_void_func_v4f16(<4 x half>) #0 declare void @external_void_func_v2i32(<2 x i32>) #0 declare void @external_void_func_v3i32(<3 x i32>) #0 @@ -319,6 +324,15 @@ ret void } +; GCN-LABEL: {{^}}test_call_external_void_func_v2f32_imm: +; GCN-DAG: v_mov_b32_e32 v0, 1.0 +; GCN-DAG: v_mov_b32_e32 v1, 2.0 +; GCN: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v2f32_imm() #0 { + call void @external_void_func_v2f32(<2 x float> ) + ret void +} + ; GCN-LABEL: {{^}}test_call_external_void_func_f64_imm: ; GCN: v_mov_b32_e32 v0, 0{{$}} ; GCN: v_mov_b32_e32 v1, 0x40100000 @@ -338,6 +352,49 @@ ret void } +; GCN-LABEL: {{^}}test_call_external_void_func_v3i16: +; GFX9: buffer_load_dwordx2 v[0:1] +; GFX9-NOT: v0 +; GFX9-NOT: v1 +; GFX9: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v3i16() #0 { + %val = load <3 x i16>, <3 x i16> addrspace(1)* undef + call void @external_void_func_v3i16(<3 x i16> %val) + ret void +} + +; FIXME: materialize constant directly in VGPR +; GCN-LABEL: {{^}}test_call_external_void_func_v3i16_imm: +; GFX9-DAG: s_mov_b32 [[K01:s[0-9]+]], 0x20001 +; GFX9-DAG: s_pack_ll_b32_b16 [[K23:s[0-9]+]], 3, s{{[0-9]+}} +; GFX9: v_mov_b32_e32 v0, [[K01]] +; GFX9: v_mov_b32_e32 v1, [[K23]] +; GFX9: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v3i16_imm() #0 { + call void @external_void_func_v3i16(<3 x i16> ) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_v4i16: +; GFX9: buffer_load_dwordx2 v[0:1] +; GFX9-NOT: v0 +; GFX9-NOT: v1 +; GFX9: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v4i16() #0 { + %val = load <4 x i16>, <4 x i16> addrspace(1)* undef + call void @external_void_func_v4i16(<4 x i16> %val) + ret void +} + +; GCN-LABEL: {{^}}test_call_external_void_func_v4i16_imm: +; GFX9-DAG: v_mov_b32_e32 v0, 0x20001 +; GFX9-DAG: v_mov_b32_e32 v1, 0x40003 +; GFX9: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v4i16_imm() #0 { + call void @external_void_func_v4i16(<4 x i16> ) + ret void +} + ; GCN-LABEL: {{^}}test_call_external_void_func_v2f16: ; GFX9: buffer_load_dword v0 ; GFX9-NOT: v0 Index: test/CodeGen/AMDGPU/mad-mix-lo.ll =================================================================== --- test/CodeGen/AMDGPU/mad-mix-lo.ll +++ test/CodeGen/AMDGPU/mad-mix-lo.ll @@ -112,12 +112,12 @@ ; GCN-LABEL: {{^}}v_mad_mix_v4f32: ; GCN: s_waitcnt -; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] -; GFX9-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] -; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX9-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX9-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-NEXT: v_mad_mixlo_f16 v6, v1, v3, v5 op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mad_mixlo_f16 v7, v0, v2, v4 op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mad_mixhi_f16 v7, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mad_mixhi_f16 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX9-NEXT: v_mov_b32_e32 v0, v7 +; GFX9-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-NEXT: s_setpc_b64 define <4 x half> @v_mad_mix_v4f32(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { %src0.ext = fpext <4 x half> %src0 to <4 x float> @@ -169,11 +169,11 @@ ; GCN-LABEL: {{^}}v_mad_mix_v4f32_clamp_postcvt: ; GCN: s_waitcnt ; GFX9-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp ; GFX9-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX9-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX9-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GFX9-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp ; GFX9-NEXT: v_mov_b32_e32 v0, v6 -; GFX9-NEXT: v_mov_b32_e32 v1, v7 +; GFX9-NEXT: v_mov_b32_e32 v1, v2 ; GFX9-NEXT: s_setpc_b64 define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 { %src0.ext = fpext <4 x half> %src0 to <4 x float> @@ -267,10 +267,11 @@ } ; GCN-LABEL: {{^}}v_mad_mix_v4f32_clamp_precvt: -; GFX9: v_mad_mix_f32 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp -; GFX9: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp -; GFX9: v_mad_mix_f32 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX9: v_mad_mix_f32 v6, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp ; GFX9: v_mad_mix_f32 v1, v1, v3, v5 op_sel_hi:[1,1,1] clamp +; GFX9: v_mad_mix_f32 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp +; GFX9: v_mad_mix_f32 v0, v0, v2, v4 op_sel_hi:[1,1,1] clamp + ; GFX9: v_cvt_f16_f32 ; GFX9: v_cvt_f16_f32 ; GFX9: v_cvt_f16_f32 Index: test/CodeGen/AMDGPU/mul.i16.ll =================================================================== --- test/CodeGen/AMDGPU/mul.i16.ll +++ test/CodeGen/AMDGPU/mul.i16.ll @@ -90,8 +90,8 @@ ; VI: v_or_b32_e32 ; GFX9: s_waitcnt -; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX9-NEXT: v_pk_mul_lo_u16 v0, v0, v2 +; GFX9-NEXT: v_pk_mul_lo_u16 v1, v1, v3 ; GFX9-NEXT: s_setpc_b64 define <4 x i16> @v_mul_v4i16(<4 x i16> %a, <4 x i16> %b) { %r.val = mul <4 x i16> %a, %b