Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -696,10 +696,11 @@ MVT SITargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { - if (CC != CallingConv::AMDGPU_KERNEL && - VT.isVector() && VT.getVectorNumElements() == 3) { + // TODO: Consider splitting all arguments into 32-bit pieces. + if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { EVT ScalarVT = VT.getScalarType(); - if (ScalarVT.getSizeInBits() == 32) + unsigned Size = ScalarVT.getSizeInBits(); + if (Size == 32 || Size == 64) return ScalarVT.getSimpleVT(); } @@ -709,11 +710,11 @@ unsigned SITargetLowering::getNumRegistersForCallingConv(LLVMContext &Context, CallingConv::ID CC, EVT VT) const { - if (CC != CallingConv::AMDGPU_KERNEL && - VT.isVector() && VT.getVectorNumElements() == 3) { + if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { EVT ScalarVT = VT.getScalarType(); - if (ScalarVT.getSizeInBits() == 32) - return 3; + unsigned Size = ScalarVT.getSizeInBits(); + if (Size == 32 || Size == 64) + return VT.getVectorNumElements(); } return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT); @@ -723,14 +724,13 @@ LLVMContext &Context, CallingConv::ID CC, EVT VT, EVT &IntermediateVT, unsigned &NumIntermediates, MVT &RegisterVT) const { - - if (CC != CallingConv::AMDGPU_KERNEL && VT.getVectorNumElements() == 3) { + if (CC != CallingConv::AMDGPU_KERNEL && VT.isVector()) { EVT ScalarVT = VT.getScalarType(); - if (ScalarVT.getSizeInBits() == 32 || - ScalarVT.getSizeInBits() == 64) { + unsigned Size = ScalarVT.getSizeInBits(); + if (Size == 32 || Size == 64) { RegisterVT = ScalarVT.getSimpleVT(); IntermediateVT = RegisterVT; - NumIntermediates = 3; + NumIntermediates = VT.getVectorNumElements(); return NumIntermediates; } } @@ -1313,6 +1313,8 @@ for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) { const ISD::InputArg *Arg = &Ins[I]; + assert(!Arg->VT.isVector() && "vector type argument should have been split"); + // First check if it's a PS input addr. if (CallConv == CallingConv::AMDGPU_PS && !Arg->Flags.isInReg() && !Arg->Flags.isByVal() && PSInputNum <= 15) { @@ -1346,25 +1348,7 @@ ++PSInputNum; } - // Second split vertices into their elements. - if (Arg->VT.isVector()) { - ISD::InputArg NewArg = *Arg; - NewArg.Flags.setSplit(); - NewArg.VT = Arg->VT.getVectorElementType(); - - // We REALLY want the ORIGINAL number of vertex elements here, e.g. a - // three or five element vertex only needs three or five registers, - // NOT four or eight. - Type *ParamType = FType->getParamType(Arg->getOrigArgIndex()); - unsigned NumElements = ParamType->getVectorNumElements(); - - for (unsigned J = 0; J != NumElements; ++J) { - Splits.push_back(NewArg); - NewArg.PartOffset += NewArg.VT.getStoreSize(); - } - } else { - Splits.push_back(*Arg); - } + Splits.push_back(*Arg); } } Index: test/CodeGen/AMDGPU/bfi_int.ll =================================================================== --- test/CodeGen/AMDGPU/bfi_int.ll +++ test/CodeGen/AMDGPU/bfi_int.ll @@ -54,8 +54,8 @@ ; FUNC-LABEL: {{^}}v_bitselect_v2i32_pat1: ; GCN: s_waitcnt -; GCN-NEXT: v_bfi_b32 v1, v3, v1, v5 ; GCN-NEXT: v_bfi_b32 v0, v2, v0, v4 +; GCN-NEXT: v_bfi_b32 v1, v3, v1, v5 ; GCN-NEXT: s_setpc_b64 define <2 x i32> @v_bitselect_v2i32_pat1(<2 x i32> %a, <2 x i32> %b, <2 x i32> %mask) { %xor.0 = xor <2 x i32> %a, %mask Index: test/CodeGen/AMDGPU/call-argument-types.ll =================================================================== --- test/CodeGen/AMDGPU/call-argument-types.ll +++ test/CodeGen/AMDGPU/call-argument-types.ll @@ -286,10 +286,10 @@ ; FIXME: Immedites should fold directly into v_mov_b32s ; GCN-LABEL: {{^}}test_call_external_void_func_v4i64: ; GCN: buffer_load_dwordx4 v[0:3] -; GCN: v_mov_b32_e32 v4, s -; GCN: v_mov_b32_e32 v5, s -; GCN: v_mov_b32_e32 v6, s -; GCN: v_mov_b32_e32 v7, s +; GCN-DAG: v_mov_b32_e32 v4, s +; GCN-DAG: v_mov_b32_e32 v5, s +; GCN-DAG: v_mov_b32_e32 v6, s +; GCN-DAG: v_mov_b32_e32 v7, s ; GCN: s_waitcnt ; GCN-NEXT: s_swappc_b64 @@ -358,6 +358,15 @@ ret void } +; GCN-LABEL: {{^}}test_call_external_void_func_v2i32_imm: +; GCN-DAG: v_mov_b32_e32 v0, 1 +; GCN-DAG: v_mov_b32_e32 v1, 2 +; GCN: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v2i32_imm() #0 { + call void @external_void_func_v2i32(<2 x i32> ) + ret void +} + ; GCN-LABEL: {{^}}test_call_external_void_func_v3i32_imm: ; HSA-DAG: s_mov_b32 s33, s9 ; MESA-DAG: s_mov_b32 s33, s3{{$}} @@ -393,6 +402,17 @@ ret void } +; GCN-LABEL: {{^}}test_call_external_void_func_v4i32_imm: +; GCN-DAG: v_mov_b32_e32 v0, 1 +; GCN-DAG: v_mov_b32_e32 v1, 2 +; GCN-DAG: v_mov_b32_e32 v2, 3 +; GCN-DAG: v_mov_b32_e32 v3, 4 +; GCN: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v4i32_imm() #0 { + call void @external_void_func_v4i32(<4 x i32> ) + ret void +} + ; GCN-LABEL: {{^}}test_call_external_void_func_v8i32: ; GCN-DAG: buffer_load_dwordx4 v[0:3], off ; GCN-DAG: buffer_load_dwordx4 v[4:7], off @@ -405,6 +425,21 @@ ret void } +; GCN-LABEL: {{^}}test_call_external_void_func_v8i32_imm: +; GCN-DAG: v_mov_b32_e32 v0, 1 +; GCN-DAG: v_mov_b32_e32 v1, 2 +; GCN-DAG: v_mov_b32_e32 v2, 3 +; GCN-DAG: v_mov_b32_e32 v3, 4 +; GCN-DAG: v_mov_b32_e32 v4, 5 +; GCN-DAG: v_mov_b32_e32 v5, 6 +; GCN-DAG: v_mov_b32_e32 v6, 7 +; GCN-DAG: v_mov_b32_e32 v7, 8 +; GCN: s_swappc_b64 +define amdgpu_kernel void @test_call_external_void_func_v8i32_imm() #0 { + call void @external_void_func_v8i32(<8 x i32> ) + ret void +} + ; GCN-LABEL: {{^}}test_call_external_void_func_v16i32: ; GCN-DAG: buffer_load_dwordx4 v[0:3], off ; GCN-DAG: buffer_load_dwordx4 v[4:7], off Index: test/CodeGen/AMDGPU/mad-mix.ll =================================================================== --- test/CodeGen/AMDGPU/mad-mix.ll +++ test/CodeGen/AMDGPU/mad-mix.ll @@ -54,13 +54,13 @@ } ; GCN-LABEL: {{^}}v_mad_mix_v2f32: -; GFX900: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mad_mix_f32 v1, v0, v3, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v3, v2 op_sel_hi:[1,1,1] +; GFX900: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mov_b32_e32 v1, v3 -; GFX906: v_mov_b32_e32 v3, v1 -; GFX906-NEXT: v_fma_mix_f32 v1, v0, v3, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v3, v2 op_sel_hi:[1,1,1] +; GFX906: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,1,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mix_f32 v0, v0, v1, v2 op_sel_hi:[1,1,1] +; GFX906-NEXT: v_mov_b32_e32 v1, v3 ; CIVI: v_mac_f32 define <2 x float> @v_mad_mix_v2f32(<2 x half> %src0, <2 x half> %src1, <2 x half> %src2) #0 { @@ -73,14 +73,14 @@ ; GCN-LABEL: {{^}}v_mad_mix_v2f32_shuffle: ; GCN: s_waitcnt -; GFX900-NEXT: v_mov_b32_e32 v3, v1 -; GFX900-NEXT: v_mad_mix_f32 v1, v0, v3, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1] -; GFX900-NEXT: v_mad_mix_f32 v0, v0, v3, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] +; GFX900: v_mad_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mad_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1] +; GFX900-NEXT: v_mov_b32_e32 v0, v3 ; GFX900-NEXT: s_setpc_b64 -; GFX906-NEXT: v_mov_b32_e32 v3, v1 -; GFX906-NEXT: v_fma_mix_f32 v1, v0, v3, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1] -; GFX906-NEXT: v_fma_mix_f32 v0, v0, v3, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mix_f32 v3, v0, v1, v2 op_sel:[1,0,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_fma_mix_f32 v1, v0, v1, v2 op_sel:[0,1,1] op_sel_hi:[1,1,1] +; GFX906-NEXT: v_mov_b32_e32 v0, v3 ; GFX906-NEXT: s_setpc_b64 ; CIVI: v_mac_f32 @@ -274,13 +274,14 @@ } ; GCN-LABEL: {{^}}v_mad_mix_v2f32_f32imm1: -; GFX9: v_mov_b32_e32 v2, v1 ; GFX9: v_mov_b32_e32 v3, 1.0 -; GFX900: v_mad_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX900: v_mad_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mov_b32_e32 v1, v2 -; GFX906: v_fma_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX906: v_fma_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding +; GFX906: v_mov_b32_e32 v1, v2 define <2 x float> @v_mad_mix_v2f32_f32imm1(<2 x half> %src0, <2 x half> %src1) #0 { %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> @@ -289,13 +290,15 @@ } ; GCN-LABEL: {{^}}v_mad_mix_v2f32_cvtf16imminv2pi: -; GFX9: v_mov_b32_e32 v2, v1 ; GFX9: v_mov_b32_e32 v3, 0x3e230000 -; GFX900: v_mad_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX900: v_mad_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding -; GFX906: v_fma_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX906: v_fma_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mov_b32_e32 v1, v2 + +; GFX906: v_fma_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding +; GFX906: v_mov_b32_e32 v1, v2 define <2 x float> @v_mad_mix_v2f32_cvtf16imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 { %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float> @@ -305,14 +308,15 @@ } ; GCN-LABEL: {{^}}v_mad_mix_v2f32_f32imminv2pi: -; GFX9: v_mov_b32_e32 v2, v1 ; GFX9: v_mov_b32_e32 v3, 0.15915494 -; GFX900: v_mad_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX900: v_mad_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mad_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding +; GFX900: v_mov_b32_e32 v1, v2 -; GFX906: v_fma_mix_f32 v1, v0, v2, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding -; GFX906: v_fma_mix_f32 v0, v0, v2, v3 op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v2, v0, v1, v3 op_sel:[1,1,0] op_sel_hi:[1,1,0] ; encoding +; GFX906: v_fma_mix_f32 v0, v0, v1, v3 op_sel_hi:[1,1,0] ; encoding +; GFX906: v_mov_b32_e32 v1, v2 define <2 x float> @v_mad_mix_v2f32_f32imminv2pi(<2 x half> %src0, <2 x half> %src1) #0 { %src0.ext = fpext <2 x half> %src0 to <2 x float> %src1.ext = fpext <2 x half> %src1 to <2 x float>