Index: lib/Target/AMDGPU/AMDGPUCallingConv.td =================================================================== --- lib/Target/AMDGPU/AMDGPUCallingConv.td +++ lib/Target/AMDGPU/AMDGPUCallingConv.td @@ -19,7 +19,7 @@ // Calling convention for SI def CC_SI : CallingConv<[ - CCIfInReg>>, // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs. - CCIfNotInReg>, // 32*4 + 4 is the minimum for a fetch shader with 32 outputs. - CCIfType<[f32, f16] , CCAssignToReg<[ + CCIfType<[f32, f16, v2f16] , CCAssignToReg<[ VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7, VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15, VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23, Index: lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- lib/Target/AMDGPU/SIISelLowering.cpp +++ lib/Target/AMDGPU/SIISelLowering.cpp @@ -1349,7 +1349,8 @@ for (unsigned I = 0, E = Ins.size(), PSInputNum = 0; I != E; ++I) { const ISD::InputArg *Arg = &Ins[I]; - assert(!Arg->VT.isVector() && "vector type argument should have been split"); + assert((!Arg->VT.isVector() || Arg->VT.getScalarSizeInBits() == 16) && + "vector type argument should have been split"); // First check if it's a PS input addr. if (CallConv == CallingConv::AMDGPU_PS && @@ -1951,29 +1952,6 @@ llvm_unreachable("Unknown loc info!"); } - if (IsShader && Arg.VT.isVector()) { - // Build a vector from the registers - Type *ParamType = FType->getParamType(Arg.getOrigArgIndex()); - unsigned NumElements = ParamType->getVectorNumElements(); - - SmallVector Regs; - Regs.push_back(Val); - for (unsigned j = 1; j != NumElements; ++j) { - Reg = ArgLocs[ArgIdx++].getLocReg(); - Reg = MF.addLiveIn(Reg, RC); - - SDValue Copy = DAG.getCopyFromReg(Chain, DL, Reg, VT); - Regs.push_back(Copy); - } - - // Fill up the missing vector elements - NumElements = Arg.VT.getVectorNumElements() - NumElements; - Regs.append(NumElements, DAG.getUNDEF(VT)); - - InVals.push_back(DAG.getBuildVector(Arg.VT, DL, Regs)); - continue; - } - InVals.push_back(Val); } @@ -2037,48 +2015,19 @@ bool IsShader = AMDGPU::isShader(CallConv); - Info->setIfReturnsVoid(Outs.size() == 0); + Info->setIfReturnsVoid(Outs.empty()); bool IsWaveEnd = Info->returnsVoid() && IsShader; - SmallVector Splits; - SmallVector SplitVals; - - // Split vectors into their elements. - for (unsigned i = 0, e = Outs.size(); i != e; ++i) { - const ISD::OutputArg &Out = Outs[i]; - - if (IsShader && Out.VT.isVector()) { - MVT VT = Out.VT.getVectorElementType(); - ISD::OutputArg NewOut = Out; - NewOut.Flags.setSplit(); - NewOut.VT = VT; - - // We want the original number of vector elements here, e.g. - // three or five, not four or eight. - unsigned NumElements = Out.ArgVT.getVectorNumElements(); - - for (unsigned j = 0; j != NumElements; ++j) { - SDValue Elem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, OutVals[i], - DAG.getConstant(j, DL, MVT::i32)); - SplitVals.push_back(Elem); - Splits.push_back(NewOut); - NewOut.PartOffset += NewOut.VT.getStoreSize(); - } - } else { - SplitVals.push_back(OutVals[i]); - Splits.push_back(Out); - } - } - // CCValAssign - represent the assignment of the return value to a location. SmallVector RVLocs; + SmallVector Splits; // CCState - Info about the registers and stack slots. CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs, *DAG.getContext()); // Analyze outgoing return values. - CCInfo.AnalyzeReturn(Splits, CCAssignFnForReturn(CallConv, isVarArg)); + CCInfo.AnalyzeReturn(Outs, CCAssignFnForReturn(CallConv, isVarArg)); SDValue Flag; SmallVector RetOps; @@ -2103,14 +2052,12 @@ } // Copy the result values into the output registers. - for (unsigned i = 0, realRVLocIdx = 0; - i != RVLocs.size(); - ++i, ++realRVLocIdx) { - CCValAssign &VA = RVLocs[i]; + for (unsigned I = 0, RealRVLocIdx = 0, E = RVLocs.size(); I != E; + ++I, ++RealRVLocIdx) { + CCValAssign &VA = RVLocs[I]; assert(VA.isRegLoc() && "Can only return in registers!"); // TODO: Partially return in registers if return values don't fit. - - SDValue Arg = SplitVals[realRVLocIdx]; + SDValue Arg = OutVals[RealRVLocIdx]; // Copied from other backends. switch (VA.getLocInfo()) { Index: test/CodeGen/AMDGPU/calling-conventions.ll =================================================================== --- test/CodeGen/AMDGPU/calling-conventions.ll +++ test/CodeGen/AMDGPU/calling-conventions.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=SI %s -; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GCN -check-prefix=VI %s +; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=SI %s +; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefix=GCN -check-prefix=VI %s ; Make sure we don't crash or assert on spir_kernel calling convention. @@ -88,8 +88,8 @@ ; Mesa pixel shader: check for 45096 (SPI_SHADER_PGM_RSRC1_PS) in .AMDGPU.config ; GCN-LABEL: .AMDGPU.config ; GCN: .long 45096 -; GCN-LABEL: {{^}}ps_mesa: -define amdgpu_ps half @ps_mesa(half %arg0) { +; GCN-LABEL: {{^}}ps_mesa_f16: +define amdgpu_ps half @ps_mesa_f16(half %arg0) { %add = fadd half %arg0, 1.0 ret half %add } @@ -121,4 +121,83 @@ ret half %add } +; FIXME: Inconsistent ABI between targets +; GCN-LABEL: {{^}}ps_mesa_v2f16: +; VI: v_mov_b32_e32 v1, 0x3c00 +; VI-NEXT: v_add_f16_sdwa v1, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI-NEXT: v_add_f16_e32 v0, 1.0, v0 +; VI-NEXT: v_or_b32_e32 v0, v0, v1 +; VI-NEXT: ; return + +; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT0:v[0-9]+]], v0 +; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT1:v[0-9]+]], v1 +; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT0:v[0-9]+]], [[CVT_ELT0]] +; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT1:v[0-9]+]], [[CVT_ELT1]] +; SI-DAG: v_add_f32_e32 v0, 1.0, [[RECVT_ELT0]] +; SI-DAG: v_add_f32_e32 v1, 1.0, [[RECVT_ELT1]] +; SI: ; return to shader part epilog +define amdgpu_ps <2 x half> @ps_mesa_v2f16(<2 x half> %arg0) { + %add = fadd <2 x half> %arg0, + ret <2 x half> %add +} + +; GCN-LABEL: {{^}}ps_mesa_inreg_v2f16: +; VI: s_lshr_b32 s1, s0, 16 +; VI-NEXT: v_mov_b32_e32 v0, s1 +; VI-NEXT: v_mov_b32_e32 v1, 0x3c00 +; VI-NEXT: v_add_f16_sdwa v0, v0, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD +; VI-NEXT: v_add_f16_e64 v1, s0, 1.0 +; VI-NEXT: v_or_b32_e32 v0, v1, v0 +; VI-NEXT: ; return to shader part epilog + +; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT0:v[0-9]+]], s0 +; SI-DAG: v_cvt_f16_f32_e32 [[CVT_ELT1:v[0-9]+]], s1 +; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT0:v[0-9]+]], [[CVT_ELT0]] +; SI-DAG: v_cvt_f32_f16_e32 [[RECVT_ELT1:v[0-9]+]], [[CVT_ELT1]] +; SI-DAG: v_add_f32_e32 v0, 1.0, [[RECVT_ELT0]] +; SI-DAG: v_add_f32_e32 v1, 1.0, [[RECVT_ELT1]] +; SI: ; return to shader part epilog +define amdgpu_ps <2 x half> @ps_mesa_inreg_v2f16(<2 x half> inreg %arg0) { + %add = fadd <2 x half> %arg0, + ret <2 x half> %add +} + +; GCN-LABEL: {{^}}ps_mesa_v2i16: +; VI: v_mov_b32_e32 v2, 1 +; VI: v_add_u16_e32 v1, 1, v0 +; VI: v_add_u16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD +; VI: v_or_b32_e32 v0, v1, v0 + + +; SI: v_lshlrev_b32_e32 v1, 16, v1 +; SI: v_add_i32_e32 v0, vcc, 1, v0 +; SI: v_add_i32_e32 v1, vcc, 0x10000, v1 +; SI: v_and_b32 +; SI: v_or_b32 +define amdgpu_ps void @ps_mesa_v2i16(<2 x i16> %arg0) { + %add = add <2 x i16> %arg0, + store <2 x i16> %add, <2 x i16> addrspace(1)* undef + ret void +} + +; GCN-LABEL: {{^}}ps_mesa_inreg_v2i16: +; VI: s_lshr_b32 s1, s0, 16 +; VI: s_add_i32 s1, s1, 1 +; VI: s_add_i32 s0, s0, 1 +; VI: s_and_b32 s0, s0, 0xffff +; VI: s_lshl_b32 s1, s1, 16 +; VI: s_or_b32 s0, s0, s1 +; VI: v_mov_b32_e32 v0, s0 + +; SI: s_lshl_b32 s1, s1, 16 +; SI: s_add_i32 s0, s0, 1 +; SI: s_add_i32 s1, s1, 0x10000 +; SI: s_and_b32 s0, s0, 0xffff +; SI: s_or_b32 s0, s0, s1 +define amdgpu_ps void @ps_mesa_inreg_v2i16(<2 x i16> inreg %arg0) { + %add = add <2 x i16> %arg0, + store <2 x i16> %add, <2 x i16> addrspace(1)* undef + ret void +} + attributes #0 = { nounwind noinline }