Index: llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h +++ llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h @@ -32,13 +32,16 @@ /// A function of this type is used to perform value split action. using SplitArgTy = std::function, Register, LLT, LLT, int)>; - void splitToValueTypes(MachineIRBuilder &B, - const ArgInfo &OrigArgInfo, - unsigned OrigArgIdx, + void splitToValueTypes(MachineIRBuilder &B, const ArgInfo &OrigArgInfo, SmallVectorImpl &SplitArgs, - const DataLayout &DL, - CallingConv::ID CallConv, - SplitArgTy SplitArg) const; + const DataLayout &DL, CallingConv::ID CallConv) const; + + void processSplitArgs(MachineIRBuilder &B, const ArgInfo &OrigArgInfo, + unsigned OrigArgIdx, + const SmallVectorImpl &SplitArg, + SmallVectorImpl &SplitArgs, + const DataLayout &DL, CallingConv::ID CallConv, + SplitArgTy PerformArgSplit) const; bool lowerReturnVal(MachineIRBuilder &B, const Value *Val, ArrayRef VRegs, MachineInstrBuilder &Ret) const; Index: llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp =================================================================== --- llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp +++ llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp @@ -179,48 +179,63 @@ } } -void AMDGPUCallLowering::splitToValueTypes( - MachineIRBuilder &B, - const ArgInfo &OrigArg, unsigned OrigArgIdx, - SmallVectorImpl &SplitArgs, - const DataLayout &DL, CallingConv::ID CallConv, - SplitArgTy PerformArgSplit) const { +// FIXME: This should move to generic code. +void AMDGPUCallLowering::splitToValueTypes(MachineIRBuilder &B, + const ArgInfo &OrigArg, + SmallVectorImpl &SplitArgs, + const DataLayout &DL, + CallingConv::ID CallConv) const { const SITargetLowering &TLI = *getTLI(); LLVMContext &Ctx = OrigArg.Ty->getContext(); - if (OrigArg.Ty->isVoidTy()) - return; - SmallVector SplitVTs; ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs); assert(OrigArg.Regs.size() == SplitVTs.size()); - int SplitIdx = 0; - for (EVT VT : SplitVTs) { - Register Reg = OrigArg.Regs[SplitIdx]; - Type *Ty = VT.getTypeForEVT(Ctx); - LLT LLTy = getLLTForType(*Ty, DL); + if (SplitVTs.size() == 0) + return; - if (OrigArgIdx == AttributeList::ReturnIndex && VT.isScalarInteger()) { - unsigned ExtendOp = TargetOpcode::G_ANYEXT; - if (OrigArg.Flags[0].isSExt()) { - assert(OrigArg.Regs.size() == 1 && "expect only simple return values"); - ExtendOp = TargetOpcode::G_SEXT; - } else if (OrigArg.Flags[0].isZExt()) { - assert(OrigArg.Regs.size() == 1 && "expect only simple return values"); - ExtendOp = TargetOpcode::G_ZEXT; - } + if (SplitVTs.size() == 1) { + // No splitting to do, but we want to replace the original type (e.g. [1 x + // double] -> double). + SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx), + OrigArg.Flags[0], OrigArg.IsFixed); + return; + } - EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT, - extOpcodeToISDExtOpcode(ExtendOp)); - if (ExtVT != VT) { - VT = ExtVT; - Ty = ExtVT.getTypeForEVT(Ctx); - LLTy = getLLTForType(*Ty, DL); - Reg = B.buildInstr(ExtendOp, {LLTy}, {Reg}).getReg(0); - } - } + // Create one ArgInfo for each virtual register in the original ArgInfo. + assert(OrigArg.Regs.size() == SplitVTs.size() && "Regs / types mismatch"); + + bool NeedsRegBlock = TLI.functionArgumentNeedsConsecutiveRegisters( + OrigArg.Ty, CallConv, false); + for (unsigned i = 0, e = SplitVTs.size(); i < e; ++i) { + Type *SplitTy = SplitVTs[i].getTypeForEVT(Ctx); + SplitArgs.emplace_back(OrigArg.Regs[i], SplitTy, OrigArg.Flags[0], + OrigArg.IsFixed); + if (NeedsRegBlock) + SplitArgs.back().Flags[0].setInConsecutiveRegs(); + } + + SplitArgs.back().Flags[0].setInConsecutiveRegsLast(); +} + +void AMDGPUCallLowering::processSplitArgs( + MachineIRBuilder &B, const ArgInfo &OrigArg, unsigned OrigArgIdx, + const SmallVectorImpl &SplitArg, + SmallVectorImpl &SplitArgs, const DataLayout &DL, + CallingConv::ID CallConv, SplitArgTy PerformArgSplit) const { + LLVMContext &Ctx = OrigArg.Ty->getContext(); + const SITargetLowering &TLI = *getTLI(); + + // FIXME: This is mostly nasty pre-processing before handleAssignments. Most + // of this should be performed by handleAssignments. + + int SplitIdx = 0; + for (const ArgInfo &SplitArg : SplitArg) { + Register Reg = OrigArg.Regs[SplitIdx]; + EVT VT = EVT::getEVT(SplitArg.Ty); + LLT LLTy = getLLTForType(*SplitArg.Ty, DL); unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT); MVT RegVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT); @@ -228,7 +243,7 @@ if (NumParts == 1) { // No splitting to do, but we want to replace the original type (e.g. [1 x // double] -> double). - SplitArgs.emplace_back(Reg, Ty, OrigArg.Flags, OrigArg.IsFixed); + SplitArgs.emplace_back(Reg, SplitArg.Ty, OrigArg.Flags, OrigArg.IsFixed); ++SplitIdx; continue; @@ -314,22 +329,68 @@ const auto &F = MF.getFunction(); const DataLayout &DL = MF.getDataLayout(); MachineRegisterInfo *MRI = B.getMRI(); + LLVMContext &Ctx = F.getContext(); CallingConv::ID CC = F.getCallingConv(); const SITargetLowering &TLI = *getTLI(); - ArgInfo OrigRetInfo(VRegs, Val->getType()); - setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F); - SmallVector SplitRetInfos; + SmallVector SplitEVTs; + ComputeValueVTs(TLI, DL, Val->getType(), SplitEVTs); + assert(VRegs.size() == SplitEVTs.size() && + "For each split Type there should be exactly one VReg."); + + // We pre-process the return value decomposed into EVTs. + SmallVector PreSplitRetInfos; + + // Further processing is applied to split the arguments from PreSplitRetInfos + // into 32-bit pieces in SplitRetInfos before passing off to + // handleAssignments. + SmallVector SplitRetInfos; - splitToValueTypes( - B, OrigRetInfo, AttributeList::ReturnIndex, SplitRetInfos, DL, CC, - [&](ArrayRef Regs, Register SrcReg, LLT LLTy, LLT PartLLT, - int VTSplitIdx) { - unpackRegsToOrigType(B, Regs, SrcReg, - SplitRetInfos[VTSplitIdx], - LLTy, PartLLT); - }); + for (unsigned i = 0; i < SplitEVTs.size(); ++i) { + EVT VT = SplitEVTs[i]; + Register Reg = VRegs[i]; + ArgInfo RetInfo(Reg, VT.getTypeForEVT(Ctx)); + setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F); + + if (VT.isScalarInteger()) { + unsigned ExtendOp = TargetOpcode::G_ANYEXT; + if (RetInfo.Flags[0].isSExt()) { + assert(RetInfo.Regs.size() == 1 && "expect only simple return values"); + ExtendOp = TargetOpcode::G_SEXT; + } else if (RetInfo.Flags[0].isZExt()) { + assert(RetInfo.Regs.size() == 1 && "expect only simple return values"); + ExtendOp = TargetOpcode::G_ZEXT; + } + + EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT, + extOpcodeToISDExtOpcode(ExtendOp)); + if (ExtVT != VT) { + RetInfo.Ty = ExtVT.getTypeForEVT(Ctx); + LLT ExtTy = getLLTForType(*RetInfo.Ty, DL); + Reg = B.buildInstr(ExtendOp, {ExtTy}, {Reg}).getReg(0); + } + } + + if (Reg != RetInfo.Regs[0]) { + RetInfo.Regs[0] = Reg; + // Reset the arg flags after modifying Reg. + setArgFlags(RetInfo, AttributeList::ReturnIndex, DL, F); + } + + splitToValueTypes(B, RetInfo, PreSplitRetInfos, DL, CC); + + // FIXME: This splitting should mostly be done by handleAssignments + processSplitArgs(B, RetInfo, AttributeList::ReturnIndex, + PreSplitRetInfos, SplitRetInfos, DL, CC, + [&](ArrayRef Regs, Register SrcReg, LLT LLTy, + LLT PartLLT, int VTSplitIdx) { + unpackRegsToOrigType(B, Regs, SrcReg, + PreSplitRetInfos[VTSplitIdx], LLTy, + PartLLT); + }); + PreSplitRetInfos.clear(); + } CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg()); OutgoingValueHandler RetHandler(B, *MRI, Ret, AssignFn); @@ -684,7 +745,7 @@ CCInfo.AllocateReg(ImplicitBufferPtrReg); } - + SmallVector SplitArg; SmallVector SplitArgs; unsigned Idx = 0; unsigned PSInputNum = 0; @@ -729,16 +790,18 @@ const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex; setArgFlags(OrigArg, OrigArgIdx, DL, F); - splitToValueTypes( - B, OrigArg, OrigArgIdx, SplitArgs, DL, CC, - // FIXME: We should probably be passing multiple registers to - // handleAssignments to do this - [&](ArrayRef Regs, Register DstReg, - LLT LLTy, LLT PartLLT, int VTSplitIdx) { - assert(DstReg == VRegs[Idx][VTSplitIdx]); - packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs, - LLTy, PartLLT); - }); + SplitArg.clear(); + splitToValueTypes(B, OrigArg, SplitArg, DL, CC); + + processSplitArgs(B, OrigArg, OrigArgIdx, SplitArg, SplitArgs, DL, CC, + // FIXME: We should probably be passing multiple registers + // to handleAssignments to do this + [&](ArrayRef Regs, Register DstReg, LLT LLTy, + LLT PartLLT, int VTSplitIdx) { + assert(DstReg == VRegs[Idx][VTSplitIdx]); + packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs, + LLTy, PartLLT); + }); ++Idx; } Index: llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/function-returns.ll @@ -24,9 +24,10 @@ ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load 1 from `i1 addrspace(1)* undef`, addrspace 1) ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s1) - ; CHECK: $vgpr0 = COPY [[ZEXT]](s32) - ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] - ; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[ZEXT]](s32) + ; CHECK: $vgpr0 = COPY [[COPY1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 %val = load i1, i1 addrspace(1)* undef ret i1 %val } @@ -39,99 +40,94 @@ ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK: [[LOAD:%[0-9]+]]:_(s1) = G_LOAD [[DEF]](p1) :: (load 1 from `i1 addrspace(1)* undef`, addrspace 1) ; CHECK: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s1) - ; CHECK: $vgpr0 = COPY [[SEXT]](s32) - ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] - ; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SEXT]](s32) + ; CHECK: $vgpr0 = COPY [[COPY1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 %val = load i1, i1 addrspace(1)* undef ret i1 %val } define i7 @i7_func_void() #0 { ; CHECK-LABEL: name: i7_func_void - ; CHECK: bb.1 (%ir-block.0): + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: bb.1 (%ir-block.0): ; CHECK: [[LOAD:%[0-9]+]]:_(s7) = G_LOAD [[DEF]](p1) :: (load 1 from `i7 addrspace(1)* undef`, addrspace 1) ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s7) - ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) - ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] - ; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0 %val = load i7, i7 addrspace(1)* undef ret i7 %val } define zeroext i7 @i7_zeroext_func_void() #0 { ; CHECK-LABEL: name: i7_zeroext_func_void - ; CHECK: bb.1 (%ir-block.0): + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: bb.1 (%ir-block.0): ; CHECK: [[LOAD:%[0-9]+]]:_(s7) = G_LOAD [[DEF]](p1) :: (load 1 from `i7 addrspace(1)* undef`, addrspace 1) ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s7) - ; CHECK: $vgpr0 = COPY [[ZEXT]](s32) - ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] - ; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0 %val = load i7, i7 addrspace(1)* undef ret i7 %val } define signext i7 @i7_signext_func_void() #0 { ; CHECK-LABEL: name: i7_signext_func_void - ; CHECK: bb.1 (%ir-block.0): + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: bb.1 (%ir-block.0): ; CHECK: [[LOAD:%[0-9]+]]:_(s7) = G_LOAD [[DEF]](p1) :: (load 1 from `i7 addrspace(1)* undef`, addrspace 1) ; CHECK: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s7) - ; CHECK: $vgpr0 = COPY [[SEXT]](s32) - ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] - ; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0 %val = load i7, i7 addrspace(1)* undef ret i7 %val } define i8 @i8_func_void() #0 { ; CHECK-LABEL: name: i8_func_void - ; CHECK: bb.1 (%ir-block.0): + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: bb.1 (%ir-block.0): ; CHECK: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (load 1 from `i8 addrspace(1)* undef`, addrspace 1) ; CHECK: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[LOAD]](s8) - ; CHECK: $vgpr0 = COPY [[ANYEXT]](s32) - ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] - ; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0 %val = load i8, i8 addrspace(1)* undef ret i8 %val } define zeroext i8 @i8_zeroext_func_void() #0 { ; CHECK-LABEL: name: i8_zeroext_func_void - ; CHECK: bb.1 (%ir-block.0): + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: bb.1 (%ir-block.0): ; CHECK: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (load 1 from `i8 addrspace(1)* undef`, addrspace 1) ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s8) - ; CHECK: $vgpr0 = COPY [[ZEXT]](s32) - ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] - ; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0 %val = load i8, i8 addrspace(1)* undef ret i8 %val } define signext i8 @i8_signext_func_void() #0 { ; CHECK-LABEL: name: i8_signext_func_void - ; CHECK: bb.1 (%ir-block.0): + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) ; CHECK: liveins: $sgpr30_sgpr31 ; CHECK: [[COPY:%[0-9]+]]:sgpr_64 = COPY $sgpr30_sgpr31 ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF + ; CHECK: bb.1 (%ir-block.0): ; CHECK: [[LOAD:%[0-9]+]]:_(s8) = G_LOAD [[DEF]](p1) :: (load 1 from `i8 addrspace(1)* undef`, addrspace 1) ; CHECK: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s8) - ; CHECK: $vgpr0 = COPY [[SEXT]](s32) - ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] - ; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0 %val = load i8, i8 addrspace(1)* undef ret i8 %val } @@ -159,9 +155,10 @@ ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[DEF]](p1) :: (load 2 from `i16 addrspace(1)* undef`, addrspace 1) ; CHECK: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[LOAD]](s16) - ; CHECK: $vgpr0 = COPY [[ZEXT]](s32) - ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] - ; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[ZEXT]](s32) + ; CHECK: $vgpr0 = COPY [[COPY1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 %val = load i16, i16 addrspace(1)* undef ret i16 %val } @@ -174,9 +171,10 @@ ; CHECK: [[DEF:%[0-9]+]]:_(p1) = G_IMPLICIT_DEF ; CHECK: [[LOAD:%[0-9]+]]:_(s16) = G_LOAD [[DEF]](p1) :: (load 2 from `i16 addrspace(1)* undef`, addrspace 1) ; CHECK: [[SEXT:%[0-9]+]]:_(s32) = G_SEXT [[LOAD]](s16) - ; CHECK: $vgpr0 = COPY [[SEXT]](s32) - ; CHECK: [[COPY1:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] - ; CHECK: S_SETPC_B64_return [[COPY1]], implicit $vgpr0 + ; CHECK: [[COPY1:%[0-9]+]]:_(s32) = COPY [[SEXT]](s32) + ; CHECK: $vgpr0 = COPY [[COPY1]](s32) + ; CHECK: [[COPY2:%[0-9]+]]:ccr_sgpr_64 = COPY [[COPY]] + ; CHECK: S_SETPC_B64_return [[COPY2]], implicit $vgpr0 %val = load i16, i16 addrspace(1)* undef ret i16 %val }