Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.h @@ -100,16 +100,8 @@ SDValue LowerDIVREM24(SDValue Op, SelectionDAG &DAG, bool sign) const; void LowerUDIVREM64(SDValue Op, SelectionDAG &DAG, SmallVectorImpl &Results) const; - /// The SelectionDAGBuilder will automatically promote function arguments - /// with illegal types. However, this does not work for the AMDGPU targets - /// since the function arguments are stored in memory as these illegal types. - /// In order to handle this properly we need to get the origianl types sizes - /// from the LLVM IR Function and fixup the ISD:InputArg values before - /// passing them to AnalyzeFormalArguments() - void getOriginalFunctionArgs(SelectionDAG &DAG, - const Function *F, - const SmallVectorImpl &Ins, - SmallVectorImpl &OrigIns) const; + void analyzeFormalArgumentsCompute(CCState &State, + const SmallVectorImpl &Ins) const; void AnalyzeFormalArguments(CCState &State, const SmallVectorImpl &Ins) const; void AnalyzeReturn(CCState &State, Index: llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/AMDGPUISelLowering.cpp @@ -37,7 +37,7 @@ MachineFunction &MF = State.getMachineFunction(); AMDGPUMachineFunction *MFI = MF.getInfo(); - uint64_t Offset = MFI->allocateKernArg(ValVT.getStoreSize(), + uint64_t Offset = MFI->allocateKernArg(LocVT.getStoreSize(), ArgFlags.getOrigAlign()); State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT, LocInfo)); return true; @@ -626,9 +626,104 @@ // TargetLowering Callbacks //===---------------------------------------------------------------------===// -void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State, +/// The SelectionDAGBuilder will automatically promote function arguments +/// with illegal types. However, this does not work for the AMDGPU targets +/// since the function arguments are stored in memory as these illegal types. +/// In order to handle this properly we need to get the original types sizes +/// from the LLVM IR Function and fixup the ISD:InputArg values before +/// passing them to AnalyzeFormalArguments() + +/// When the SelectionDAGBuilder computes the Ins, it takes care of splitting +/// input values across multiple registers. Each item in the Ins array +/// represents a single value that will be stored in regsters. Ins[x].VT is +/// the value type of the value that will be stored in the register, so +/// whatever SDNode we lower the argument to needs to be this type. +/// +/// In order to correctly lower the arguments we need to know the size of each +/// argument. Since Ins[x].VT gives us the size of the register that will +/// hold the value, we need to look at Ins[x].ArgVT to see the 'real' type +/// for the orignal function argument so that we can deduce the correct memory +/// type to use for Ins[x]. In most cases the correct memory type will be +/// Ins[x].ArgVT. However, this will not always be the case. If, for example, +/// we have a kernel argument of type v8i8, this argument will be split into +/// 8 parts and each part will be represented by its own item in the Ins array. +/// For each part the Ins[x].ArgVT will be the v8i8, which is the full type of +/// the argument before it was split. From this, we deduce that the memory type +/// for each individual part is i8. We pass the memory type as LocVT to the +/// calling convention analysis function and the register type (Ins[x].VT) as +/// the ValVT. +void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(CCState &State, const SmallVectorImpl &Ins) const { + for (unsigned i = 0, e = Ins.size(); i != e; ++i) { + const ISD::InputArg &In = Ins[i]; + EVT MemVT; + + unsigned NumRegs = getNumRegisters(State.getContext(), In.ArgVT); + + if (In.ArgVT == MVT::i16 || In.ArgVT == MVT::i8 || In.ArgVT == MVT::f16) { + // The ABI says the caller will extend these values to 32-bits. + MemVT = In.ArgVT.isInteger() ? MVT::i32 : MVT::f32; + } else if (NumRegs == 1) { + // This argument is not split, so the IR type is the memory type. + assert(!In.Flags.isSplit()); + if (In.ArgVT.isExtended()) { + // We have an extended type, like i24, so we should just use the register type + MemVT = In.VT; + } else { + MemVT = In.ArgVT; + } + } else if (In.ArgVT.isVector() && In.VT.isVector() && + In.ArgVT.getScalarType() == In.VT.getScalarType()) { + assert(In.ArgVT.getVectorNumElements() > In.VT.getVectorNumElements()); + // We have a vector value which has been split into a vector with + // the same scalar type, but fewer elements. This should handle + // all the floating-point vector types. + MemVT = In.VT; + } else if (In.ArgVT.isVector() && + In.ArgVT.getVectorNumElements() == NumRegs) { + // This arg has been split so that each element is stored in a separate + // register. + MemVT = In.ArgVT.getScalarType(); + } else if (In.ArgVT.isExtended()) { + // We have an extended type, like i65. + MemVT = In.VT; + } else { + unsigned MemoryBits = In.ArgVT.getStoreSizeInBits() / NumRegs; + assert(In.ArgVT.getStoreSizeInBits() % NumRegs == 0); + if (In.VT.isInteger()) { + MemVT = EVT::getIntegerVT(State.getContext(), MemoryBits); + } else if (In.VT.isVector()) { + assert(!In.VT.getScalarType().isFloatingPoint()); + unsigned NumElements = In.VT.getVectorNumElements(); + assert(MemoryBits % NumElements == 0); + // This vector type has been split into another vector type with + // a different elements size. + EVT ScalarVT = EVT::getIntegerVT(State.getContext(), + MemoryBits / NumElements); + MemVT = EVT::getVectorVT(State.getContext(), ScalarVT, NumElements); + } else { + llvm_unreachable("cannot deduce memory type."); + } + } + + // Convert one element vectors to scalar. + if (MemVT.isVector() && MemVT.getVectorNumElements() == 1) + MemVT = MemVT.getScalarType(); + + if (MemVT.isExtended()) { + // This should really only happen if we have vec3 arguments + assert(MemVT.isVector() && MemVT.getVectorNumElements() == 3); + MemVT = MemVT.getPow2VectorType(State.getContext()); + } + + assert(MemVT.isSimple()); + allocateKernArg(i, In.VT, MemVT.getSimpleVT(), CCValAssign::Full, In.Flags, + State); + } +} +void AMDGPUTargetLowering::AnalyzeFormalArguments(CCState &State, + const SmallVectorImpl &Ins) const { State.AnalyzeFormalArguments(Ins, CC_AMDGPU); } @@ -2617,38 +2712,6 @@ // Helper functions //===----------------------------------------------------------------------===// -void AMDGPUTargetLowering::getOriginalFunctionArgs( - SelectionDAG &DAG, - const Function *F, - const SmallVectorImpl &Ins, - SmallVectorImpl &OrigIns) const { - - for (unsigned i = 0, e = Ins.size(); i < e; ++i) { - if (Ins[i].ArgVT == Ins[i].VT) { - OrigIns.push_back(Ins[i]); - continue; - } - - EVT VT; - if (Ins[i].ArgVT.isVector() && !Ins[i].VT.isVector()) { - // Vector has been split into scalars. - VT = Ins[i].ArgVT.getVectorElementType(); - } else if (Ins[i].VT.isVector() && Ins[i].ArgVT.isVector() && - Ins[i].ArgVT.getVectorElementType() != - Ins[i].VT.getVectorElementType()) { - // Vector elements have been promoted - VT = Ins[i].ArgVT; - } else { - // Vector has been spilt into smaller vectors. - VT = Ins[i].VT; - } - - ISD::InputArg Arg(Ins[i].Flags, VT, VT, Ins[i].Used, - Ins[i].OrigArgIndex, Ins[i].PartOffset); - OrigIns.push_back(Arg); - } -} - SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG, const TargetRegisterClass *RC, unsigned Reg, EVT VT) const { Index: llvm/trunk/lib/Target/AMDGPU/R600ISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/R600ISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/R600ISelLowering.cpp @@ -1512,9 +1512,11 @@ SmallVector LocalIns; - getOriginalFunctionArgs(DAG, MF.getFunction(), Ins, LocalIns); - - AnalyzeFormalArguments(CCInfo, LocalIns); + if (AMDGPU::isShader(CallConv)) { + AnalyzeFormalArguments(CCInfo, Ins); + } else { + analyzeFormalArgumentsCompute(CCInfo, Ins); + } for (unsigned i = 0, e = Ins.size(); i < e; ++i) { CCValAssign &VA = ArgLocs[i]; Index: llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp =================================================================== --- llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp +++ llvm/trunk/lib/Target/AMDGPU/SIISelLowering.cpp @@ -685,9 +685,6 @@ } if (!AMDGPU::isShader(CallConv)) { - getOriginalFunctionArgs(DAG, DAG.getMachineFunction().getFunction(), Ins, - Splits); - assert(Info->hasWorkGroupIDX() && Info->hasWorkItemIDX()); } else { assert(!Info->hasPrivateSegmentBuffer() && !Info->hasDispatchPtr() && @@ -735,7 +732,10 @@ CCInfo.AllocateReg(FlatScratchInitReg); } - AnalyzeFormalArguments(CCInfo, Splits); + if (!AMDGPU::isShader(CallConv)) + analyzeFormalArgumentsCompute(CCInfo, Ins); + else + AnalyzeFormalArguments(CCInfo, Splits); SmallVector Chains; @@ -752,7 +752,7 @@ if (VA.isMemLoc()) { VT = Ins[i].VT; - EVT MemVT = Splits[i].VT; + EVT MemVT = VA.getLocVT(); const unsigned Offset = Subtarget->getExplicitKernelArgOffset() + VA.getLocMemOffset(); // The first 36 bytes of the input buffer contains information about Index: llvm/trunk/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll =================================================================== --- llvm/trunk/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll +++ llvm/trunk/test/CodeGen/AMDGPU/v1i64-kernel-arg.ll @@ -1,5 +1,3 @@ -; REQUIRES: asserts -; XFAIL: * ; RUN: llc -march=r600 -mcpu=cypress < %s | FileCheck %s ; CHECK-LABEL: {{^}}kernel_arg_i64: